1
2
3 __version__ = "0.1.2"
4 __url__ = "http://trac.defuze.org/wiki/amplee"
5 __authors__ = ["Sylvain Hellegouarch (sh@defuze.org)"]
6 __date__ = "2007/27/12"
7 __copyright__ = """
8 Copyright (c) 2007 Sylvain Hellegouarch
9 All rights reserved.
10 """
11 __license__ = """
12 Redistribution and use in source and binary forms, with or without modification,
13 are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17 * Redistributions in binary form must reproduce the above copyright notice,
18 this list of conditions and the following disclaimer in the documentation
19 and/or other materials provided with the distribution.
20 * Neither the name of Sylvain Hellegouarch nor the names of his contributors
21 may be used to endorse or promote products derived from this software
22 without specific prior written permission.
23
24 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
25 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
26 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
27 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
28 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 """
35
36 __doc__ = """
37 Crawls through an Atom Publishing protocol service document and
38 will perform some validation, testing against the AtomPub remote service.
39 """
40
41
42
43
44
45
46
47
48
49
50 try:
51 import cStringIO as StringIO
52 except ImportError:
53 import StringIO
54
55 import sys
56 import logging
57 import random
58 from ConfigParser import ConfigParser, NoOptionError
59 from urlparse import urljoin, urlparse
60 from urllib import quote
61 from xml.parsers.expat import ExpatError
62
63 import string
64 CHARS = string.letters + string.octdigits + ' _-'
65 SLUG_CHARS = CHARS.decode('utf-8') + u'\xe9\xe8\xf9\xf6'
66
67
68 try:
69 import httplib2
70 except ImportError:
71 print "You miss the httplib2 package. You must install it first."
72 print "Get it at http://bitworking.org/projects/httplib2/"
73 sys.exit(2)
74
75 try:
76 from bridge import Element as E
77 from bridge.filter.atom import lookup_links
78 from bridge.common import XML_NS, ATOM10_NS, ATOM10_PREFIX, ATOMPUB_NS, ATOMPUB_PREFIX,\
79 atom_as_attr, atom_as_list, atom_attribute_of_element
80
81 atom_attribute_of_element[XML_NS] = ['lang', 'base', 'id']
82 except ImportError:
83 print "You miss the bridge package. You must install it first."
84 print "Get it at http://trac.defuze.org/wiki/bridge"
85 sys.exit(2)
86
87
88 try:
89 from atomschematron import validate as schematron_validate
90 except ImportError:
91 schematron_validate = None
92
93 __all__ = ['Options', 'Crawler']
94
97 """
98 Dummy class to carry different options used throughout the process.
99
100 When you use the Crawler class from your own code and not as a module
101 you must pass an instance of this class.
102
103 The ``service_uri`` is the only compulsory value to be set.
104 """
105 self.service_uri = service_uri
106 self.username = None
107 self.password = None
108 self.validate = True
109 self.output = None
110 self.test_mapping = None
111 self.debuglevel = 0
112 self.base_uri = ''
113
115 """
116 Dummy wrapper to pass to the validator which expects a stream.
117 """
120
122 self.logger.debug(msg)
123
126 self.options = options
127 self.create_logger()
128
129 httplib2.debuglevel = options.debuglevel
130
131 if not schematron_validate:
132 self.log("Could not import the schematron validator."\
133 "You may be missing the Amara package.\n"\
134 "Validation is disabled.\n")
135 options.validate = False
136
137 if self.options.test_mapping:
138 self.prepare_test_mapping()
139
141 self.logger = logging.getLogger("crawler.logger")
142 if not self.options.output:
143 hdlr = logging.StreamHandler()
144 else:
145 hdlr = logging.FileHandler(self.options.output, 'w')
146
147 self.logger.addHandler(hdlr)
148 self.logger.setLevel(logging.DEBUG)
149
151 if isinstance(self.options.test_mapping, dict):
152 return
153
154 if isinstance(self.options.test_mapping, str):
155 data = file(self.options.test_mapping, 'r')
156
157 c = ConfigParser()
158 c.readfp(data)
159
160 mapping = {}
161 for media_type in c.sections():
162 post = put = None
163 try:
164 post = c.get(media_type, 'post')
165 except NoOptionError:
166 pass
167
168 mapping[media_type] = {'post': post}
169
170 self.options.test_mapping = mapping
171
173 if isinstance(slug, unicode):
174 return quote(slug.encode(encoding), safe=' /')
175 return quote(slug, safe=' /')
176
179
180 - def get_random_text(self):
181 return ''.join([random.choice(CHARS) for i in range(20)])
182
183 - def log(self, msg, level=logging.DEBUG):
184 if self.logger:
185 self.logger.log(level, msg)
186
187 - def request(self, url, method='GET', body=None, headers=None):
188 h = httplib2.Http()
189 if self.options.username:
190 h.add_credentials(self.options.username, self.options.password)
191 url = url.decode('utf-8')
192 return h.request(url, method=method, body=body, headers=headers)
193
195 tokens = urlparse(href)
196 if not tokens[0]:
197 return urljoin(base_uri, href)
198 return href
199
201 try:
202 return E.load(r, as_attribute=atom_as_attr, as_list=atom_as_list,
203 as_attribute_of_element=atom_attribute_of_element).xml_root
204 except ExpatError:
205 print r
206 self.log(r, logging.ERROR)
207 raise
208
209 - def text(self, e, name, ns):
210 try:
211 return e.get_child(name, ns).xml_text
212 except:
213 return ''
214
215 - def check(self, token, expected):
216 if isinstance(expected, str):
217 if token != expected:
218 self.log("ERROR: Expected %s but found %s" % (expected, token), logging.ERROR)
219 return "[INVALID]"
220 elif isinstance(expected, list):
221 if token not in expected:
222 self.log("ERROR: Expected to be in [%s] but found %s" % (', '.join(expected), token), logging.ERROR)
223 return "[INVALID]"
224 return "[VALID]"
225
227 r, c = self.request(self.options.service_uri)
228
229 s = self.parse_response(c)
230
231 self.log("|-- Service")
232 self.log(" Media-type: %s %s" % (r['content-type'],
233 self.check(r['content-type'],
234 'application/atomsvc+xml')))
235
236 self.log(" xml:id: %s" % s.get_attribute_value('id', ''))
237 self.log(" xml:lang: %s" % s.get_attribute_value('lang', ''))
238 base_uri = s.get_attribute_value('base', '')
239 self.log(" xml:base: %s" % base_uri)
240 self.log(" Number of workspaces: %d" % len(s.workspace))
241
242 if self.options.validate:
243
244 schematron_validate(StringIO.StringIO(c),
245 LoggerWrap(self.logger),
246 phase="AtomPubService")
247
248 self.log("\n")
249
250 for w in s.workspace:
251 self.handle_workspace(w, base_uri=base_uri)
252
253
255 self.log(" |-- Workspace")
256 self.log(" xml:lang: %s" % w.get_attribute_value('lang', ''))
257 self.log(" xml:id: %s" % w.get_attribute_value('id', ''))
258 base = w.get_attribute_value('base', self.options.base_uri)
259 self.log(" xml:base: %s" % base)
260 self.log(" Title: %s" % self.text(w, 'title', ATOM10_NS))
261 collections = w.get_children('collection', ATOMPUB_NS)
262 self.log(" Number of collections: %d" % len(collections))
263 for c in w.collection:
264 self.handle_collection(c, base or base_uri)
265 self.log("\n")
266
268 self.log(" |-- Collection")
269 self.log(" xml:lang: %s" % c.get_attribute_value('lang', ''))
270 self.log(" xml:id: %s" % c.get_attribute_value('id', ''))
271 self.log(" xml:base: %s" % c.get_attribute_value('base', ''))
272 self.log(" Title: %s" % self.text(c, 'title', ATOM10_NS))
273 self.log(" Href: %s" % str(c.href))
274
275 accepts = c.get_children('accept', ATOMPUB_NS)
276 if len(accepts) == 1 and accepts[0].xml_text == None:
277 self.log(" The only app:accept element is empty, the collection may reject POST requests")
278 else:
279 for a in accepts:
280 self.handle_accept(a)
281
282 categories = c.get_children('categories', ATOMPUB_NS)
283 for categories_elem in categories:
284 self.handle_categories(categories_elem, base_uri)
285
286 self.handle_collection_feed(c, base_uri)
287
288 if self.options.test_mapping and accepts:
289 self.perform_tests(c, accepts, base_uri)
290
292 r, c = self.request(self.build_url(str(c.href), base_uri))
293 self.log(" |-- Feed")
294 self.log(" Media-type: %s" % r['content-type'])
295
296 f = self.parse_response(c)
297 entries = f.get_children('entry', ATOM10_NS)
298 self.log(" Number of members: %d" % len(entries))
299 self.log(" ID: %s" % self.text(f, 'id', ATOM10_NS))
300 self.log(" Title: %s" % self.text(f, 'title', ATOM10_NS))
301 self.log(" Updated: %s" % self.text(f, 'updated', ATOM10_NS))
302
303 if self.options.validate:
304 schematron_validate(StringIO.StringIO(c),
305 LoggerWrap(self.logger),
306 phase="AtomFeed")
307
309 self.log(" Accept: %s" % a.xml_text)
310
319
321 self.log(" |-- Category")
322 self.log(" Term: %s" % c.get_attribute_value('term'))
323 self.log(" Label: %s" % c.get_attribute_value('label'))
324 self.log(" Scheme: %s" % c.get_attribute_value('scheme'))
325