Package amplee :: Package contrib :: Module crawler
[hide private]
[frames] | no frames]

Source Code for Module amplee.contrib.crawler

  1  # -*- coding: utf-8 -*- 
  2   
  3  __version__ = "0.1.2" 
  4  __url__ = "http://trac.defuze.org/wiki/amplee" 
  5  __authors__ = ["Sylvain Hellegouarch (sh@defuze.org)"] 
  6  __date__ = "2007/27/12" 
  7  __copyright__ = """ 
  8  Copyright (c) 2007 Sylvain Hellegouarch 
  9  All rights reserved. 
 10  """ 
 11  __license__ = """ 
 12  Redistribution and use in source and binary forms, with or without modification,  
 13  are permitted provided that the following conditions are met: 
 14    
 15       * Redistributions of source code must retain the above copyright notice,  
 16         this list of conditions and the following disclaimer. 
 17       * Redistributions in binary form must reproduce the above copyright notice,  
 18         this list of conditions and the following disclaimer in the documentation  
 19         and/or other materials provided with the distribution. 
 20       * Neither the name of Sylvain Hellegouarch nor the names of his contributors  
 21         may be used to endorse or promote products derived from this software  
 22         without specific prior written permission. 
 23    
 24  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND  
 25  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED  
 26  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE  
 27  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE  
 28  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL  
 29  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR  
 30  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER  
 31  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,  
 32  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  
 33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 34  """ 
 35   
 36  __doc__ = """ 
 37  Crawls through an Atom Publishing protocol service document and 
 38  will perform some validation, testing against the AtomPub remote service. 
 39  """ 
 40   
 41  # 
 42  # 0.1.2 - Fixed some unicode issues 
 43  #  
 44  # 0.1.1 - Thanks to Luis Peralta for the patch to read categories that would not be inline 
 45  # 
 46  # 0.1.0 - Initial version 
 47  # 
 48   
 49  # stdlib imports 
 50  try: 
 51      import cStringIO as StringIO 
 52  except ImportError: 
 53      import StringIO 
 54   
 55  import sys 
 56  import logging 
 57  import random 
 58  from ConfigParser import ConfigParser, NoOptionError 
 59  from urlparse import urljoin, urlparse 
 60  from urllib import quote 
 61  from xml.parsers.expat import ExpatError 
 62   
 63  import string 
 64  CHARS = string.letters + string.octdigits + ' _-' 
 65  SLUG_CHARS = CHARS.decode('utf-8') + u'\xe9\xe8\xf9\xf6' 
 66   
 67  # third-party products 
 68  try: 
 69      import httplib2 
 70  except ImportError: 
 71      print "You miss the httplib2 package. You must install it first." 
 72      print "Get it at http://bitworking.org/projects/httplib2/" 
 73      sys.exit(2) 
 74   
 75  try: 
 76      from bridge import Element as E 
 77      from bridge.filter.atom import lookup_links 
 78      from bridge.common import XML_NS, ATOM10_NS, ATOM10_PREFIX, ATOMPUB_NS, ATOMPUB_PREFIX,\ 
 79           atom_as_attr, atom_as_list, atom_attribute_of_element 
 80   
 81      atom_attribute_of_element[XML_NS] = ['lang', 'base', 'id'] 
 82  except ImportError: 
 83      print "You miss the bridge package. You must install it first." 
 84      print "Get it at http://trac.defuze.org/wiki/bridge" 
 85      sys.exit(2) 
 86       
 87  # local imports 
 88  try: 
 89      from atomschematron import validate as schematron_validate 
 90  except ImportError: 
 91      schematron_validate = None 
 92       
 93  __all__ = ['Options', 'Crawler'] 
 94   
95 -class Options(object):
96 - def __init__(self, service_uri):
97 """ 98 Dummy class to carry different options used throughout the process. 99 100 When you use the Crawler class from your own code and not as a module 101 you must pass an instance of this class. 102 103 The ``service_uri`` is the only compulsory value to be set. 104 """ 105 self.service_uri = service_uri 106 self.username = None 107 self.password = None 108 self.validate = True 109 self.output = None 110 self.test_mapping = None 111 self.debuglevel = 0 112 self.base_uri = ''
113
114 -class LoggerWrap(object):
115 """ 116 Dummy wrapper to pass to the validator which expects a stream. 117 """
118 - def __init__(self, logger):
119 self.logger = logger
120
121 - def write(self, msg):
122 self.logger.debug(msg)
123
124 -class Crawler(object):
125 - def __init__(self, options):
126 self.options = options 127 self.create_logger() 128 129 httplib2.debuglevel = options.debuglevel 130 131 if not schematron_validate: 132 self.log("Could not import the schematron validator."\ 133 "You may be missing the Amara package.\n"\ 134 "Validation is disabled.\n") 135 options.validate = False 136 137 if self.options.test_mapping: 138 self.prepare_test_mapping()
139
140 - def create_logger(self):
141 self.logger = logging.getLogger("crawler.logger") 142 if not self.options.output: 143 hdlr = logging.StreamHandler() 144 else: 145 hdlr = logging.FileHandler(self.options.output, 'w') 146 147 self.logger.addHandler(hdlr) 148 self.logger.setLevel(logging.DEBUG)
149
150 - def prepare_test_mapping(self):
151 if isinstance(self.options.test_mapping, dict): 152 return 153 154 if isinstance(self.options.test_mapping, str): 155 data = file(self.options.test_mapping, 'r') 156 157 c = ConfigParser() 158 c.readfp(data) 159 160 mapping = {} 161 for media_type in c.sections(): 162 post = put = None 163 try: 164 post = c.get(media_type, 'post') 165 except NoOptionError: 166 pass 167 168 mapping[media_type] = {'post': post} 169 170 self.options.test_mapping = mapping
171
172 - def encode_slug(self, slug, encoding='utf-8'):
173 if isinstance(slug, unicode): 174 return quote(slug.encode(encoding), safe=' /') 175 return quote(slug, safe=' /')
176
177 - def get_random_slug(self):
178 return self.encode_slug(''.join([random.choice(SLUG_CHARS) for i in range(20)]))
179
180 - def get_random_text(self):
181 return ''.join([random.choice(CHARS) for i in range(20)])
182
183 - def log(self, msg, level=logging.DEBUG):
184 if self.logger: 185 self.logger.log(level, msg)
186
187 - def request(self, url, method='GET', body=None, headers=None):
188 h = httplib2.Http() 189 if self.options.username: 190 h.add_credentials(self.options.username, self.options.password) 191 url = url.decode('utf-8') 192 return h.request(url, method=method, body=body, headers=headers)
193
194 - def build_url(self, href, base_uri=None):
195 tokens = urlparse(href) 196 if not tokens[0]: 197 return urljoin(base_uri, href) 198 return href
199
200 - def parse_response(self, r):
201 try: 202 return E.load(r, as_attribute=atom_as_attr, as_list=atom_as_list, 203 as_attribute_of_element=atom_attribute_of_element).xml_root 204 except ExpatError: 205 print r 206 self.log(r, logging.ERROR) 207 raise
208
209 - def text(self, e, name, ns):
210 try: 211 return e.get_child(name, ns).xml_text 212 except: 213 return ''
214
215 - def check(self, token, expected):
216 if isinstance(expected, str): 217 if token != expected: 218 self.log("ERROR: Expected %s but found %s" % (expected, token), logging.ERROR) 219 return "[INVALID]" 220 elif isinstance(expected, list): 221 if token not in expected: 222 self.log("ERROR: Expected to be in [%s] but found %s" % (', '.join(expected), token), logging.ERROR) 223 return "[INVALID]" 224 return "[VALID]"
225
226 - def handle_service(self):
227 r, c = self.request(self.options.service_uri) 228 229 s = self.parse_response(c) 230 231 self.log("|-- Service") 232 self.log(" Media-type: %s %s" % (r['content-type'], 233 self.check(r['content-type'], 234 'application/atomsvc+xml'))) 235 236 self.log(" xml:id: %s" % s.get_attribute_value('id', '')) 237 self.log(" xml:lang: %s" % s.get_attribute_value('lang', '')) 238 base_uri = s.get_attribute_value('base', '') 239 self.log(" xml:base: %s" % base_uri) 240 self.log(" Number of workspaces: %d" % len(s.workspace)) 241 242 if self.options.validate: 243 # Let's validate the service document with schematron 244 schematron_validate(StringIO.StringIO(c), 245 LoggerWrap(self.logger), 246 phase="AtomPubService") 247 248 self.log("\n") 249 250 for w in s.workspace: 251 self.handle_workspace(w, base_uri=base_uri)
252 253
254 - def handle_workspace(self, w, base_uri=None):
255 self.log(" |-- Workspace") 256 self.log(" xml:lang: %s" % w.get_attribute_value('lang', '')) 257 self.log(" xml:id: %s" % w.get_attribute_value('id', '')) 258 base = w.get_attribute_value('base', self.options.base_uri) 259 self.log(" xml:base: %s" % base) 260 self.log(" Title: %s" % self.text(w, 'title', ATOM10_NS)) 261 collections = w.get_children('collection', ATOMPUB_NS) 262 self.log(" Number of collections: %d" % len(collections)) 263 for c in w.collection: 264 self.handle_collection(c, base or base_uri) 265 self.log("\n")
266
267 - def handle_collection(self, c, base_uri=None):
268 self.log(" |-- Collection") 269 self.log(" xml:lang: %s" % c.get_attribute_value('lang', '')) 270 self.log(" xml:id: %s" % c.get_attribute_value('id', '')) 271 self.log(" xml:base: %s" % c.get_attribute_value('base', '')) 272 self.log(" Title: %s" % self.text(c, 'title', ATOM10_NS)) 273 self.log(" Href: %s" % str(c.href)) 274 275 accepts = c.get_children('accept', ATOMPUB_NS) 276 if len(accepts) == 1 and accepts[0].xml_text == None: 277 self.log(" The only app:accept element is empty, the collection may reject POST requests") 278 else: 279 for a in accepts: 280 self.handle_accept(a) 281 282 categories = c.get_children('categories', ATOMPUB_NS) 283 for categories_elem in categories: 284 self.handle_categories(categories_elem, base_uri) 285 286 self.handle_collection_feed(c, base_uri) 287 288 if self.options.test_mapping and accepts: 289 self.perform_tests(c, accepts, base_uri)
290
291 - def handle_collection_feed(self, c, base_uri=None):
292 r, c = self.request(self.build_url(str(c.href), base_uri)) 293 self.log(" |-- Feed") 294 self.log(" Media-type: %s" % r['content-type']) 295 296 f = self.parse_response(c) 297 entries = f.get_children('entry', ATOM10_NS) 298 self.log(" Number of members: %d" % len(entries)) 299 self.log(" ID: %s" % self.text(f, 'id', ATOM10_NS)) 300 self.log(" Title: %s" % self.text(f, 'title', ATOM10_NS)) 301 self.log(" Updated: %s" % self.text(f, 'updated', ATOM10_NS)) 302 303 if self.options.validate: 304 schematron_validate(StringIO.StringIO(c), 305 LoggerWrap(self.logger), 306 phase="AtomFeed")
307
308 - def handle_accept(self, a):
309 self.log(" Accept: %s" % a.xml_text)
310
311 - def handle_categories(self, c, base_uri=None):
312 self.log(" |-- Categories") 313 if c.get_attribute_value('href', None): 314 r, c2 = self.request(self.build_url(str(c.href))) 315 c = self.parse_response(c2) 316 categories = c.get_children('category', ATOM10_NS) 317 for c in categories: 318 self.handle_category(c)
319
320 - def handle_category(self, c):
321 self.log(" |-- Category") 322 self.log(" Term: %s" % c.get_attribute_value('term')) 323 self.log(" Label: %s" % c.get_attribute_value('label')) 324 self.log(" Scheme: %s" % c.get_attribute_value('scheme'))
325
326 - def perform_tests(self, collection, accepts, base_uri):
327 url = self.build_url(str(collection.href), base_uri) 328 self.log(" Performing tests against %s" % url) 329 types = [a.xml_text for a in accepts] 330 passed_tests = 0 331 332 for media_type in types: 333 if media_type in self.options.test_mapping: 334 path = self.options.test_mapping[media_type]['post'] 335 if path: 336 passed_tests += 1 337 338 self.log(" POST: %s" % media_type) 339 body = file(path, 'r').read() 340 slug = self.get_random_slug() 341 self.log(" Slug submitted: %s" % slug) 342 r, c = self.request(url, method='POST', body=body, 343 headers={'content-type': media_type, 344 'slug': slug}) 345 self.log(" Status: %s %s" % (r['status'], self.check(r['status'], '201'))) 346 self.log(" Type: %s %s" % (r['content-type'], self.check(r['content-type'], 347 ['application/atom+xml', 348 'application/atom+xml;type=entry']))) 349 350 e = self.parse_response(c) 351 if e: 352 links = e.filtrate(lookup_links, rel='edit') 353 if not links: 354 self.log(' Missing <atom:link rel="edit" /> element') 355 else: 356 link = links[0] 357 self.log(' rel="edit" href: %s' % link.get_attribute_value('href', '')) 358 self.log(' rel="edit" type: %s' % link.get_attribute_value('type', '')) 359 360 if 'etag' in r: 361 self.log(" Etag: %s" % r['etag']) 362 363 if 'location' not in r: 364 self.log(" Missing required Location header. Will not test further.") 365 else: 366 url = self.build_url(r['location'], base_uri) 367 self.