Monthly Archives: December 2007

Atom, related and Wikipedia

I needed recently to grab an Atom feed of entris that contained link elements with a rel attribute set to “related”, for some reason it’s not that common and I therefore decided to make my own by scraping a few pages from Wikipedia, generating said link elements from the “See Also” sections of each articles. Here is a dirty script that performs such task.

# -*- coding: utf-8 -*-

from datetime import datetime

from urlparse import urljoin, urlparse

import uuid

from xml.sax.saxutils import escape

import amara

from BeautifulSoup import BeautifulSoup

import httplib2

visited_links = []

ATOM10_NS = u’http://www.w3.org/2005/Atom’

ATOM10_PREFIX = u’atom’

def qname(local_name, prefix=None):

if not prefix:

return local_name

return u”%s:%s” % (prefix, local_name)

def init_feed():

d = amara.create_document(prefixes={ATOM10_PREFIX: ATOM10_NS})

feed = d.xml_create_element(qname(u”feed”, ATOM10_PREFIX), ns=ATOM10_NS)

d.xml_append(feed)

feed.xml_append(d.xml_create_element(qname(u”id”, ATOM10_PREFIX), ns=ATOM10_NS,

content=u’urn:uuid:’ + unicode(uuid.uuid4())))

feed.xml_append(d.xml_create_element(qname(u”updated”, ATOM10_PREFIX), ns=ATOM10_NS,

content=unicode(datetime.utcnow().isoformat())))

return d, feed

d, feed = init_feed()

def run(url):

print “Visiting: %s” % url

entry = init_entry(url)

h = httplib2.Http(‘.cache’)

r, c = h.request(url)

soup = BeautifulSoup(c)

entry.xml_append(d.xml_create_element(qname(u”content”, ATOM10_PREFIX), ns=ATOM10_NS,

attributes={u’type’: u’text/html’,

u’src’: unicode(url)}))

see_also = soup.find(name=’a’, attrs={‘id’: ‘See_also’})

if see_also:

see_also_links = see_also.parent.findNextSibling(name=’ul’)

if see_also_links: # some pages have the

empty :-(
see_also_links = see_also_links.findAll(‘li’)

if see_also_links:
next_to_visits = []
for link in see_also_links:
link = str(link.a['href'])
if urlparse(link)[1] == ”:
link = urljoin(‘http://en.wikipedia.org’, link)
entry.xml_append(d.xml_create_element(qname(u”link”, ATOM10_PREFIX), ns=ATOM10_NS,
attributes={u’rel’: u’related’, u’type’: u’text/html’,
u’href’: unicode(link)}))
if link not in visited_links:
visited_links.append(link)
next_to_visits.append(link)

for link in next_to_visits:
run(link)

def init_entry(url):
entry = d.xml_create_element(qname(u”entry”, ATOM10_PREFIX), ns=ATOM10_NS)
feed.xml_append(entry)

entry.xml_append(d.xml_create_element(qname(u”id”, ATOM10_PREFIX), ns=ATOM10_NS,
content=u’urn:uuid:’ + unicode(uuid.uuid5(uuid.NAMESPACE_URL, url))))

entry.xml_append(d.xml_create_element(qname(u”title”, ATOM10_PREFIX), ns=ATOM10_NS,
content=unicode(url), attributes={u’type’: u’text’}))

entry.xml_append(d.xml_create_element(qname(u”updated”, ATOM10_PREFIX), ns=ATOM10_NS,
content=unicode(datetime.utcnow().isoformat())))

entry.xml_append(d.xml_create_element(qname(u”link”, ATOM10_PREFIX), ns=ATOM10_NS,
attributes={u’rel’: u’self’, u’type’: u’text/html’,
u’href’: unicode(url)}))

author = d.xml_create_element(qname(u”author”, ATOM10_PREFIX), ns=ATOM10_NS)
author.xml_append(d.xml_create_element(qname(u”uri”, ATOM10_PREFIX), ns=ATOM10_NS,
content=u’http://en.wikipedia.org’))
entry.xml_append(author)

return entry

if _name_ == ‘_main_’:
try:
print “Ctrl-C to stop the scrapping”
run(‘http://en.wikipedia.org/wiki/Castle’)
except KeyboardInterrupt:
file(‘wikipedia_related_feed.atom’, ‘wb’).write(feed.xml(indent=True))
print “Saved as wikipedia_related_feed.atom”
except Exception, ex:
file(‘wikipedia_related_feed.atom’, ‘wb’).write(feed.xml(indent=True))
print “Saved as wikipedia_related_feed.atom”
raise