Source

scatter-share / balisage09 / schedule.py

Full commit
#25 lines to scrape Balisage schedule and generate tweets (including a URL shortener)
#Note: performance almost entirely based on speed of access to the URL shortener service
#Usage:
#curl http://www.balisage.net/2009/At-A-Glance.html | python schedule.py 
import sys, urllib, urllib2
from amara.bindery import html
from amara.lib import iri

BASE = 'http://www.balisage.net/2009/At-A-Glance.html'
SUFFIX = '#balisage2009'
URL_SHORTENER = 'http://ur1.ca/'

doc = html.parse(sys.stdin)
abstracts = doc.xml_select(u'//a[@class="abstract"]')

def shorten_url(url):
    params = {'longurl': url}
    data = urllib.urlencode(params)
    request = urllib2.Request(URL_SHORTENER, data)
    resultdoc = html.parse(urllib2.urlopen(request).read())
    shortened = resultdoc.xml_select(u'string(//p[@class="success"]/a)')
    return shortened

for ab in abstracts:
    chunks = []
    #Many other ways to do this but wanted to demo XPath fullnes :)
    chunks.append('"'+unicode(ab)+'"')
    chunks.append(ab.xml_select(u'string(following-sibling::*//a[@class="biolink"])'))
    chunks.append(shorten_url(iri.absolutize(ab.href, BASE)))
    chunks.append(SUFFIX)
    msg = ' '.join(chunks)
    print msg
    print 'Length:', len(msg)