1. Benoit Boissinot
  2. scrapers

Source

scrapers / fr / senat.fr / scrutin / scrutin.py

# encoding: utf8
from lxml.html import parse
import json
import urllib2
import os.path

BASE_URL = 'http://www.senat.fr/scrupub/%(year)d/scr%(year)d-%(id)d.html'

votechoices = {
    u'Ont voté pour':               'for',
    u'Ont voté contre':             'against',
    u'Abstentions':                 'abstention',
    u"N'ont pas pris part au vote": 'novote',
}

def scrutin(ref):
    vote = {}
    vote['ref'] = ref
    x = parse(urllib2.urlopen(ref))
    for i in x.findall("//div/p/b"):
        try:
            votechoice = votechoices[i.text]
        except KeyError:
            continue
        names = []
        for e in i.getparent().getnext().findall('tr/td/a'):
            id = e.get('href')
            base = 'http://www.senat.fr/senfic/'
            assert id.startswith(base) and id.endswith('.html')
            id = id[len(base):]
            id, ext = os.path.splitext(id)
            assert ext == '.html'
            names.append(id)
        vote[votechoice] = names
    return vote

for year in xrange(2006, 2015):
    i = 0
    while True:
        i += 1
        url = BASE_URL % {'year': year, 'id': i}
        try:
            fname = '%d-%d' % (year, i)
            if os.path.exists(fname):
                print 'skipping' + fname
                continue
            print 'fetching' + fname
            v = scrutin(url)
            json.dump(v, open(fname, 'w'))
        except Exception, e:
            print e
            break