Source

amdt hadopi / amdt.py

Full commit
# coding:utf8

# uses svn version from http://opendocumentfellowship.com/projects/odfpy
import sys
from urllib import urlopen
from re import compile
import re
from warnings import warn
from BeautifulSoup import BeautifulSoup
from odf.opendocument import OpenDocumentText
from odf.text import P, Span
from odf.style import Style, TextProperties


BASE_URL = 'http://www.assemblee-nationale.fr/13/amendements/1240/124%06d.asp'
SOURCES = 'http://www.laquadrature.net/amendements.txt'

def clean_html(s):
    s = unicode(BeautifulSoup(s, convertEntities='html'))
    s = s.replace('\r\n', '')
    s = re.sub('<br.*/?>', '\n', s)
    s = re.sub('<!--.*-->', '', s)
#    s = s.encode('latin1', 'xmlcharrefreplace')
    return s

def single_finder(regex, flags=0):
    regex = compile(regex, flags)
    def finder(data):
        match = regex.search(data)
        if match is not None:
            return clean_html(match.groups()[0])
    return finder

def many_finder(regex, flags=0):
    regex = compile(regex, flags)
    def finder(data):
        matches = regex.findall(data)
        if matches:
            return [clean_html(m) for m in matches]
    return finder

RE = {
        "number"    : single_finder('<meta name="NUM_AMENDG" content="([^"]+)">'),
        "alinea"    : single_finder('<meta name="DESIGNATION_ALINEA" content="([^"]+)">'),
        "article"   : single_finder('<meta name="DESIGNATION_ARTICLE" content="([^"]+)">'),
        "fate"      : single_finder('<meta name="SORT_EN_SEANCE" content="([^"]+)">'),
        "order"     : single_finder('<meta name="ORDRE_TEXTE" content="([^"]+)">'),
        "date"      : single_finder('<DATE_AMEND>([^>]*)</DATE_AMEND>'),
        "author"    : single_finder('<AUTEURS><p[^>]*>(.*)</p></AUTEURS>', re.M|re.S),
        "detail"    : many_finder('<DISPOSITIF><p [^>]*>(.*)</p></DISPOSITIF>'),
        "expose"    : many_finder('<EXPOSE><p [^>]*>(.*)</p></EXPOSE>'),
        "gov"       : single_finder('<tr>\s*<td[^>]*><NOEXTRACT><div>Gouvernement</div></NOEXTRACT>\s*</td>\s*<td[^>]*>([^<]*)\s*</td></tr>', re.M|re.S),
        "com"       : single_finder('<tr>\s*<td[^>]*><NOEXTRACT><div>Commission</div></NOEXTRACT>\s*</td>\s*<td[^>]*>([^<]*)\s*</td></tr>', re.M|re.S),
        "sous_amdt" : single_finder("\xe0 l\'amendement n\xb0 ([0-9]+)"),


    }

class Amdt(object):

    def __init__(self, url, fill=True):
        self.url = url
        self.data = {}
        if fill:
            self.fill()

    def fill(self):
        content = urlopen(self.url).read() # grouik

        for key, finder in RE.iteritems():
            value = finder(content)
            if value is not None:
                self.data[key] = value
            else:
                warn("Unable to find amendement's %s for %s" % (key, self.url))



if __name__ == '__main__':
    if sys.argv[1:]:
        urls = [ BASE_URL % (int(n)) for n in sys.argv[1:]]
    else:
        urls = [ url.strip() for url in urlopen(SOURCES)]
    # init amdts

    textdoc = OpenDocumentText()
    s = textdoc.styles
    gras = Style(name="Gras", family="text")
    propriete = TextProperties(fontweight="bold")
    gras.addElement(propriete)
    s.addElement(gras)

    

    amdts = [ Amdt(url) for url in urls]

    #print them
    for amdt in amdts:
        from pprint import pprint
	p = P(text=(u"Amendement "), )
	p.addElement(Span(stylename=gras, text=(u"%s, de %s" % (amdt.data["number"], amdt.data["author"]))))
        textdoc.text.addElement(p)
	p = P()
	p.addElement(Span(text=(u"Détail : "),stylename=gras))
	textdoc.text.addElement(p)
	for detail in amdt.data["detail"]:
	  p = P()
          p.addElement(Span(text=(u"%s\r\n" % detail)))
          textdoc.text.addElement(p)
	  p = P()
	  textdoc.text.addElement(p)
	p.addElement(Span(stylename=gras, text=(u"Exposé : ")))
	for expose in amdt.data["expose"]:
	  p = P()
          p.addElement(Span(text=(u"%s\n" % expose)))
    	  textdoc.text.addElement(p)
	  p = P()
	  textdoc.text.addElement(p)
	p = P()
	textdoc.text.addElement(p)
	pprint(amdt.data)
    textdoc.save("amendements", True)