1. Benoit Boissinot
  2. amdt hadopi


amdt hadopi / amdt.py

# coding:utf8

import sys
from urllib import urlopen
from re import compile
import re
from optparse import OptionParser
from pprint import pprint
from warnings import warn
from BeautifulSoup import BeautifulSoup

BASE_URL = 'http://www.assemblee-nationale.fr/13/amendements/1240/124%06d.asp'
SOURCES = 'http://www.laquadrature.net/amendements.txt'

def clean_html(s):
    s = unicode(BeautifulSoup(s, convertEntities='html'))
    s = s.replace('\r\n', '')
    s = re.sub('<br.*/?>', '\n', s)
    s = re.sub('<!--.*-->', '', s)
#    s = s.encode('latin1', 'xmlcharrefreplace')
    return s

def single_finder(regex, flags=0):
    regex = compile(regex, flags)
    def finder(data):
        match = regex.search(data)
        if match is not None:
            return clean_html(match.groups()[0])
    return finder

def many_finder(regex, flags=0):
    regex = compile(regex, flags)
    def finder(data):
        matches = regex.findall(data)
        if matches:
            return [clean_html(m) for m in matches]
    return finder

RE = {
        "number"    : single_finder('<meta name="NUM_AMENDG" content="([^"]+)">'),
        "alinea"    : single_finder('<meta name="DESIGNATION_ALINEA" content="([^"]+)">'),
        "article"   : single_finder('<meta name="DESIGNATION_ARTICLE" content="([^"]+)">'),
        "fate"      : single_finder('<meta name="SORT_EN_SEANCE" content="([^"]+)">'),
        "order"     : single_finder('<meta name="ORDRE_TEXTE" content="([^"]+)">'),
        "date"      : single_finder('<DATE_AMEND>([^>]*)</DATE_AMEND>'),
        "author"    : single_finder('<AUTEURS><p[^>]*>(.*)</p></AUTEURS>', re.M|re.S),
        "detail"    : many_finder('<DISPOSITIF><p [^>]*>(.*)</p></DISPOSITIF>'),
        "expose"    : many_finder('<EXPOSE><p [^>]*>(.*)</p></EXPOSE>'),
        "gov"       : single_finder('<tr>\s*<td[^>]*><NOEXTRACT><div>Gouvernement</div></NOEXTRACT>\s*</td>\s*<td[^>]*>([^<]*)\s*</td></tr>', re.M|re.S),
        "com"       : single_finder('<tr>\s*<td[^>]*><NOEXTRACT><div>Commission</div></NOEXTRACT>\s*</td>\s*<td[^>]*>([^<]*)\s*</td></tr>', re.M|re.S),
        "sous_amdt" : single_finder("\xe0 l\'amendement n\xb0 ([0-9]+)"),


class Amdt(object):

    def __init__(self, url, fill=True):
        self.url = url
        self.data = {}
        if fill:

    def fill(self):
        content = urlopen(self.url).read() # grouik

        for key, finder in RE.iteritems():
            value = finder(content)
            if value is not None:
                self.data[key] = value
            #    warn("Unable to find amendement's %s for %s" % (key, self.url))

def print_odt(amdts):
    # uses svn version from http://opendocumentfellowship.com/projects/odfpy
    from odf.opendocument import OpenDocumentText
    from odf.text import P, Span
    from odf.style import Style, TextProperties

    textdoc = OpenDocumentText()
    s = textdoc.styles
    gras = Style(name="Gras", family="text")
    propriete = TextProperties(fontweight="bold")

    #print them
    for amdt in amdts:
        from pprint import pprint
        p = P(text=(u"Amendement "), )
        p.addElement(Span(stylename=gras, text=(u"%s, de %s" % (amdt.data.get("number",""), amdt.data.get("author","")))))
        p = P()
        p.addElement(Span(text=(u"Détail : "),stylename=gras))
        for detail in amdt.data.get("detail", ()):
            p = P()
            p.addElement(Span(text=(u"%s\r\n" % detail)))
            p = P()
        p.addElement(Span(stylename=gras, text=(u"Exposé : ")))
        for expose in amdt.data.get("expose", ()):
            p = P()
            p.addElement(Span(text=(u"%s\n" % expose)))
            p = P()
        p = P()
    textdoc.save("amendements", True)

def print_wiki(amdts):
    for amdt in amdts:
        if 'sous_amdt' in amdt.data:
            header = u"Sous-amendement n° %s à l'am. %s - %s" % (amdt.data['number'], amdt.data['sous_amdt'], amdt.data['article'])
            header = u'Amendement n° %s - %s' % (amdt.data['number'], amdt.data['article'])
            if 'alinea' in amdt.data:
                header += u', %s' % amdt.data['alinea']
        print u"=== %s ===" % header
        print u"(%s) <br />" % amdt.data['author'].replace("\n", " ")
        print u"[%s Amendement non encore analysé.]<br />" % (BASE_URL % int(amdt.data['number'].split()[0]))
        print u"''Sans opinion.''"
        print ''

def print_pprint(amdt):
    for amdt in amdts:

outputs = {'wiki': print_wiki, 'odt': print_odt, 'pprint': print_pprint}

if __name__ == '__main__':
    usage = "usage: %prog [options] arg"
    #usage = "usage: %prog [options]"
    parser = OptionParser(usage)
    parser.add_option("-l", "--url-list", dest="url_list",
                      help="get amendements list from URL")
    parser.add_option("--stop-number", dest="stop",
                      help="stop reading url list at NUMBER")
    parser.add_option("--output", dest="output", default='pprint',
                      help="output (wiki, odt, pprint), default pprint")
    (options, args) = parser.parse_args()
    if len(args) != 0 and options.url_list:
        parser.error("--url-list not compatible with amdt number as args")

    if options.url_list:
        stop = options.stop or None
        urls = []
        regexp = "<td class='TexteColonnePlace'><A HREF=\"(?P<url>http.+)\">(?P<num>\d+)"
        data = urlopen(options.url_list).read()
        for m in re.finditer(regexp, data):
            if stop is not None and m.group('num') == stop:
    elif args:
        urls = [ BASE_URL % (int(n)) for n in args]
        urls = [ url.strip() for url in urlopen(SOURCES)]

    # init amdts
    amdts = [ Amdt(url) for url in urls]