Source

kalapea / riigikogu.py

Full commit
"""
Riigikogu.py includes handlers,parsers,crawlers for manage Riigikogu data
"""
from pyparsing import *
import time
import rdflib

comma = Literal(",")
endpunc = oneOf(".!?")
num = Word(nums)
date = Combine(num + "." + num + "." + num)
uniword = Word(alphas + "-", unicode(alphas) + ("-õüäöÕÜÄÖ").decode("utf-8"))
#inner validation functions
POLNS = rdflib.Namespace("http://example.com/politician/")

concanate = lambda tokens: u" ".join(tokens)

def validate_date(tokens, template = "%d.%m.%Y"):
    """controlls, does parsed string is correct date string"""
    try:
        time.strptime(tokens[0], template)
    except ValueError, msg:
        raise ParseException("Invalid date string (%s) \n %s" % (tokens[0], msg))

date.setParseAction(validate_date)


def parse_data(member, member_graph):
    """
    Transforms member dict to triple objects.
    """
    subject = rdflib.BNode()
    print parse_name(member, subject)
    print parse_birthplace(member, subject)

def parse_name(member, subject):
    field_key = u"name"
    return ( subject, POLNS["name"], rdflib.Literal(member[field_key]))

#-- Parsers -------------------------------------------------------------------------
def parse_birthplace(member, subject):
    field_key = u"S\u00fcnniaeg ja -koht"
    birthtime = Combine(num + "." + num + "." + num)
    birthplace = Word(alphas)
    birthtimeplace = Group(birthtime.setResultsName("date") +\
                    birthplace.setResultsName("place"))
    tokens = birthtimeplace.parseString(member[field_key][0])
    print tokens
    result = (( subject, POLNS["birthdate"], tokens.date),
                (subject, POLNS["birthplace"], tokens.place))
    return result

def parse_workplace(string, member_node):
    """"""
    #unicodePrintables = u''.join(unichr(c) for c in xrange(65536) if not unichr(c).isspace())
    workplace = OneOrMore(uniword)
    profession = OneOrMore(uniword)
    year = Word(nums,max = 4)
    separator = oneOf(u"\u2013 -")
    period = Combine(year("since") + separator + Optional(year("until")))
    experience = (workplace("workplace") +
                      ZeroOrMore(Literal(",") + profession("profession")) +
                      period)

    profession.setParseAction(lambda tokens: u" ".join(tokens))
    workplace.setParseAction(lambda tokens: u" ".join(tokens))
    tokens = experience.parseString(string)
    #to rdf ...
    workplace_node = BNode()
    profession_node = BNode()
    result = (
            (workplace_node, rdflib.RDF.type, POLNS["Workplace"]),
            (workplace_node, POLNS["name"], Literal(tokens.workplace, lang = "et")),
            (member_node, POLNS["hasWorked"], workplace_node),
            (workplace_node, POLNS["since"], Literal(tokens.since,
                datatype = rdflib._XSD_NS.date)),
            (workplace_node, POLNS["until"], Literal(tokens.since or "1900",
                datatype = rdflib._XSD_NS.date)),
            (profession_node, rdflib.RDF.type, POLNS["Profession"]),
            (profession_node, POLNS["name"], Literal(tokens.professions, lang = "et")),
            (member_node, POLNS["hasProfession"], tokens.profession),
            (workplace_node, POLNS["since"], Literal(tokens.since,
                datatype = rdflib._XSD_NS.date)),
            (workplace_node, POLNS["until"], Literal(tokens.until or "1900",
                datatype = rdflib._XSD_NS.date)),
            )
    return result

def parse_workplaces(member_dict, member_node):
    """ """
    field_key = u"T\u00f6\u00f6kohad"
    result = []
    worker = lambda string: parse_workplace(string, subject)
    result = map(worker, member_dict[field_key])
    return result

#TODO:controll does list values are joined to one string
def parse_occupations(member_dict, member_node):
    """ """
    field_key =  u"Tegevusalad"
    def _parse_occupation(string):
        """parse occupation fields and produce rdf statements"""
        place = OneOrMore(uniword)
        role = OneOrMore(uniword)
        role.setParseAction(concanate)
        place.setParseAction(concanate)
        occupation = (place("place") + ZeroOrMore(Literal(",") + role("role")))
        tokens = occupation.parseString(string)
        #to rdf
        occupation_node = rdflib.BNode()

        role_val = tokens.role if len(tokens.role) > 1 else "liige"
        result = ((occupation_node, rdflib.RDF.type, POLNS["Occupation"]),
                (occupation_node, POLNS["name"], rdflib.Literal(role_val,lang = "et")),
                (member_node, POLNS["hasRole"], occupation_node),
                (member_node, POLNS["roleAt"], rdflib.Literal(tokens.place))
                )
        return result
    return map(_parse_occupation, member_dict[field_key])


#TODO: create special translation file et-en.py
field_translations = {
    u"S\u00fcnniaeg ja -koht"   : u"Birthtimeplace",
    u"Haridus"                  : u"Education"
}

if __name__ == "__main__":
    import simplejson
    member_graph = rdflib.ConjunctiveGraph()
    fp = open("/home/timgluz/Projects/kalapea/result.json")
    data = simplejson.load(fp)
    member = data[0]
    parse_data(member, member_graph)
    fp.close()