Commits

Anonymous committed af6ff4a

parsers/extractor for income declaration. little bit buggy.

Comments (0)

Files changed (1)

riigikogu/interests.py

+"""
+Extracts information from "Ametiisikute majanduslike huvide deklaratsioonid"
+
+"""
+import re
+import requests
+import logging
+from StringIO import StringIO
+
+from lxml import etree
+from lxml import html
+
+def fetch_doc():
+    """reads xml document and inserts it memorybuffer"""
+    url = "https://www.riigiteataja.ee/akt/13330154.xml"
+    response = requests.get(url)
+    buf = StringIO(response.content)
+    buf.seek(0)
+    logging.debug("fetched public interests: status %s"%(response.ok))
+    return buf
+
+
+def extract_data(dtree):
+    """
+    extracts data from given etree object, that includes own htmlcontainer
+    """
+    root = dtree.getroot()
+    data = root.xpath("//*[local-name() = 'HTMLKonteiner']")
+    #html_text = html.fromstring(data[0].text)
+
+    def _fix_html(html_tree):
+        """ """
+        #find ergma paragraph and add class "MsoNormal"
+        ergma = html_tree.find(".//p/b")
+        new_elem = html.Element("b")
+        new_elem.set("class", "MsoNormal")
+        new_elem.text, new_elem.tail = ergma.text, ergma.tail
+        ergma.replace(new_elem)
+
+    def _clean_string(string):
+        ''' '''
+        string = re.sub(r"(\\n)|(\xa0)|([\s]{2,})", "", string)
+        return string
+
+    step = 11 #how many fields every people have minus 1
+    def _generate_row(html_tree):
+        _strip = lambda x: x.strip("\t \n\r.")
+        _fix_html(html_tree)
+        elements = html_tree.findall("p[@class='MsoNormal']") #"./"
+
+        for element in elements:
+            row_str = element.text_content()
+            row_str = _clean_string()
+            row = []
+            raise NotImplemented, "data to string"
+            yield row
+
+    def _extract(html_string):
+        """Extracts data from given html"""
+        html_tree = html.fromstring(html_string)
+        result = []
+        print "before generator"
+        gen = _generate_row(html_tree)
+        for row in gen:
+            result.append(row)
+        return result
+
+    return _extract(data[0].text)
+
+def extract_meta(dtree):
+    """extracts document's metadata"""
+    pass
+
+def main():
+    """main and public function"""
+    dtree = etree.parse(fetch_doc())
+    meta_data = extract_meta(dtree)
+    data = extract_data(dtree)
+
+    return meta_data, data
+
+if __name__ == "__main__":
+    main()