1. Barnaby Walters
  2. mf2py

Commits

Tom Morris  committed 733855d

got simple property parsing working without any magical inference stuff

  • Participants
  • Parent commits 80490b3
  • Branches default

Comments (0)

Files changed (3)

File mf2py/parser.py

View file
  • Ignore whitespace
     def __init__(self, *args, **kwargs):
         self.__url__ = None
         self.__doc__ = None
+        self.__parsed__ = {"items": [], "rels": {}}
 
         if len(args) > 0:
             if type(args[0]) is file:
                         if urlparse(poss_base.getAttribute("href")).netloc is not '':
                             self.__url__ = poss_base.getAttribute("href")
 
+        if self.__doc__ is not None:
+            # parse!
+            self.parse()
+
+    def parse(self):
+        def handle_microformat(microformat_name, el, ctx):
+            properties = parse_props(el, {})
+            microformat = {"type": [microformat_name],
+                           "properties": properties}
+            ctx.append(microformat)
+            print microformat
+
+        def parse_props(el, props = {}):
+            if el.hasAttribute("class"):
+                classes = el.getAttribute("class").split(" ")
+
+                # simple property parsing
+                potential_simple_property_signifiers = [x for x in classes if x.startswith("p-")]
+                if len(potential_simple_property_signifiers) > 0:
+                    for prop in potential_simple_property_signifiers:
+                        prop_name = prop.replace("p-", "")
+                        prop_value = props.get(prop_name, [])
+                        prop_value.append(el.firstChild.nodeValue)
+                        props[prop_name] = prop_value
+
+            for child in [x for x in el.childNodes if x.nodeType is 1]:
+                res = parse_props(child)
+                props.update(res)
+            return props
+
+        def parse_el(el, ctx):
+            potential_microformats = []
+
+            if el.hasAttribute("class"):
+                classes = el.getAttribute("class").split(" ")
+                potential_microformats = [x for x in classes if x.startswith("h-")]
+
+            if len(potential_microformats) > 0:
+                for microformat_name in potential_microformats:
+                    handle_microformat(microformat_name, el, ctx)
+
+            for child in [x for x in el.childNodes if x.nodeType is 1]:
+                parse_el(child, ctx)
+
+        ctx = []
+        parse_el(self.__doc__.documentElement, ctx)
+
     def to_dict(self):
-        return { "items": [], "rels": {} }
+        return self.__parsed__
     
     def to_json(self):
         return json.dumps(self.to_dict())

File test/examples/simple_person_reference.html

View file
  • Ignore whitespace
+<!DOCTYPE html>
+<html>
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Simple_person_reference</title>
+</head>
+<body>
+  <span class="h-card">
+  <span class="p-name">Frances Berriman</span>
+  </span>
+</body>
+</html>

File test/test_parser.py

View file
  • Ignore whitespace
 def test_base():
     p = Parser(open("test/examples/base.html"))
     assert p.__url__ == u"http://tantek.com/"
+
+def test_simple_parse():
+    p = Parser(open("test/examples/simple_person_reference.html"))
+    print p.to_dict()
+    assert type(p.to_dict()["classes"]) is list