Commits

Tom Morris committed b02a554

adding simple u- property parsing

Comments (0)

Files changed (3)

                         prop_name = prop.replace("p-", "")
                         prop_value = props.get(prop_name, [])
                         prop_value.append(el.firstChild.nodeValue)
-                        props[prop_name] = prop_value
+
+                        if prop_value is not []:
+                            props[prop_name] = prop_value
+
+                potential_url_property_signifiers = [x for x in classes if x.startswith("u-")]
+                if len(potential_url_property_signifiers) > 0:
+                    for prop in potential_url_property_signifiers:
+                        prop_name = prop.replace("u-", "")
+                        prop_value = props.get(prop_name, [])
+                        if el.nodeName == 'a' and el.hasAttribute("href"):
+                            prop_value.append(el.getAttribute("href"))
+                        elif el.nodeName == 'area' and el.hasAttribute("href"):
+                            prop_value.append(el.getAttribute("href"))
+
+                        if prop_value is not []:
+                            props[prop_name] = prop_value
 
             for child in [x for x in el.childNodes if x.nodeType is 1]:
                 res = parse_props(child)

test/examples/person_with_url.html

+<!DOCTYPE html>
+<html>
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Simple_person_reference</title>
+</head>
+<body>
+  <span class="h-card">
+  <span class="p-name">Tom Morris</span>
+  <a href="http://tommorris.org/" class="u-url">tommorris.org</a>
+  </span>
+</body>
+</html>

test/test_parser.py

     p = Parser(open("test/examples/simple_person_reference_same_element.html"))
     result = p.to_dict()
     assert result["items"][0]["properties"] == {u'name': [u'Frances Berriman']}
+
+def test_person_with_url():
+    p = Parser(open("test/examples/person_with_url.html"))
+    result = p.to_dict()
+    assert result["items"][0]["properties"]["name"] == [u'Tom Morris']
+    assert result["items"][0]["properties"]["url"] == [u'http://tommorris.org/']