Commits

Anonymous committed edd11e5

Added basic dt-* parsing with tests, stubbed e-* parsing

Comments (0)

Files changed (3)

                 # TODO: make this handle multiple spaces, tabs(?) separating classnames
                 classes = el.getAttribute("class").split(" ")
                 
-                # nested microformat parsing
+                # Is this element a microformat root?
                 root_classnames = [c for c in classes if c.startswith("h-")]
                 if len(root_classnames) > 0:
                     # this element represents a nested microformat
                         # nested microformat is a child microformat, parse and add to children
                         children.append(handle_microformat(root_classnames, el))
                 else:
-                    # simple property parsing
+                    # Parse plaintext p-* properties.
                     for prop in [c for c in classes if c.startswith("p-")]:
                         # TODO: parse for value-class here
                         prop_name = prop[2:]
                         if prop_value is not []:
                             props[prop_name] = prop_value
 
-                    # url property parsing
+                    # Parse URL u-* properties.
                     for prop in [c for c in classes if c.startswith("u-")]:
                         prop_name = prop[2:]
                         prop_value = props.get(prop_name, [])
 
                         # el/at matching
                         url_matched = False
-                        if el.nodeName == 'a' and el.hasAttribute("href"):
+                        if el.nodeName in ("a", "area") and el.hasAttribute("href"):
                             prop_value.append(url_relative(el.getAttribute("href")))
                             url_matched = True
-                        elif el.nodeName == 'area' and el.hasAttribute("href"):
-                            prop_value.append(url_relative(el.getAttribute("href")))
-                            url_matched = True
-                        elif el.nodeName == 'img' and el.hasAttribute("src"):
+                        elif el.nodeName == "img" and el.hasAttribute("src"):
                             prop_value.append(url_relative(el.getAttribute("src")))
                             url_matched = True
-                        elif el.nodeName == 'object' and el.hasAttribute("data"):
+                        elif el.nodeName == "object" and el.hasAttribute("data"):
                             prop_value.append(url_relative(el.getAttribute("data")))
                             url_matched = True
 
 
                         if prop_value is not []:
                             props[prop_name] = prop_value
+                    
+                    # Parse datetime dt-* properties.
+                    for prop in [c for c in classes if c.startswith("dt-")]:
+                        prop_name = prop[3:]
+                        prop_value = props.get(prop_name, [])
+                        
+                        # TODO: parse value-class pattern including datetime parsing rules.
+                        # http://microformats.org/wiki/value-class-pattern
+                        
+                        if el.nodeName in ("time", "ins", "del") and el.hasAttribute("datetime"):
+                            prop_value.append(el.getAttribute("datetime"))
+                        elif el.nodeName == "abbr" and el.hasAttribute("title"):
+                            prop_value.append(el.getAttribute("title"))
+                        elif el.nodeName in ("data", "input") and el.hasAttribute("value"):
+                            prop_value.append(el.getAttribute("value"))
+                        else:
+                            prop_value.append(el.firstChild.nodeValue)
+                        
+                        props[prop_name] = prop_value
+
+                    # Parse embedded markup e-* properties.
+                    for prop in [c for c in classes if c.startswith("e-")]:
+                        prop_name = prop[2:]
+                        prop_value = props.get(prop_name, [])
+                        
+                        
+                        
+                        props[prop_name] = prop_value
             
             parsed.add(el)
             

test/examples/datetimes.html

+<!DOCTYPE html>
+<html>
+<head>
+ <meta http-equiv="content-type" content="text/html; charset=utf-8">
+ <title>Hello World</title>
+</head>
+<body>
+ <div class="h-event">
+  <h1 class="p-name">Microformats Hootenanny</h1>
+  Arrive <span class="dt-start">2014-01-01T12:00:00+00:00</span>, stay until
+  <time class="dt-end" datetime="3014-01-01T18:00:00+00:00">six pee em GMT on the first day of the first month, year three thousand, four and ten.</time>
+  
+  That’s right, the microformats <del class="dt-updated" datetime="2011-08-26T00:01:21+00:00">party</del><ins class="dt-updated" datetime="2011-08-26T00:01:21+00:00">hootenanny</ins> lasts for <abbr class="dt-duration" title="P1000Y">ONE THOUSAND YEARS</abbr>.
+ </div>
+</body>
+</html>
     assert result["items"][4]["properties"]["photo"][0] == "http://tommorris.org/photo.png"
     assert result["items"][4]["properties"]["name"][0] == "Tom Morris"
 
+def test_datetime_parsing():
+    result = parse_fixture("datetimes.html")
+    pprint(result)
+    assert result["items"][0]["properties"]["start"][0] == "2014-01-01T12:00:00+00:00"
+    assert result["items"][0]["properties"]["end"][0] == "3014-01-01T18:00:00+00:00"
+    assert result["items"][0]["properties"]["duration"][0] == "P1000Y"
+    assert result["items"][0]["properties"]["updated"][0] == "2011-08-26T00:01:21+00:00"
+    assert result["items"][0]["properties"]["updated"][1] == "2011-08-26T00:01:21+00:00"
+
 def test_backcompat():
     result = parse_fixture("backcompat.html")
     assert set(result["items"][0]["type"]) == set(["h-card"])