Commits

Toby Inkster committed f5f5ffe

Create XRD::Parser module, improve H:H:M::Parser docs.

  • Participants
  • Parent commits aba8a9b

Comments (0)

Files changed (2)

 use HTML::HTML5::Microdata::Parser;
 
 my $h = <<HTML;
-<html lang="en">
+<html lang="">
+<head>
 	<title >Foo bar</title>
 	<link rel="up up up FOOBLE alternate stylesheet fooble http://EXAMPLE.COM/" href="foo.css">
 	<meta name="http://search.cpan.org/dist/HTML-HTML5-Microdata-Parser/#auto_config"
 	  content="xhtml_time=1" />
-</html>
+	  <meta  http-equiv=content-language content=en-gb-oed>
+	</head>
 
 <div itemscope>
  <p>My name is <span itemprop="name">Elizabeth</span>.</p>
  <p>My name is <span itemprop="name">Neil</span>.</p>
  <p>My band is called <span itemprop="band">Four Parts Water</span>.</p>
  <p>I am <span itemprop="nationality">British</span>.</p>
+ <p itemprop="http://example.net/">Foo</p>
 </div>
 
-<section itemref="fooble" itemscope itemid="#hedral" itemtype="http://example.org/animals#cat">
+<section xml:lang="en-us" itemref="fooble" itemscope itemid="#hedral" itemtype="http://example.org/animals#cat">
  <h1 itemprop="name">Hedral</h1>
  <p itemprop="desc">Hedral is a male american domestic
  shorthair, with a fluffy black fur with white paws and belly.</p>

File lib/HTML/HTML5/Microdata/Parser.pm

 
   * alt_stylesheet  - Magic rel="alternate stylesheet". [1]
   * auto_config     - See section "Auto Config" [0]
+  * mhe_lang        - Process <meta http-equiv=Content-Language>.
+                      [1]
   * prefix_empty    - URI prefix for itemprops of untyped items.
                       [undef]
+  * tdb_service     - thing-described-by.org when possible. [0] 
+  * xhtml_base      - Process <base href> element. [1]
+  * xhtml_lang      - Process @lang. [1]
   * xhtml_time      - Process <time> element nicely. [0]
+  * xml_lang        - Process @xml:lang. [1]
 
 $storage is an RDF::Trine::Storage object. If undef, then a new
 temporary store is created.
 			'options' => {
 				'alt_stylesheet'  => 1,
 				'auto_config'     => 0,
+				'mhe_lang'        => 1,
 				'prefix_empty'    => undef,
+				'tdb_service'     => 0,
+				'xhtml_base'      => 1,
+				'xhtml_lang'      => 1,
+				'xhtml_time'      => 0,
+				'xml_lang'        => 1,
 				},
-			'Graphs'  => {},
+			'default_language' => undef,
 		};
 	bless $this, $class;
 	
 	$this->auto_config;
 	
 	# HTML <base> element.
-	my @bases = $this->{DOM}->getElementsByTagName('base');
-	my $base;
-	foreach my $b (@bases)
+	if ($this->{'options'}->{'xhtml_base'})
 	{
-		if ($b->hasAttribute('href'))
+		my @bases = $this->{DOM}->getElementsByTagName('base');
+		my $base;
+		foreach my $b (@bases)
 		{
-			$base = $b->getAttribute('href');
-			$base =~ s/#.*$//g;
+			if ($b->hasAttribute('href'))
+			{
+				$base = $b->getAttribute('href');
+				$base =~ s/#.*$//g;
+			}
+		}
+		$this->{'baseuri'} = $this->uri($base)
+			if defined $base && length $base;
+	}
+	
+	if ($this->{'options'}->{'mhe_lang'})
+	{
+		my $xpc = XML::LibXML::XPathContext->new;
+		$xpc->registerNs('x', 'http://www.w3.org/1999/xhtml');
+		my $nodes = $xpc->find('//x:meta[translate(@http-equiv,"CONTENT-LANGUAGE","content-language"="content-language")]/@content', $this->{'DOM'}->documentElement);
+		foreach my $node ($nodes->get_nodelist)
+		{
+			if ($node->getValue =~ /^\s*([^\s,]+)/)
+			{
+				my $lang = $1;
+				if (valid_lang($lang))
+				{
+					$this->{'default_language'} = $lang;
+					last;
+				}
+			}
 		}
 	}
-	$this->{'baseuri'} = $this->uri($base)
-		if defined $base && length $base;
-	
+		
 	return $this;
 }
 
 		$this->rdf_triple(
 			$item,
 			$this->{'baseuri'},                             # subject : the document's current address 
-			'http://www.w3.org/1999/xhtml/microdata#item ', # predicate : http://www.w3.org/1999/xhtml/microdata#item  
+			'http://www.w3.org/1999/xhtml/microdata#item',  # predicate : http://www.w3.org/1999/xhtml/microdata#item  
 			$item_address);                                 # object : [item address]
 	}
 	
 		$this->rdf_triple(
 			$item,
 			$item_address,                                      # subject : [item address]
-			'http://www.w3.org/1999/02/22-rdf-syntax-ns#type ', # predicate : http://www.w3.org/1999/02/22-rdf-syntax-ns#type  
+			'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',  # predicate : http://www.w3.org/1999/02/22-rdf-syntax-ns#type  
 			$item_type);                                        # object : [item type]
 	}
 	
 	my $this = shift;
 	my $node = shift;
 
-	my @namespaces = (XML_XML_NS, 'http://www.w3.org/1999/xhtml', undef);
-	foreach my $ns (@namespaces)
+	my $XML_XHTML_NS = 'http://www.w3.org/1999/xhtml';
+
+	if ($this->{'options'}->{'xml_lang'}
+	&&  $node->hasAttributeNS(XML_XML_NS, 'lang'))
 	{
-		if ($node->hasAttributeNS($ns, 'lang')
-		&&  valid_lang($node->getAttributeNS($ns, 'lang')))
-		{
-			return $node->getAttributeNS($ns, 'lang');
-		}
+		return valid_lang($node->getAttributeNS(XML_XML_NS, 'lang')) ?
+			$node->getAttributeNS(XML_XML_NS, 'lang'):
+			undef;
 	}
-	
+
+	if ($this->{'options'}->{'xhtml_lang'}
+	&&  $node->hasAttributeNS($XML_XHTML_NS, 'lang'))
+	{
+		return valid_lang($node->getAttributeNS($XML_XHTML_NS, 'lang')) ?
+			$node->getAttributeNS($XML_XHTML_NS, 'lang'):
+			undef;
+	}
+
+	if ($this->{'options'}->{'xhtml_lang'}
+	&&  $node->hasAttributeNS(undef, 'lang'))
+	{
+		return valid_lang($node->getAttributeNS(undef, 'lang')) ?
+			$node->getAttributeNS(undef, 'lang'):
+			undef;
+	}
+
 	if ($node != $this->{'DOM'}->documentElement
-	&&  defined $node->parentNode)
+	&&  defined $node->parentNode
+	&&  $node->parentNode->nodeType == XML_ELEMENT_NODE)
 	{
 		return $this->get_node_lang($node->parentNode);
 	}
 	
-	return undef;
+	return $this->{'default_language'};
 }
 
 sub rdf_triple
 		 | ($privateUse)
 		 | ($grandfathered)
 		 )$/xi);
+	
 	return $r;
 }
 
 	
 	foreach my $o (keys %$options)
 	{
-		next unless $o=~ /^(alt_stylesheet | prefix_empty | xhtml_time)$/ix;		
+		next unless $o=~ /^(alt_stylesheet | mhe_lang | prefix_empty | 
+			xhtml_base | xhtml_lang | xhtml_time | xml_lang)$/ix;	
 		$count++;
 		$this->{'options'}->{lc $o} = $options->{$o};
 	}