Commits

Toby Inkster committed fac8976

Switch to proper HTML::Selector::XPath

Comments (0)

Files changed (5)

lib/XML/LibXML/QuerySelector.pm

+package XML::LibXML::QuerySelector;
+
 use 5.010;
 use common::sense;
 use utf8;
 
+use HTML::Selector::XPath 0.13 qw//;
+use XML::LibXML 1.70 qw//;
+
+BEGIN
 {
-	package XML::LibXML::QuerySelector;
-
-	use XML::LibXML qw//;
-
-	BEGIN
-	{
-		$XML::LibXML::QuerySelector::AUTHORITY = 'cpan:TOBYINK';
-		$XML::LibXML::QuerySelector::VERSION   = '0.001';
-		
-		push @XML::LibXML::Document::ISA, __PACKAGE__;
-		push @XML::LibXML::DocumentFragment::ISA, __PACKAGE__;
-		push @XML::LibXML::Element::ISA, __PACKAGE__;
-	}
-
-	my $contains = sub 
-	{
-		my ($self, $node) = @_;
-		my $self_path = $self->nodePath;
-		my $node_path = $node->nodePath;
-		my $sub_node_path = substr $node_path, 0, length $self_path;
-		$sub_node_path eq $self_path;
-	};
-
-	sub querySelectorAll
-	{
-		my ($self, $selector_string) = @_;
-		my $selector = XML::LibXML::QuerySelector::ToXPath->new($selector_string);
-		my $xpath = $selector->to_xpath(prefix => 'defaultns');
-		
-		my $document = $self->nodeName =~ /^#/ ? $self : $self->ownerDocument;
-		my $xc = XML::LibXML::XPathContext->new($document);
-		$xc->registerNs(defaultns => $document->documentElement->namespaceURI);
-		
-		if ($document == $self)
-		{
-			return $xc->findnodes($xpath);
-		}
-		
-		my @results = map
-			{ $self->$contains($_) ? ($_) : () }
-			@{[ $xc->findnodes($xpath) ]};
-		
-		wantarray ? @results : XML::LibXML::NodeList->new(@results);
-	}
-
-	sub querySelector
-	{
-		my ($self, $selector_string) = @_;
-		my $results = $self->querySelectorAll($selector_string);
-		return unless $results->size;
-		$results->shift;
-	}
+	$XML::LibXML::QuerySelector::AUTHORITY = 'cpan:TOBYINK';
+	$XML::LibXML::QuerySelector::VERSION   = '0.003';
+	
+	push @XML::LibXML::Document::ISA, __PACKAGE__;
+	push @XML::LibXML::DocumentFragment::ISA, __PACKAGE__;
+	push @XML::LibXML::Element::ISA, __PACKAGE__;
+	
+	eval { require Object::AUTHORITY; Object::AUTHORITY->import };
+	eval { require Object::DOES;      Object::DOES->import };
 }
 
+my $contains = sub 
 {
-	package XML::LibXML::QuerySelector::ToXPath;
+	my ($self, $node) = @_;
+	my $self_path = $self->nodePath;
+	my $node_path = $node->nodePath;
+	my $sub_node_path = substr $node_path, 0, length $self_path;
+	$sub_node_path eq $self_path;
+};
+
+sub querySelectorAll
+{
+	my ($self, $selector_string) = @_;
+	my $selector = HTML::Selector::XPath->new($selector_string);
 	
-	use Carp qw//;
+	my $document = $self->nodeName =~ /^#/ ? $self : $self->ownerDocument;
+	my $nsuri    = $document->documentElement->lookupNamespaceURI('');
 	
-	our @ISA;
-	BEGIN
+	my $xc = XML::LibXML::XPathContext->new($document);
+	$xc->registerNs(defaultns => $nsuri) if $nsuri;
+
+	my $xpath = defined $nsuri
+		? $selector->to_xpath(prefix => 'defaultns')
+		: $selector->to_xpath;
+
+	if ($document == $self)
 	{
-		$XML::LibXML::QuerySelector::ToXPath::AUTHORITY = 'cpan:TOBYINK';
-		$XML::LibXML::QuerySelector::ToXPath::VERSION   = '0.001';
-		
-		require HTML::Selector::XPath;
-		@ISA = qw/HTML::Selector::XPath/;
+		return $xc->findnodes($xpath);
 	}
 	
-	# XXX: Identifiers should also allow any characters U+00A0 and higher, and any
-	# escaped characters.
-	my $ident = qr/(?![0-9]|-[-0-9])[-_a-zA-Z0-9]+/;
+	my @results = map
+		{ $self->$contains($_) ? ($_) : () }
+		@{[ $xc->findnodes($xpath) ]};
 	
-	my $reg = {
-		# tag name/id/class
-		element => qr/^([#.]?)([a-z0-9\\*_-]*)((\|)([a-z0-9\\*_-]*))?/i,
-		# attribute presence
-		attr1   => qr/^\[ \s* ($ident) \s* \]/x,
-		# attribute value match
-		attr2   => qr/^\[ \s* ($ident) \s*
-			( [~|*^\$!]? = ) \s*
-			(?: ($ident) | "([^"]*)" ) \s* \] /x,
-		badattr => qr/^\[/,
-		attrN   => qr/^:not\((.*?)\)/i, # this should be a parentheses matcher instead of a RE!
-		pseudo  => qr/^:([()a-z0-9_+-]+)/i,
-		# adjacency/direct descendance
-		combinator => qr/^(\s*[>+~\s](?!,))/i,
-		# rule separator
-		comma => qr/^\s*,\s*/i,
-	};
-	
-	foreach (qw/
-			selector_to_xpath
-			convert_attribute_match
-			_generate_child
-			nth_child
-			nth_last_child
-			parse_pseudo
-		/)
-	{
-		no strict 'refs';
-		*{$_} = \&{"$ISA[0]\::$_"};
-	}
-	
-	sub to_xpath
-	{
-		my $self = shift;
-		my $rule = $self->{expression} or return;
-		my %parms = @_;
-		my $root = $parms{root} || '/';
-		
-		my @parts = ("$root/");
-		my $last_rule = '';
-		my @next_parts;
-		
-		my $tag;
-		my $wrote_tag;
-		my $tag_index;
-		my $root_index = 0; # points to the current root
-		# Loop through each "unit" of the rule
-		while (length $rule && $rule ne $last_rule)
-		{
-			$last_rule = $rule;
-			
-			$rule =~ s/^\s*|\s*$//g;
-			last unless length $rule;
-			
-			# Prepend explicit first selector if we have an implicit selector
-			# (that is, if we start with a combinator)
-			if ($rule =~ /$reg->{combinator}/)
-			{
-				$rule = "* $rule";
-			}
-			
-			# Match elements
-			if ($rule =~ s/$reg->{element}//)
-			{
-				my ($id_class,$name,$lang) = ($1,$2,$3);
-				
-				# to add *[1]/self:: for follow-sibling
-				if (@next_parts)
-				{
-					push @parts, @next_parts; #, (pop @parts);
-					@next_parts = ();
-				}
-				
-				if ($id_class eq '')
-				{
-					$tag = $name || '*';
-				}
-				else
-				{
-					$tag = '*';
-				}
-				
-				if (defined $parms{prefix} and not $tag =~ /[*:|]/)
-				{
-					$tag = join ':', $parms{prefix}, $tag;
-				}
-				
-				if (! $wrote_tag++)
-				{
-					push @parts, $tag;
-					$tag_index = $#parts;
-				}
-				
-				# XXX Shouldn't the RE allow both, ID and class?
-				if ($id_class eq '#')
-				{ # ID
-					push @parts, "[\@id='$name']";
-				}
-				elsif ($id_class eq '.')
-				{ # class
-					push @parts, "[contains(concat(' ', \@class, ' '), ' $name ')]";
-				}
-			}
-			
-			# Match attribute selectors
-			if ($rule =~ s/$reg->{attr2}//)
-			{
-				push @parts, "[", convert_attribute_match( $1, $2, $^N ), "]";
-			}
-			elsif ($rule =~ s/$reg->{attr1}//)
-			{
-				# If we have no tag output yet, write the tag:
-				if (! $wrote_tag++)
-				{
-					push @parts, '*';
-					$tag_index = $#parts;
-				}
-				push @parts, "[\@$1]";
-			}
-			elsif ($rule =~ $reg->{badattr})
-			{
-				Carp::croak "Invalid attribute-value selector '$rule'";
-			}
-			
-			# Match negation
-			if ($rule =~ s/$reg->{attrN}//)
-			{
-				my $sub_rule = $1;
-				if ($sub_rule =~ s/$reg->{attr2}//)
-				{
-					push @parts, "[not(", convert_attribute_match( $1, $2, $^N ), ")]";
-				}
-				elsif ($sub_rule =~ s/$reg->{attr1}//)
-				{
-					push @parts, "[not(\@$1)]";
-				}
-				elsif ($rule =~ $reg->{badattr})
-				{
-					Carp::croak "Invalid attribute-value selector '$rule'";
-				}
-				else
-				{
-					my $xpath = selector_to_xpath($sub_rule);
-					$xpath =~ s!^//!!;
-					push @parts, "[not(self::$xpath)]";
-				}
-			}
-			
-			# Ignore pseudoclasses/pseudoelements
-			while ($rule =~ s/$reg->{pseudo}//)
-			{
-				if ( my @expr = $self->parse_pseudo($1, \$rule) )
-				{
-					push @parts, @expr;
-				}
-				elsif ( $1 eq 'first-child')
-				{
-					# Translates to :nth-child(1)
-					push @parts, nth_child(1);
-				}
-				elsif ( $1 eq 'last-child')
-				{
-					push @parts, nth_last_child(1);
-				}
-				elsif ( $1 eq 'only-child')
-				{
-					push @parts, nth_child(1), nth_last_child(1);
-				}
-				elsif ($1 =~ /^lang\(([\w\-]+)\)$/)
-				{
-					push @parts, "[\@xml:lang='$1' or starts-with(\@xml:lang, '$1-')]";
-				}
-				elsif ($1 =~ /^nth-child\((\d+)\)$/)
-				{
-					push @parts, nth_child($1);
-				}
-				elsif ($1 =~ /^nth-child\((\d+)n(?:\+(\d+))?\)$/)
-				{
-					push @parts, nth_child($1, $2||0);
-				}
-				elsif ($1 =~ /^nth-last-child\((\d+)\)$/)
-				{
-					push @parts, nth_last_child($1);
-				}
-				elsif ($1 =~ /^nth-last-child\((\d+)n(?:\+(\d+))?\)$/)
-				{
-					push @parts, nth_last_child($1, $2||0);
-				}
-				elsif ($1 =~ /^first-of-type$/)
-				{
-					push @parts, "[1]";
-				}
-				elsif ($1 =~ /^nth-of-type\((\d+)\)$/)
-				{
-					push @parts, "[$1]";
-				}
-				elsif ($1 =~ /^contains\($/)
-				{
-					$rule =~ s/^\s*"([^"]*)"\s*\)\s*$//
-						or die "Malformed string in :contains(): '$rule'";
-					push @parts, qq{[text()[contains(string(.),"$1")]]};
-				}
-				elsif ( $1 eq 'root')
-				{
-					# This will give surprising results if you do E > F:root
-					$parts[$root_index] = $root;
-				}
-				elsif ( $1 eq 'empty')
-				{
-					push @parts, "[not(* or text())]";
-				}
-				else
-				{
-					Carp::croak "Can't translate '$1' pseudo-class";
-				}
-			}
-			
-			# Match combinators (whitespace, >, + and ~)
-			if ($rule =~ s/$reg->{combinator}//)
-			{
-				my $match = $1;
-				if ($match =~ />/)
-				{
-					push @parts, "/";
-				}
-				elsif ($match =~ /\+/)
-				{
-					push @parts, "/following-sibling::*[1]/self::";
-					$tag_index = $#parts;
-				}
-				elsif ($match =~ /\~/)
-				{
-					push @parts, "/following-sibling::";
-				}
-				elsif ($match =~ /^\s*$/)
-				{
-					push @parts, "//"
-				}
-				else
-				{
-					die "Weird combinator '$match'"
-				}
-				
-				# new context
-				undef $tag;
-				undef $wrote_tag;
-			}
-			
-			# Match commas
-			if ($rule =~ s/$reg->{comma}//)
-			{
-				push @parts, " | ", "$root/"; # ending one rule and beginning another
-				$root_index = $#parts;
-				undef $tag;
-				undef $wrote_tag;
-			}
-		}
-		return join '', @parts;
-	}
+	wantarray ? @results : XML::LibXML::NodeList->new(@results);
+}
+
+sub querySelector
+{
+	my ($self, $selector_string) = @_;
+	my $results = $self->querySelectorAll($selector_string);
+	return unless $results->size;
+	$results->shift;
 }
 
 __FILE__
 
 =head1 NAME
 
-XML::LibXML::QuerySelector - add querySelector and querySelectorAll methods to XML::LibXML::Node
+XML::LibXML::QuerySelector - add querySelector and querySelectorAll methods to XML::LibXML nodes
 
 =head1 SYNOPSIS
 
   my $document = XML::LibXML->new->parse_file('my.xhtml');
-  my $warning  = $document->querySelector('p.warning');
+  my $warning  = $document->querySelector('p.warning strong');
   print $warning->toString if defined $warning;
 
 =head1 DESCRIPTION
 
 =back
 
+=head1 CAVEATS
+
+=over
+
+=item * When called on an element, C<querySelectorAll> returns a static
+node list; not a live node list. (Called on a document or document
+fragment, it will return a live node list as specified in the W3C
+Candidate Recommendation.)
+
+=item * Use on mixed-namespace documents is largely untested. The module
+is mostly intended for use with XHTML and HTML documents.
+
+=back
+
 =head1 BUGS
 
 Please report any bugs to
 L<http://rt.cpan.org/Dist/Display.html?Queue=XML-LibXML-QuerySelector>.
 
+=head1 TODO
+
+=over
+
+=item * Consider adding HTML5 DOM traversal methods including
+C<getElementsByClassName>.
+
+=back
+
 =head1 SEE ALSO
 
 L<HTML::Selector::XPath>,
 
 Toby Inkster E<lt>tobyink@cpan.orgE<gt>.
 
+=head1 THANKS
+
+Tatsuhiko Miyagawa and Max Maischein, for HTML::Selector::XPath, and for
+resolving L<https://rt.cpan.org/Ticket/Display.html?id=73719> quickly.
+
 =head1 COPYRIGHT AND LICENCE
 
 This software is copyright (c) 2012 by Toby Inkster.
 dist:project :release dist:v_0-001 .
 dist:v_0-001
 	a               :Version ;
-	dc:issued       "2012-01-03"^^xsd:date ;
+	dc:issued       "2012-01-05"^^xsd:date ;
 	:revision       "0.001"^^xsd:string ;
 	:file-release   <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/XML-LibXML-QuerySelector-0.001.tar.gz> ;
 	rdfs:label      "Initial release" .
 
+dist:project :release dist:v_0-002 .
+dist:v_0-002
+	a               :Version ;
+	dc:issued       "2012-01-07"^^xsd:date ;
+	:revision       "0.002"^^xsd:string ;
+	:file-release   <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/XML-LibXML-QuerySelector-0.002.tar.gz> ;
+	dcs:changeset [
+		dcs:versus dist:v_0-001 ;
+		dcs:item   [ rdfs:label "Remove spurious 'use Data::Printer' from t/02level1.t"@en ; a dcs:Packaging ]
+		] .
+
+dist:project :release dist:v_0-003 .
+dist:v_0-003
+	a               :Version ;
+	dc:issued       "2012-01-17"^^xsd:date ;
+	:revision       "0.003"^^xsd:string ;
+	:file-release   <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/XML-LibXML-QuerySelector-0.003.tar.gz> ;
+	dcs:changeset [
+		dcs:versus dist:v_0-002 ;
+		dcs:item   [ rdfs:label "Dropped XML::LibXML::QuerySelector::ToXPath, which was a bundled and modified version of HTML::Selector::XPath."@en; a dcs:Removal ];
+		dcs:item   [ rdfs:label "use HTML::Selector::XPath 0.13"@en ]
+		] .
 	a               :Project ;
 	:programming-language "Perl" ;
 	:name           "XML-LibXML-QuerySelector" ;
-	:shortdesc      "add querySelector and querySelectorAll methods to XML::LibXML::Node" ;
+	:shortdesc      "add querySelector and querySelectorAll methods to XML::LibXML nodes" ;
 	:homepage       <https://metacpan.org/release/XML-LibXML-QuerySelector> ;
 	:download-page  <https://metacpan.org/release/XML-LibXML-QuerySelector> ;
 	:bug-database   <http://rt.cpan.org/Dist/Display.html?Queue=XML-LibXML-QuerySelector> ;
+	:repository     [ a :HgRepository ; :browse <https://bitbucket.org/tobyink/p5-xml-libxml-queryselector> ] ;
 	:created        "2012-01-03"^^xsd:date ;
 	:license        <http://dev.perl.org/licenses/> ;
 	:developer      [ a foaf:Person ; foaf:name "Toby Inkster" ; foaf:mbox <mailto:tobyink@cpan.org> ] .

meta/makefile.ttl

 	:version_from _:main ;
 	:readme_from _:main ;
 	:test_requires "Test::More 0.61" ;
-	:requires "Carp", "HTML::Selector::XPath", "XML::LibXML 1.70" , "common::sense" .
+	:requires "Carp", "HTML::Selector::XPath 0.13", "XML::LibXML 1.70" , "common::sense" , "utf8" .
 
 _:main <http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#fileName> "lib/XML/LibXML/QuerySelector.pm" .
 
 use Test::More tests => 7;
-use Data::Printer;
 use XML::LibXML::QuerySelector;
 
 my $document = XML::LibXML->new->parse_string(<<'XHTML');