perl-XML-LibXML / t / 12html.t

use Test;
BEGIN { plan tests => 41 }
use XML::LibXML;
use IO::File;
ok(1);

my $html = "example/test.html";

my $parser = XML::LibXML->new();
{
    my $doc = $parser->parse_html_file($html);
    ok($doc);
}

my $fh = IO::File->new($html) || die "Can't open $html: $!";

my $string;
{
    local $/;
    $string = <$fh>;
}

seek($fh, 0, 0);

ok($string);

$doc = $parser->parse_html_string($string);

ok($doc);

undef $doc;

$doc = $parser->parse_html_fh($fh);

ok($doc);

$fh->close();

# parsing HTML's CGI calling links

my $strhref = <<EOHTML;

<html>
    <body>
        <a href="http:/foo.bar/foobar.pl?foo=bar&bar=foo">
            foo
        </a>
        <p>test
    </body>
</html>
EOHTML

my $htmldoc;

$parser->recover(1);
eval {
    local $SIG{'__WARN__'} = sub { };
    $htmldoc = $parser->parse_html_string( $strhref );
};

# ok( not $@ );
ok( $htmldoc );

print "parse_html_string with encoding...\n";
# encodings
if (eval { require Encode; }) {
  use utf8;

  my $utf_str = "ěščř";

  # w/o 'meta' charset
  $strhref = <<EOHTML;
<html>
  <body>
    <p>$utf_str</p>
  </body>
</html>
EOHTML
   
  ok( Encode::is_utf8($strhref) );
  $htmldoc = $parser->parse_html_string( $strhref );
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  $htmldoc = $parser->parse_html_string( $strhref, 
					 { 
					   encoding => 'UTF-8' 
					 }
					);
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);


  my $iso_str = Encode::encode('iso-8859-2', $strhref);
  $htmldoc = $parser->parse_html_string( $iso_str,
					 {
					   encoding => 'iso-8859-2'
					  }
					);
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  # w/ 'meta' charset
  $strhref = <<EOHTML;
<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html;
      charset=iso-8859-2">
  </head>
  <body>
    <p>$utf_str</p>
  </body>
</html>
EOHTML

  $htmldoc = $parser->parse_html_string( $strhref, { encoding => 'UTF-8' });
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  $iso_str = Encode::encode('iso-8859-2', $strhref);
  $htmldoc = $parser->parse_html_string( $iso_str );
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  $htmldoc = $parser->parse_html_string( $iso_str, { encoding => 'iso-8859-2',
						     URI => 'foo'
						   } );
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);
  ok($htmldoc->URI, 'foo');
} else {
  skip("Encoding related tests require Encode") for 1..14;
}

print "parse example/enc_latin2.html...\n";
# w/ 'meta' charset
{
  use utf8;
  my $utf_str = "ěščř";
  my $test_file = 'example/enc_latin2.html';
  my $fh;

  $htmldoc = $parser->parse_html_file( $test_file );
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);
  
  $htmldoc = $parser->parse_html_file( $test_file, { encoding => 'iso-8859-2',
						     URI => 'foo'
						   });
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);
  ok($htmldoc->URI, 'foo');
  
  open $fh, $test_file;
  $htmldoc = $parser->parse_html_fh( $fh );
  close $fh;
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);
  
  open $fh, $test_file;
  $htmldoc = $parser->parse_html_fh( $fh, { encoding => 'iso-8859-2',
					    URI => 'foo',
					  });
  close $fh;
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->URI, 'foo');
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  if (1000*$] < 5008) {
    skip("skipping for Perl < 5.8") for 1..2;
  } elsif (20627 > XML::LibXML::LIBXML_VERSION) {
    skip("skipping for libxml2 < 2.6.27") for 1..2;
  } else {
    # translate to UTF8 on perl-side
    open $fh, '<:encoding(iso-8859-2)', $test_file;
    $htmldoc = $parser->parse_html_fh( $fh, { encoding => 'UTF-8' });
    close $fh;
    ok( $htmldoc && $htmldoc->getDocumentElement );
    ok($htmldoc->findvalue('//p/text()'), $utf_str);
  }
}

print "parse example/enc2_latin2.html...\n";
# w/o 'meta' charset
{
  use utf8;
  my $utf_str = "ěščř";
  my $test_file = 'example/enc2_latin2.html';
  my $fh;

  $htmldoc = $parser->parse_html_file( $test_file, { encoding => 'iso-8859-2' });
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  open $fh, $test_file;
  $htmldoc = $parser->parse_html_fh( $fh, { encoding => 'iso-8859-2' });
  close $fh;
  ok( $htmldoc && $htmldoc->getDocumentElement );
  ok($htmldoc->findvalue('//p/text()'), $utf_str);

  if (1000*$] < 5008) {
    skip("skipping for Perl < 5.8") for 1..2;
  } else {
    # translate to UTF8 on perl-side
    open $fh, '<:encoding(iso-8859-2)', $test_file;
    $htmldoc = $parser->parse_html_fh( $fh, { encoding => 'UTF-8' } );
    close $fh;
    ok( $htmldoc && $htmldoc->getDocumentElement );
    ok($htmldoc->findvalue('//p/text()'), $utf_str);
  }
}


{
  # 44715

  my $html = <<'EOF';
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Test &amp; Test some more</title>
</head>
<body>
<p>Meet you at the caf&eacute;?</p>
<p>How about <a href="http://example.com?mode=cafe&id=1&ref=foo">this one</a>?
</p>
<input class="wibble" id="foo" value="working" />
</body>
</html>
EOF
  my $parser = XML::LibXML->new;
  eval {
    $doc = $parser->parse_html_string(
      $html => { recover => 1, suppress_errors => 1 }
     );
  };
  ok(!$@);
  ok($doc);
  my $root = $doc && $doc->documentElement;
  my $val = $root && $root->findvalue('//input[@id="foo"]/@value');
  ok($val eq 'working');
}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.