Commits

Toby Inkster  committed 59ea6ec

better string literals

  • Participants
  • Parent commits 124a03f

Comments (0)

Files changed (2)

File lib/RDF/TrineX/Serializer/MockTurtleSoup.pm

 	$self->{colspace}   //= 20;
 	$self->{indent}     ||= "\t";
 	$self->{repeats}    //= 0;
+	$self->{encoding}   ||= "utf8";
+	$self->{apostrophe} //= 0;
 	
-	croak("Bad indent!") unless $self->{indent} =~ /^\s+$/;
+	croak("Bad indent")
+		unless $self->{indent} =~ /^\s+$/;
+	croak("Bad encoding: expected 'utf8' or 'ascii'")
+		unless $self->{encoding} =~ /^(ascii|utf8)$/;
 	
 	return $self;
 }
 		}
 		elsif (defined $dt)
 		{
-			my $n2 = RDF::Trine::Node::Literal->new($n->literal_value);
-			return sprintf('%s^^%s', $n2->as_ntriples, $dt);
+			return sprintf('%s^^%s', $self->_escaped_quoted_string($n->literal_value), $dt);
 		}
 	}
+	elsif ($n->is_literal and $n->has_language)
+	{
+		return sprintf('%s@%s', $self->_escaped_quoted_string($n->literal_value), $n->literal_value_language);
+	}
+	elsif ($n->is_literal)
+	{
+		return $self->_escaped_quoted_string($n->literal_value);
+	}
 	
 	if ($n->is_blank)
 	{
 	return $n->as_ntriples;
 }
 
+{
+	my %ESCAPE = (
+		"\t"     => "\\t",
+		"\r"     => "\\r",
+		"\n"     => "\\n",
+		"\""     => "\\\"",
+		"\'"     => "\\\'",
+		"\\"     => "\\\\",
+	);
+	
+	sub _escaped_quoted_string
+	{
+		my $self = shift;
+		my ($str) = @_;
+		
+		my $quote = '"';
+		my $chars = '\x00-\x1F\x5C';
+		
+		if ($self->{apostrophe} and $str =~ /\"/ and not $str =~ /\'/)
+		{
+			$quote = "'";
+		}
+		else
+		{
+			$chars .= '\x22'
+		}
+		
+		if ($self->{encoding} eq "ascii")
+		{
+			$chars .= '\x{0080}-\x{10FFFF}';
+		}
+		
+		$str =~ s{([$chars])}{
+			exists($ESCAPE{$1}) ? $ESCAPE{$1} :
+			ord($1) <= 0xFFFF   ? sprintf('\u%04X', ord($1)) : sprintf('\U%08X', ord($1))
+		}xeg;
+		
+		"$quote$str$quote";
+	}
+}
+
 sub _serialize_bunch
 {
 	my $self = shift;
 
 =head1 SYNOPSIS
 
- my $ser = RDF::TrineX::Serializer::MockTurtleSoup->new(%opts);
+ use RDF::TrineX::Serializer::MockTurtleSoup;
+ 
+ my $ser = "RDF::TrineX::Serializer::MockTurtleSoup"->new(%opts);
  $ser->serialize_model_to_file($fh, $model);
 
 =head1 DESCRIPTION
 elsewhere. But also allow the user to supply a list of additional
 URIs that will be abbreviated to QNames:
 
- RDF::Trine::Serializer::MockTurtleSoup->new(
+ "RDF::TrineX::Serializer::MockTurtleSoup"->new(
     abbreviate => [
        qr{^http://ontologi\.es/},
        qr{^http://purl\.org/},
 abbreviated anyway. URIs which cannot be abbreviated to a legal QName
 will just be output as URIs.
 
+=item C<apostrophe>
+
+Boolean; if true, then the serializer will sometimes quote literals with
+an apostrophe instead of double-quote marks. This is allowed by recent
+versions of the Turtle spec, but was disallowed by earlier specifications,
+and not widely supported yet. Defaults to false.
+
 =item C<colspace>
 
 This allows your predicate-object pairs to line up as nice columns. The
 smaller the number, the closer they get. Default is 20.
 
+=item C<encoding>
+
+Either "ascii" or "utf8". Default is "utf8".
+
 =item C<indent>
 
 A whitespace string to indent by. The default is one tab character.
 
 use strict;
 use warnings;
+use utf8;
 use Test::More;
 
+BEGIN {
+	*is_string = eval { require Test::LongString }
+		? \&Test::LongString::is_string
+		: \&Test::More::is;
+};
+
+use Encode qw( encode decode );
 use JSON qw( to_json -convert_blessed_universally );
 use RDF::Trine;
+use Unicode::Normalize qw( NFD );
 
 require RDF::Trine::Graph;
 require RDF::Trine::Model;
 		my $mts = "RDF::TrineX::Serializer::MockTurtleSoup"->new(%$opts, priorities => $prio);
 		my $got = $mts->serialize_model_to_string($input);
 		
-		is($got, $expected, "serialized string matches") if $do_str_test;
+		is_string(
+			NFD(decode("utf8", $got)),
+			NFD($expected),
+			"serialized string matches",
+		) if $do_str_test;
 		
 		my $model = "RDF::Trine::Model"->new;
 		"RDF::Trine::Parser::Turtle"->new->parse_into_model(
    doap:maintainer      <http://purl.org/NET/cpan-uri/person/tobyink>;
    doap:name            "RDF-TrineX-Serializer-MockTurtleSoup";
    doap:programming-language "Perl";
-   doap:shortdesc       "he's a bit slow, but he's sure good lookin'".
+   doap:shortdesc       "he's a bit slow, but he's sure good lookin'";
+   doap:xxx1            "foo\"bar";
+   doap:xxx2            "foo'bar";
+   doap:xxx3            "café".
 
 <http://purl.org/NET/cpan-uri/person/tobyink>
    a                    foaf:Person;
 	colspace   => 0,
 	abbreviate => qr(cpan-uri),
 	labelling  => qr((?:title|name)$),
+	encoding   => "ascii",
 	namespaces => { prj => 'http://purl.org/NET/cpan-uri/dist/RDF-TrineX-Serializer-MockTurtleSoup/' }
 }, <<'OUTPUT');
 @prefix dc:    <http://purl.org/dc/terms/> .
 	doap:license <http://dev.perl.org/licenses/>;
 	doap:maintainer person:tobyink;
 	doap:programming-language "Perl";
-	doap:shortdesc "he's a bit slow, but he's sure good lookin'".
+	doap:shortdesc "he's a bit slow, but he's sure good lookin'";
+	doap:xxx1 "foo\"bar";
+	doap:xxx2 "foo'bar";
+	doap:xxx3 "caf\u00E9".
 
 person:tobyink
 	a foaf:Person;
    doap:maintainer      <http://purl.org/NET/cpan-uri/person/tobyink>;
    doap:name            "RDF-TrineX-Serializer-MockTurtleSoup";
    doap:programming-language "Perl";
-   doap:shortdesc       "he's a bit slow, but he's sure good lookin'".
+   doap:shortdesc       "he's a bit slow, but he's sure good lookin'";
+   doap:xxx1            "foo\"bar";
+   doap:xxx2            "foo'bar";
+   doap:xxx3            "café".
 
 OUTPUT
 
 	<http://usefulinc.com/ns/doap#name> "RDF-TrineX-Serializer-MockTurtleSoup" ;
 	<http://usefulinc.com/ns/doap#programming-language> "Perl" ;
 	<http://usefulinc.com/ns/doap#shortdesc> "he's a bit slow, but he's sure good lookin'" ;
+	<http://usefulinc.com/ns/doap#xxx1> "foo\"bar" ;
+	<http://usefulinc.com/ns/doap#xxx2> "foo'bar" ;
+	<http://usefulinc.com/ns/doap#xxx3> "caf\u00e9" ;
 	a <http://usefulinc.com/ns/doap#Project> .
 <http://purl.org/NET/cpan-uri/person/tobyink> a <http://xmlns.com/foaf/0.1/Person> ;
 	<http://xmlns.com/foaf/0.1/nick> "TOBYINK" ;