Commits

Doug Burke  committed d98b382

Improve formatting of string literals in Turtle/N3

  • Participants
  • Parent commits 824a141
  • Tags 0.9.0.5

Comments (0)

Files changed (7)

+0.9.0.5:
+
+  Turtle/N3 output: more changes for string literals containing
+  double-quote characters.
+
+  N3 Parser: parsing of string literals within three quote marks has
+  been updated to match the Turtle parser.
+
+  Turtle Parser: a few more error messages have been added.
+  
 0.9.0.4:
 
   Turtle parser: updated to the Candidate Recommendation (19 February

File src/Swish/RDF/Formatter/Internal.hs

 import Control.Monad (liftM)
 import Control.Monad.State (State, get, gets, modify, put)
 
-import Data.List (foldl', groupBy, isInfixOf, intersperse, partition)
+import Data.List (foldl', groupBy, intersperse, partition)
+import Data.Maybe (isJust)
 import Data.Monoid (Monoid(..), mconcat)
 import Data.Word
 
 
 -- N3-like output
 
--- temporary conversion
+-- temporary conversion, also note that it is not obvious that all
+-- the uses of quoteB are valid (e.g. when formatting a URL for use
+-- in a prefix statement). TODO: review
+--
 quoteB :: Bool -> String -> B.Builder
 quoteB f v = B.fromString $ quote f v
 
+-- Force the "basic" display, that is act as if it is to be
+-- surrounded by "...".
+quoteBString :: String -> B.Builder
+quoteBString = quoteB True
+
 {-|
 Convert text into a format for display in Turtle. The idea
 is to use one double quote unless three are needed, and to
 
 -- The original thinking was that a scan of the string is worthwhile
 -- if it avoids having to quote characters, but we always need to
--- go through and do this anyway, eg for @\t@ or @\@.
---
--- Swish.RDF.Graph.quote, used by quoteB, does not handle
--- strings with three or more consecutive @"@ characters, so
--- we explicitly check for this and fall back to the single-"
--- version, which protects each quote.
+-- scan through to protect certain characters.
 --
 quoteText :: T.Text -> B.Builder
 quoteText txt = 
-  let st = T.unpack txt -- TODO: fix to use Text
-
-      -- assume the magical ghc pixie will fuse all these loops
-      hasNL = '\n' `elem` st
-      hasSQ = '"' `elem` st
-      has3Q = "\"\"\"" `isInfixOf` st
+  let -- assume the magical ghc pixie will fuse all these loops
+      -- (the docs say that T.findIndex can fuse, but that
+      -- T.isInfixOf doesn't)
+      hasNL = isJust $ T.findIndex (== '\n') txt
+      hasSQ = isJust $ T.findIndex (== '"') txt
+      has3Q = "\"\"\"" `T.isInfixOf` txt
         
       n = if has3Q || (not hasNL && not hasSQ) then 1 else 3
             
       qch = B.fromString (replicate n '"')
-      qst = quoteB (n==1) st
+      qst = B.fromText $ quoteT (n==1) txt
 
   in mconcat [qch, qst, qch]
 
 --       not sure the following counts as clever enough ...
 --  
 showScopedName :: ScopedName -> B.Builder
-showScopedName = quoteB True . show
+showScopedName = quoteBString . show
 
 formatScopedName :: ScopedName -> M.Map (Maybe T.Text) URI -> B.Builder
 formatScopedName sn prmap =
   in case findPrefix nsuri prmap of
        Just (Just p) -> B.fromText $ quoteT True $ mconcat [p, ":", local]
        _             -> mconcat [ "<"
-                                , quoteB True (show nsuri ++ T.unpack local)
+                                , quoteBString (show nsuri ++ T.unpack local)
                                 , ">"
                                 ]
 
 --
 -- However, I am moving away from storing a canonical representation
 -- of a datatyped literal in the resource since it is messy and makes
--- some comparisons difficult (unless equality of RDFLabels is made
--- dependent on types, and then it gets messy). I am also not as
--- concerned about issues in the N3 parser/formatter as in the Turtle
--- one.
+-- some comparisons difficult, in particular for the W3C Turtle test
+-- suite [I think] (unless equality of RDFLabels is made dependent on
+-- types, and then it gets messy). I am also not as concerned about
+-- issues in the N3 parser/formatter as in the Turtle one.
 --
 formatTypedLit :: Bool -> T.Text -> ScopedName -> B.Builder
 formatTypedLit n3flag lit dtype
     | dtype `elem` [xsdBoolean, xsdDecimal, xsdInteger] = B.fromText lit
     | otherwise = mconcat [quoteText lit, "^^", showScopedName dtype]
                            
-	       
 {-
 Add a list inline. We are given the labels that constitute
 the list, in order, so just need to display them surrounded
 formatPrefixLines :: NamespaceMap -> [B.Builder]
 formatPrefixLines = map pref . M.assocs
     where
-      pref (Just p,u) = mconcat ["@prefix ", B.fromText p, ": <", quoteB True (show u), "> ."]
-      pref (_,u)      = mconcat ["@prefix : <", quoteB True (show u), "> ."]
+      pref (Just p,u) = mconcat ["@prefix ", B.fromText p, ": <", quoteBString (show u), "> ."]
+      pref (_,u)      = mconcat ["@prefix : <", quoteBString (show u), "> ."]
 
 formatPrefixes_ ::
     (B.Builder -> State a B.Builder)  -- ^ Create a new line

File src/Swish/RDF/Graph.hs

 showCanon (TypedLit st dt)   = quote1Str st ++ "^^" ++ show (getScopedNameURI dt)
 showCanon s                  = show s
 
+-- The Data.Text API points out issues with processing a text
+-- character-by-character, but it's not clear to me how to avoid
+-- that here
+--
+-- One assumption would be that the strings aren't likely to be large,
+-- so that several calls to T.find or similar could be made to
+-- simplify certain cases.
+--
+-- Is it worth scanning through the text to look for characters like \n
+-- or #, or to look for sequences like '##'?
+
+-- Is it worth sending in a flag to indicate the different modes for
+-- handling \n characters, or just leave this complexity in 'quoteT False'?
+--
+processChar ::
+  Char
+  -> (T.Text, Bool) -- ^ the boolean is @True@ if the returned text has been
+  -- expanded so that it begins with @\@
+processChar '"'  = ("\\\"", True)
+processChar '\\' = ("\\\\", True)
+processChar '\n' = ("\\n", True)
+processChar '\r' = ("\\r", True)
+processChar '\t' = ("\\t", True)
+processChar '\b' = ("\\b", True)
+processChar '\f' = ("\\f", True)
+processChar c =
+  let nc = ord c
+      -- lazy ways to convert to hex-encoded strings
+      four = T.append "\\u" . T.pack $ printf "%04X" nc
+      eight = T.append "\\U" . T.pack $ printf "%08X" nc
+  in if nc < 0x20
+     then (four, True)
+     else if nc < 0x7f
+          then (T.singleton c, False)
+          else if nc < 0x10000
+               then (four, True)
+               else (eight, True)
+
+convertChar :: Char -> T.Text
+convertChar = fst . processChar
+
 -- | See `quote`.
 quoteT :: Bool -> T.Text -> T.Text
-quoteT f = T.pack . quote f . T.unpack  -- TODO: avoid conversion to string
+quoteT True txt =
+  -- Output is to be used as "..."
+  let go dl x =
+        case T.uncons x of
+          Just (c, xs) -> go (dl . T.append (convertChar c)) xs
+          _ -> dl T.empty
+                          
+  in go (T.append T.empty) txt
 
-{-| N3-style quoting rules for a string.
+-- One complexity here is my reading of the Turtle grammar
+--    STRING_LITERAL_LONG_QUOTE ::=	'"""' (('"' | '""')? [^"\] | ECHAR | UCHAR)* '"""'
+-- which says that any un-protected double-quote characters can not
+-- be followed by a \ character. One option would be to always use the
+-- 'quoteT True' behavior.
+--
+quoteT _ txt =
+  -- Output is to be used as """...""""
+  let go dl x =
+        case T.uncons x of
+          Just ('"', xs) -> go1 dl xs
+          Just ('\n', xs) -> go (dl . T.cons '\n') xs
+          Just (c, xs) -> go (dl . T.append (convertChar c)) xs
+          _ -> dl T.empty
 
-WARNING: the output is /incorrect/ if the flag is @False@ and
-the text contains 3 or more consecutive @\"@ characters.
--}
+      -- Seen one double quote
+      go1 dl x =
+        case T.uncons x of
+          Just ('"', xs) -> go2 dl xs
+          Just ('\n', xs) -> go (dl . T.append "\"\n") xs
+          Just ('\\', xs) -> go (dl . T.append "\\\"\\\\") xs
+          Just (c, xs) ->
+            let (t, f) = processChar c
+                dl' = if f then T.append "\\\"" else T.cons '"'
+            in go (dl . dl' . T.append t) xs
+          _ -> dl "\\\""
+          
+      -- Seen two double quotes
+      go2 dl x =
+        case T.uncons x of
+          Just ('"', xs) -> go (dl . T.append "\\\"\\\"\\\"") xs
+          Just ('\n', xs) -> go (dl . T.append "\"\"\n") xs
+          Just ('\\', xs) -> go (dl . T.append "\\\"\\\"\\\\") xs
+          Just (c, xs) ->
+            let (t, f) = processChar c
+                dl' = T.append (if f then "\\\"\\\"" else "\"\"")
+            in go (dl . dl' . T.append t) xs
+          _ -> dl "\\\"\\\""
+
+      -- at the start of the string we have 3 quotes, so any
+      -- starting quote characters must be quoted.
+      go0 dl x =
+        case T.uncons x of
+          Just ('"', xs) -> go0 (dl . T.append "\\\"") xs
+          Just ('\n', xs) -> go (dl . T.cons '\n') xs
+          Just (c, xs) -> go (dl . T.append (convertChar c)) xs
+          _ -> dl T.empty
+      
+  in go0 (T.append T.empty) txt
+        
+-- | Turtle-style quoting rules for a string.
+--
+--   At present the choice is between using one or three
+--   double quote (@"@) characters to surround the string; i.e. using
+--   single quote (@'@)  characters is not supported.
 
 quote :: 
   Bool  -- ^ @True@ if the string is to be displayed using one rather than three quotes.
   -> String -- ^ String to quote.
-  -> String
+  -> String -- ^ The string does *not* contain the surrounding quote marks.
+quote f = T.unpack . quoteT f . T.pack
+
+{-
 quote _     []           = ""
 quote False s@(c:'"':[]) | c == '\\'  = s -- handle triple-quoted strings ending in "
                          | otherwise  = [c, '\\', '"']
 
 quote True  ('"': st)    = '\\':'"': quote True  st
 quote True  ('\n':st)    = '\\':'n': quote True  st
+quote True  ('\t':st)    = '\\':'t': quote True  st
 
-quote True  ('\t':st)    = '\\':'t': quote True  st
 quote False ('"': st)    =      '"': quote False st
 quote False ('\n':st)    =     '\n': quote False st
 quote False ('\t':st)    =     '\t': quote False st
           then '\\':'u': drop 4 ustr
           else c : rst
 
--- surround a string with quotes ("...")
+-}
 
+-- surround a string with a single double-quote mark at each end,
+-- e.g. "...".
 quote1Str :: T.Text -> String
-quote1Str t = '"' : quote False (T.unpack t) ++ "\""
+quote1Str t = '"' : T.unpack (quoteT True t) ++ "\""
 
 ---------------------------------------------------------
 --  Selected RDFLabel values

File src/Swish/RDF/Parser/N3.hs

 --------------------------------------------------------------------------------
 -- |
 --  Module      :  N3
---  Copyright   :  (c) 2003, Graham Klyne, 2009 Vasili I Galchin, 2011, 2012 Douglas Burke
+--  Copyright   :  (c) 2003, Graham Klyne, 2009 Vasili I Galchin, 2011, 2012, 2013 Douglas Burke
 --  License     :  GPL V2
 --
 --  Maintainer  :  Douglas Burke
     , notFollowedBy
     , endBy
     , sepEndBy
-    , manyTill
+    -- , manyTill
     , noneOf
     , char
     , ichar
 -}
 
 singleQuoted :: N3Parser T.Text
-singleQuoted = fmap T.pack (bracket sQuot sQuot $ many n3Character)
+singleQuoted = T.pack <$> (bracket sQuot sQuot $ many n3Character)
     
 {-
 tripleQUoted ::=	"""[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""
--}
-tripleQuoted :: N3Parser T.Text
+
+The following may not match the output format we now create (with the
+move to the Turtle Candidate Recommendation), so re-writing as a test,
+but this means pulling in a lot of Turtle productions, which should
+be shared.
+
 tripleQuoted = tQuot *> fmap T.pack (manyTill (n3Character <|> sQuot <|> char '\n') tQuot)
   where
     -- tQuot = try (count 3 sQuot)
     tQuot = exactly 3 sQuot
+-}
+tripleQuoted :: N3Parser T.Text
+tripleQuoted =
+  let sep = exactly 3 sQuot
+  in T.concat <$> bracket sep sep (many _tCharsLong)
+
+{-- Turtle productions: start --}
+oneOrTwo :: N3Parser T.Text
+oneOrTwo = do
+  ignore $ char '"'
+  mb <- optional (char '"')
+  case mb of
+    Just _ -> return $ "\"\""
+    _      -> return $ "\""
+
+_multiQuote :: N3Parser T.Text
+_multiQuote = do
+  mq <- optional (oneOrTwo)
+  r <- noneOf "\"\\"
+  return $ fromMaybe T.empty mq `T.snoc` r
+                
+_tCharsLong :: N3Parser T.Text
+_tCharsLong =
+  T.singleton <$> _protChar
+  <|> _multiQuote
+
+_protChar :: N3Parser Char
+_protChar = char '\\' *> (_echar' <|> _uchar')
+
+_echar' :: N3Parser Char
+_echar' = 
+  (char 't' *> pure '\t') <|>
+  (char 'b' *> pure '\b') <|>
+  (char 'n' *> pure '\n') <|>
+  (char 'r' *> pure '\r') <|>
+  (char 'f' *> pure '\f') <|>
+  (char '\\' *> pure '\\') <|>
+  (char '"' *> pure '"') <|>
+  (char '\'' *> pure '\'')
+
+_uchar' :: N3Parser Char
+_uchar' =
+  (char 'u' *> commit hex4)
+  <|>
+  (char 'U' *> commit hex8)
+
+{-- Turtle productions: end --}
 
 getDefaultPrefix :: N3Parser Namespace
 getDefaultPrefix = do
 --------------------------------------------------------------------------------
 --
 --  Copyright (c) 2003, Graham Klyne, 2009 Vasili I Galchin,
---    2011, 2012 Douglas Burke
+--    2011, 2012, 2013 Douglas Burke
 --  All rights reserved.
 --
 --  This file is part of Swish.

File src/Swish/RDF/Parser/Turtle.hs

 --  Module      :  Turtle
 --  Copyright   :  (c) 2003, Graham Klyne, 2009 Vasili I Galchin, 2011, 2012, 2013 Douglas Burke
 --  License     :  GPL V2
---
+-- 
 --  Maintainer  :  Douglas Burke
 --  Stability   :  experimental
 --  Portability :  OverloadedStrings
---
+-- 
 --  This Module implements a Turtle parser, returning a
 --  new 'RDFGraph' consisting of triples and namespace information parsed from
 --  the supplied input string, or an error indication.
---
+-- 
 -- REFERENCES:
---
+-- 
 --  - \"Turtle, Terse RDF Triple Language\",
 --    W3C Candidate Recommendation 19 February 2013 (<http://www.w3.org/TR/2013/CR-turtle-20130219/L),
 --    <http://www.w3.org/TR/turtle/>
---
+-- 
 -- NOTES:
---
+-- 
 --  - Prior to version @0.9.0.4@, the parser followed the
 --    W3C Working Draft 09 August 2011 (<http://www.w3.org/TR/2011/WD-turtle-20110809/>)
 -- 
 --  - Strings with no language tag are converted to a 'LitTag' not a
 --    'TypedLitTag' with a type of @xsd:string@ (e.g. see
 --    <http://www.w3.org/TR/2011/WD-turtle-20110809/#terms>).
---
+-- 
 --  - If the URI is actually an IRI (Internationalized Resource Identifiers)
 --    then the parser will fail since 'Network.URI.parseURI' fails.
 -- 
 --    @localName_with_nfc_PN_CHARS_BASE_character_boundaries@,
 --    and
 --    @localName_with_non_leading_extras@.
---
+-- 
 --------------------------------------------------------------------------------
 
 -- TODO:
 [2]	statement	::=	directive | triples '.'
 -}
 statement :: TurtleParser ()
-statement = directive <|> (triples *> fullStop)
+statement = directive <|> (triples *> commit fullStop <? "Missing '.' after a statement.")
 
 {-
 [3]	directive	::=	prefixID | base | sparqlPrefix | sparqlBase
 rdfLiteral :: TurtleParser RDFLabel
 rdfLiteral = do
   lbl <- L.toStrict <$> turtleString
-  opt <- optional ((Left <$> _langTag)
+  opt <- optional ((Left <$> (_langTag <? "Unable to parse the language tag"))
                    <|>
-                   (string "^^" *> (Right <$> commit iri)))
+                   (string "^^" *> (Right <$> (commit iri <? "Unable to parse the datatype of the literal"))))
   ignore $ optional whiteSpace
   return $ case opt of
              Just (Left lcode)  -> LangLit lbl lcode
   lexeme (
     _stringLiteralLongQuote <|> _stringLiteralQuote <|>
     _stringLiteralLongSingleQuote <|> _stringLiteralSingleQuote
-  )
+    ) <? "Unable to parse a string literal"
 
 {-
 [135s]	iri	::=	IRIREF | PrefixedName
 [23]	STRING_LITERAL_SINGLE_QUOTE	::=	"'" ([^#x27#x5C#xA#xD] | ECHAR | UCHAR)* "'"
 [24]	STRING_LITERAL_LONG_SINGLE_QUOTE	::=	"'''" (("'" | "''")? [^'\] | ECHAR | UCHAR)* "'''"
 [25]	STRING_LITERAL_LONG_QUOTE	::=	'"""' (('"' | '""')? [^"\] | ECHAR | UCHAR)* '"""'
+
+Since ECHAR | UCHAR is common to all these we pull it out to
+create the _protChar parser.
 -}
 
+_protChar :: TurtleParser Char
+_protChar = char '\\' *> (_echar' <|> _uchar')
+
 _exclSLQ, _exclSLSQ :: String
 _exclSLQ = map chr [0x22, 0x5c, 0x0a, 0x0d]
 _exclSLSQ = map chr [0x27, 0x5c, 0x0a, 0x0d]
 _stringItLong sep chars = L.concat <$> bracket sep sep (many chars)
 
 _tChars :: String -> TurtleParser Char
-_tChars excl = (char '\\' *> (_echar' <|> _uchar'))
-               <|> noneOf excl
+_tChars excl = _protChar <|> noneOf excl
 
 oneOrTwo :: Char -> TurtleParser L.Text
 oneOrTwo c = do
-  a <- char c
+  ignore $ char c
   mb <- optional (char c)
   case mb of
-    Just b -> return $ L.pack [a,b]
-    _      -> return $ L.singleton a
+    Just _ -> return $ L.pack [c,c]
+    _      -> return $ L.singleton c
 
 _multiQuote :: Char -> TurtleParser L.Text
 _multiQuote c = do
                 
 _tCharsLong :: Char -> TurtleParser L.Text
 _tCharsLong c =
-  let conv = (L.singleton `fmap`)
-  in _multiQuote c <|> conv (_echar <|> _uchar)
+  L.singleton <$> _protChar
+  <|> _multiQuote c
 
 {-
 [26]	UCHAR	::=	'\u' HEX HEX HEX HEX | '\U' HEX HEX HEX HEX HEX HEX HEX HEX
 _uchar = char '\\' >> _uchar'
 
 _uchar' :: TurtleParser Char
-_uchar' = (char 'u' *> hex4) <|> (char 'U' *> hex8)
+_uchar' =
+  (char 'u' *> (commit hex4 <? "Expected 4 hex characters after \\u"))
+  <|>
+  (char 'U' *> (commit hex8 <? "Expected 8 hex characters after \\U"))
 
 {-
 [159s]	ECHAR	::=	'\' [tbnrf\"']
--}
+
+Since ECHAR is only used by the string productions
+in the form ECHAR | UCHAR, the check for the leading
+\ has been moved out (see _protChar)
+
 _echar :: TurtleParser Char
 _echar = char '\\' *> _echar'
+-}
 
 _echar' :: TurtleParser Char
 _echar' = 
 Name:               swish
-Version:            0.9.0.4
+Version:            0.9.0.5
 Stability:          experimental
 License:            LGPL
 License-file:       LICENSE 
   .
   * Complete, ready-to-run, command-line and script-driven programs.
   .
+  Changes in version @0.9.0.5@:
+  .
+  * Turtle/N3 output: more changes for string literals containing
+  double-quote characters.
+  .
+  * N3 Parser: parsing of string literals within three quote marks has
+  been updated to match the Turtle parser.
+  .
+  * Turtle Parser: a few more error messages have been added.
+  .
   Changes in version @0.9.0.4@:
   .
   * Turtle parser: updated to the Candidate Recommendation (19 February 2013)

File tests/TurtleTest.hs

 import qualified Data.Text as T
 import qualified Data.Text.Lazy as L
 
+import Data.Char (chr)
 import Data.Maybe (fromMaybe)
 
 import Network.URI (URI, parseURIReference)
 
 -- Cases to try and improve the test coverage
 
--- | This was actually more a problem with output rather than input.
-coverage1 :: T.Text
-coverage1 =
-  T.unlines
-  [ "<urn:a> <urn:b> \"' -D RT @madeupname: \\\"Foo \\u0024 Zone\\\" \\U0000007e:\\\"\\\"\\\"D\" ."
-  ]
-
-resultc1 :: [RDFTriple]
-resultc1 =
-  [ triple
-    (toURI "urn:a")
-    (toURI "urn:b")
-    (Lit "' -D RT @madeupname: \"Foo $ Zone\" ~:\"\"\"D")
-  ]
+trips :: T.Text -> [RDFTriple]
+trips t = [triple (toURI "urn:a") (toURI "urn:b") (Lit t)]
 
 coverageCases :: Test
 coverageCases =
   TestList
-  [ compareExample "p1" coverage1 resultc1
+  [ -- This was actually more a problem with output rather than input.
+    compareExample "cov1"
+    "<urn:a> <urn:b> \"' -D RT @madeupname: \\\"Foo \\u0024 Zone\\\" \\U0000007e:\\\"\\\"\\\"D\" ."
+    (trips "' -D RT @madeupname: \"Foo $ Zone\" ~:\"\"\"D")
+  , compareExample "cov2"
+    "<urn:a> <urn:b> \"\"\"\"Bob \\\"\\uF481\"\"\"."
+    (trips (T.snoc "\"Bob \"" (chr 0xf481)))
+    {-
+      rapper will parse this but I do not think it matches
+      the Turtle grammar
+  , compareExample "cov2-option"
+    "<urn:a> <urn:b> \"\"\"\"Bob \"\\U0001F481\"\"\""
+    (trips "\"Bob \"\\U0001F481")
+    -}
+  , compareExample "cov3-1"
+    "<urn:a> <urn:b> \"\"\"\\\"A quoted string.\\\"\"\"\"."
+    (trips "\"A quoted string.\"")
+  , compareExample "cov3-2"
+    "<urn:a> <urn:b> \"\"\"\\\"\\\"A quoted string.\\\"\"\"\"."
+    (trips "\"\"A quoted string.\"")
+  , compareExample "cov3-3"
+    "<urn:a> <urn:b> \"\"\"\\\"A quoted string.\\\"\\\"\"\"\"."
+    (trips "\"A quoted string.\"\"")
   ]
 
 -- Extracted from failures seen when using the W3C test suite