Commits

Doug Burke  committed 42aeb77

Minor updates to IRI parsing (will still fail since Network.URI does not accept IRIs)

  • Participants
  • Parent commits c4184b9
  • Tags 0.7.0.1

Comments (0)

Files changed (5)

+0.7.0.1:
+
+  URI parsing has changed slightly. The only user-visible change is that
+  error messages will be slightly different, in particular when
+  given an IRI in Turtle or NTriples format. Unfortunately IRIs are
+  still not supported.
+
 0.7.0.0:
 
   For code that uses the Swish script language, the main change is to

File src/Swish/RDF/Parser/N3.hs

 --    
 -- NOTES:
 --
---  UTF-8 handling is not really tested.
+--  - The parser needs to be updated to the latest version
+--    (\"W3C Team Submission 28 March 2011\",
+--    <http://www.w3.org/TeamSubmission/2011/SUBM-n3-20110328/>)
 --
---  No performance testing has been applied.
+--  - UTF-8 handling is not really tested.
 --
---  Not all N3 grammar elements are supported, including:
+--  - No performance testing has been applied.
+--
+--  - Not all N3 grammar elements are supported, including:
 --
 --    - @\@forSome@ (we read it in but ignore the arguments)
 --
 
 explicitURI :: N3Parser URI
 explicitURI = do
-  let lb = char '<'
-      rb = char '>'
-  
-  -- TODO: do the whitespace definitions match?
-  ustr <- between lb rb $ many (satisfy (/= '>'))
-  let uclean = filter (not . isSpace) ustr
-  
-  case parseURIReference uclean of
-    Nothing -> fail $ "Unable to convert <" ++ uclean ++ "> to a URI"
+  ignore $ char '<'
+  ustr <- manyFinally' ((satisfy isSpace *> next) <|> next) (char '>')
+  case parseURIReference ustr of
+    Nothing -> failBad $ "Invalid URI: <" ++ ustr ++ ">"
     Just uref -> do
       s <- stGet
       let base = getSUri s "base"

File src/Swish/RDF/Parser/NTriples.hs

 --    W3C Recommendation 10 February 2004,
 --    <http://www.w3.org/TR/rdf-testcases/#ntriples>
 --
+-- NOTES:
+--
+--  - If the URI is actually an IRI (Internationalized Resource Identifiers)
+--    then the parser will fail since 'Network.URI.parseURI' fails.
+--    
 --------------------------------------------------------------------------------
 
 module Swish.RDF.Parser.NTriples
 
 triple :: NTParser ()
 triple = 
-  {- tryin to be fancy but addStatement is a Parser not a pure function
-  addStatement 
-  <$> (subject <* skip1WS)
-  <*> (predicate <* skip1WS)
-  <*> (object <* (skipWS *> fullStop *> skipWS))
-  -}
-  
   do
-    s <- subject
-    skip1WS
-    p <- predicate
-    skip1WS
-    o <- object
-    skipWS
-    fullStop
-    skipWS
+    s <- subject <* skip1WS
+    p <- predicate <* skip1WS
+    o <- object <* (skipWS >> fullStop >> skipWS)
     addStatement s p o
 
 {-
 
 {-
 uriref	::=	'<' absoluteURI '>'	
-absoluteURI	::=	character+ with escapes as defined in section URI References	
+absoluteURI	::=	character+ with escapes as defined below (from section 'URI References')	
+
+The absoluteURI production encodes a Unicode string representing an RDF URI references as specified in
+[RDF-CONCEPTS]. These are encoded in N-Triples using the escapes described in section Strings.
 
 -}
 
 uriref :: NTParser ScopedName
 uriref = do
-  -- not ideal, as want to reject invalid characters immediately rather than via parseURI
-  ustr <- L.unpack <$> bracket (char '<') (char '>') (many1Satisfy (/= '>'))
-  -- ustr <- bracket (char '<') (char '>') $ many1 character -- looks like need to exclude > from character
-  -- ustr <- char '<' *> manyTill character (char '>')
-  
-  maybe (failBad ("Invalid URI: <" ++ ustr ++ ">"))
+  ignore $ char '<'
+  uri <- manyFinally' character (char '>')
+  maybe (failBad ("Invalid URI: <" ++ uri ++ ">"))
     (return . makeURIScopedName)
-    (parseURI ustr)
+    (parseURI uri)
 
 urirefLbl :: NTParser RDFLabel
 urirefLbl = Res <$> uriref

File src/Swish/RDF/Parser/Turtle.hs

 --    W3C Working Draft 09 August 2011 (<http://www.w3.org/TR/2011/WD-turtle-20110809/>),
 --    <http://www.w3.org/TR/turtle/>
 --
--- Notes:
+-- NOTES:
 --
--- At present there is a lot of overlap with the N3 Parser.
+--  - At present there is a lot of overlap with the N3 Parser.
 --
--- The parser needs to be updated to the latest working draft (10 July 2012,
--- <http://www.w3.org/TR/2012/WD-turtle-20120710/#sec-changelog>).
+--  - The parser needs to be updated to the latest working draft (10 July 2012,
+--    <http://www.w3.org/TR/2012/WD-turtle-20120710/#sec-changelog>).
+--
+--  - Strings with no language tag are converted to a 'LitTag' not a
+--    'TypedLitTag' with a type of @xsd:string@ (e.g. see
+--    <http://www.w3.org/TR/2011/WD-turtle-20110809/#terms>).
+--
+--  - If the URI is actually an IRI (Internationalized Resource Identifiers)
+--    then the parser will fail since 'Network.URI.parseURI' fails.
 --
 --------------------------------------------------------------------------------
 
 import Control.Applicative
 import Control.Monad (foldM)
 
-import Data.Char (ord, isAsciiLower, isAsciiUpper, isDigit) 
+import Data.Char (chr, ord, isAsciiLower, isAsciiUpper, isDigit) 
 import Data.LookupMap (LookupMap(..), LookupEntryClass(..))
 import Data.LookupMap (mapFindMaybe, mapAdd, mapReplace)
 import Data.Maybe (fromMaybe, fromJust)
 http://lists.w3.org/Archives/Public/public-rdf-comments/2011Aug/0011.html
 
 Unlike N3, whitespace is significant within the surrounding <>.
-
-At present relying on Network.URI to define what characters are valid
-in a URI. This is not necessarily ideal.
 -}
 
 _iriRef :: TurtleParser URI
 _iriRef = do
-  utxt <- bracket (char '<') (char '>') $ manySatisfy (/= '>') -- TODO: fix
-  let ustr = L.unpack utxt
+  ignore $ char '<'
+  ustr <- manyFinally' iriRefChar (char '>')
   case parseURIReference ustr of
-    Nothing -> fail $ "Unable to convert <" ++ ustr ++ "> to a URI"
+    Nothing -> failBad $ "Invalid URI: <" ++ ustr ++ ">"
     Just uref -> do
       s <- stGet
       either fail return $ appendURIs (baseUri s) uref
 
+iriRefChar :: TurtleParser Char
+iriRefChar = satisfy notIRIChar <|> _uchar
+
+notIRIChar :: Char -> Bool
+notIRIChar c = c >= chr 0x20
+               && 
+               c `notElem` "^<>\"{}|^`\\"
+
 {-
 [71s] <PNAME_NS> ::= (PN_PREFIX)? ":" 
 -}
 Name:               swish
-Version:            0.7.0.0
+Version:            0.7.0.1
 Stability:          experimental
 License:            LGPL
 License-file:       LICENSE 
   .
   Changes in version @0.7.0.0@:
   .
+  * Internal changes to parsing of URI values for NTriples, Turtle, and N3
+  parsers (error messages will be slightly different when IRIs are used).
+  Unfortunately IRIs are still not supported. 
+  .
+  Changes in version @0.7.0.0@:
+  .
   For code that uses the Swish script language, the main change is to import @Swish@ rather
   than @Swish.RDF.SwishMain@, and to note that the other @Swish.RDF.Swish*@ modules are
   now called @Swish.*@.