Bryan O'Sullivan avatar Bryan O'Sullivan committed 8d2c256

Clean up and document the stricter parsing functions.

Comments (0)

Files changed (7)

     (
     -- * Encoding and decoding
       decode
+    , decode'
     , encode
     -- * Core JSON types
     , Value(..)
     , object
     -- * Parsing
     , json
+    , json'
     ) where
 
 import Data.Aeson.Encode (encode)
-import Data.Aeson.Parser (json)
+import Data.Aeson.Parser.Internal (decodeWith, json, json')
 import Data.Aeson.Types
 import qualified Data.ByteString.Lazy as L
-import qualified Data.Attoparsec.Lazy as L
 
 -- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
 -- If this fails due to incomplete or invalid input, 'Nothing' is
 -- returned.
+--
+-- This function parses immediately, but defers conversion.  See
+-- 'json' for details.
 decode :: (FromJSON a) => L.ByteString -> Maybe a
-decode s = case L.parse json s of
-             L.Done _ v -> case fromJSON v of
-                             Success a -> Just a
-                             _         -> Nothing
-             _          -> Nothing
+decode = decodeWith json fromJSON
 {-# INLINE decode #-}
+
+-- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
+-- If this fails due to incomplete or invalid input, 'Nothing' is
+-- returned.
+--
+-- This function parses and performs conversion immediately.  See
+-- 'json'' for details.
+decode' :: (FromJSON a) => L.ByteString -> Maybe a
+decode' = decodeWith json' fromJSON
+{-# INLINE decode' #-}

Data/Aeson/Generic.hs

     (
     -- * Decoding and encoding
       decode
+    , decode'
     , encode
     -- * Lower-level conversion functions
     , fromJSON
 import Data.Text.Encoding (encodeUtf8)
 import Data.Time.Clock (UTCTime)
 import Data.Word (Word, Word8, Word16, Word32, Word64)
-import Data.Aeson.Parser (json)
+import Data.Aeson.Parser.Internal (decodeWith, json, json')
 import qualified Data.Aeson.Encode as E
 import qualified Data.Aeson.Functions as F
 import qualified Data.Aeson.Types as T
-import qualified Data.Attoparsec.Lazy as L
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Lazy as L
 import qualified Data.HashMap.Strict as H
 -- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
 -- If this fails due to incomplete or invalid input, 'Nothing' is
 -- returned.
+--
+-- This function parses immediately, but defers conversion.  See
+-- 'json' for details.
 decode :: (Data a) => L.ByteString -> Maybe a
-decode s = case L.parse json s of
-             L.Done _ v -> case fromJSON v of
-                             Success a -> Just a
-                             _         -> Nothing
-             _          -> Nothing
+decode = decodeWith json fromJSON
+{-# INLINE decode #-}
+
+-- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
+-- If this fails due to incomplete or invalid input, 'Nothing' is
+-- returned.
+--
+-- This function parses and performs conversion immediately.  See
+-- 'json'' for details.
+decode' :: (Data a) => L.ByteString -> Maybe a
+decode' = decodeWith json' fromJSON
+{-# INLINE decode' #-}
 
 type T a = a -> Value
 

Data/Aeson/Parser.hs

 --
 -- Efficiently and correctly parse a JSON string.  The string must be
 -- encoded as UTF-8.
+--
+-- It can be useful to think of parsing as occurring in two phases:
+--
+-- * Identification of the textual boundaries of a JSON value.  This
+--   is always strict, so that an invalid JSON document can be
+--   rejected as soon as possible.
+--
+-- * Conversion of a JSON value to a Haskell value.  This may be
+--   either immediate (strict) or deferred (lazy); see below for
+--   details.
 
 module Data.Aeson.Parser
     (
+    -- * Lazy parsers
+    -- $lazy
       json
+    , value
+    , jstring
+    -- * Strict parsers
+    -- $strict
     , json'
-    , value
     , value'
-    , jstring
     ) where
 
-import Blaze.ByteString.Builder (fromByteString, toByteString)
-import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
-import Blaze.ByteString.Builder.Word (fromWord8)
-import Control.Applicative as A
-import Data.Aeson.Types (Value(..))
-import Data.Attoparsec.Char8
-import Data.Bits ((.|.), shiftL)
-import Data.ByteString as B
-import Data.Char (chr)
-import Data.Monoid (mappend, mempty)
-import Data.Text as T
-import Data.Text.Encoding (decodeUtf8)
-import Data.Vector as Vector hiding ((++))
-import Data.Word (Word8)
-import qualified Data.Attoparsec as A
-import qualified Data.Attoparsec.Zepto as Z
-import qualified Data.ByteString.Char8 as B8
-import qualified Data.ByteString.Unsafe as B
-import qualified Data.HashMap.Strict as H
+import Data.Aeson.Parser.Internal (json, json', jstring, value, value')
 
--- | Parse a top-level JSON value.  This must be either an object or
--- an array.
-json :: Parser Value
-json = json_ object_ array_
+-- $lazy
+--
+-- The 'json' and 'value' parsers decouple identification from
+-- conversion.  Identification occurs immediately (so that an invalid
+-- JSON document can be rejected as early as possible), but conversion
+-- to a Haskell value is deferred until that value is needed.
+--
+-- This decoupling can be time-efficient if only a smallish subset of
+-- elements in a JSON value need to be inspected, since the cost of
+-- conversion is zero for uninspected elements.  The trade off is an
+-- increase in memory usage, due to allocation of thunks for values
+-- that have not yet been converted.
 
--- | Parse a top-level JSON value.  This must be either an object or
--- an array.
+-- $strict
 --
--- This is a strict parser version of 'json' which avoids
--- building up thunks during parsing. Prefer this version if most of
--- the JSON data needs to be accessed.
-json' :: Parser Value
-json' = json_ object_' array_'
-
-json_ :: Parser Value -> Parser Value -> Parser Value
-json_ obj ary = do
-  w <- skipSpace *> A.satisfy (\w -> w == 123 || w == 91)
-  if w == 123
-    then obj
-    else ary
-{-# INLINE json_ #-}
-
-object_ :: Parser Value
-object_ = {-# SCC "object_" #-} Object <$> objectValues value
-
-object_' :: Parser Value
-object_' = {-# SCC "object_'" #-} do
-  !vals <- objectValues value'
-  return (Object vals)
-
-objectValues :: Parser Value -> Parser (H.HashMap Text Value)
-objectValues val = do
-  skipSpace
-  let pair = do
-        a <- jstring <* skipSpace
-        b <- char ':' *> skipSpace *> val
-        return (a,b)
-  vals <- ((pair <* skipSpace) `sepBy` (char ',' *> skipSpace)) <* char '}'
-  return (H.fromList vals)
-{-# INLINE objectValues #-}
-
-array_ :: Parser Value
-array_ = {-# SCC "array_" #-} Array <$> arrayValues value
-
-array_' :: Parser Value
-array_' = {-# SCC "array_'" #-} do
-  !vals <- arrayValues value'
-  return (Array vals)
-
-arrayValues :: Parser Value -> Parser (Vector Value)
-arrayValues val = do
-  skipSpace
-  vals <- ((val <* skipSpace) `sepBy` (char ',' *> skipSpace)) <* char ']'
-  return (Vector.fromList vals)
-{-# INLINE arrayValues #-}
-
--- | Parse any JSON value.  Use 'json' in preference to this function
--- if you are parsing data from an untrusted source.
-value :: Parser Value
-value = most <|> (Number <$> number)
- where
-  most = do
-    c <- satisfy (`B8.elem` "{[\"ftn")
-    case c of
-      '{' -> object_
-      '[' -> array_
-      '"' -> String <$> jstring_
-      'f' -> string "alse" *> pure (Bool False)
-      't' -> string "rue" *> pure (Bool True)
-      'n' -> string "ull" *> pure Null
-      _   -> error "attoparsec panic! the impossible happened!"
-
--- | Strict version of 'value'. See also 'json''.
-value' :: Parser Value
-value' = most <|> num
- where
-  most = do
-    c <- satisfy (`B8.elem` "{[\"ftn")
-    case c of
-      '{' -> object_'
-      '[' -> array_'
-      '"' -> do
-          !s <- jstring_
-          return (String s)
-      'f' -> string "alse" *> pure (Bool False)
-      't' -> string "rue" *> pure (Bool True)
-      'n' -> string "ull" *> pure Null
-      _   -> error "attoparsec panic! the impossible happened!"
-  num = do
-    !n <- number
-    return (Number n)
-
-doubleQuote, backslash :: Word8
-doubleQuote = 34
-backslash = 92
-{-# INLINE backslash #-}
-{-# INLINE doubleQuote #-}
-
-jstring :: Parser Text
-jstring = A.word8 doubleQuote *> jstring_
-
--- | Parse a string without a leading quote.
-jstring_ :: Parser Text
-jstring_ = {-# SCC "jstring_" #-} do
-  s <- A.scan False $ \s c -> if s then Just False
-                                   else if c == doubleQuote
-                                        then Nothing
-                                        else Just (c == backslash)
-  _ <- A.word8 doubleQuote
-  if backslash `B.elem` s
-    then case Z.parse unescape s of
-           Right r  -> return (decodeUtf8 r)
-           Left err -> fail err
-    else return (decodeUtf8 s)
-{-# INLINE jstring_ #-}
-
-unescape :: Z.Parser ByteString
-unescape = toByteString <$> go mempty where
-  go acc = do
-    h <- Z.takeWhile (/=backslash)
-    let rest = do
-          start <- Z.take 2
-          let !slash = B.unsafeHead start
-              !t = B.unsafeIndex start 1
-              escape = case B.findIndex (==t) "\"\\/ntbrfu" of
-                         Just i -> i
-                         _      -> 255
-          if slash /= backslash || escape == 255
-            then fail "invalid JSON escape sequence"
-            else do
-            let cont m = go (acc `mappend` fromByteString h `mappend` m)
-                {-# INLINE cont #-}
-            if t /= 117 -- 'u'
-              then cont (fromWord8 (B.unsafeIndex mapping escape))
-              else do
-                   a <- hexQuad
-                   if a < 0xd800 || a > 0xdfff
-                     then cont (fromChar (chr a))
-                     else do
-                       b <- Z.string "\\u" *> hexQuad
-                       if a <= 0xdbff && b >= 0xdc00 && b <= 0xdfff
-                         then let !c = ((a - 0xd800) `shiftL` 10) +
-                                       (b - 0xdc00) + 0x10000
-                              in cont (fromChar (chr c))
-                         else fail "invalid UTF-16 surrogates"
-    done <- Z.atEnd
-    if done
-      then return (acc `mappend` fromByteString h)
-      else rest
-  mapping = "\"\\/\n\t\b\r\f"
-
-hexQuad :: Z.Parser Int
-hexQuad = do
-  s <- Z.take 4
-  let hex n | w >= 48 && w <= 57  = w - 48
-            | w >= 97 && w <= 122 = w - 87
-            | w >= 65 && w <= 90  = w - 55
-            | otherwise           = 255
-        where w = fromIntegral $ B.unsafeIndex s n
-      a = hex 0; b = hex 1; c = hex 2; d = hex 3
-  if (a .|. b .|. c .|. d) /= 255
-    then return $! d .|. (c `shiftL` 4) .|. (b `shiftL` 8) .|. (a `shiftL` 12)
-    else fail "invalid hex escape"
+-- The 'json'' and 'value'' parsers combine identification with
+-- conversion.  They consume more CPU cycles up front, but have a
+-- smaller memory footprint.

Data/Aeson/Parser/Internal.hs

+{-# LANGUAGE BangPatterns, OverloadedStrings #-}
+
+-- |
+-- Module:      Data.Aeson.Parser.Internal
+-- Copyright:   (c) 2011 MailRank, Inc.
+-- License:     Apache
+-- Maintainer:  Bryan O'Sullivan <bos@serpentine.com>
+-- Stability:   experimental
+-- Portability: portable
+--
+-- Efficiently and correctly parse a JSON string.  The string must be
+-- encoded as UTF-8.
+
+module Data.Aeson.Parser.Internal
+    (
+    -- * Lazy parsers
+      json
+    , value
+    , jstring
+    -- * Strict parsers
+    , json'
+    , value'
+    -- * Helpers
+    , decodeWith
+    ) where
+
+import Blaze.ByteString.Builder (fromByteString, toByteString)
+import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
+import Blaze.ByteString.Builder.Word (fromWord8)
+import Control.Applicative as A
+import Data.Aeson.Types (Result(..), Value(..))
+import Data.Attoparsec.Char8 hiding (Result)
+import Data.Bits ((.|.), shiftL)
+import Data.ByteString as B
+import Data.Char (chr)
+import Data.Monoid (mappend, mempty)
+import Data.Text as T
+import Data.Text.Encoding (decodeUtf8)
+import Data.Vector as Vector hiding ((++))
+import Data.Word (Word8)
+import qualified Data.Attoparsec as A
+import qualified Data.Attoparsec.Lazy as L
+import qualified Data.Attoparsec.Zepto as Z
+import qualified Data.ByteString.Char8 as B8
+import qualified Data.ByteString.Lazy as L
+import qualified Data.ByteString.Unsafe as B
+import qualified Data.HashMap.Strict as H
+
+-- | Parse a top-level JSON value.  This must be either an object or
+-- an array.
+--
+-- The conversion of parsed values to Haskell values is deferred.
+-- This may improve performance if not all of the results of
+-- conversions are needed, but at a cost in thunk allocation.
+json :: Parser Value
+json = json_ object_ array_
+
+-- | Parse a top-level JSON value.  This must be either an object or
+-- an array.
+--
+-- This is a strict version of 'json' which avoids building up thunks
+-- during parsing; it performs all conversions immediately.  Prefer
+-- this version if most of the JSON data needs to be accessed.
+json' :: Parser Value
+json' = json_ object_' array_'
+
+json_ :: Parser Value -> Parser Value -> Parser Value
+json_ obj ary = do
+  w <- skipSpace *> A.satisfy (\w -> w == 123 || w == 91)
+  if w == 123
+    then obj
+    else ary
+{-# INLINE json_ #-}
+
+object_ :: Parser Value
+object_ = {-# SCC "object_" #-} Object <$> objectValues jstring value
+
+object_' :: Parser Value
+object_' = {-# SCC "object_'" #-} do
+  !vals <- objectValues jstring' value'
+  return (Object vals)
+ where
+  jstring' = do
+    !s <- jstring
+    return s
+
+objectValues :: Parser Text -> Parser Value -> Parser (H.HashMap Text Value)
+objectValues str val = do
+  skipSpace
+  let pair = do
+        a <- str <* skipSpace
+        b <- char ':' *> skipSpace *> val
+        return (a,b)
+  vals <- ((pair <* skipSpace) `sepBy` (char ',' *> skipSpace)) <* char '}'
+  return (H.fromList vals)
+{-# INLINE objectValues #-}
+
+array_ :: Parser Value
+array_ = {-# SCC "array_" #-} Array <$> arrayValues value
+
+array_' :: Parser Value
+array_' = {-# SCC "array_'" #-} do
+  !vals <- arrayValues value'
+  return (Array vals)
+
+arrayValues :: Parser Value -> Parser (Vector Value)
+arrayValues val = do
+  skipSpace
+  vals <- ((val <* skipSpace) `sepBy` (char ',' *> skipSpace)) <* char ']'
+  return (Vector.fromList vals)
+{-# INLINE arrayValues #-}
+
+-- | Parse any JSON value.  You should usually 'json' in preference to
+-- this function.  This is only safe to use if you are parsing data
+-- from an untrusted source.
+value :: Parser Value
+value = most <|> (Number <$> number)
+ where
+  most = do
+    c <- satisfy (`B8.elem` "{[\"ftn")
+    case c of
+      '{' -> object_
+      '[' -> array_
+      '"' -> String <$> jstring_
+      'f' -> string "alse" *> pure (Bool False)
+      't' -> string "rue" *> pure (Bool True)
+      'n' -> string "ull" *> pure Null
+      _   -> error "attoparsec panic! the impossible happened!"
+
+-- | Strict version of 'value'. See also 'json''.
+value' :: Parser Value
+value' = most <|> num
+ where
+  most = do
+    c <- satisfy (`B8.elem` "{[\"ftn")
+    case c of
+      '{' -> object_'
+      '[' -> array_'
+      '"' -> do
+          !s <- jstring_
+          return (String s)
+      'f' -> string "alse" *> pure (Bool False)
+      't' -> string "rue" *> pure (Bool True)
+      'n' -> string "ull" *> pure Null
+      _   -> error "attoparsec panic! the impossible happened!"
+  num = do
+    !n <- number
+    return (Number n)
+
+doubleQuote, backslash :: Word8
+doubleQuote = 34
+backslash = 92
+{-# INLINE backslash #-}
+{-# INLINE doubleQuote #-}
+
+-- | Parse a quoted JSON string.
+jstring :: Parser Text
+jstring = A.word8 doubleQuote *> jstring_
+
+-- | Parse a string without a leading quote.
+jstring_ :: Parser Text
+jstring_ = {-# SCC "jstring_" #-} do
+  s <- A.scan False $ \s c -> if s then Just False
+                                   else if c == doubleQuote
+                                        then Nothing
+                                        else Just (c == backslash)
+  _ <- A.word8 doubleQuote
+  if backslash `B.elem` s
+    then case Z.parse unescape s of
+           Right r  -> return (decodeUtf8 r)
+           Left err -> fail err
+    else return (decodeUtf8 s)
+{-# INLINE jstring_ #-}
+
+unescape :: Z.Parser ByteString
+unescape = toByteString <$> go mempty where
+  go acc = do
+    h <- Z.takeWhile (/=backslash)
+    let rest = do
+          start <- Z.take 2
+          let !slash = B.unsafeHead start
+              !t = B.unsafeIndex start 1
+              escape = case B.findIndex (==t) "\"\\/ntbrfu" of
+                         Just i -> i
+                         _      -> 255
+          if slash /= backslash || escape == 255
+            then fail "invalid JSON escape sequence"
+            else do
+            let cont m = go (acc `mappend` fromByteString h `mappend` m)
+                {-# INLINE cont #-}
+            if t /= 117 -- 'u'
+              then cont (fromWord8 (B.unsafeIndex mapping escape))
+              else do
+                   a <- hexQuad
+                   if a < 0xd800 || a > 0xdfff
+                     then cont (fromChar (chr a))
+                     else do
+                       b <- Z.string "\\u" *> hexQuad
+                       if a <= 0xdbff && b >= 0xdc00 && b <= 0xdfff
+                         then let !c = ((a - 0xd800) `shiftL` 10) +
+                                       (b - 0xdc00) + 0x10000
+                              in cont (fromChar (chr c))
+                         else fail "invalid UTF-16 surrogates"
+    done <- Z.atEnd
+    if done
+      then return (acc `mappend` fromByteString h)
+      else rest
+  mapping = "\"\\/\n\t\b\r\f"
+
+hexQuad :: Z.Parser Int
+hexQuad = do
+  s <- Z.take 4
+  let hex n | w >= 48 && w <= 57  = w - 48
+            | w >= 97 && w <= 122 = w - 87
+            | w >= 65 && w <= 90  = w - 55
+            | otherwise           = 255
+        where w = fromIntegral $ B.unsafeIndex s n
+      a = hex 0; b = hex 1; c = hex 2; d = hex 3
+  if (a .|. b .|. c .|. d) /= 255
+    then return $! d .|. (c `shiftL` 4) .|. (b `shiftL` 8) .|. (a `shiftL` 12)
+    else fail "invalid hex escape"
+
+decodeWith :: Parser Value -> (Value -> Result a) -> L.ByteString -> Maybe a
+decodeWith p to s =
+    case L.parse p s of
+      L.Done _ v -> case to v of
+                      Success a -> Just a
+                      _         -> Nothing
+      _          -> Nothing
+{-# INLINE decodeWith #-}
+
+-- $lazy
+--
+-- The 'json' and 'value' parsers decouple identification from
+-- conversion.  Identification occurs immediately (so that an invalid
+-- JSON document can be rejected as early as possible), but conversion
+-- to a Haskell value is deferred until that value is needed.
+--
+-- This decoupling can be time-efficient if only a smallish subset of
+-- elements in a JSON value need to be inspected, since the cost of
+-- conversion is zero for uninspected elements.  The trade off is an
+-- increase in memory usage, due to allocation of thunks for values
+-- that have not yet been converted.
+
+-- $strict
+--
+-- The 'json'' and 'value'' parsers combine identification with
+-- conversion.  They consume more CPU cycles up front, but have a
+-- smaller memory footprint.
 
   other-modules:
     Data.Aeson.Functions
+    Data.Aeson.Parser.Internal
     Data.Aeson.Types.Class
     Data.Aeson.Types.Internal
 

benchmarks/bench-parse.py

 
 result_re = re.compile(r'^\s*(\d+) good, (\d+\.\d+)s$', re.M)
 
+if len(sys.argv) > 1:
+    parser_exe = sys.argv[1]
+else:
+    parser_exe = './AesonParse'
+
 def run(count, filename):
     print '    %s :: %s times' % (filename, count)
-    p = subprocess.Popen(['./AesonParse', str(count), filename],
+    p = subprocess.Popen([parser_exe, str(count), filename],
                          stdout=subprocess.PIPE)
     output = p.stdout.read()
     p.wait()

release-notes.markdown

 
 * Instances of `ToJSON` and `FromJSON` for tuples are between 45% and
   70% faster than in 0.3.
+
+
+## Evaluation control
+
+This version of aeson makes explicit the decoupling between
+*identifying* an element of a JSON document and *converting* it to
+Haskell.  See the
+[`Data.Aeson.Parser`](http://hackage.haskell.org/packages/archive/aeson/latest/doc/html/Data-Aeson-Parser.html)
+documentation for details.
+
+The normal aeson `decode` function performs identification strictly,
+but defers conversion until needed.  This can result in improved
+performance (e.g. if the results of some conversions are never
+needed), but at a cost in increased memory consumption.
+
+The new `decode'` function performs identification and conversion
+immediately.  This incurs an up-front cost in CPU cycles, but reduces
+reduce memory consumption.
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.