Commits

Bryan O'Sullivan committed 75d833d

Improve docs.

  • Participants
  • Parent commits e6a18a9

Comments (0)

Files changed (3)

File Data/Attoparsec.hs

     , module Data.Attoparsec.Combinator
 
     -- * Parsing individual bytes
+    , I.word8
     , I.anyWord8
     , I.notWord8
-    , I.word8
     , I.satisfy
+    , I.satisfyWith
 
     -- ** Byte classes
     , I.inClass
     -- * Efficient string handling
     , I.string
     , I.skipWhile
-    , I.stringTransform
     , I.take
-    , I.takeTill
     , I.takeWhile
     , I.takeWhile1
+    , I.takeTill
 
     -- * State observation and manipulation functions
     , I.endOfInput

File Data/Attoparsec/Char8.hs

 -- 
 -- /Note/: This module is intended for parsing text that is
 -- represented using an 8-bit character set, e.g. ASCII or
--- ISO-8859-15.  It /does not/ deal with character encodings,
--- multibyte characters, or wide characters.  Any attempts to use
--- characters above code point 255 will give wrong answers.
+-- ISO-8859-15.  It /does not/ make any attempt to deal with character
+-- encodings, multibyte characters, or wide characters.  In
+-- particular, all attempts to use characters above code point U+00FF
+-- will give wrong answers.  Characters below U+00FF are simply
+-- translated to and from the byte values of their Unicode code
+-- points.
 module Data.Attoparsec.Char8
     (
     -- * Parser types
     , module Data.Attoparsec.Combinator
 
     -- * Parsing individual characters
+    , satisfy
+    , char
     , anyChar
-    , char
     , char8
+    , notChar
+
+    -- ** Special character parsers
     , digit
-    , letter
-    , notChar
+    , letter_iso8859_15
+    , letter_ascii
     , space
-    , satisfy
 
-    -- ** Character classes
+    -- ** Fast predicates
+    , isDigit
+    , isAlpha_iso8859_15
+    , isAlpha_ascii
+
+    -- *** Character classes
     , inClass
     , notInClass
 
     , stringCI
     , skipSpace
     , skipWhile
-    , take
+    , I.take
     , takeTill
     , takeWhile
     , takeWhile1
     , isHorizontalSpace
 
     -- * Numeric parsers
-    , hexNumber
-    --, int
-    --, integer
+    , decimal
+    , hexadecimal
+    , signed
     --, double
 
     -- * State observation and manipulation functions
     , I.ensure
     ) where
 
+import Control.Applicative ((*>), (<$>), (<|>))
 import Data.Attoparsec.Combinator
 import Data.Attoparsec.FastSet (charClass, memberChar)
 import Data.Attoparsec.Internal (Parser, (<?>))
 import Data.ByteString.Internal (c2w, w2c)
--- import Data.ByteString.Lex.Double (readDouble)
 import Data.Word (Word8)
 import Prelude hiding (takeWhile)
 import qualified Data.Attoparsec as A
 import qualified Data.Attoparsec.Internal as I
+import qualified Data.ByteString as B8
 import qualified Data.ByteString.Char8 as B
-import qualified Data.ByteString as B8
 
+-- ASCII-specific but fast, oh yes.
 toLower :: Word8 -> Word8
 toLower w | w >= 65 && w <= 90 = w + 32
           | otherwise          = w
 stringCI = I.stringTransform (B8.map toLower)
 {-# INLINE stringCI #-}
 
+-- | Consume input as long as the predicate returns 'True', and return
+-- the consumed input.
+--
+-- This parser requires the predicate to succeed on at least one byte
+-- of input: it will fail if the predicate never returns 'True' or if
+-- there is no input left.
 takeWhile1 :: (Char -> Bool) -> Parser B.ByteString
 takeWhile1 p = I.takeWhile1 (p . w2c)
 {-# INLINE takeWhile1 #-}
 
--- | Character parser.
+-- | The parser @satisfy p@ succeeds for any byte for which the
+-- predicate @p@ returns 'True'. Returns the byte that is actually
+-- parsed.
+--
+-- >digit = satisfy isDigit
+-- >    where isDigit c = c >= '0' && c <= '9'
 satisfy :: (Char -> Bool) -> Parser Char
 satisfy = I.satisfyWith w2c
 {-# INLINE satisfy #-}
 
-letter :: Parser Char
-letter = satisfy isLetter <?> "letter"
-  where isLetter c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
-{-# INLINE letter #-}
+-- | Match a letter, in the ISO-8859-15 encoding.
+letter_iso8859_15 :: Parser Char
+letter_iso8859_15 = satisfy isAlpha_iso8859_15 <?> "letter_iso8859_15"
+{-# INLINE letter_iso8859_15 #-}
 
+-- | Match a letter, in the ASCII encoding.
+letter_ascii :: Parser Char
+letter_ascii = satisfy isAlpha_ascii <?> "letter_ascii"
+{-# INLINE letter_ascii #-}
+
+-- | A fast alphabetic predicate for the ISO-8859-15 encoding
+--
+-- /Note/: For all character encodings other than ISO-8859-15, and
+-- almost all Unicode code points above U+00A3, this predicate gives
+-- /wrong answers/.
+isAlpha_iso8859_15 :: Char -> Bool
+isAlpha_iso8859_15 c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
+                       (c >= '\166' && moby c)
+  where moby = notInClass "\167\169\171-\179\182\183\185\187\191\215\247"
+        {-# NOINLINE moby #-}
+{-# INLINE isAlpha_iso8859_15 #-}
+
+-- | A fast alphabetic predicate for the ASCII encoding
+--
+-- /Note/: For all character encodings other than ASCII, and
+-- almost all Unicode code points above U+007F, this predicate gives
+-- /wrong answers/.
+isAlpha_ascii :: Char -> Bool
+isAlpha_ascii c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
+{-# INLINE isAlpha_ascii #-}
+
+-- | Parse a single digit.
 digit :: Parser Char
 digit = satisfy isDigit <?> "digit"
-  where isDigit c = c >= '0' && c <= '9'
 {-# INLINE digit #-}
 
+-- | A fast digit predicate.
+isDigit :: Char -> Bool
+isDigit c = c >= '0' && c <= '9'
+{-# INLINE isDigit #-}
+
+-- | Match any character.
 anyChar :: Parser Char
 anyChar = satisfy $ const True
 {-# INLINE anyChar #-}
 
+-- | Fast predicate for matching a space character.
+--
+-- /Note/: This predicate only gives correct answers for the ASCII
+-- encoding.  For instance, it does not recognise U+00A0 (non-breaking
+-- space) as a space character, even though it is a valid ISO-8859-15
+-- byte.
 isSpace :: Char -> Bool
 isSpace c = c `B.elem` spaces
     where spaces = B.pack " \n\r\t\v\f"
+          {-# NOINLINE spaces #-}
+{-# INLINE isSpace #-}
 
+-- | Parse a space character.
+--
+-- /Note/: This parser only gives correct answers for the ASCII
+-- encoding.  For instance, it does not recognise U+00A0 (non-breaking
+-- space) as a space character, even though it is a valid ISO-8859-15
+-- byte.
 space :: Parser Char
 space = satisfy isSpace <?> "space"
 {-# INLINE space #-}
 char c = satisfy (== c) <?> [c]
 {-# INLINE char #-}
 
--- | Match a specific character.
+-- | Match a specific character, but return its 'Word8' value.
 char8 :: Char -> Parser Word8
 char8 c = I.satisfy (== c2w c) <?> [c]
 {-# INLINE char8 #-}
 
 -- | Match any character in a set.
 --
--- > vowel = inClass "aeiou"
+-- >vowel = inClass "aeiou"
 --
 -- Range notation is supported.
 --
--- > halfAlphabet = inClass "a-nA-N"
+-- >halfAlphabet = inClass "a-nA-N"
 --
 -- To add a literal \'-\' to a set, place it at the beginning or end
 -- of the string.
 notInClass s = not . inClass s
 {-# INLINE notInClass #-}
 
--- | Consume characters while the predicate succeeds.
+-- | Consume input as long as the predicate returns 'True', and return
+-- the consumed input.
+--
+-- This parser does not fail.  It will return an empty string if the
+-- predicate returns 'False' on the first byte of input.
+--
+-- /Note/: Because this parser does not fail, do not use it with
+-- combinators such as 'many', because such parsers loop until a
+-- failure occurs.  Careless use will thus result in an infinite loop.
 takeWhile :: (Char -> Bool) -> Parser B.ByteString
 takeWhile p = I.takeWhile (p . w2c)
 {-# INLINE takeWhile #-}
 
--- | Consume characters while the predicate fails.
+-- | Consume input as long as the predicate returns 'False'
+-- (i.e. until it returns 'True'), and return the consumed input.
+--
+-- This parser does not fail.  It will return an empty string if the
+-- predicate returns 'True' on the first byte of input.
+--
+-- /Note/: Because this parser does not fail, do not use it with
+-- combinators such as 'many', because such parsers loop until a
+-- failure occurs.  Careless use will thus result in an infinite loop.
 takeTill :: (Char -> Bool) -> Parser B.ByteString
 takeTill p = I.takeTill (p . w2c)
 {-# INLINE takeTill #-}
 
--- | Skip over characters while the predicate succeeds.
+-- | Skip past input for as long as the predicate returns 'True'.
 skipWhile :: (Char -> Bool) -> Parser ()
 skipWhile p = I.skipWhile (p . w2c)
 {-# INLINE skipWhile #-}
 skipSpace = skipWhile isSpace >> return ()
 {-# INLINE skipSpace #-}
 
+-- | A predicate that matches either a carriage return @\'\\r\'@ or
+-- newline @\'\\n\'@ character.
 isEndOfLine :: Word8 -> Bool
 isEndOfLine w = w == 13 || w == 10
 {-# INLINE isEndOfLine #-}
 
+-- | A predicate that matches either a space @\' \'@ or horizontal tab
+-- @\'\\t\'@ character.
 isHorizontalSpace :: Word8 -> Bool
 isHorizontalSpace w = w == 32 || w == 9
 {-# INLINE isHorizontalSpace #-}
 
 {-
-numeric :: String -> (B.ByteString -> Maybe (a,B.ByteString)) -> Parser a
-numeric desc f = do
-  s <- getInput
-  case f s of
-    Nothing -> fail desc
-    Just (i,s') -> setInput s' >> return i
-                   
--- | Parse an integer.  The position counter is not updated.
-int :: Parser Int
-int = numeric "Int" B.readInt
-
--- | Parse an integer.  The position counter is not updated.
-integer :: Parser Integer
-integer = numeric "Integer" B.readInteger
-
 -- | Parse a Double.  The position counter is not updated.
 double :: Parser Double
 double = numeric "Double" readDouble
 -}
 
-hexNumber :: Integral a => Parser a
-{-# SPECIALISE hexNumber :: Parser Int #-}
-hexNumber = fromHex `fmap` I.takeWhile1 isHexDigit
+-- | Parse and decode an unsigned hexadecimal number.  The hex digits
+-- @\'a\'@ through @\'f\'@ may be upper or lower case.
+--
+-- This parser does not accept a leading @\"0x\"@ string.
+hexadecimal :: Integral a => Parser a
+{-# SPECIALISE hexadecimal :: Parser Int #-}
+hexadecimal = B8.foldl' step 0 `fmap` I.takeWhile1 isHexDigit
   where isHexDigit w = (w >= 48 && w <= 57) || (x >= 97 && x <= 102)
             where x = toLower w
-        fromHex = B8.foldl' step 0
         step a w | w >= 48 && w <= 57  = a * 16 + fromIntegral (w - 48)
-                 | x >= 97 && x <= 102 = a * 16 + fromIntegral (x - 87)
-                 | otherwise           = error "impossible"
+                 | otherwise           = a * 16 + fromIntegral (x - 87)
             where x = toLower w
+
+-- | Parse and decode an unsigned decimal number.
+decimal :: Integral a => Parser a
+{-# SPECIALISE decimal :: Parser Int #-}
+decimal = B8.foldl' step 0 `fmap` I.takeWhile1 isDig
+  where isDig w  = w >= 48 && w <= 57
+        step a w = a * 10 + fromIntegral (w - 48)
+
+-- | Parse a number with an optional leading @\'+\'@ or @\'-\'@ sign
+-- character.
+signed :: Num a => Parser a -> Parser a
+{-# SPECIALISE signed :: Parser Int -> Parser Int #-}
+signed p = (negate <$> char8 '-' *> p)
+       <|> (char8 '+' *> p)
+       <|> p

File Data/Attoparsec/Internal.hs

 
     -- * Running parsers
     , parse
-    , parseAll
-    , feed
 
     -- * Combinators
     , (<?>)
     , string
     , stringTransform
     , take
-    , takeTill
     , takeWhile
     , takeWhile1
+    , takeTill
 
     -- * State observation and manipulation functions
     , endOfInput
     where msg = "Failed reading: " ++ err
 {-# INLINE failDesc #-}
 
+-- | Succeed only if at least @n@ bytes of input are available.
 ensure :: Int -> Parser ()
 ensure n = Parser $ \st0@(S s0 _a0 _c0) kf ks ->
     if B.length s0 >= n
     then ks st0 ()
     else runParser (demandInput >> ensure n) st0 kf ks
 
+-- | Immediately demand more input via a 'Partial' continuation
+-- result.
 demandInput :: Parser ()
 demandInput = Parser $ \st0@(S s0 a0 c0) kf ks ->
     if c0 == Complete
          else let st1 = S (s0 +++ s) (a0 +++ s) Incomplete
               in  ks st1 ()
 
+-- | This parser always succeeds.  It returns 'True' if any input is
+-- available either immediately or on demand, and 'False' if the end
+-- of all input has been reached.
 wantInput :: Parser Bool
 wantInput = Parser $ \st0@(S s0 a0 c0) _kf ks ->
   case undefined of
 put :: B.ByteString -> Parser ()
 put s = Parser (\(S _s0 a0 c0) _kf ks -> ks (S s a0 c0) ())
 
-take :: Int -> Parser B.ByteString
-take n = takeWith n (const True)
-{-# INLINE take #-}
-
 (+++) :: B.ByteString -> B.ByteString -> B.ByteString
 (+++) = B.append
 {-# INLINE (+++) #-}
 
+-- | Attempt a parse, and if it fails, rewind the input so that no
+-- input appears to have been consumed.
+--
+-- This combinator is useful in cases where a parser might consume
+-- some input before failing, i.e. the parser needs arbitrary
+-- lookahead.  The downside to using this combinator is that it can
+-- retain input for longer than is desirable.
 try :: Parser a -> Parser a
 try p = Parser $ \st0 kf ks ->
         runParser p (noAdds st0) (kf . mappend st0) ks
 
+-- | The parser @satisfy p@ succeeds for any byte for which the
+-- predicate @p@ returns 'True'. Returns the byte that is actually
+-- parsed.
+--
+-- >digit = satisfy isDigit
+-- >    where isDigit w = w >= 48 && w <= 57
 satisfy :: (Word8 -> Bool) -> Parser Word8
 satisfy p = do
   ensure 1
     then put (B.unsafeTail s) >> return w
     else fail "satisfy"
 
--- | Character parser.
+-- | The parser @satisfyWith f p@ transforms a byte, and succeeds if
+-- the predicate @p@ returns 'True' on the transformed value. The
+-- parser returns the transformed byte that was parsed.
 satisfyWith :: (Word8 -> a) -> (a -> Bool) -> Parser a
 satisfyWith f p = do
   ensure 1
     (fp,o,_) <- B.toForeignPtr `fmapP` take (sizeOf dummy)
     return . B.inlinePerformIO . withForeignPtr fp $ \p -> peek (castPtr $ p `plusPtr` o)
 
+-- | Consume @n@ bytes of input, but succeed only if the predicate
+-- returns 'True'.
 takeWith :: Int -> (B.ByteString -> Bool) -> Parser B.ByteString
 takeWith n p = do
   ensure n
     then put t >> return h
     else failDesc "takeWith"
 
+-- | Consume exactly @n@ bytes of input.
+take :: Int -> Parser B.ByteString
+take n = takeWith n (const True)
+{-# INLINE take #-}
+
+-- | @string s@ parses a sequence of bytes that identically match
+-- @s@. Returns the parsed string (i.e. @s@).  This parser consumes no
+-- input if it fails (even if a partial match).
+--
+-- /Note/: The behaviour of this parser is different to that of the
+-- similarly-named parser in Parsec, as this one is all-or-nothing.
+-- To illustrate the difference, the following parser will fail under
+-- Parsec given an input of @"for"@:
+--
+-- >string "foo" <|> string "for"
+--
+-- The reason for its failure is that that the first branch is a
+-- partial match, and will consume the letters @\'f\'@ and @\'o\'@
+-- before failing.  In Attoparsec, the above parser will /succeed/ on
+-- that input, because the failed first branch will consume nothing.
 string :: B.ByteString -> Parser B.ByteString
 string s = takeWith (B.length s) (==s)
 {-# INLINE string #-}
 stringTransform f s = takeWith (B.length s) ((==s) . f)
 {-# INLINE stringTransform #-}
 
+-- | Skip past input for as long as the predicate returns 'True'.
 skipWhile :: (Word8 -> Bool) -> Parser ()
 skipWhile p = go
  where
       put t
       when (B.null t) go
 
+-- | Consume input as long as the predicate returns 'False'
+-- (i.e. until it returns 'True'), and return the consumed input.
+--
+-- This parser does not fail.  It will return an empty string if the
+-- predicate returns 'True' on the first byte of input.
+--
+-- /Note/: Because this parser does not fail, do not use it with
+-- combinators such as 'many', because such parsers loop until a
+-- failure occurs.  Careless use will thus result in an infinite loop.
 takeTill :: (Word8 -> Bool) -> Parser B.ByteString
 takeTill p = takeWhile (not . p)
 {-# INLINE takeTill #-}
 
+-- | Consume input as long as the predicate returns 'True', and return
+-- the consumed input.
+--
+-- This parser does not fail.  It will return an empty string if the
+-- predicate returns 'False' on the first byte of input.
+--
+-- /Note/: Because this parser does not fail, do not use it with
+-- combinators such as 'many', because such parsers loop until a
+-- failure occurs.  Careless use will thus result in an infinite loop.
 takeWhile :: (Word8 -> Bool) -> Parser B.ByteString
 takeWhile p = go
  where
           else return h
       else return B.empty
 
+-- | Consume input as long as the predicate returns 'True', and return
+-- the consumed input.
+--
+-- This parser requires the predicate to succeed on at least one byte
+-- of input: it will fail if the predicate never returns 'True' or if
+-- there is no input left.
 takeWhile1 :: (Word8 -> Bool) -> Parser B.ByteString
 takeWhile1 p = do
   (`when` demandInput) =<< B.null <$> get
     then (h+++) `fmapP` takeWhile p
     else return h
 
--- | Match any character in a set.
+-- | Match any byte in a set.
 --
--- > vowel = inClass "aeiou"
+-- >vowel = inClass "aeiou"
 --
 -- Range notation is supported.
 --
--- > halfAlphabet = inClass "a-nA-N"
+-- >halfAlphabet = inClass "a-nA-N"
 --
--- To add a literal \'-\' to a set, place it at the beginning or end
+-- To add a literal @\'-\'@ to a set, place it at the beginning or end
 -- of the string.
 inClass :: String -> Word8 -> Bool
 inClass s = (`memberWord8` mySet)
     where mySet = charClass s
 {-# INLINE inClass #-}
 
--- | Match any character not in a set.
+-- | Match any byte not in a set.
 notInClass :: String -> Word8 -> Bool
 notInClass s = not . inClass s
 {-# INLINE notInClass #-}
 notWord8 c = satisfy (/= c) <?> "not " ++ show c
 {-# INLINE notWord8 #-}
 
+-- | Match only if all input has been consumed.
 endOfInput :: Parser ()
 endOfInput = Parser $ \st0@S{..} kf ks ->
              if B.null input
                        in  runParser demandInput st0 kf' ks'
              else kf st0 [] "endOfInput"
                                                
+-- | Match either a single newline character @\'\\n\'@, or a carriage
+-- return followed by a newline character @\"\\r\\n\"@.
 endOfLine :: Parser ()
 endOfLine = (word8 10 >> return ()) <|> (string (B.pack "\r\n") >> return ())
 
 {-# INLINE (<?>) #-}
 infix 0 <?>
 
+-- | Terminal failure continuation.
 failK :: Failure a
 failK st0 stack msg = Fail st0 stack msg
 
+-- | Terminal success continuation.
 successK :: Success a a
 successK state a = Done state a
 
+-- | Run a parser.
 parse :: Parser a -> B.ByteString -> Result a
 parse m s = runParser m (S s B.empty Incomplete) failK successK
 {-# INLINE parse #-}
-              
-feed :: Result r -> B.ByteString -> Result r
-feed f@(Fail _ _ _) _ = f
-feed (Partial k) d = k d
-feed (Done (S s a c) r) d = Done (S (s +++ d) a c) r
-
-parseAll :: Parser a -> [B.ByteString] -> Result a
-parseAll p ss = case ss of
-                  []     -> go (parse p B.empty) []
-                  (c:cs) -> go (parse p c) cs
-  where go (Partial k) (c:cs) = go (k c) cs
-        go (Partial k) []     = k B.empty
-        go r           _      = r