Bryan O'Sullivan avatar Bryan O'Sullivan committed 4a127e8

Add controllable error handling and recovery code.

Comments (0)

Files changed (6)

Data/Text/Encoding.hs

     , decodeUtf16BE
     , decodeUtf32LE
     , decodeUtf32BE
+    -- ** Controllable error handling
+    , decodeUtf8With
+    , decodeUtf16LEWith
+    , decodeUtf16BEWith
+    , decodeUtf32LEWith
+    , decodeUtf32BEWith
 
     -- * Encoding Text to ByteStrings
     , encodeUtf8
     
 import Data.ByteString (ByteString)
 import qualified Data.Text.Fusion as F
+import Data.Text.Encoding.Error (OnDecodeError, strictDecode)
 import qualified Data.Text.Encoding.Fusion as E
 import Data.Text.Internal (Text)
 
 {-# INLINE decodeASCII #-}
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
+decodeUtf8With :: OnDecodeError -> ByteString -> Text
+decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)
+{-# INLINE decodeUtf8With #-}
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8 :: ByteString -> Text
-decodeUtf8 bs = F.unstream (E.streamUtf8 bs)
+decodeUtf8 = decodeUtf8With strictDecode
 {-# INLINE decodeUtf8 #-}
 
 -- | Encode text using UTF-8 encoding.
 {-# INLINE encodeUtf8 #-}
 
 -- | Decode text from little endian UTF-16 encoding.
+decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
+decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
+{-# INLINE decodeUtf16LEWith #-}
+
+-- | Decode text from little endian UTF-16 encoding.
 decodeUtf16LE :: ByteString -> Text
-decodeUtf16LE bs = F.unstream (E.streamUtf16LE bs)
+decodeUtf16LE = decodeUtf16LEWith strictDecode
 {-# INLINE decodeUtf16LE #-}
 
 -- | Decode text from big endian UTF-16 encoding.
+decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
+decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
+{-# INLINE decodeUtf16BEWith #-}
+
+-- | Decode text from big endian UTF-16 encoding.
 decodeUtf16BE :: ByteString -> Text
-decodeUtf16BE bs = F.unstream (E.streamUtf16BE bs)
+decodeUtf16BE = decodeUtf16BEWith strictDecode
 {-# INLINE decodeUtf16BE #-}
 
 -- | Encode text using little endian UTF-16 encoding.
 {-# INLINE encodeUtf16BE #-}
 
 -- | Decode text from little endian UTF-32 encoding.
+decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
+decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
+{-# INLINE decodeUtf32LEWith #-}
+
+-- | Decode text from little endian UTF-32 encoding.
 decodeUtf32LE :: ByteString -> Text
-decodeUtf32LE bs = F.unstream (E.streamUtf32LE bs)
+decodeUtf32LE = decodeUtf32LEWith strictDecode
 {-# INLINE decodeUtf32LE #-}
 
 -- | Decode text from big endian UTF-32 encoding.
+decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
+decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
+{-# INLINE decodeUtf32BEWith #-}
+
+-- | Decode text from big endian UTF-32 encoding.
 decodeUtf32BE :: ByteString -> Text
-decodeUtf32BE bs = F.unstream (E.streamUtf32BE bs)
+decodeUtf32BE = decodeUtf32BEWith strictDecode
 {-# INLINE decodeUtf32BE #-}
 
 -- | Encode text using little endian UTF-32 encoding.

Data/Text/Encoding/Error.hs

+{-# LANGUAGE DeriveDataTypeable #-}
+-- |
+-- Module      : Data.Text.Encoding.Error
+-- Copyright   : (c) Bryan O'Sullivan 2009
+--
+-- License     : BSD-style
+-- Maintainer  : bos@serpentine.com, rtharper@aftereternity.co.uk,
+--               duncan@haskell.org
+-- Stability   : experimental
+-- Portability : GHC
+--
+-- Types and functions for dealing with encoding and decoding errors
+-- in Unicode text.
+--
+-- The standard functions for encoding and decoding text are strict,
+-- which is to say that they throw exceptions on invalid input.  This
+-- is often unhelpful on real world input, so alternative functions
+-- exist that accept custom handlers for dealing with invalid inputs.
+-- These 'OnError' handlers are normal Haskell functions.  You can use
+-- one of the presupplied functions in this module, or you can write a
+-- custom handler of your own.
+
+module Data.Text.Encoding.Error
+    (
+    -- * Error handling types
+      UnicodeException(..)
+    , OnError
+    , OnDecodeError
+    , OnEncodeError
+    -- * Useful error handling functions
+    , lenientDecode
+    , strictDecode
+    , strictEncode
+    , ignore
+    , replace
+    ) where
+
+import Control.Exception (Exception, throw)
+import Data.Typeable (Typeable)
+import Data.Word (Word8)
+import Numeric (showHex)
+
+-- | Function type for handling a coding error.  It is supplied with
+-- two inputs:
+--
+-- * A 'String' that describes the error.
+--
+-- * The input value that caused the error.  If the error arose
+--   because the end of input was reached or could not be identified
+--   precisely, this value will be 'Nothing'.
+--
+-- If the handler returns a value wrapped with 'Just', that value will
+-- be used in the output as the replacement for the invalid input.  If
+-- it returns 'Nothing', no value will be used in the output.
+--
+-- Should the handler need to abort processing, it should use 'error'
+-- or 'throw' an exception (preferably a 'UnicodeException').  It may
+-- use the description provided to construct a more helpful error
+-- report.
+type OnError a b = String -> Maybe a -> Maybe b
+type OnDecodeError = OnError Word8 Char
+type OnEncodeError = OnError Char Word8
+
+-- | An exception type for representing Unicode encoding errors.
+data UnicodeException =
+    DecodeError String (Maybe Word8)
+    -- ^ Could not decode a byte sequence because it was invalid under
+    -- the given encoding, or ran out of input in mid-decode.
+  | EncodeError String (Maybe Char)
+    -- ^ Tried to encode a character that could not be represented
+    -- under the given encoding, or ran out of input in mid-encode.
+    deriving (Typeable)
+
+showUnicodeException :: UnicodeException -> String
+showUnicodeException (DecodeError desc (Just w))
+    = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc)
+showUnicodeException (DecodeError desc Nothing)
+    = "Cannot decode input: " ++ desc
+showUnicodeException (EncodeError desc (Just c))
+    = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc)
+showUnicodeException (EncodeError desc Nothing)
+    = "Cannot encode input: " ++ desc
+                     
+instance Show UnicodeException where
+    show = showUnicodeException
+
+instance Exception UnicodeException
+
+-- | Throw a 'UnicodeException' if decoding fails.
+strictDecode :: OnError Word8 Char
+strictDecode desc c = throw (DecodeError desc c)
+
+-- | Replace an invalid input byte with the Unicode replacement
+-- character U+FFFD.
+lenientDecode :: OnError Word8 Char
+lenientDecode _ _ = Just '\xfffd'
+
+-- | Throw a 'UnicodeException' if encoding fails.
+strictEncode :: OnError Char Word8
+strictEncode desc c = throw (EncodeError desc c)
+
+-- | Ignore an invalid input, substituting nothing in the output.
+ignore :: OnError a b
+ignore _ _ = Nothing
+
+-- | Replace an invalid input with a valid output.
+replace :: b -> OnError a b
+replace c _ _ = Just c

Data/Text/Encoding/Fusion.hs

-{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE BangPatterns, Rank2Types #-}
 
 -- |
 -- Module      : Data.Text.Encoding.Fusion
 import Data.ByteString as B
 import Data.ByteString.Internal (ByteString(..), mallocByteString, memcpy)
 import Data.Text.Fusion (Step(..), Stream(..))
+import Data.Text.Encoding.Error
 import Data.Text.Encoding.Fusion.Common
 import Data.Text.UnsafeChar (unsafeChr, unsafeChr8, unsafeChr32)
 import Data.Text.UnsafeShift (shiftL)
 
 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using UTF-8
 -- encoding.
-streamUtf8 :: ByteString -> Stream Char
-streamUtf8 bs = Stream next 0 l
+streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
+streamUtf8 onErr bs = Stream next 0 l
     where
       l = B.length bs
       {-# INLINE next #-}
           | i+1 < l && U8.validate2 x1 x2 = Yield (U8.chr2 x1 x2) (i+2)
           | i+2 < l && U8.validate3 x1 x2 x3 = Yield (U8.chr3 x1 x2 x3) (i+3)
           | i+3 < l && U8.validate4 x1 x2 x3 x4 = Yield (U8.chr4 x1 x2 x3 x4) (i+4)
-          | otherwise = encodingError "UTF-8"
+          | otherwise = decodeError "streamUtf8" "UTF-8" onErr mx (i+1)
           where
+            mx = if i >= l then Nothing else Just x1
             x1 = idx i
             x2 = idx (i + 1)
             x3 = idx (i + 2)
 
 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
 -- endian UTF-16 encoding.
-streamUtf16LE :: ByteString -> Stream Char
-streamUtf16LE bs = Stream next 0 l
+streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf16LE onErr bs = Stream next 0 l
     where
       l = B.length bs
       {-# INLINE next #-}
           | i >= l                         = Done
           | i+1 < l && U16.validate1 x1    = Yield (unsafeChr x1) (i+2)
           | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
-          | otherwise = encodingError "UTF-16LE"
+          | otherwise = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing (i+1)
           where
             x1    = idx i       + (idx (i + 1) `shiftL` 8)
             x2    = idx (i + 2) + (idx (i + 3) `shiftL` 8)
 
 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
 -- endian UTF-16 encoding.
-streamUtf16BE :: ByteString -> Stream Char
-streamUtf16BE bs = Stream next 0 l
+streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf16BE onErr bs = Stream next 0 l
     where
       l = B.length bs
       {-# INLINE next #-}
           | i >= l                         = Done
           | i+1 < l && U16.validate1 x1    = Yield (unsafeChr x1) (i+2)
           | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
-          | otherwise = encodingError "UTF16-BE"
+          | otherwise = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing (i+1)
           where
             x1    = (idx i `shiftL` 8)       + idx (i + 1)
             x2    = (idx (i + 2) `shiftL` 8) + idx (i + 3)
 
 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
 -- endian UTF-32 encoding.
-streamUtf32BE :: ByteString -> Stream Char
-streamUtf32BE bs = Stream next 0 l
+streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf32BE onErr bs = Stream next 0 l
     where
       l = B.length bs
       {-# INLINE next #-}
       next i
           | i >= l                    = Done
           | i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4)
-          | otherwise                 = encodingError "UTF-32BE"
+          | otherwise = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing (i+1)
           where
             x     = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
             x1    = idx i
 
 -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
 -- endian UTF-32 encoding.
-streamUtf32LE :: ByteString -> Stream Char
-streamUtf32LE bs = Stream next 0 l
+streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf32LE onErr bs = Stream next 0 l
     where
       l = B.length bs
       {-# INLINE next #-}
       next i
           | i >= l                    = Done
           | i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4)
-          | otherwise                 = encodingError "UTF-32LE"
+          | otherwise = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing (i+1)
           where
             x     = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
             x1    = idx i
                   memcpy dest' src' (fromIntegral srcLen)
           return dest
 
-encodingError :: String -> a
-encodingError encoding =
-    error $ "Data.Text.Encoding.Fusion: Bad " ++ encoding ++ " stream"
+decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
+            -> s -> Step s Char
+decodeError func kind onErr mb i =
+    case onErr desc mb of
+      Nothing -> Skip i
+      Just c  -> Yield c i
+    where desc = "Data.Text.Encoding.Fusion." ++ func ++ ": Invalid " ++
+                 kind ++ " stream"

Data/Text/Lazy/Encoding.hs

     -- * Decoding ByteStrings to Text
     --  decodeASCII
       decodeUtf8
+    , decodeUtf8With
     --, decodeUtf16LE
     --, decodeUtf16BE
     --, decodeUtf32LE
     ) where
 
 import Data.ByteString.Lazy (ByteString)
+import Data.Text.Encoding.Error (OnDecodeError, strictDecode)
 import Data.Text.Lazy (Text)
 import qualified Data.Text.Lazy.Fusion as F
 import qualified Data.Text.Lazy.Encoding.Fusion as E
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
+decodeUtf8With :: OnDecodeError -> ByteString -> Text
+decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)
+{-# INLINE decodeUtf8With #-}
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8 :: ByteString -> Text
-decodeUtf8 bs = F.unstream (E.streamUtf8 bs)
+decodeUtf8 = decodeUtf8With strictDecode
 {-# INLINE decodeUtf8 #-}
 
 -- | Encode text using UTF-8 encoding.

Data/Text/Lazy/Encoding/Fusion.hs

-{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE BangPatterns, Rank2Types #-}
 
 -- |
 -- Module      : Data.Text.Lazy.Encoding.Fusion
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Unsafe as B
 import Data.Text.Encoding.Fusion.Common
+import Data.Text.Encoding.Error
 import Data.Text.Fusion (Step(..), Stream(..))
 import Data.Text.Fusion.Internal (M(..), PairS(..), S(..))
 import Data.Text.UnsafeChar (unsafeChr8)
 
 -- | /O(n)/ Convert a lazy 'ByteString' into a 'Stream Char', using
 -- UTF-8 encoding.
-streamUtf8 :: ByteString -> Stream Char
-streamUtf8 bs0 = Stream next (bs0 :!: empty :!: 0) unknownLength
+streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
+streamUtf8 onErr bs0 = Stream next (bs0 :!: empty :!: 0) unknownLength
     where
       empty = S N N N N
       {-# INLINE next #-}
          where es = bs :!: empty :!: i
       {-# INLINE consume #-}
       consume (bs@(Chunk ps rest) :!: s :!: i)
-          | i >= len    = consume (rest :!: s  :!: 0)
-          | otherwise   = next    (bs   :!: s' :!: i+1)
-          where s' = case s of
-                       S N _ _ _ -> S x N N N
-                       S a N _ _ -> S a x N N
-                       S a b N _ -> S a b x N
-                       S a b c N -> S a b c x
-                       _         -> encodingError "streamUtf8" "UTF-8"
-                x   = J (B.unsafeIndex ps i)
-                len = B.length ps
+          | i >= B.length ps = consume (rest :!: s  :!: 0)
+          | otherwise =
+        case s of
+          S N _ _ _ -> next (bs :!: S x N N N :!: i+1)
+          S a N _ _ -> next (bs :!: S a x N N :!: i+1)
+          S a b N _ -> next (bs :!: S a b x N :!: i+1)
+          S a b c N -> next (bs :!: S a b c x :!: i+1)
+          S (J a) b c d -> decodeError "streamUtf8" "UTF-8" onErr (Just a)
+                           (bs :!: S b c d N :!: i+1)
+          where x = J (B.unsafeIndex ps i)
       consume (Empty :!: S N _ _ _ :!: _) = Done
-      consume _ = encodingError "streamUtf8" "UTF-8"
+      consume st = decodeError "streamUtf8" "UTF-8" onErr Nothing st
 {-# INLINE [0] streamUtf8 #-}
 
 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
 unstream :: Stream Word8 -> ByteString
 unstream = unstreamChunks defaultChunkSize
 
-encodingError :: String -> String -> a
-encodingError func encoding =
-    error $ "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Bad " ++
-            encoding ++ " stream"
+decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
+            -> s -> Step s Char
+decodeError func kind onErr mb i =
+    case onErr desc mb of
+      Nothing -> Skip i
+      Just c  -> Yield c i
+    where desc = "Data.Text.Lazy.Encoding.Fusion." ++ func ++ ": Invalid " ++
+                 kind ++ " stream"
   exposed-modules:
     Data.Text
     Data.Text.Encoding
+    Data.Text.Encoding.Error
     Data.Text.Encoding.Fusion
     Data.Text.Foreign
     Data.Text.Fusion
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.