Commits

Bryan O'Sullivan committed 5153e29

Decode lazy UTF-8 ByteStrings

Comments (0)

Files changed (2)

Data/Text/Lazy/Encoding.hs

     (
     -- * Decoding ByteStrings to Text
     --  decodeASCII
-    --, decodeUtf8
+      decodeUtf8
     --, decodeUtf16LE
     --, decodeUtf16BE
     --, decodeUtf32LE
     ) where
 
 import Data.ByteString.Lazy (ByteString)
+import Data.Text.Lazy (Text)
+import qualified Data.Text.Lazy.Fusion as F
+import qualified Data.Text.Lazy.Encoding.Fusion as E
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
+decodeUtf8 :: ByteString -> Text
+decodeUtf8 bs = F.unstream (E.streamUtf8 bs)
+{-# INLINE decodeUtf8 #-}

Data/Text/Lazy/Encoding/Fusion.hs

     (
     -- * Streaming
     --  streamASCII
-    --, streamUtf8
+     streamUtf8
     --, streamUtf16LE
     --, streamUtf16BE
     --, streamUtf32LE
     -- * Unstreaming
     --, unstream
 
-      module Data.Text.Encoding.Fusion.Common
+    , module Data.Text.Encoding.Fusion.Common
     ) where
 
+import Data.ByteString.Lazy.Internal (ByteString(..))
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Unsafe as B
 import Data.Text.Encoding.Fusion.Common
+import Data.Text.Fusion (Step(..), Stream(..))
+import Data.Text.Fusion.Internal (PairS(..))
+import Data.Text.UnsafeChar (unsafeChr, unsafeChr8, unsafeChr32)
+import qualified Data.Text.Encoding.Utf8 as U8
+
+unknownLength :: Int
+unknownLength = 4
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using UTF-8
+-- encoding.
+streamUtf8 :: ByteString -> Stream Char
+streamUtf8 bs0 = Stream next (bs0 :!: 0) unknownLength
+    where
+      {-# INLINE next #-}
+      next (c@(Chunk bs rest) :!: i)
+          | i >= l = next (rest :!: 0)
+          | U8.validate1 x1 = Yield (unsafeChr8 x1) (c :!: i+1)
+          | i+1 < l && U8.validate2 x1 x2 = Yield (U8.chr2 x1 x2) (c :!: i+2)
+          | i+2 < l && U8.validate3 x1 x2 x3 = Yield (U8.chr3 x1 x2 x3) (c :!: i+3)
+          | i+3 < l && U8.validate4 x1 x2 x3 x4 = Yield (U8.chr4 x1 x2 x3 x4) (c :!: i+4)
+          | otherwise = encodingError "UTF-8"
+          where
+            x1 = idx i
+            x2 = idx (i + 1)
+            x3 = idx (i + 2)
+            x4 = idx (i + 3)
+            idx = B.unsafeIndex bs
+            l = B.length bs
+      next (Empty :!: _) = Done
+{-# INLINE [0] streamUtf8 #-}
+
+encodingError :: String -> a
+encodingError encoding =
+    error $ "Data.Text.Lazy.Encoding.Fusion: Bad " ++ encoding ++ " stream"