Bryan O'Sullivan avatar Bryan O'Sullivan committed 54ee4c2

Add a faster lazy UTF-8 decode (roughly 2x)

Comments (0)

Files changed (2)

Data/Text/Lazy/Encoding.hs

+{-# LANGUAGE BangPatterns #-}
 -- |
 -- Module      : Data.Text.Lazy.Encoding
 -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
     -- * Decoding ByteStrings to Text
       decodeASCII
     , decodeUtf8
+    , decodeUtf8'
     , decodeUtf16LE
     , decodeUtf16BE
     , decodeUtf32LE
     , decodeUtf32BE
     -- ** Controllable error handling
     , decodeUtf8With
+    , decodeUtf8With'
     , decodeUtf16LEWith
     , decodeUtf16BEWith
     , decodeUtf32LEWith
     , encodeUtf32BE
     ) where
 
+import Data.Bits ((.&.))
 import Data.Text.Encoding.Error (OnDecodeError, strictDecode)
-import Data.Text.Lazy.Internal (Text(..), chunk, foldrChunks)
+import Data.Text.Lazy.Internal (Text(..), chunk, empty, foldrChunks)
+import qualified Data.ByteString as S
 import qualified Data.ByteString.Lazy as B
 import qualified Data.ByteString.Lazy.Internal as B
+import qualified Data.ByteString.Unsafe as S
+import qualified Data.Text as T
 import qualified Data.Text.Encoding as TE
 import qualified Data.Text.Lazy.Encoding.Fusion as E
 import qualified Data.Text.Lazy.Fusion as F
 
 -- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
 decodeASCII :: B.ByteString -> Text
-decodeASCII bs = foldr (chunk . TE.decodeASCII) Empty (B.toChunks bs)
+decodeASCII bs = foldr (chunk . TE.decodeASCII) empty (B.toChunks bs)
 {-# INLINE decodeASCII #-}
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8 = decodeUtf8With strictDecode
 {-# INLINE decodeUtf8 #-}
 
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
+decodeUtf8With' :: OnDecodeError -> B.ByteString -> Text
+decodeUtf8With' onErr bs0 = fast bs0
+  where
+    decode = TE.decodeUtf8With onErr
+    fast (B.Chunk p ps) | isComplete p = chunk (decode p) (fast ps)
+                        | otherwise    = chunk (decode h) (slow t ps)
+      where (h,t) = S.splitAt pivot p
+            pivot | at 1      = len-1
+                  | at 2      = len-2
+                  | otherwise = len-3
+            len  = S.length p
+            at n = len >= n && S.unsafeIndex p (len-n) .&. 0xc0 == 0xc0
+    fast B.Empty = empty
+    slow i bs = {-# SCC "decodeUtf8With'/slow" #-}
+                case B.uncons bs of
+                  Just (w,bs') | isComplete i' -> chunk (decode i') (fast bs')
+                               | otherwise     -> slow i' bs'
+                    where i' = S.snoc i w
+                  Nothing -> case S.uncons i of
+                               Just (j,i') ->
+                                 case onErr desc (Just j) of
+                                   Nothing -> slow i' bs
+                                   Just c  -> Chunk (T.singleton c) (slow i' bs)
+                               Nothing ->
+                                 case onErr desc Nothing of
+                                   Nothing -> empty
+                                   Just c  -> Chunk (T.singleton c) empty
+    isComplete bs = {-# SCC "decodeUtf8With'/isComplete" #-}
+                    ix 1 .&. 0x80 == 0 ||
+                    (len >= 2 && ix 2 .&. 0xe0 == 0xc0) ||
+                    (len >= 3 && ix 3 .&. 0xf0 == 0xe0) ||
+                    (len >= 4 && ix 4 .&. 0xf8 == 0xf0)
+      where len = S.length bs
+            ix n = S.unsafeIndex bs (len-n)
+    desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
+{-# INLINE[0] decodeUtf8With' #-}
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
+decodeUtf8' :: B.ByteString -> Text
+decodeUtf8' = decodeUtf8With' strictDecode
+{-# INLINE[0] decodeUtf8' #-}
+
+-- This rule seems to cause performance loss.
+{- RULES "LAZY STREAM stream/decodeUtf8' fusion" [1]
+   forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}
+
 encodeUtf8 :: Text -> B.ByteString
 encodeUtf8 (Chunk c cs) = B.Chunk (TE.encodeUtf8 c) (encodeUtf8 cs)
 encodeUtf8 Empty        = B.Empty

tests/benchmarks/DecodeUtf8.hs

   bs <- BL.hGetContents h
   rnf (TL.decodeUtf8 bs) `seq` return ()
 
+lazy_ h = do
+  bs <- BL.hGetContents h
+  rnf (TL.decodeUtf8' bs) `seq` return ()
+
 lazy_len h = do
   bs <- BL.hGetContents h
   print . TL.length . TL.decodeUtf8 $ bs
 
+lazy__len h = do
+  bs <- BL.hGetContents h
+  print . TL.length . TL.decodeUtf8' $ bs
+
+lazy_init_len h = do
+  bs <- BL.hGetContents h
+  print . TL.length . TL.init . TL.decodeUtf8 $ bs
+
+lazy__init_len h = do
+  bs <- BL.hGetContents h
+  print . TL.length . TL.init . TL.decodeUtf8' $ bs
+
 lazy_io h = do
   hSetEncoding h utf8
   t <- TL.hGetContents h
     "strict_len_io" -> strict_len_io h
     "lazy" -> lazy h
     "lazy_len" -> lazy_len h
+    "lazy_init_len" -> lazy_init_len h
+    "lazy__init_len" -> lazy__init_len h
+    "lazy_" -> lazy_ h
+    "lazy__len" -> lazy__len h
     "lazy_io" -> lazy_io h
     "lazy_len_io" -> lazy_len_io h
     "string" -> string h
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.