John Millikin avatar John Millikin committed 9a8885a

Add lazy decoders for UTF-16 and UTF-32.

Comments (0)

Files changed (3)

Data/Text/Lazy/Encoding.hs

     -- * Decoding ByteStrings to Text
       decodeASCII
     , decodeUtf8
+    , decodeUtf16LE
+    , decodeUtf16BE
+    , decodeUtf32LE
+    , decodeUtf32BE
+    -- ** Controllable error handling
     , decodeUtf8With
-    --, decodeUtf16LE
-    --, decodeUtf16BE
-    --, decodeUtf32LE
-    --, decodeUtf32BE
+    , decodeUtf16LEWith
+    , decodeUtf16BEWith
+    , decodeUtf32LEWith
+    , decodeUtf32BEWith
 
     -- * Encoding Text to ByteStrings
     , encodeUtf8
 encodeUtf8 txt = E.unstream (E.restreamUtf8 (F.stream txt))
 {-# INLINE encodeUtf8 #-}
 
+-- | Decode text from little endian UTF-16 encoding.
+decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text
+decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
+{-# INLINE decodeUtf16LEWith #-}
+
+-- | Decode text from little endian UTF-16 encoding.
+decodeUtf16LE :: B.ByteString -> Text
+decodeUtf16LE = decodeUtf16LEWith strictDecode
+{-# INLINE decodeUtf16LE #-}
+
+-- | Decode text from big endian UTF-16 encoding.
+decodeUtf16BEWith :: OnDecodeError -> B.ByteString -> Text
+decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
+{-# INLINE decodeUtf16BEWith #-}
+
+-- | Decode text from big endian UTF-16 encoding.
+decodeUtf16BE :: B.ByteString -> Text
+decodeUtf16BE = decodeUtf16BEWith strictDecode
+{-# INLINE decodeUtf16BE #-}
+
 -- | Encode text using little endian UTF-16 encoding.
 encodeUtf16LE :: Text -> B.ByteString
 encodeUtf16LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16LE) [] txt)
 encodeUtf16BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16BE) [] txt)
 {-# INLINE encodeUtf16BE #-}
 
+-- | Decode text from little endian UTF-32 encoding.
+decodeUtf32LEWith :: OnDecodeError -> B.ByteString -> Text
+decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
+{-# INLINE decodeUtf32LEWith #-}
+
+-- | Decode text from little endian UTF-32 encoding.
+decodeUtf32LE :: B.ByteString -> Text
+decodeUtf32LE = decodeUtf32LEWith strictDecode
+{-# INLINE decodeUtf32LE #-}
+
+-- | Decode text from big endian UTF-32 encoding.
+decodeUtf32BEWith :: OnDecodeError -> B.ByteString -> Text
+decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
+{-# INLINE decodeUtf32BEWith #-}
+
+-- | Decode text from big endian UTF-32 encoding.
+decodeUtf32BE :: B.ByteString -> Text
+decodeUtf32BE = decodeUtf32BEWith strictDecode
+{-# INLINE decodeUtf32BE #-}
+
 -- | Encode text using little endian UTF-32 encoding.
 encodeUtf32LE :: Text -> B.ByteString
 encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)

Data/Text/Lazy/Encoding/Fusion.hs

     (
     -- * Streaming
     --  streamASCII
-     streamUtf8
-    --, streamUtf16LE
-    --, streamUtf16BE
-    --, streamUtf32LE
-    --, streamUtf32BE
+      streamUtf8
+    , streamUtf16LE
+    , streamUtf16BE
+    , streamUtf32LE
+    , streamUtf32BE
 
     -- * Unstreaming
     , unstream
 import Data.Text.Encoding.Error
 import Data.Text.Fusion (Step(..), Stream(..))
 import Data.Text.Fusion.Size
-import Data.Text.UnsafeChar (unsafeChr8)
-import Data.Word (Word8)
+import Data.Text.UnsafeChar (unsafeChr, unsafeChr8, unsafeChr32)
+import Data.Text.UnsafeShift (shiftL)
+import Data.Word (Word8, Word16, Word32)
 import qualified Data.Text.Encoding.Utf8 as U8
+import qualified Data.Text.Encoding.Utf16 as U16
+import qualified Data.Text.Encoding.Utf32 as U32
 import System.IO.Unsafe (unsafePerformIO)
 import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
 import Foreign.Storable (pokeByteOff)
     consume st             = decodeError "streamUtf8" "UTF-8" onErr Nothing st
 {-# INLINE [0] streamUtf8 #-}
 
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
+-- endian UTF-16 encoding.
+streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf16LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
+  where
+    next (T bs@(Chunk ps _) S0 i)
+      | i + 1 < len && U16.validate1 x1 =
+          Yield (unsafeChr x1)         (T bs S0 (i+2))
+      | i + 3 < len && U16.validate2 x1 x2 =
+          Yield (U16.chr2 x1 x2)       (T bs S0 (i+4))
+      where len = B.length ps
+            x1   = c (idx  i)      (idx (i + 1))
+            x2   = c (idx (i + 2)) (idx (i + 3))
+            c w1 w2 = w1 + (w2 `shiftL` 8)
+            idx = fromIntegral . B.unsafeIndex ps :: Int -> Word16
+    next st@(T bs s i) =
+      case s of
+        S2 w1 w2       | U16.validate1 (c w1 w2)           ->
+          Yield (unsafeChr (c w1 w2))   es
+        S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
+          Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
+        _ -> consume st
+       where es = T bs S0 i
+             c :: Word8 -> Word8 -> Word16
+             c w1 w2 = fromIntegral w1 + (fromIntegral w2 `shiftL` 8)
+    consume (T bs@(Chunk ps rest) s i)
+        | i >= B.length ps = consume (T rest s 0)
+        | otherwise =
+      case s of
+        S0             -> next (T bs (S1 x)          (i+1))
+        S1 w1          -> next (T bs (S2 w1 x)       (i+1))
+        S2 w1 w2       -> next (T bs (S3 w1 w2 x)    (i+1))
+        S3 w1 w2 w3    -> next (T bs (S4 w1 w2 w3 x) (i+1))
+        S4 w1 w2 w3 w4 -> decodeError "streamUtf16LE" "UTF-16LE" onErr (Just w1)
+                           (T bs (S3 w2 w3 w4)       (i+1))
+        where x = B.unsafeIndex ps i
+    consume (T Empty S0 _) = Done
+    consume st             = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing st
+{-# INLINE [0] streamUtf16LE #-}
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
+-- endian UTF-16 encoding.
+streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf16BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
+  where
+    next (T bs@(Chunk ps _) S0 i)
+      | i + 1 < len && U16.validate1 x1 =
+          Yield (unsafeChr x1)         (T bs S0 (i+2))
+      | i + 3 < len && U16.validate2 x1 x2 =
+          Yield (U16.chr2 x1 x2)       (T bs S0 (i+4))
+      where len = B.length ps
+            x1   = c (idx  i)      (idx (i + 1))
+            x2   = c (idx (i + 2)) (idx (i + 3))
+            c w1 w2 = (w1 `shiftL` 8) + w2
+            idx = fromIntegral . B.unsafeIndex ps :: Int -> Word16
+    next st@(T bs s i) =
+      case s of
+        S2 w1 w2       | U16.validate1 (c w1 w2)           ->
+          Yield (unsafeChr (c w1 w2))   es
+        S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) ->
+          Yield (U16.chr2 (c w1 w2) (c w3 w4)) es
+        _ -> consume st
+       where es = T bs S0 i
+             c :: Word8 -> Word8 -> Word16
+             c w1 w2 = (fromIntegral w1 `shiftL` 8) + fromIntegral w2
+    consume (T bs@(Chunk ps rest) s i)
+        | i >= B.length ps = consume (T rest s 0)
+        | otherwise =
+      case s of
+        S0             -> next (T bs (S1 x)          (i+1))
+        S1 w1          -> next (T bs (S2 w1 x)       (i+1))
+        S2 w1 w2       -> next (T bs (S3 w1 w2 x)    (i+1))
+        S3 w1 w2 w3    -> next (T bs (S4 w1 w2 w3 x) (i+1))
+        S4 w1 w2 w3 w4 -> decodeError "streamUtf16BE" "UTF-16BE" onErr (Just w1)
+                           (T bs (S3 w2 w3 w4)       (i+1))
+        where x = B.unsafeIndex ps i
+    consume (T Empty S0 _) = Done
+    consume st             = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing st
+{-# INLINE [0] streamUtf16BE #-}
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
+-- endian UTF-32 encoding.
+streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf32BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
+  where
+    next (T bs@(Chunk ps _) S0 i)
+      | i + 3 < len && U32.validate x =
+          Yield (unsafeChr32 x)       (T bs S0 (i+4))
+      where len = B.length ps
+            x = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
+            x1    = idx i
+            x2    = idx (i+1)
+            x3    = idx (i+2)
+            x4    = idx (i+3)
+            idx = fromIntegral . B.unsafeIndex ps :: Int -> Word32
+    next st@(T bs s i) =
+      case s of
+        S4 w1 w2 w3 w4 | U32.validate (c w1 w2 w3 w4) ->
+          Yield (unsafeChr32 (c w1 w2 w3 w4)) es
+        _ -> consume st
+       where es = T bs S0 i
+             c :: Word8 -> Word8 -> Word8 -> Word8 -> Word32
+             c w1 w2 w3 w4 = shifted
+              where
+               shifted = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
+               x1 = fromIntegral w1
+               x2 = fromIntegral w2
+               x3 = fromIntegral w3
+               x4 = fromIntegral w4
+    consume (T bs@(Chunk ps rest) s i)
+        | i >= B.length ps = consume (T rest s 0)
+        | otherwise =
+      case s of
+        S0             -> next (T bs (S1 x)          (i+1))
+        S1 w1          -> next (T bs (S2 w1 x)       (i+1))
+        S2 w1 w2       -> next (T bs (S3 w1 w2 x)    (i+1))
+        S3 w1 w2 w3    -> next (T bs (S4 w1 w2 w3 x) (i+1))
+        S4 w1 w2 w3 w4 -> decodeError "streamUtf32BE" "UTF-32BE" onErr (Just w1)
+                           (T bs (S3 w2 w3 w4)       (i+1))
+        where x = B.unsafeIndex ps i
+    consume (T Empty S0 _) = Done
+    consume st             = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing st
+{-# INLINE [0] streamUtf32BE #-}
+
+-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
+-- endian UTF-32 encoding.
+streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf32LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize
+  where
+    next (T bs@(Chunk ps _) S0 i)
+      | i + 3 < len && U32.validate x =
+          Yield (unsafeChr32 x)       (T bs S0 (i+4))
+      where len = B.length ps
+            x = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
+            x1    = idx i
+            x2    = idx (i+1)
+            x3    = idx (i+2)
+            x4    = idx (i+3)
+            idx = fromIntegral . B.unsafeIndex ps :: Int -> Word32
+    next st@(T bs s i) =
+      case s of
+        S4 w1 w2 w3 w4 | U32.validate (c w1 w2 w3 w4) ->
+          Yield (unsafeChr32 (c w1 w2 w3 w4)) es
+        _ -> consume st
+       where es = T bs S0 i
+             c :: Word8 -> Word8 -> Word8 -> Word8 -> Word32
+             c w1 w2 w3 w4 = shifted
+              where
+               shifted = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
+               x1 = fromIntegral w1
+               x2 = fromIntegral w2
+               x3 = fromIntegral w3
+               x4 = fromIntegral w4
+    consume (T bs@(Chunk ps rest) s i)
+        | i >= B.length ps = consume (T rest s 0)
+        | otherwise =
+      case s of
+        S0             -> next (T bs (S1 x)          (i+1))
+        S1 w1          -> next (T bs (S2 w1 x)       (i+1))
+        S2 w1 w2       -> next (T bs (S3 w1 w2 x)    (i+1))
+        S3 w1 w2 w3    -> next (T bs (S4 w1 w2 w3 x) (i+1))
+        S4 w1 w2 w3 w4 -> decodeError "streamUtf32LE" "UTF-32LE" onErr (Just w1)
+                           (T bs (S3 w2 w3 w4)       (i+1))
+        where x = B.unsafeIndex ps i
+    consume (T Empty S0 _) = Done
+    consume st             = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing st
+{-# INLINE [0] streamUtf32LE #-}
+
 -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'.
 unstreamChunks :: Int -> Stream Word8 -> ByteString
 unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0)

tests/Properties.hs

 t_utf8              = forAll genUnicode $ (E.decodeUtf8 . E.encodeUtf8) `eq` id
 tl_utf8             = forAll genUnicode $ (EL.decodeUtf8 . EL.encodeUtf8) `eq` id
 t_utf16LE           = forAll genUnicode $ (E.decodeUtf16LE . E.encodeUtf16LE) `eq` id
+tl_utf16LE          = forAll genUnicode $ (EL.decodeUtf16LE . EL.encodeUtf16LE) `eq` id
 t_utf16BE           = forAll genUnicode $ (E.decodeUtf16BE . E.encodeUtf16BE) `eq` id
+tl_utf16BE          = forAll genUnicode $ (EL.decodeUtf16BE . EL.encodeUtf16BE) `eq` id
 t_utf32LE           = forAll genUnicode $ (E.decodeUtf32LE . E.encodeUtf32LE) `eq` id
+tl_utf32LE          = forAll genUnicode $ (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id
 t_utf32BE           = forAll genUnicode $ (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id
+tl_utf32BE          = forAll genUnicode $ (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id
 
 class Stringy s where
     packS    :: String -> s
     testProperty "t_utf8" t_utf8,
     testProperty "tl_utf8" tl_utf8,
     testProperty "t_utf16LE" t_utf16LE,
+    testProperty "tl_utf16LE" tl_utf16LE,
     testProperty "t_utf16BE" t_utf16BE,
+    testProperty "tl_utf16BE" tl_utf16BE,
     testProperty "t_utf32LE" t_utf32LE,
-    testProperty "t_utf32BE" t_utf32BE
+    testProperty "tl_utf32LE" tl_utf32LE,
+    testProperty "t_utf32BE" t_utf32BE,
+    testProperty "tl_utf32BE" tl_utf32BE
   ],
 
   testGroup "instances" [
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.