Commits

Bryan O'Sullivan committed 78ee8fe Merge

Merge pull request #55 from bgamari/incremental

Introduce incremental UTF8 decoding interface

  • Participants
  • Parent commits 2e00b23, 62f78b4

Comments (0)

Files changed (3)

File Data/Text/Encoding.hs

     , decodeUtf32LEWith
     , decodeUtf32BEWith
 
+    -- ** Streaming decoding with controllable error handling
+    , decodeUtf8With'
+    , Decoder(..)
+
     -- * Encoding Text to ByteStrings
     , encodeUtf8
     , encodeUtf16LE
 #else
 import Control.Monad.ST (unsafeIOToST, unsafeSTToIO)
 #endif
+import Control.Monad.ST (runST)
 import Data.Bits ((.&.))
 import Data.ByteString as B
 import Data.ByteString.Internal as B
 import Data.Text.Private (runText)
 import Data.Text.UnsafeChar (ord, unsafeWrite)
 import Data.Text.UnsafeShift (shiftL, shiftR)
-import Data.Word (Word8)
+import Data.Word (Word8, Word32)
 import Foreign.C.Types (CSize)
 import Foreign.ForeignPtr (withForeignPtr)
 import Foreign.Marshal.Utils (with)
   desc = "Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream"
 {- INLINE[0] decodeUtf8With #-}
 
+data Decoder = Some !Text (ByteString -> Decoder)
+
+-- | (codepoint, state)
+type DecoderState = (Word32, Word32)
+
+decodeUtf8With' :: OnDecodeError -> ByteString -> Decoder
+decodeUtf8With' onErr = decodeChunk (0,0)
+ where
+  -- We create a slightly larger than necessary buffer to accomodate a
+  -- potential surrogate pair started in the last buffer
+  decodeChunk :: DecoderState -> ByteString -> Decoder
+  decodeChunk (codepoint0,state0) (PS fp off len) =
+    runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
+   where
+    decodeChunkToBuffer :: A.MArray s -> IO Decoder
+    decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+      with (0::CSize) $ \destOffPtr ->
+      with codepoint0 $ \codepointPtr ->
+      with state0 $ \statePtr ->
+        let end = ptr `plusPtr` (off + len)
+            loop curPtr = do
+              curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtr end
+                                                  codepointPtr statePtr
+              state <- peek statePtr
+              case state of
+                12 -> do
+                  -- We encountered an encoding error
+                  x <- peek curPtr'
+                  case onErr desc (Just x) of
+                    Nothing -> loop $ curPtr' `plusPtr` 1
+                    Just c -> do
+                      destOff <- peek destOffPtr
+                      w <- unsafeSTToIO $
+                           unsafeWrite dest (fromIntegral destOff) (safe c)
+                      poke destOffPtr (destOff + fromIntegral w)
+                      poke statePtr 0
+                      loop $ curPtr' `plusPtr` 1
+ 
+                _ -> do
+                  -- We encountered the end of the buffer while decoding
+                  n <- peek destOffPtr
+                  codepoint <- peek codepointPtr
+                  chunkText <- unsafeSTToIO $ do
+                      arr <- A.unsafeFreeze dest
+                      return $! textP arr 0 (fromIntegral n)
+                  return $ Some chunkText $ decodeChunk (codepoint, state)
+        in loop (ptr `plusPtr` off)
+  desc = "Data.Text.Encoding.decodeUtf8With': Invalid UTF-8 stream"
+{- INLINE[0] decodeUtf8With' #-}
+
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
 --
     :: MutableByteArray# s -> Ptr CSize
     -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
 
+foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_state
+    :: MutableByteArray# s -> Ptr CSize
+    -> Ptr Word8 -> Ptr Word8
+    -> Ptr Word32 -> Ptr Word32 -> IO (Ptr Word8)
+
 foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
     :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO ()

File cbits/cbits.c

  * the start of an invalid byte sequence.
  *
  * At exit, updates *destoff with the next offset to write to, and
- * returns the next source offset to read from.
+ * returns the next source offset to read from. Moreover, this function
+ * exposes the internal decoder state (state0 and codepoint0), allowing one
+ * to restart the decoder after it terminates (say, due to a partial codepoint).
+ *
+ * In particular, there are a few possible outcomes,
+ *
+ *   1) We decoded the buffer entirely:
+ *      In this case we return srcend
+ *      state0 == UTF8_ACCEPT
+ *
+ *   2) We met an invalid encoding
+ *      In this case we return the address of the first invalid byte
+ *      state0 == UTF8_REJECT
+ *
+ *   3) We reached the end of the buffer while decoding a codepoint
+ *      In this case we return a pointer to the first byte of the partial codepoint
+ *      state0 != UTF8_ACCEPT, UTF8_REJECT
+ *
  */
-uint8_t const *
-_hs_text_decode_utf8(uint16_t *dest, size_t *destoff,
-		     const uint8_t const *src, const uint8_t const *srcend)
+const uint8_t *
+_hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+                           const uint8_t *const src, const uint8_t *const srcend,
+                           uint32_t *codepoint0, uint32_t *state0)
 {
   uint16_t *d = dest + *destoff;
-  const uint8_t const *s = src;
-  uint32_t state = UTF8_ACCEPT;
+  const uint8_t *s = src;
+  uint32_t state = *state0;
+  uint32_t codepoint = *codepoint0;
 
   while (s < srcend) {
-    uint32_t codepoint;
-
 #if defined(__i386__) || defined(__x86_64__)
     /*
      * This code will only work on a little-endian system that
     }
   }
 
-  /* Error recovery - if we're not in a valid finishing state, back up. */
-  if (state != UTF8_ACCEPT)
+  /* Invalid encoding, back up to the errant character */
+  if (state == UTF8_REJECT)
     s -= 1;
 
   *destoff = d - dest;
+  *codepoint0 = codepoint;
+  *state0 = state;
 
   return s;
 }
+
+/*
+ * Helper to decode buffer and discard final decoder state
+ */
+const uint8_t *
+_hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
+                     const uint8_t *const src, const uint8_t *const srcend)
+{
+  uint32_t codepoint;
+  uint32_t state = UTF8_ACCEPT;
+  return _hs_text_decode_utf8_state(dest, destoff, src, srcend, &codepoint, &state);
+}

File tests/Tests/Properties.hs

 t_utf32BE    = forAll genUnicode $ (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id
 tl_utf32BE   = forAll genUnicode $ (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id
 
+t_utf8_incr  = do
+        Positive n <- arbitrary
+        forAll genUnicode $ recode n `eq` id
+    where recode n = T.concat . feedChunksOf n (E.decodeUtf8With' strictDecode) . E.encodeUtf8
+          feedChunksOf :: Int -> (B.ByteString -> E.Decoder) -> B.ByteString -> [T.Text]
+          feedChunksOf n f bs
+            | B.null bs  = []
+            | otherwise  = let (a,b) = B.splitAt n bs
+                               E.Some t f' = f a
+                           in t : feedChunksOf n f' b
+
 -- This is a poor attempt to ensure that the error handling paths on
 -- decode are exercised in some way.  Proper testing would be rather
 -- more involved.
       testProperty "tl_latin1" tl_latin1,
       testProperty "t_utf8" t_utf8,
       testProperty "t_utf8'" t_utf8',
+      testProperty "t_utf8_incr" t_utf8_incr,
       testProperty "tl_utf8" tl_utf8,
       testProperty "tl_utf8'" tl_utf8',
       testProperty "t_utf16LE" t_utf16LE,