Bryan O'Sullivan avatar Bryan O'Sullivan committed 39d1c4f

Present undecoded bytestring when streaming

Comments (0)

Files changed (3)

Data/Text/Encoding.hs

 import Foreign.C.Types (CSize)
 import Foreign.ForeignPtr (withForeignPtr)
 import Foreign.Marshal.Utils (with)
-import Foreign.Ptr (Ptr, minusPtr, plusPtr)
+import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
 import Foreign.Storable (Storable, peek, poke)
 import GHC.Base (MutableByteArray#)
 import qualified Data.Text.Array as A
 -- or continuation where it is encountered.
 
 -- | A stream oriented decoding result.
-data Decoding = Some Text (ByteString -> Decoding)
+data Decoding = Some Text ByteString (ByteString -> Decoding)
 
 instance Show Decoding where
-    showsPrec d (Some t _) = showParen (d > prec) $
-                             showString "Some " . showsPrec (prec+1) t
-      where prec = 10
+    showsPrec d (Some t bs _) = showParen (d > prec) $
+                                showString "Some " . showsPrec prec' t .
+                                showChar ' ' . showsPrec prec' bs .
+                                showString " _"
+      where prec = 10; prec' = prec + 1
 
 newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
 newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
   -- We create a slightly larger than necessary buffer to accommodate a
   -- potential surrogate pair started in the last buffer
   decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding
-  decodeChunk codepoint0 state0 (PS fp off len) =
+  decodeChunk codepoint0 state0 bs@(PS fp off len) =
     runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
    where
     decodeChunkToBuffer :: A.MArray s -> IO Decoding
       with (0::CSize) $ \destOffPtr ->
       with codepoint0 $ \codepointPtr ->
       with state0 $ \statePtr ->
+      with nullPtr $ \curPtrPtr ->
         let end = ptr `plusPtr` (off + len)
             loop curPtr = do
-              curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtr end
-                                                  codepointPtr statePtr
+              poke curPtrPtr curPtr
+              curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
+                         curPtrPtr end codepointPtr statePtr
               state <- peek statePtr
               case state of
                 UTF8_REJECT -> do
                   chunkText <- unsafeSTToIO $ do
                       arr <- A.unsafeFreeze dest
                       return $! textP arr 0 (fromIntegral n)
-                  return $ Some chunkText (decodeChunk codepoint state)
+                  lastPtr <- peek curPtrPtr
+                  let left = lastPtr `minusPtr` curPtr
+                  return $ Some chunkText (B.drop left bs)
+                           (decodeChunk codepoint state)
         in loop (ptr `plusPtr` off)
   desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
 
 
 foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_state
     :: MutableByteArray# s -> Ptr CSize
-    -> Ptr Word8 -> Ptr Word8
+    -> Ptr (Ptr Word8) -> Ptr Word8
     -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
 
 foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
  * A best-effort decoder. Runs until it hits either end of input or
  * the start of an invalid byte sequence.
  *
- * At exit, updates *destoff with the next offset to write to, and
- * returns the next source offset to read from. Moreover, this function
- * exposes the internal decoder state (state0 and codepoint0), allowing one
- * to restart the decoder after it terminates (say, due to a partial codepoint).
+ * At exit, we update *destoff with the next offset to write to, *src
+ * with the next source location past the last one successfully
+ * decoded, and return the next source location to read from.
+ *
+ * Moreover, we expose the internal decoder state (state0 and
+ * codepoint0), allowing one to restart the decoder after it
+ * terminates (say, due to a partial codepoint).
  *
  * In particular, there are a few possible outcomes,
  *
  */
 const uint8_t *
 _hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
-                           const uint8_t *const src, const uint8_t *const srcend,
+                           const uint8_t **const src,
+                           const uint8_t *const srcend,
                            uint32_t *codepoint0, uint32_t *state0)
 {
   uint16_t *d = dest + *destoff;
-  const uint8_t *s = src;
+  const uint8_t *s = *src, *last = *src;
   uint32_t state = *state0;
   uint32_t codepoint = *codepoint0;
 
 	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
 	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
       }
+      last = s;
     }
 #endif
 
       *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
       *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
     }
+    last = s;
   }
 
   /* Invalid encoding, back up to the errant character */
   *destoff = d - dest;
   *codepoint0 = codepoint;
   *state0 = state;
+  *src = last;
 
   return s;
 }
  */
 const uint8_t *
 _hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
-                     const uint8_t *const src, const uint8_t *const srcend)
+                     const uint8_t *src, const uint8_t *const srcend)
 {
   uint32_t codepoint;
   uint32_t state = UTF8_ACCEPT;
-  return _hs_text_decode_utf8_state(dest, destoff, src, srcend, &codepoint, &state);
+  return _hs_text_decode_utf8_state(dest, destoff, &src, srcend, &codepoint, &state);
 }

tests/Tests/Properties.hs

           feedChunksOf n f bs
             | B.null bs  = []
             | otherwise  = let (a,b) = B.splitAt n bs
-                               E.Some t f' = f a
+                               E.Some t _ f' = f a
                            in t : feedChunksOf n f' b
 
 -- This is a poor attempt to ensure that the error handling paths on
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.