Bryan O'Sullivan avatar Bryan O'Sullivan committed 4f69839 Merge

Merge from 1.0 branch

Comments (0)

Files changed (5)

  *      state0 != UTF8_ACCEPT, UTF8_REJECT
  *
  */
-const uint8_t *
-_hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
-                           const uint8_t **const src,
-                           const uint8_t *const srcend,
-                           uint32_t *codepoint0, uint32_t *state0)
+#if defined(__GNUC__) || defined(__clang__)
+static inline uint8_t const *
+_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+			 const uint8_t const **src, const uint8_t const *srcend,
+			 uint32_t *codepoint0, uint32_t *state0)
+  __attribute((always_inline));
+#endif
+
+static inline uint8_t const *
+_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
+			 const uint8_t const **src, const uint8_t const *srcend,
+			 uint32_t *codepoint0, uint32_t *state0)
 {
   uint16_t *d = dest + *destoff;
   const uint8_t *s = *src, *last = *src;
     last = s;
   }
 
-  /* Invalid encoding, back up to the errant character */
-  if (state == UTF8_REJECT)
-    s -= 1;
-
   *destoff = d - dest;
   *codepoint0 = codepoint;
   *state0 = state;
   return s;
 }
 
+uint8_t const *
+_hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
+                           const uint8_t const **src,
+			   const uint8_t const *srcend,
+                           uint32_t *codepoint0, uint32_t *state0)
+{
+  uint8_t const *ret = _hs_text_decode_utf8_int(dest, destoff, src, srcend,
+						codepoint0, state0);
+  if (*state0 == UTF8_REJECT)
+    ret -=1;
+  return ret;
+}
+
 /*
  * Helper to decode buffer and discard final decoder state
  */
 {
   uint32_t codepoint;
   uint32_t state = UTF8_ACCEPT;
-  return _hs_text_decode_utf8_state(dest, destoff, &src, srcend, &codepoint, &state);
+  uint8_t const *ret = _hs_text_decode_utf8_int(dest, destoff, &src, srcend,
+						&codepoint, &state);
+  /* Back up if we have an incomplete or invalid encoding */
+  if (state != UTF8_ACCEPT)
+    ret -= 1;
+  return ret;
 }
+1.0.0.1
+
+	* decodeUtf8: Fixed a regression that caused us to incorrectly
+	  identify truncated UTF-8 as valid (gh-61)
+
 1.0.0.0
 
 	* Added support for Unicode 6.3.0 to case conversion functions

tests/Tests/Properties.hs

 import Test.QuickCheck.Monadic
 import Text.Show.Functions ()
 
+import Control.Applicative ((<$>), (<*>))
 import Control.Arrow ((***), second)
 import Control.Exception (catch)
 import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord)
                                E.Some t _ f' = f a
                            in t : feedChunksOf n f' b
 
--- This is a poor attempt to ensure that the error handling paths on
--- decode are exercised in some way.  Proper testing would be rather
--- more involved.
-t_utf8_err :: DecodeErr -> B.ByteString -> Property
-t_utf8_err (DE _ de) bs = monadicIO $ do
-  l <- run $ let len = T.length (E.decodeUtf8With de bs)
-             in (len `seq` return (Right len)) `catch`
-                (\(e::UnicodeException) -> return (Left e))
-  case l of
-    Left err -> assert $ length (show err) >= 0
-    Right n  -> assert $ n >= 0
+data Badness = Solo | Leading | Trailing
+             deriving (Eq, Show)
+
+instance Arbitrary Badness where
+    arbitrary = elements [Solo, Leading, Trailing]
+
+t_utf8_err :: Badness -> DecodeErr -> Property
+t_utf8_err bad de = do
+  let gen = case bad of
+        Solo     -> genInvalidUTF8
+        Leading  -> B.append <$> genInvalidUTF8 <*> genUTF8
+        Trailing -> B.append <$> genUTF8 <*> genInvalidUTF8
+      genUTF8 = E.encodeUtf8 <$> genUnicode
+  forAll gen $ \bs -> do
+    onErr <- genDecodeErr de
+    monadicIO $ do
+    l <- run $ let len = T.length (E.decodeUtf8With onErr bs)
+               in (len `seq` return (Right len)) `catch`
+                  (\(e::UnicodeException) -> return (Left e))
+    assert $ case l of
+      Left err -> length (show err) >= 0
+      Right _  -> de /= Strict
 
 t_utf8_err' :: B.ByteString -> Property
 t_utf8_err' bs = monadicIO . assert $ case E.decodeUtf8' bs of
                                         Left err -> length (show err) >= 0
                                         Right t  -> T.length t >= 0
 
+genInvalidUTF8 :: Gen B.ByteString
+genInvalidUTF8 = B.pack <$> oneof [
+    -- invalid leading byte of a 2-byte sequence
+    (:) <$> choose (0xC0, 0xC1) <*> upTo 1 contByte
+    -- invalid leading byte of a 4-byte sequence
+  , (:) <$> choose (0xF5, 0xFF) <*> upTo 3 contByte
+    -- continuation bytes without a start byte
+  , listOf1 contByte
+    -- short 2-byte sequence
+  , (:[]) <$> choose (0xC2, 0xDF)
+    -- short 3-byte sequence
+  , (:) <$> choose (0xE0, 0xEF) <*> upTo 1 contByte
+    -- short 4-byte sequence
+  , (:) <$> choose (0xF0, 0xF4) <*> upTo 2 contByte
+  ]
+  where
+    contByte = (0x80 +) <$> choose (0, 0x3f)
+    upTo n gen = do
+      k <- choose (0,n)
+      vectorOf k gen
+
 s_Eq s            = (s==)    `eq` ((S.streamList s==) . S.streamList)
     where _types = s :: String
 sf_Eq p s =

tests/Tests/QuickCheckUtils.hs

     , integralRandomR
 
     , DecodeErr (..)
+    , genDecodeErr
 
     , Stringy (..)
     , eq
                                          fromIntegral b :: Integer) g of
                             (x,h) -> (fromIntegral x, h)
 
-data DecodeErr = DE String T.OnDecodeError
+data DecodeErr = Lenient | Ignore | Strict | Replace
+               deriving (Show, Eq)
 
-instance Show DecodeErr where
-    show (DE d _) = "DE " ++ d
+genDecodeErr :: DecodeErr -> Gen T.OnDecodeError
+genDecodeErr Lenient = return T.lenientDecode
+genDecodeErr Ignore  = return T.ignore
+genDecodeErr Strict  = return T.strictDecode
+genDecodeErr Replace = arbitrary
 
 instance Arbitrary DecodeErr where
-    arbitrary = oneof [ return $ DE "lenient" T.lenientDecode
-                      , return $ DE "ignore" T.ignore
-                      , return $ DE "strict" T.strictDecode
-                      , DE "replace" `fmap` arbitrary ]
+    arbitrary = elements [Lenient, Ignore, Strict, Replace]
 
 class Stringy s where
     packS    :: String -> s

File contents unchanged.

Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.