Bryan O'Sullivan avatar Bryan O'Sullivan committed e4c003b

Test ASCII and UTF-8 encoding roundtripping

Comments (0)

Files changed (4)

Data/Text/Encoding/Fusion.hs

 import qualified Data.Text.Utf32 as U32
 import qualified Data.Text.Utf8 as U8
 
-data T4 a b c d = T4 !a !b !c !d
+data T4 a b c d = T4 {-# UNPACK #-} !a {-# UNPACK #-} !b {-# UNPACK #-} !c {-# UNPACK #-} !d
 
 streamASCII :: ByteString -> Stream Char
 streamASCII bs = Stream next 0 l
                       where x' = fromIntegral (ord x) :: Word8
 {-# INLINE restreamASCII #-}
 
+data M = N | J {-# UNPACK #-} !Word8
+       deriving (Eq, Ord, Show)
+
 -- | /O(n)/ Convert a Stream Char into a UTF-8 encoded Stream Word8.
 restreamUtf8 :: Stream Char -> Stream Word8
 restreamUtf8 (Stream next0 s0 len) =
-    Stream next (T4 (Just s0) Nothing Nothing Nothing) (len*2)
+    Stream next (T4 s0 N N N) (len*2)
     where
       {-# INLINE next #-}
-      next (T4 (Just s) Nothing Nothing Nothing) = case next0 s of
+      next (T4 s N N N) = case next0 s of
                   Done              -> Done
-                  Skip s'           -> Skip (T4 (Just s') Nothing Nothing Nothing)
+                  Skip s'           -> Skip (T4 s' N N N)
                   Yield x xs
-                      | n <= 0x7F   -> Yield c         (T4 (Just xs) Nothing Nothing Nothing)
-                      | n <= 0x07FF -> Yield (fst c2)  (T4 (Just xs) (Just $ snd c2) Nothing Nothing)
-                      | n <= 0xFFFF -> Yield (fst3 c3) (T4 (Just xs) (Just $ snd3 c3) (Just $ trd3 c3) Nothing)
-                      | otherwise   -> Yield (fst4 c4) (T4 (Just xs) (Just $ snd4 c4) (Just $ trd4 c4) (Just $ fth4 c4))
+                      | n <= 0x7F   -> Yield c  (T4 xs N N N)
+                      | n <= 0x07FF -> Yield a2 (T4 xs (J b2) N N)
+                      | n <= 0xFFFF -> Yield a3 (T4 xs (J b3) (J c3) N)
+                      | otherwise   -> Yield a4 (T4 xs (J b4) (J c4) (J d4))
                       where
                         n  = ord x
                         c  = fromIntegral n
-                        c2 = U8.ord2 x
-                        c3 = U8.ord3 x
-                        c4 = U8.ord4 x
-                        fst3 (x1,_,_)   = x1
-                        snd3 (_,x2,_)   = x2
-                        trd3 (_,_,x3)   = x3
-                        fst4 (x1,_,_,_) = x1
-                        snd4 (_,x2,_,_) = x2
-                        trd4 (_,_,x3,_) = x3
-                        fth4 (_,_,_,x4) = x4
-      next (T4 (Just s) (Just x2) Nothing Nothing) = Yield x2 (T4 (Just s) Nothing Nothing Nothing)
-      next (T4 (Just s) (Just x2) x3 Nothing)      = Yield x2 (T4 (Just s) x3 Nothing Nothing)
-      next (T4 (Just s) (Just x2) x3 x4)           = Yield x2 (T4 (Just s) x3 x4 Nothing)
+                        (a2,b2) = U8.ord2 x
+                        (a3,b3,c3) = U8.ord3 x
+                        (a4,b4,c4,d4) = U8.ord4 x
+      next (T4 s (J x2) N N)   = Yield x2 (T4 s N N N)
+      next (T4 s (J x2) x3 N)  = Yield x2 (T4 s x3 N N)
+      next (T4 s (J x2) x3 x4) = Yield x2 (T4 s x3 x4 N)
       next _ = internalError "restreamUtf8"
 {-# INLINE restreamUtf8 #-}
 

Data/Text/Utf8.hs

     , validate4
     ) where
 
+import Control.Exception (assert)
 import Data.Char (ord)
 import Data.Bits (shiftR, (.&.))
 import GHC.Exts
 import GHC.Word (Word8(..))
 
+default(Int)
+
 between :: Word8                -- ^ byte to check
         -> Word8                -- ^ lower bound
         -> Word8                -- ^ upper bound
 {-# INLINE between #-}
 
 ord2   :: Char -> (Word8,Word8)
-ord2 c = (x1,x2)
+ord2 c = assert (n >= 0x80 && n <= 0x07ff) (x1,x2)
     where
       n  = ord c
-      x1 = fromIntegral $ (shiftR n 6) + (0xC0 :: Int) :: Word8
-      x2 = fromIntegral $ (n .&. 0x3F) + (0x80 :: Int) :: Word8
+      x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
+      x2 = fromIntegral $ (n .&. 0x3F)   + 0x80
 
 ord3   :: Char -> (Word8,Word8,Word8)
-ord3 c = (x1,x2,x3)
+ord3 c = assert (n >= 0x0800 && n <= 0xffff) (x1,x2,x3)
     where
       n  = ord c
-      x1 = fromIntegral $ (shiftR n 12) + (0xE0::Int) :: Word8
-      x2 = fromIntegral $ ((shiftR n 6) .&. (0x3F::Int)) + (0x80::Int) :: Word8
-      x3 = fromIntegral $ (n .&. (0x3F::Int)) + (0x80::Int) :: Word8
+      x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
+      x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
+      x3 = fromIntegral $ (n .&. 0x3F) + 0x80
 
 ord4   :: Char -> (Word8,Word8,Word8,Word8)
-ord4 c = (x1,x2,x3,x4)
+ord4 c = assert (n >= 0x10000) (x1,x2,x3,x4)
     where
       n  = ord c
-      x1 = fromIntegral $ (shiftR n 18) + (0xF0::Int) :: Word8
-      x2 = fromIntegral $ ((shiftR n 12) .&. (0x3F::Int)) + (0x80::Int) :: Word8
-      x3 = fromIntegral $ ((shiftR n 6) .&. (0x3F::Int)) + (0x80::Int) :: Word8
-      x4 = fromIntegral $ (n .&. (0x3F::Int)) + (0x80::Int) :: Word8
+      x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
+      x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
+      x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
+      x4 = fromIntegral $ (n .&. 0x3F) + 0x80
 
 chr2       :: Word8 -> Word8 -> Char
 chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
       y1# = word2Int# x1#
       y2# = word2Int# x2#
       z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6#
-      z2# = y2# -# 0x8F#
+      z2# = y2# -# 0x80#
 {-# INLINE chr2 #-}
 
 chr3          :: Word8 -> Word8 -> Word8 -> Char

tests/Properties.hs

 import Test.QuickCheck
 import Text.Show.Functions
 
+import Data.Char
 import Debug.Trace
 import Text.Printf
 import System.Environment
 import Control.Arrow
 import Control.Monad
 import Data.Word
+import qualified Data.ByteString as B
 import qualified Data.Text as T
 import Data.Text (pack,unpack)
+import qualified Data.Text.Encoding as E
 import qualified Data.Text.Fusion as S
 import Data.Text.Fusion (stream, unstream)
 import qualified Data.List as L
 
-
 import QuickCheckUtils
 
-prop_pack_unpack s   = (unpack . pack) s == s
+prop_pack_unpack s     = (unpack . pack) s == s
 prop_stream_unstream t = (unstream . stream) t == t
-prop_reverse_stream t = (S.reverse . S.reverseStream) t == t
-prop_singleton c     = [c] == (unpack . T.singleton) c
+prop_reverse_stream t  = (S.reverse . S.reverseStream) t == t
+prop_singleton c       = [c] == (unpack . T.singleton) c
+
+prop_ascii t           = E.decodeASCII (E.encodeASCII a) == a
+    where a            = T.map (\c -> chr (ord c `mod` 128)) t
+prop_utf8              = (E.decodeUtf8 . E.encodeUtf8) `eq` id
 
 -- Do two functions give the same answer?
 eq :: (Eq a) => (t -> a) -> (t -> a) -> t -> Bool
   ("prop_reverse_stream", mytest prop_reverse_stream),
   ("prop_singleton", mytest prop_singleton),
 
+  ("prop_ascii", mytest prop_ascii),
+  ("prop_utf8", mytest prop_utf8),
+
   ("prop_cons", mytest prop_cons),
   ("prop_snoc", mytest prop_snoc),
   ("prop_append", mytest prop_append),

tests/QuickCheckUtils.hs

 import System.IO
 import System.Random
 import Test.QuickCheck
+import qualified Data.ByteString as B
 
 integralRandomR :: (Integral a, RandomGen g) => (a,a) -> g -> (a,g)
 integralRandomR  (a,b) g = case randomR (fromIntegral a :: Integer,
                             (x,g) -> (fromIntegral x, g)
 
 instance Random Word8 where
-  randomR = integralRandomR
-  random  = randomR (minBound,maxBound)
+    randomR = integralRandomR
+    random  = randomR (minBound,maxBound)
 
 instance Arbitrary Word8 where
     arbitrary     = choose (minBound,maxBound)
     coarbitrary c = variant (fromEnum c `rem` 4)
 
+instance Arbitrary B.ByteString where
+  arbitrary = B.pack `fmap` arbitrary
+  coarbitrary s = coarbitrary (B.unpack s)
+
 instance Random Word16 where
   randomR = integralRandomR
   random  = randomR (minBound,maxBound)
     arbitrary   = (fmap T.pack) `fmap` arbitrary
     coarbitrary = coarbitrary . notEmpty
 
+instance Arbitrary (NotEmpty B.ByteString) where
+    arbitrary   = (fmap B.pack) `fmap` arbitrary
+    coarbitrary = coarbitrary . notEmpty
+
 debug = False
 
 mytest :: Testable a => a -> Int -> IO (Bool, Int)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.