Commits

Bryan O'Sullivan  committed dcc1207

A giant orgy of hacking that's impossible to split apart

* Implemented group and groupBy
* Added Data.Text.Unsafe module
* Refactored code to be safer and less redundant
* Fixed numerous fencepost errors
* Improved test coverage
* Added lots of assertions to document/enforce basic invariants
* A pwnie for everyone

  • Participants
  • Parent commits 491340d

Comments (0)

Files changed (8)

File Data/Text.hs

     , splitAt
     , span
     , break
-    -- , group
-    -- , groupBy
+    , group
+    , groupBy
     , inits
     , tails
 
 
 import Prelude (Char, Bool, Functor(..), Int, Maybe(..), String,
                 Eq, (==), (++), error,
-                Show, showsPrec,
-                Read, readsPrec,
-                (&&), (||), (+), (-), (<), (>), (<=), (>=), (.),
+                Read(..), Show(..),
+                (&&), (||), (+), (-), (<), (>), (<=), (>=), (.), ($),
                 not, return, otherwise)
+import Control.Exception (assert)
 import Data.Char (isSpace)
 import Control.Monad.ST (ST)
 import qualified Data.Text.Array as A
 
 import qualified Data.Text.Fusion as S
 import Data.Text.Fusion (Stream(..), Step(..), stream, unstream)
-import Data.Text.Internal (Text(..), empty)
+import Data.Text.Internal (Text(..), empty, text)
 import qualified Prelude as P
+import Data.Text.Unsafe (iter, iter_)
 import Data.Text.UnsafeChar (unsafeChr)
 import qualified Data.Text.Utf16 as U16
 
 head t = S.head (stream t)
 {-# INLINE head #-}
 
--- | Iterate one step through a UTF-16 array, returning the current
--- character and the step to add to give the next offset to iterate
--- at.
-iter :: A.Array Word16 -> Int -> (Char,Int)
-iter arr i | m < 0xD800 || m > 0xDBFF = (unsafeChr m,  1)
-           | otherwise                = (U16.chr2 m n, 2)
-  where m = A.unsafeIndex arr i
-        n = A.unsafeIndex arr j
-        j = i + 1
-{-# INLINE iter #-}
-
--- | Iterate one step through a UTF-16 array, returning the next
--- offset to iterate at.
-iter_ :: A.Array Word16 -> Int -> Int
-iter_ arr i | m < 0xD800 || m > 0xDBFF = 1
-            | otherwise                = 2
-  where m = A.unsafeIndex arr i
-{-# INLINE iter_ #-}
-
 -- | /O(1)/ Returns the first character and rest of a 'Text', or
 -- 'Nothing' if empty. Subject to array fusion.
 uncons :: Text -> Maybe (Char, Text)
-uncons (Text arr off len)
+uncons t@(Text arr off len)
     | len <= 0  = Nothing
     | otherwise = Just (c, Text arr (off+d) (len-d))
-    where (c,d) = iter arr off
+    where (c,d) = iter t 0
 {-# INLINE uncons #-}
 
 second :: (b -> c) -> (a,b) -> (a,c)
     S.last (stream t) = last t
   #-}
 
+-- | Construct a 'Text' without invisibly pinning its byte array in
+-- memory if its length has dwindled to zero.
+textP :: A.Array Word16 -> Int -> Int -> Text
+textP arr off len | len == 0  = empty
+                  | otherwise = text arr off len
+{-# INLINE textP #-}
 
 -- | /O(1)/ Returns all characters after the head of a 'Text', which
 -- must be non-empty.  Subject to array fusion.
 tail :: Text -> Text
-tail (Text arr off len)
+tail t@(Text arr off len)
     | len <= 0  = errorEmptyList "tail"
-    | otherwise = Text arr (off+d) (len-d)
-    where d = iter_ arr off
+    | otherwise = textP arr (off+d) (len-d)
+    where d = iter_ t 0
 {-# INLINE [1] tail #-}
 
-
+{-# RULES
+"TEXT tail -> fused" [~1] forall t.
+    tail t = unstream (S.tail (stream t))
+"TEXT tail -> unfused" [1] forall t.
+    unstream (S.tail (stream t)) = tail t
+ #-}
 
 -- | /O(1)/ Returns all but the last character of a 'Text', which must
 -- be non-empty.  Subject to array fusion.
 init :: Text -> Text
 init (Text arr off len) | len <= 0                   = errorEmptyList "init"
-                        | n >= 0xDC00 && n <= 0xDFFF = Text arr off (len-2)
-                        | otherwise                  = Text arr off (len-1)
+                        | n >= 0xDC00 && n <= 0xDFFF = textP arr off (len-2)
+                        | otherwise                  = textP arr off (len-1)
     where
       n = A.unsafeIndex arr (off+len-1)
 {-# INLINE [1] init #-}
 -- | /O(1)/ Tests whether a 'Text' is empty or not.  Subject to array
 -- fusion.
 null :: Text -> Bool
-null t = S.null (stream t)
-{-# INLINE null #-}
+null (Text _arr _off len) = assert (len >= 0) $ len <= 0
+{-# INLINE [1] null #-}
+
+{-# RULES
+"TEXT null -> fused" [~1] forall t.
+    null t = S.null (stream t)
+"TEXT null -> unfused" [1] forall t.
+    S.null (stream t) = null t
+ #-}
 
 -- | /O(n)/ Returns the number of characters in a 'Text'.
 -- Subject to array fusion.
 take n t@(Text arr off len)
     | n <= 0    = empty
     | n >= len  = t
-    | otherwise = Text arr off (loop off 0)
+    | otherwise = Text arr off (loop 0 0)
   where
-      end = off + len
       loop !i !count
-           | i >= end || count >= n   = i - off
-           | otherwise                = loop (i+d) (count+1)
-           where d = iter_ arr i
+           | i >= len || count >= n = i
+           | otherwise              = loop (i+d) (count+1)
+           where d = iter_ t i
 {-# INLINE [1] take #-}
 
 {-# RULES
 drop n t@(Text arr off len)
     | n <= 0    = t
     | n >= len  = empty
-    | otherwise = loop off 0 len
+    | otherwise = loop 0 0
   where end = off + len
-        loop !i !count !l
-            | i >= end || count >= n   = Text arr i l
-            | otherwise                = loop (i+d) (count+1) (l-d)
-            where d = iter_ arr i
+        loop !i !count
+            | i >= end || count >= n   = Text arr (off+i) (len-i)
+            | otherwise                = loop (i+d) (count+1)
+            where d = iter_ t i
 {-# INLINE [1] drop #-}
 
 {-# RULES
 -- the longest prefix (possibly empty) of elements that satisfy @p@.
 -- This function is subject to array fusion.
 takeWhile :: (Char -> Bool) -> Text -> Text
-takeWhile p t@(Text arr off len) = loop off 0
-  where loop !i !l | l >= len    = t
-                   | p c         = loop (i+d) (l+d)
-                   | otherwise   = Text arr off l
-            where (c,d)          = iter arr i
+takeWhile p t@(Text arr off len) = loop 0
+  where loop !i | i >= len    = t
+                | p c         = loop (i+d)
+                | otherwise   = textP arr off i
+            where (c,d)       = iter t i
 {-# INLINE [1] takeWhile #-}
 
 {-# RULES
 -- | /O(n)/ 'dropWhile' @p@ @xs@ returns the suffix remaining after
 -- 'takeWhile' @p@ @xs@. This function is subject to array fusion.
 dropWhile :: (Char -> Bool) -> Text -> Text
-dropWhile p (Text arr off len) = loop off 0
+dropWhile p t@(Text arr off len) = loop 0 0
   where loop !i !l | l >= len  = empty
                    | p c       = loop (i+d) (l+d)
-                   | otherwise = Text arr i (len-l)
-            where (c,d)        = iter arr i
+                   | otherwise = Text arr (off+i) (len-l)
+            where (c,d)        = iter t i
 {-# INLINE [1] dropWhile #-}
 
 {-# RULES
     | n <= 0    = (empty, t)
     | n >= len  = (t, empty)
     | otherwise = (Text arr off k, Text arr (off+k) (len-k))
-  where k = loop off 0
-        end = off + len
+  where k = loop 0 0
         loop !i !count
-            | i >= end || count >= n = i - off
+            | i >= len || count >= n = i
             | otherwise              = loop (i+d) (count+1)
-            where d                  = iter_ arr i
+            where d                  = iter_ t i
 {-# INLINE splitAt #-}
 
 -- | /O(n)/ 'span', applied to a predicate @p@ and text @t@, returns a
 -- @t@ of elements that satisfy @p@, and whose second is the remainder
 -- of the list.
 span :: (Char -> Bool) -> Text -> (Text, Text)
-span p t@(Text arr off len)
-    | k == 0    = (empty, t)
-    | k == len  = (t, empty)
-    | otherwise = (Text arr off k, Text arr (off+k) (len-k))
-  where k = loop off 0
-        loop !i !l | l >= len || not (p c) = l
-                   | otherwise             = loop (i+d) (l+d)
-            where (c,d)                    = iter arr i
+span p t@(Text arr off len) = (textP arr off k, textP arr (off+k) (len-k))
+  where k = loop 0
+        loop !i | i >= len || not (p c) = i
+                | otherwise             = loop (i+d)
+            where (c,d)                 = iter t i
 {-# INLINE span #-}
 
 -- | /O(n)/ 'break' is like 'span', but the prefix returned is over
 break p = span (not . p)
 {-# INLINE break #-}
 
+-- | /O(n)/ Group characters in a string according to a predicate.
+groupBy :: (Char -> Char -> Bool) -> Text -> [Text]
+groupBy p = loop
+  where
+    loop t@(Text arr off len)
+        | null t    = []
+        | otherwise = text arr off n : loop (text arr (off+n) (len-n))
+        where (c,d) = iter t 0
+              n     = d + findAIndexOrEnd (not . p c) (Text arr (off+d) (len-d))
+
+-- | Returns the /array/ index (in units of 'Word16') at which a
+-- character may be found.  This is /not/ the same as the logical
+-- index returned by e.g. 'findIndex'.
+findAIndexOrEnd :: (Char -> Bool) -> Text -> Int
+findAIndexOrEnd q t@(Text _arr _off len) = go 0
+    where go !i | i >= len || q c       = i
+                | otherwise             = go (i+d)
+                where (c,d)             = iter t i
+    
+-- | /O(n)/ Group characters in a string by equality.
+group :: Text -> [Text]
+group = groupBy (==)
+
 -- | /O(n)/ Return all initial segments of the given 'Text', shortest
 -- first.
 inits :: Text -> [Text]
-inits t@(Text arr off len) = loop off
+inits t@(Text arr off len) = loop 0
     where loop i | i >= len = [t]
-                 | otherwise = Text arr off i : loop (i + iter_ arr i)
+                 | otherwise = Text arr off i : loop (i + iter_ t i)
 
 -- | /O(n)/ Return all final segments of the given 'Text', longest
 -- first.
 -- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
 -- representing white space.
 words :: Text -> [Text]
-words (Text arr off len) = loop0 off off
-    where
-      loop0 start n
-          | n >= len = if start == n
-                       then []
-                       else [Text arr (start+off) (n-start)]
-          | isSpace (unsafeChr c) =
-              if start == n
-              then loop0 (start+1) (start+1)
-              else Text arr (start+off) (n-start) : loop0 (n+1) (n+1)
-          | otherwise = if c < 0xD800 || c > 0xDBFF
-                        then loop0 start (n+1)
-                        else loop0 start (n+2)
-          where c = arr `A.unsafeIndex` n
+words t@(Text arr off len) = loop 0 0
+  where
+    loop !start !n
+        | n >= len = if start == n
+                     then []
+                     else [Text arr (start+off) (n-start)]
+        | isSpace c =
+            if start == n
+            then loop (start+1) (start+1)
+            else Text arr (start+off) (n-start) : loop (n+d) (n+d)
+        | otherwise = loop start (n+d)
+        where (c,d) = iter t n
 {-# INLINE words #-}
 
 -- | /O(n)/ Breaks a 'Text' up into a list of 'Text's at

File Data/Text/Array.hs

     , unsafeNew
     , unsafeFreeze
     , run
+    , toList
     ) where
 
 #if 0
     length (Array len _ba) = len
     {-# INLINE length #-}
 
+instance (Elt e, Show e) => Show (Array e) where
+    show = show . toList
+
 instance IArray (MArray s e) where
     length (MArray len _ba) = len
     {-# INLINE length #-}
 
 #endif
 
+-- | Convert an immutable array to a list.
+toList :: Elt e => Array e -> [e]
+toList a = loop 0
+    where loop i | i < len   = unsafeIndex a i : loop (i+1)
+                 | otherwise = []
+          len = length a
+
 -- | An empty immutable array.
 empty :: Elt e => Array e
 empty = runST (unsafeNew 0 >>= unsafeFreeze)

File Data/Text/Fusion.hs

     , find
     , index
     , findIndex
+    , findIndexOrEnd
     , elemIndex
 
     -- * Zipping and unzipping
                  | otherwise -> loop_findIndex (i+1) s'
 {-# INLINE [0] findIndex #-}
 
+-- | The 'findIndexOrEnd' function takes a predicate and a stream and
+-- returns the index of the first element in the stream
+-- satisfying the predicate.
+findIndexOrEnd :: (Char -> Bool) -> Stream Char -> Int
+findIndexOrEnd p (Stream next s0 _len) = loop_findIndex 0 s0
+  where
+    loop_findIndex !i !s = case next s of
+      Done                   -> i
+      Skip    s'             -> loop_findIndex i     s' -- hmm. not caught by QC
+      Yield x s' | p x       -> i
+                 | otherwise -> loop_findIndex (i+1) s'
+{-# INLINE [0] findIndexOrEnd #-}
+
 -- | /O(n)/ The 'elemIndex' function returns the index of the first
 -- element in the given stream which is equal to the query
 -- element, or 'Nothing' if there is no such element.

File Data/Text/Internal.hs

     (
     -- * Types
       Text(..)
+    -- * Construction
+    , text
     -- * Code that must be here for accessibility
     , empty
+    -- * Debugging
+    , showText
     ) where
 
+import Control.Exception (assert)
 import qualified Data.Text.Array as A
 import Data.Typeable (Typeable)
 import Data.Word (Word16)
 
 -- | A space efficient, packed, unboxed Unicode text type.
-data Text = Text {-# UNPACK #-} !(A.Array Word16) -- payload
-                 {-# UNPACK #-} !Int              -- offset
-                 {-# UNPACK #-} !Int              -- length
-            deriving (Typeable)
+data Text = Text {
+      textArray :: {-# UNPACK #-} !(A.Array Word16) -- payload
+    , textOffset :: {-# UNPACK #-} !Int              -- offset
+    , textLength :: {-# UNPACK #-} !Int              -- length
+    } deriving (Typeable)
+
+text :: A.Array Word16 -> Int -> Int -> Text
+text arr off len =
+    assert (len >= 0) .
+    assert (off >= 0) .
+    assert (alen == 0 || off < alen) .
+    assert (len == 0 || c < 0xDC00 || c > 0xDFFF) $
+    Text arr off len
+  where c    = A.unsafeIndex arr off
+        alen = A.length arr
+{-# INLINE text #-}
 
 -- | /O(1)/ The empty 'Text'.
 empty :: Text
 empty = Text A.empty 0 0
 {-# INLINE [1] empty #-}
+
+showText :: Text -> String
+showText (Text arr off len) =
+    "Text " ++ (show . take (off+len) . A.toList) arr ++ ' ' :
+            show off ++ ' ' : show len

File Data/Text/Unsafe.hs

+-- |
+-- Module      : Data.Text.Unsafe
+-- License     : BSD-style
+-- Copyright   : (c) Bryan O'Sullivan 2009
+-- Maintainer  : bos@serpentine.com
+-- Stability   : experimental
+-- Portability : portable
+-- 
+-- A module containing unsafe 'Text' operations, for very very careful
+-- use in heavily tested code.
+module Data.Text.Unsafe
+    (
+      iter
+    , iter_
+    , unsafeHead
+    , unsafeTail
+    ) where
+     
+import Control.Exception (assert)
+import Data.Text.Internal (Text(..))
+import Data.Text.UnsafeChar (unsafeChr)
+import Data.Text.Utf16 (chr2)
+import qualified Data.Text.Array as A
+
+-- | /O(1)/ A variant of 'head' for non-empty 'Text'. 'unsafeHead'
+-- omits the check for the empty case, so there is an obligation on
+-- the programmer to provide a proof that the 'Text' is non-empty.
+unsafeHead :: Text -> Char
+unsafeHead (Text arr off len)
+    | m < 0xD800 || m > 0xDBFF = unsafeChr m
+    | otherwise                = chr2 m n
+    where m = assert (len > 0) $ A.unsafeIndex arr off
+          n = assert (len > 1) $ A.unsafeIndex arr (off+1)
+{-# INLINE unsafeHead #-}
+
+-- | /O(1)/ A variant of 'tail' for non-empty 'Text'. 'unsafeHead'
+-- omits the check for the empty case, so there is an obligation on
+-- the programmer to provide a proof that the 'Text' is non-empty.
+unsafeTail :: Text -> Text
+unsafeTail t@(Text arr off len) =
+    assert (d <= len) $ Text arr (off+d) (len-d)
+  where d = iter_ t 0
+{-# INLINE unsafeTail #-}
+
+-- | /O(1)/ Iterate one step through a UTF-16 array, returning the
+-- current character and the delta to add to give the next offset to
+-- iterate at.
+iter :: Text -> Int -> (Char,Int)
+iter (Text arr off len) i
+    | m < 0xD800 || m > 0xDBFF = (unsafeChr m, 1)
+    | otherwise                = (chr2 m n,    2)
+  where m = assert (i < len) $ A.unsafeIndex arr j
+        n = assert (j < len) $ A.unsafeIndex arr k
+        j = off + i
+        k = j + 1
+{-# INLINE iter #-}
+
+-- | /O(1)/ Iterate one step through a UTF-16 array, returning the
+-- delta to add to give the next offset to iterate at.
+iter_ :: Text -> Int -> Int
+iter_ (Text arr off len) i | m < 0xD800 || m > 0xDBFF = 1
+                           | otherwise                = 2
+  where m = assert (i >= 0 && i < len) $ A.unsafeIndex arr (off+i)
+{-# INLINE iter_ #-}

File tests/Properties.hs

 import Control.Applicative
 import Control.Arrow
 import Control.Monad
+import Data.Word
 import qualified Data.Text as T
 import Data.Text (pack,unpack)
 import qualified Data.Text.Fusion as S
 prop_singleton c     = [c] == (unpack . T.singleton) c
 
 -- Do two functions give the same answer?
+eq :: (Eq a) => (t -> a) -> (t -> a) -> t -> Bool
 eq a b s  = a s == b s
 -- What about with the RHS packed?
-eqP a b s  = a s == b (pack s)
+eqP :: (Eq a) => (String -> a) -> (T.Text -> a) -> String -> Word8 -> Bool
+eqP a b s w  = a s == b t &&
+               a sa == b ta &&
+               a sb == b tb
+    where t             = pack s
+          (sa,sb)       = splitAt m s
+          (ta,tb)       = T.splitAt m t
+          l             = length s
+          m | l == 0    = n
+            | otherwise = n `mod` l
+          n             = fromIntegral w
 -- Or with the string non-empty, and the RHS packed?
-eqEP a b s = let e = notEmpty s
-             in a e == b (pack e)
+eqEP :: (Eq a) =>
+        (String -> a) -> (T.Text -> a) -> NotEmpty String -> Word8 -> Bool
+eqEP a b e w  = a s == b t &&
+                (null sa || a sa == b ta) &&
+                (null sb || a sb == b tb)
+    where (sa,sb)       = splitAt m s
+          (ta,tb)       = T.splitAt m t
+          t             = pack s
+          l             = length s
+          m | l == 0    = n
+            | otherwise = n `mod` l
+          n             = fromIntegral w
+          s             = notEmpty e
 
 prop_cons x          = (x:)     `eqP` (unpack . T.cons x)
 prop_snoc x          = (++ [x]) `eqP` (unpack . (flip T.snoc) x)
 prop_splitAt n       = L.splitAt n   `eqP` ((unpack *** unpack) . T.splitAt n)
 prop_span p          = L.span p      `eqP` ((unpack *** unpack) . T.span p)
 prop_break p         = L.break p     `eqP` ((unpack *** unpack) . T.break p)
+prop_group           = L.group       `eqP` (map unpack . T.group)
+prop_groupBy p       = L.groupBy p   `eqP` (map unpack . T.groupBy p)
 prop_inits           = L.inits       `eqP` (map unpack . T.inits)
 prop_tails           = L.tails       `eqP` (map unpack . T.tails)
 
   ("prop_splitAt", mytest prop_splitAt),
   ("prop_span", mytest prop_span),
   ("prop_break", mytest prop_break),
+  ("prop_group", mytest prop_group),
+  ("prop_groupBy", mytest prop_groupBy),
   ("prop_inits", mytest prop_inits),
   ("prop_tails", mytest prop_tails),
 

File tests/QuickCheckUtils.hs

                                          fromIntegral b :: Integer) g of
                             (x,g) -> (fromIntegral x, g)
 
+instance Random Word8 where
+  randomR = integralRandomR
+  random  = randomR (minBound,maxBound)
+
+instance Arbitrary Word8 where
+    arbitrary     = choose (minBound,maxBound)
+    coarbitrary c = variant (fromEnum c `rem` 4)
+
 instance Random Word16 where
   randomR = integralRandomR
   random  = randomR (minBound,maxBound)
     Data.Text.Array
     Data.Text.Encoding
     Data.Text.Encoding.Fusion
+    Data.Text.Unsafe
     Data.Text.UnsafeChar
     Data.Text.Internal
     Data.Text.Fusion