Commits

Bryan O'Sullivan  committed 960b50b

Implement split using the new indices function.

  • Participants
  • Parent commits d8d379b

Comments (0)

Files changed (5)

File Data/Text.hs

 import Data.Monoid (Monoid(..))
 import Data.Word (Word16)
 import Data.String (IsString(..))
-
 import qualified Data.Text.Fusion as S
 import qualified Data.Text.Fusion.Common as S
 import Data.Text.Fusion (stream, reverseStream, unstream)
-
 import Data.Text.Internal (Text(..), empty, text, textP)
 import qualified Prelude as P
 import Data.Text.Unsafe (iter, iter_, reverseIter, unsafeHead, unsafeTail)
 import Data.Text.UnsafeChar (unsafeChr)
 import qualified Data.Text.Encoding.Utf16 as U16
+import Data.Text.Search (indices)
 
 -- $fusion
 --
 -- copies to create substrings; they just construct new 'Text's that
 -- are slices of the original.
 
--- | /O(m*n)/ Break a 'Text' into pieces separated by the first
+-- | /O(m+n)/ Break a 'Text' into pieces separated by the first
 -- 'Text' argument, consuming the delimiter. An empty delimiter is
 -- invalid, and will cause an error to be raised.
 --
 --
 -- > intercalate s . split s         == id
 -- > split (singleton c)             == splitWith (==c)
-split :: Text                   -- ^ Text to split on
-      -> Text                   -- ^ Input text
-      -> [Text]
-split pat src0
+--
+-- In (unlikely) bad cases, this function's time complexity
+-- degenerates towards /O(n*m)/.
+split :: Text -> Text -> [Text]
+split pat src@(Text arr _off len)
     | null pat  = emptyError "split"
-    | l == 1    = splitWith (== (unsafeHead pat)) src0
-    | otherwise = go src0
+    | l == 1    = splitWith (== unsafeHead pat) src
+    | otherwise = chomp 0 (indices pat src)
   where
-    l      = length pat
-    go src = search 0 src
-      where
-        search !n !s
-            | null s             = [src]      -- not found
-            | pat `isPrefixOf` s = take n src : go (drop l s)
-            | otherwise          = search (n+1) (unsafeTail s)
+    l              =  length pat
+    chomp s (x:xs) =  textP arr s (x-s) : chomp (x+l) xs
+    chomp s []     = [textP arr s (len-s)]
 {-# INLINE [1] split #-}
 
 {-# RULES

File Data/Text/Search.hs

 --
 -- * F. Lundh: The Fast Search Algorithm.
 --   <http://effbot.org/zone/stringlib.htm> (2006)
+
 module Data.Text.Search
     (
       indices
-    , slowIndices
     ) where
 
-import qualified Data.Text as T
 import qualified Data.Text.Array as A
 import Data.Word (Word64)
 import Data.Text.Internal (Text(..))
 import Data.Bits ((.|.), (.&.))
 import Data.Text.UnsafeShift (shiftL)
-import Data.Text.Unsafe (iter_)
 
 -- | /O(n+m)/ Find the offsets of all non-overlapping indices of
 -- @needle@ within @haystack@. In (unlikely) bad cases, this
                       | hindex i == c = i : loop (i+1)
                       | otherwise     = loop (i+1)
 {-# INLINE indices #-}
-
--- | /O(n*m)/ Find the offsets of all non-overlapping indices of
--- @needle@ within @haystack@. Provided purely as a slower, reliable
--- counterpart to 'indices'.
-slowIndices :: Text             -- ^ Substring to search for (@needle@)
-            -> Text             -- ^ Text to search in (@haystack@)
-            -> [Int]
-slowIndices needle@(Text _narr _noff nlen) haystack@(Text harr hoff hlen)
-    | T.null needle = []
-    | otherwise     = scan 0
-  where
-    scan i | i >= hlen = []
-           | needle `T.isPrefixOf` t = i : scan (i+nlen)
-           | otherwise = scan (i+d)
-           where t = Text harr (hoff+i) (hlen-i)
-                 d = iter_ haystack i

File tests/Makefile

 	cd .. && cabal build
 endif
 
-Properties.o: QuickCheckUtils.o
+Properties.o: QuickCheckUtils.o SlowFunctions.o
 
 QuickCheckUtils.o: $(lib)
 
-qc: Properties.o QuickCheckUtils.o
+qc: Properties.o QuickCheckUtils.o SlowFunctions.o
 	$(ghc) $(ghc-flags) -threaded -o $@ $^ $(lib)
 
 sb: SearchBench.o

File tests/Properties.hs

 import System.IO.Unsafe (unsafePerformIO)
 import Test.Framework (defaultMain, testGroup)
 import Test.Framework.Providers.QuickCheck (testProperty)
-import Data.Text.Search
+import Data.Text.Search (indices)
+import qualified SlowFunctions as Slow
 
 import QuickCheckUtils (NotEmpty(..), small)
 
 t_tails           = L.tails       `eqP` (map unpackS . T.tails)
 tl_tails          = L.tails       `eqP` (map unpackS . TL.tails)
 
+t_split_split p     = T.split p `eq` Slow.split p
 t_split_i (NotEmpty t)  = id `eq` (T.intercalate t . T.split t)
 tl_split_i (NotEmpty t) = id `eq` (TL.intercalate t . TL.split t)
 t_splitTimes_i k (NotEmpty t) = id `eq` (T.intercalate t . T.splitTimes k t)
 t_zipWith c s     = L.zipWith c s `eqP` (unpackS . T.zipWith c (packS s))
 tl_zipWith c s    = L.zipWith c s `eqP` (unpackS . TL.zipWith c (packS s))
 
-t_indices = unsquare (\s -> slowIndices s `eq` indices s)
+t_indices = unsquare (\s -> Slow.indices s `eq` indices s)
 t_indices_occurs t = unsquare (\ts -> let s = T.intercalate t ts
-                                      in slowIndices t s == indices t s)
+                                      in Slow.indices t s == indices t s)
 
 -- Bit shifts.
 shiftL w = forAll (choose (0,width-1)) $ \k -> Bits.shiftL w k == U.shiftL w k
     ],
 
     testGroup "breaking many" [
+      testProperty "t_split_split" t_split_split,
       testProperty "t_split_i" t_split_i,
       testProperty "t_splitTimes_i" t_splitTimes_i,
       testProperty "tl_splitTimes_i" tl_splitTimes_i,

File tests/SlowFunctions.hs

+{-# LANGUAGE BangPatterns #-}
+
+module SlowFunctions
+    (
+      indices
+    , split
+    ) where
+
+import qualified Data.Text as T
+import Data.Text.Internal (Text(..))
+import Data.Text.Unsafe (iter_, unsafeHead, unsafeTail)
+
+indices :: Text                -- ^ Substring to search for (@needle@)
+        -> Text                -- ^ Text to search in (@haystack@)
+        -> [Int]
+indices needle@(Text _narr _noff nlen) haystack@(Text harr hoff hlen)
+    | T.null needle = []
+    | otherwise     = scan 0
+  where
+    scan i | i >= hlen = []
+           | needle `T.isPrefixOf` t = i : scan (i+nlen)
+           | otherwise = scan (i+d)
+           where t = Text harr (hoff+i) (hlen-i)
+                 d = iter_ haystack i
+
+split :: Text                   -- ^ Text to split on
+      -> Text                   -- ^ Input text
+      -> [Text]
+split pat src0
+    | T.null pat  = error "split: empty"
+    | l == 1      = T.splitWith (== (unsafeHead pat)) src0
+    | otherwise   = go src0
+  where
+    l      = T.length pat
+    go src = search 0 src
+      where
+        search !n !s
+            | T.null s             = [src]      -- not found
+            | pat `T.isPrefixOf` s = T.take n src : go (T.drop l s)
+            | otherwise            = search (n+1) (unsafeTail s)