Commits

Bryan O'Sullivan committed 7010438

Flesh out the implementation of lazy indices, with tests.
Alas, it has a bug.

  • Participants
  • Parent commits f70328e

Comments (0)

Files changed (3)

Data/Text/Lazy/Search.hs

 import Data.Int (Int64)
 import Data.Word (Word16, Word64)
 import qualified Data.Text.Internal as T
+import qualified Data.Text as T
 import Data.Text.Fusion.Internal (PairS(..))
 import Data.Text.Lazy.Internal (Text(..))
 import Data.Bits ((.|.), (.&.))
 indices :: Text              -- ^ Substring to search for (@needle@)
         -> Text              -- ^ Text to search in (@haystack@)
         -> [Int64]
-indices needle Empty = []
-indices needle@(Chunk n ns) haystack@(Chunk c cs)
+indices needle@(Chunk n ns) haystack@(Chunk k ks)
     | nlen <= 0 || ldiff < 0 = []
-    | otherwise              = scan 0 0 c cs
+    | nlen == 1              = scanOne (nindex 0) 0 k ks
+    | otherwise              = scan 0 0 k ks
   where
-    scan !g !i x@(T.Text harr hoff l) xs
+    scan !g !i x@(T.Text _ _ l) xs
          | g > ldiff                  = []
          | i >= m                     = case xs of
                                           Empty      -> []
     nlast     = nlen - 1
     nindex    = index n ns
     z         = foldChunks fin 0 needle
-        where fin _ (T.Text narr noff nlen) = A.unsafeIndex narr (noff+nlen-1)
+        where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1)
     mask :*: skip = buildTable needle
+    scanOne c i (T.Text oarr ooff olen) os = go 0
+      where
+        go h | h >= olen = case os of
+                             Empty      -> []
+                             Chunk y ys -> scanOne c (i+fromIntegral olen) y ys
+             | on == c = i + fromIntegral h : go (h+1)
+             | otherwise = go (h+1)
+             where on = A.unsafeIndex oarr (ooff+h)
+indices _ _ = []
 
 index :: T.Text -> Text -> Int64 -> Word16
 index (T.Text arr off len) xs i
-    | j <= len = A.unsafeIndex arr (off+j)
+    | j < len   = A.unsafeIndex arr (off+j)
     | otherwise = case xs of
-                    Empty      -> error "empty"
+                    Empty | j == len  -> 0
+                          | otherwise -> error "empty"
                     Chunk c cs -> index c cs (i-fromIntegral len)
     where j = fromIntegral i
 
+swizzle :: Word16 -> Word64
 swizzle k = 1 `shiftL` (fromIntegral k .&. 0x3f)
 
 foldChunks :: (a -> T.Text -> a) -> a -> Text -> a
 foldChunks f z (Chunk c cs) = let z' = f z c
                               in z' `seq` foldChunks f z' cs
 
+wordLength :: Text -> Int64
 wordLength = foldChunks sumLength 0
     where sumLength i (T.Text _ _ l) = i + fromIntegral l
 
 buildTable :: Text -> PairS Word64 Int64
 buildTable Empty = 0 :*: 0
-buildTable needle@(Chunk c cs) = outer c cs 0 0 (nlen-2)
+buildTable needle@(Chunk k ks) = outer k ks 0 0 0 (nlen-2)
   where
-    outer x@(T.Text xarr xoff xlen) xs = go
+    outer (T.Text xarr xoff xlen) xs = go
       where
-        go !i !mask !skip
-            | i >= xlen = case xs of
-                            Empty      -> (mask .|. swizzle z) :*: skip
-                            Chunk y ys -> outer y ys 0 mask skip
-            | otherwise = go (i+1) (mask .|. swizzle c) skip'
+        go !(g::Int64) !i !mask !skip
+            | i >= xlast = case xs of
+                             Empty      -> (mask .|. swizzle z) :*: skip
+                             Chunk y ys -> outer y ys g 0 mask skip
+            | otherwise = go (g+1) (i+1) (mask .|. swizzle c) skip'
             where c                 = A.unsafeIndex xarr (xoff+i)
-                  skip' | c == z    = nlen - fromIntegral i - 2
+                  skip' | c == z    = nlen - fromIntegral g - 2
                         | otherwise = skip
+                  xlast = xlen - 1
     nlen      = wordLength needle
     z         = foldChunks fin 0 needle
-        where fin _ (T.Text narr noff nlen) = A.unsafeIndex narr (noff+nlen-1)
+        where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1)

tests/Properties.hs

 import Test.Framework (defaultMain, testGroup)
 import Test.Framework.Providers.QuickCheck (testProperty)
 import Data.Text.Search (indices)
+import qualified Data.Text.Lazy.Search as S (indices)
 import qualified SlowFunctions as Slow
 
 import QuickCheckUtils (NotEmpty(..), small)
 t_zipWith c s     = L.zipWith c s `eqP` (unpackS . T.zipWith c (packS s))
 tl_zipWith c s    = L.zipWith c s `eqP` (unpackS . TL.zipWith c (packS s))
 
+slow_indices = unsquare (\s -> (map fromIntegral . Slow.indices (T.pack s)) `eq` (Slow.lazyIndices (TL.pack s) . TL.fromChunks . (:[])))
 t_indices = unsquare (\s -> Slow.indices s `eq` indices s)
+tl_indices = unsquare (\s -> Slow.lazyIndices s `eq` S.indices s)
 t_indices_occurs t = unsquare (\ts -> let s = T.intercalate t ts
                                       in Slow.indices t s == indices t s)
 
     testProperty "t_findIndex" t_findIndex,
     testProperty "t_count" t_count,
     testProperty "tl_count" tl_count,
+    testProperty "slow_indices" slow_indices,
     testProperty "t_indices" t_indices,
+    testProperty "tl_indices" tl_indices,
     testProperty "t_indices_occurs" t_indices_occurs
   ],
 

tests/SlowFunctions.hs

 module SlowFunctions
     (
       indices
+    , lazyIndices
     , split
     ) where
 
+import Data.Int (Int64)
 import qualified Data.Text as T
+import qualified Data.Text.Lazy as L
 import Data.Text.Internal (Text(..))
 import Data.Text.Unsafe (iter_, unsafeHead, unsafeTail)
 
            where t = Text harr (hoff+i) (hlen-i)
                  d = iter_ haystack i
 
+lazyIndices :: L.Text -> L.Text -> [Int64]
+lazyIndices needle haystack
+    | L.null needle = []
+    | otherwise = scan 0 haystack
+  where
+    scan !i hay
+        | L.null hay                = []
+        | needle `L.isPrefixOf` hay = i : scan (i+n) (L.drop n hay)
+        | otherwise                 = scan (i+1) (L.tail hay)
+    n = L.length needle
+
 split :: Text                   -- ^ Text to split on
       -> Text                   -- ^ Input text
       -> [Text]