Commits

Bryan O'Sullivan committed 6f6b73e

Finish off the search code, and hide it.

Comments (0)

Files changed (3)

Data/Text/Search.hs

 {-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
 
-module Data.Text.Search where
+-- |
+-- Module      : Data.Text.Search
+-- Copyright   : (c) Bryan O'Sullivan 2009
+--
+-- License     : BSD-style
+-- Maintainer  : bos@serpentine.com, rtharper@aftereternity.co.uk,
+--               duncan@haskell.org
+-- Stability   : experimental
+-- Portability : GHC
+--
+-- Fast substring search for 'Text', based on work by Boyer, Moore,
+-- Horspool, Sunday, and Lundh.
+--
+-- References:
+-- 
+-- * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm.
+--   Communications of the ACM, 20, 10, 762-772 (1977)
+--
+-- * R. N. Horspool: Practical Fast Searching in Strings.  Software -
+--   Practice and Experience 10, 501-506 (1980)
+--
+-- * D. M. Sunday: A Very Fast Substring Search Algorithm.
+--   Communications of the ACM, 33, 8, 132-142 (1990)
+--
+-- * F. Lundh: The Fast Search Algorithm.
+--   <http://effbot.org/zone/stringlib.htm> (2006)
+module Data.Text.Search
+    (
+      indices
+    , slowIndices
+    ) where
 
-import qualified Data.Text as T ()
+import qualified Data.Text as T
 import qualified Data.Text.Array as A
 import Data.Word (Word64)
 import Data.Text.Internal (Text(..))
 import Data.Bits ((.|.), (.&.))
 import Data.Text.UnsafeShift (shiftL)
+import Data.Text.Unsafe (iter_)
 
-indices :: Text -> Text -> [Int]
+-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
+-- @needle@ within @haystack@. In (unlikely) bad cases, this
+-- algorithm's complexity degenerates towards /O(n*m)/.
+indices :: Text                -- ^ Substring to search for (@needle@)
+        -> Text                -- ^ Text to search in (@haystack@)
+        -> [Int]
 indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
-  | ldiff < 0 = []
-  | otherwise = outer 0
+  | nlen == 1              = scanOne (nindex 0)
+  | nlen <= 0 || ldiff < 0 = []
+  | otherwise              = scan 0
   where
-    ldiff   = hlen - nlen
-    nlast   = nlen - 1
-    z       = nindex nlast
+    ldiff    = hlen - nlen
+    nlast    = nlen - 1
+    z        = nindex nlast
     nindex k = A.unsafeIndex narr (noff+k)
     hindex k = A.unsafeIndex harr (hoff+k)
-    (mask :: Word64, skip :: Int) = buildTable 0 0 (nlen-2)
+    (mask :: Word64, skip) = buildTable 0 0 (nlen-2)
     buildTable !i !msk !skp
         | i >= nlast           = (msk .|. swizzle z, skp)
         | otherwise            = buildTable (i+1) (msk .|. swizzle c) skp'
               skp' | c == z    = nlen - i - 2
                    | otherwise = skp
     swizzle k = 1 `shiftL` (fromEnum k .&. 0x3f)
-    outer !i
-        | i > ldiff = []
-        | c == z    = if candidateMatch 0
-                      then i : outer (i + nlast)
-                      else if nextInPattern
-                           then outer (i + nlen)
-                           else outer (i + skip)
-        | nextInPattern = outer (i + nlen)
-        | otherwise     = outer (i + 1)
-        where c = hindex (i+nlast)
+    scan !i
+        | i > ldiff                  = []
+        | c == z && candidateMatch 0 = i : scan (i + nlast)
+        | otherwise                  = scan (i + delta)
+        where c                      = hindex (i + nlast)
               candidateMatch j
                   | j >= nlast               = True
                   | hindex (i+j) /= nindex j = False
                   | otherwise                = candidateMatch (j+1)
-              nextInPattern = (mask .&. swizzle (hindex (i+nlen))) == 0
+              delta
+                  | nextInPattern = nlen + 1
+                  | c == z        = skip + 1
+                  | otherwise     = 1
+              nextInPattern       = (mask .&. swizzle (hindex (i+nlen))) == 0
+    scanOne c = loop 0
+        where loop !i | i >= hlen     = []
+                      | hindex i == c = i : loop (i+1)
+                      | otherwise     = loop (i+1)
+{-# INLINE indices #-}
+
+-- | /O(n*m)/ Find the offsets of all non-overlapping indices of
+-- @needle@ within @haystack@. Provided purely as a slower, reliable
+-- counterpart to 'indices'.
+slowIndices :: Text             -- ^ Substring to search for (@needle@)
+            -> Text             -- ^ Text to search in (@haystack@)
+            -> [Int]
+slowIndices needle@(Text _narr _noff nlen) haystack@(Text harr hoff hlen)
+    | T.null needle = []
+    | otherwise     = scan 0
+  where
+    scan i | i >= hlen = []
+           | needle `T.isPrefixOf` t = i : scan (i+nlen)
+           | otherwise = scan (i+d)
+           where t = Text harr (hoff+i) (hlen-i)
+                 d = iter_ haystack i

tests/Properties.hs

 import System.IO.Unsafe (unsafePerformIO)
 import Test.Framework (defaultMain, testGroup)
 import Test.Framework.Providers.QuickCheck (testProperty)
+import Data.Text.Search
 
 import QuickCheckUtils (NotEmpty(..))
 
 t_zipWith c s     = L.zipWith c s `eqP` (unpackS . T.zipWith c (packS s))
 tl_zipWith c s    = L.zipWith c s `eqP` (unpackS . TL.zipWith c (packS s))
 
+t_indices = unsquare (\s -> slowIndices s `eq` indices s)
+t_indices_occurs t = unsquare (\ts -> let s = T.intercalate t ts
+                                      in slowIndices t s == indices t s)
+
 -- Bit shifts.
 shiftL w = forAll (choose (0,width-1)) $ \k -> Bits.shiftL w k == U.shiftL w k
     where width = round (log (fromIntegral m) / log 2 :: Double)
     testProperty "t_elemIndices" t_elemIndices,
     testProperty "tl_elemIndices" tl_elemIndices,
     testProperty "t_count" t_count,
-    testProperty "tl_count" tl_count
+    testProperty "tl_count" tl_count,
+    testProperty "t_indices" t_indices,
+    testProperty "t_indices_occurs" t_indices_occurs
   ],
 
   testGroup "zips" [
     Data.Text.Foreign
     Data.Text.Lazy
     Data.Text.Lazy.Encoding
-    Data.Text.Search
   other-modules:
     Data.Text.Array
     Data.Text.Encoding.Fusion
     Data.Text.Lazy.Encoding.Fusion
     Data.Text.Lazy.Fusion
     Data.Text.Lazy.Internal
+    Data.Text.Search
     Data.Text.Unsafe
     Data.Text.UnsafeChar
     Data.Text.UnsafeShift
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.