Source

text / Data / Text / Lazy / Search.hs

The default branch has multiple heads

{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}

-- |
-- Module      : Data.Text.Lazy.Search
-- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
--               duncan@haskell.org
-- Stability   : experimental
-- Portability : GHC
--
-- Fast substring search for lazy 'Text', based on work by Boyer,
-- Moore, Horspool, Sunday, and Lundh.  Adapted from the strict
-- implementation.

module Data.Text.Lazy.Search
    (
      indices
    ) where

import qualified Data.Text.Array as A
import Data.Int (Int64)
import Data.Word (Word16, Word64)
import qualified Data.Text.Internal as T
import Data.Text.Fusion.Internal (PairS(..))
import Data.Text.Lazy.Internal (Text(..), foldlChunks)
import Data.Bits ((.|.), (.&.))
import Data.Text.UnsafeShift (shiftL)

-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
-- @needle@ within @haystack@.
--
-- This function is strict in @needle@, and lazy (as far as possible)
-- in the chunks of @haystack@.
--
-- In (unlikely) bad cases, this algorithm's complexity degrades
-- towards /O(n*m)/.
indices :: Text              -- ^ Substring to search for (@needle@)
        -> Text              -- ^ Text to search in (@haystack@)
        -> [Int64]
indices needle@(Chunk n ns) _haystack@(Chunk k ks)
    | nlen <= 0  = []
    | nlen == 1  = indicesOne (nindex 0) 0 k ks
    | otherwise  = advance k ks 0 0
  where
    advance x@(T.Text _ _ l) xs = scan
     where
      scan !g !i
         | i >= m = case xs of
                      Empty           -> []
                      Chunk y ys      -> advance y ys g (i-m)
         | lackingHay (i + nlen) x xs  = []
         | c == z && candidateMatch 0  = g : scan (g+nlen) (i+nlen)
         | otherwise                   = scan (g+delta) (i+delta)
       where
         m = fromIntegral l
         c = hindex (i + nlast)
         delta | nextInPattern = nlen + 1
               | c == z        = skip + 1
               | otherwise     = 1
         nextInPattern         = mask .&. swizzle (hindex (i+nlen)) == 0
         candidateMatch !j
             | j >= nlast               = True
             | hindex (i+j) /= nindex j = False
             | otherwise                = candidateMatch (j+1)
         hindex                         = index x xs
    nlen      = wordLength needle
    nlast     = nlen - 1
    nindex    = index n ns
    z         = foldlChunks fin 0 needle
        where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1)
    (mask :: Word64) :*: skip = buildTable n ns 0 0 0 (nlen-2)
    swizzle w = 1 `shiftL` (fromIntegral w .&. 0x3f)
    buildTable (T.Text xarr xoff xlen) xs = go
      where
        go !(g::Int64) !i !msk !skp
            | i >= xlast = case xs of
                             Empty      -> (msk .|. swizzle z) :*: skp
                             Chunk y ys -> buildTable y ys g 0 msk' skp'
            | otherwise = go (g+1) (i+1) msk' skp'
            where c                = A.unsafeIndex xarr (xoff+i)
                  msk'             = msk .|. swizzle c
                  skp' | c == z    = nlen - g - 2
                       | otherwise = skp
                  xlast = xlen - 1
    -- | Check whether an attempt to index into the haystack at the
    -- given offset would fail.
    lackingHay q = go 0
      where
        go p (T.Text _ _ l) ps = p' < q && case ps of
                                             Empty      -> True
                                             Chunk r rs -> go p' r rs
            where p' = p + fromIntegral l
indices _ _ = []

-- | Fast index into a partly unpacked 'Text'.  We take into account
-- the possibility that the caller might try to access one element
-- past the end.
index :: T.Text -> Text -> Int64 -> Word16
index (T.Text arr off len) xs !i
    | j < len   = A.unsafeIndex arr (off+j)
    | otherwise = case xs of
                    Empty
                        -- out of bounds, but legal
                        | j == len  -> 0
                        -- should never happen, due to lackingHay above
                        | otherwise -> emptyError "index"
                    Chunk c cs -> index c cs (i-fromIntegral len)
    where j = fromIntegral i

-- | A variant of 'indices' that scans linearly for a single 'Word16'.
indicesOne :: Word16 -> Int64 -> T.Text -> Text -> [Int64]
indicesOne c = chunk
  where
    chunk !i (T.Text oarr ooff olen) os = go 0
      where
        go h | h >= olen = case os of
                             Empty      -> []
                             Chunk y ys -> chunk (i+fromIntegral olen) y ys
             | on == c = i + fromIntegral h : go (h+1)
             | otherwise = go (h+1)
             where on = A.unsafeIndex oarr (ooff+h)

-- | The number of 'Word16' values in a 'Text'.
wordLength :: Text -> Int64
wordLength = foldlChunks sumLength 0
    where sumLength i (T.Text _ _ l) = i + fromIntegral l

emptyError :: String -> a
emptyError fun = error ("Data.Text.Lazy.Search." ++ fun ++ ": empty input")
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.