Source

stringsearch / Data / ByteString / Lazy / Search / DFA.hs

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
{-# LANGUAGE BangPatterns #-}
-- |
-- Module         : Data.ByteString.Lazy.Search.DFA
-- Copyright      : Daniel Fischer
-- Licence        : BSD3
-- Maintainer     : Daniel Fischer <daniel.is.fischer@googlemail.com>
-- Stability      : Provisional
-- Portability    : non-portable (BangPatterns)
--
-- Fast search of lazy 'L.ByteString' values. Breaking,
-- splitting and replacing using a deterministic finite automaton.

module Data.ByteString.Lazy.Search.DFA ( -- * Overview
                                         -- $overview

                                         -- ** Complexity and performance
                                         -- $complexity

                                         -- ** Partial application
                                         -- $partial

                                         -- * Finding substrings
                                         indices
                                       , nonOverlappingIndices
                                         -- * Breaking on substrings
                                       , breakOn
                                       , breakAfter
                                       , breakFindAfter
                                         -- * Replacing
                                       , replace
                                         -- * Splitting
                                       , split
                                       , splitKeepEnd
                                       , splitKeepFront
                                       ) where

import Data.ByteString.Search.Internal.Utils (automaton, keep, ldrop, lsplit)
import Data.ByteString.Search.Substitution

import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Internal as LI
import Data.ByteString.Unsafe (unsafeIndex)

import Data.Array.Base (unsafeAt)
--import Data.Array.Unboxed (UArray)

import Data.Bits
import Data.Int (Int64)

-- $overview
--
-- This module provides functions related to searching a substring within
-- a string. The searching algorithm uses a deterministic finite automaton
-- based on the Knuth-Morris-Pratt algorithm.
-- The automaton is implemented as an array of @(patternLength + 1) * &#963;@
-- state transitions, where &#963; is the alphabet size (256), so it is only
-- suitable for short enough patterns, therefore the patterns in this module
-- are required to be strict 'S.ByteString's.
--
-- When searching a pattern in a UTF-8-encoded 'L.ByteString', be aware that
-- these functions work on bytes, not characters, so the indices are
-- byte-offsets, not character offsets.

-- $complexity
--
-- The time and space complexity of the preprocessing phase is
-- /O/(@patternLength * &#963;@).
-- The searching phase is /O/(@targetLength@), each target character is
-- inspected only once.
--
-- In general the functions in this module have about the same performance as
-- the corresponding functions using the Knuth-Morris-Pratt algorithm but
-- are considerably slower than the Boyer-Moore functions. For very short
-- patterns or, in the case of 'indices', patterns with a short period
-- which occur often, however, times are close to or even below the
-- Boyer-Moore times.

-- $partial
--
-- All functions can usefully be partially applied. Given only a pattern,
-- the automaton is constructed only once, allowing efficient re-use.

------------------------------------------------------------------------------
--                            Exported Functions                            --
------------------------------------------------------------------------------

-- | @'indices'@ finds the starting indices of all possibly overlapping
--   occurrences of the pattern in the target string.
--   If the pattern is empty, the result is @[0 .. 'length' target]@.
{-# INLINE indices #-}
indices :: S.ByteString     -- ^ Strict pattern to find
        -> L.ByteString     -- ^ Lazy string to search
        -> [Int64]          -- ^ Offsets of matches
indices !pat = lazySearcher True pat . L.toChunks

-- | @'nonOverlappingIndices'@ finds the starting indices of all
--   non-overlapping occurrences of the pattern in the target string.
--   It is more efficient than removing indices from the list produced
--   by 'indices'.
{-# INLINE nonOverlappingIndices #-}
nonOverlappingIndices :: S.ByteString   -- ^ Strict pattern to find
                      -> L.ByteString   -- ^ Lazy string to search
                      -> [Int64]        -- ^ Offsets of matches
nonOverlappingIndices !pat = lazySearcher False pat . L.toChunks

-- | @'breakOn' pattern target@ splits @target@ at the first occurrence
--   of @pattern@. If the pattern does not occur in the target, the
--   second component of the result is empty, otherwise it starts with
--   @pattern@. If the pattern is empty, the first component is empty.
--   For a non-empty pattern, the first component is generated lazily,
--   thus the first parts of it can be available before the pattern has
--   been found or determined to be absent.
--
-- @
--   'uncurry' 'L.append' . 'breakOn' pattern = 'id'
-- @
breakOn :: S.ByteString  -- ^ Strict pattern to search for
        -> L.ByteString  -- ^ Lazy string to search in
        -> (L.ByteString, L.ByteString)
                         -- ^ Head and tail of string broken at substring
breakOn pat = breaker . L.toChunks
  where
    lbrk = lazyBreaker True pat
    breaker strs = let (f, b) = lbrk strs
                   in (L.fromChunks f, L.fromChunks b)

-- | @'breakAfter' pattern target@ splits @target@ behind the first occurrence
--   of @pattern@. An empty second component means that either the pattern
--   does not occur in the target or the first occurrence of pattern is at
--   the very end of target. If you need to discriminate between those cases,
--   use breakFindAfter.
--   If the pattern is empty, the first component is empty.
--   For a non-empty pattern, the first component is generated lazily,
--   thus the first parts of it can be available before the pattern has
--   been found or determined to be absent.
-- @
--   'uncurry' 'L.append' . 'breakAfter' pattern = 'id'
-- @
breakAfter :: S.ByteString  -- ^ Strict pattern to search for
           -> L.ByteString  -- ^ Lazy string to search in
           -> (L.ByteString, L.ByteString)
                            -- ^ Head and tail of string broken after substring
breakAfter pat = breaker . L.toChunks
  where
    lbrk = lazyBreaker False pat
    breaker strs = let (f, b) = lbrk strs
                   in (L.fromChunks f, L.fromChunks b)

-- | @'breakFindAfter'@ does the same as 'breakAfter' but additionally indicates
--   whether the pattern is present in the target.
--
-- @
--   'fst' . 'breakFindAfter' pat = 'breakAfter' pat
-- @
breakFindAfter :: S.ByteString  -- ^ Strict pattern to search for
               -> L.ByteString  -- ^ Lazy string to search in
               -> ((L.ByteString, L.ByteString), Bool)
                            -- ^ Head and tail of string broken after substring
                            --   and presence of pattern
breakFindAfter pat
  | S.null pat  = \str -> ((L.empty, str), True)
breakFindAfter pat = breaker . L.toChunks
  where
    !patLen = S.length pat
    lbrk = lazyBreaker True pat
    breaker strs = let (f, b) = lbrk strs
                       (f1, b1) = lsplit patLen b
                       mbpat = L.fromChunks f1
                   in ((foldr LI.chunk mbpat f, L.fromChunks b1), not (null b))

-- | @'replace' pat sub text@ replaces all (non-overlapping) occurrences of
--   @pat@ in @text@ with @sub@. If occurrences of @pat@ overlap, the first
--   occurrence that does not overlap with a replaced previous occurrence
--   is substituted. Occurrences of @pat@ arising from a substitution
--   will not be substituted. For example:
--
-- @
--   'replace' \"ana\" \"olog\" \"banana\" = \"bologna\"
--   'replace' \"ana\" \"o\" \"bananana\" = \"bono\"
--   'replace' \"aab\" \"abaa\" \"aaabb\" = \"aabaab\"
-- @
--
--   The result is a lazy 'L.ByteString',
--   which is lazily produced, without copying.
--   Equality of pattern and substitution is not checked, but
--
-- @
--   'replace' pat pat text == text
-- @
--
--   holds (the internal structure is generally different).
--   If the pattern is empty but not the substitution, the result
--   is equivalent to (were they 'String's) @cycle sub@.
--
--   For non-empty @pat@ and @sub@ a lazy 'L.ByteString',
--
-- @
--   'L.concat' . 'Data.List.intersperse' sub . 'split' pat = 'replace' pat sub
-- @
--
--   and analogous relations hold for other types of @sub@.
replace :: Substitution rep
        => S.ByteString     -- ^ Strict pattern to replace
        -> rep              -- ^ Replacement string
        -> L.ByteString     -- ^ Lazy string to modify
        -> L.ByteString     -- ^ Lazy result
replace pat
  | S.null pat = \sub -> prependCycle sub
  | otherwise =
    let !patLen = S.length pat
        breaker = lazyBreaker True pat
        repl subst strs
          | null strs   = []
          | otherwise   =
            let (pre, mtch) = breaker strs
            in pre ++ case mtch of
                        [] -> []
                        _  -> subst (repl subst (ldrop patLen mtch))
    in \sub -> let {-# NOINLINE subst #-}
                   !subst = substitution sub
                   repl1 = repl subst
               in L.fromChunks . repl1 . L.toChunks


-- | @'split' pattern target@ splits @target@ at each (non-overlapping)
--   occurrence of @pattern@, removing @pattern@. If @pattern@ is empty,
--   the result is an infinite list of empty 'L.ByteString's, if @target@
--   is empty but not @pattern@, the result is an empty list, otherwise
--   the following relations hold (where @patL@ is the lazy 'L.ByteString'
--   corresponding to @pat@):
--
-- @
--   'L.concat' . 'Data.List.intersperse' patL . 'split' pat = 'id',
--   'length' ('split' pattern target) ==
--               'length' ('nonOverlappingIndices' pattern target) + 1,
-- @
--
--   no fragment in the result contains an occurrence of @pattern@.
split :: S.ByteString   -- ^ Strict pattern to split on
      -> L.ByteString   -- ^ Lazy string to split
      -> [L.ByteString] -- ^ Fragments of string
split pat
  | S.null pat  = const (repeat L.empty)
split pat = map L.fromChunks . splitter . L.toChunks
  where
    !patLen = S.length pat
    breaker = lazyBreaker True pat
    splitter strs
      | null strs  = []
      | otherwise  = splitter' strs
    splitter' strs
      | null strs  = [[]]
      | otherwise  =
        case breaker strs of
          (pre, mtch) ->
            pre : case mtch of
                    [] -> []
                    _  -> splitter' (ldrop patLen mtch)

-- | @'splitKeepEnd' pattern target@ splits @target@ after each (non-overlapping)
--   occurrence of @pattern@. If @pattern@ is empty, the result is an
--   infinite list of empty 'L.ByteString's, otherwise the following
--   relations hold:
--
-- @
--   'L.concat' . 'splitKeepEnd' pattern = 'id,'
-- @
--
--   all fragments in the result except possibly the last end with
--   @pattern@, no fragment contains more than one occurrence of @pattern@.
splitKeepEnd :: S.ByteString    -- ^ Strict pattern to split on
             -> L.ByteString    -- ^ Lazy string to split
             -> [L.ByteString]  -- ^ Fragments of string
splitKeepEnd pat
  | S.null pat = const (repeat L.empty)
splitKeepEnd pat = map L.fromChunks . splitter . L.toChunks
  where
    breaker = lazyBreaker False pat
    splitter [] = []
    splitter strs =
      case breaker strs of
        (pre, mtch) -> pre : splitter mtch

-- | @'splitKeepFront'@ is like 'splitKeepEnd', except that @target@ is split
--   before each occurrence of @pattern@ and hence all fragments
--   with the possible exception of the first begin with @pattern@.
--   No fragment contains more than one non-overlapping occurrence
--   of @pattern@.
splitKeepFront :: S.ByteString    -- ^ Strict pattern to split on
               -> L.ByteString    -- ^ Lazy string to split
               -> [L.ByteString]  -- ^ Fragments of string
splitKeepFront pat
  | S.null pat  = const (repeat L.empty)
splitKeepFront pat = map L.fromChunks . splitter . L.toChunks
  where
    !patLen = S.length pat
    breaker = lazyBreaker True pat
    splitter strs = case splitter' strs of
                      ([] : rst) -> rst
                      other -> other
    splitter' []    = []
    splitter' strs  =
      case breaker strs of
        (pre, mtch) ->
          pre : case mtch of
                  [] -> []
                  _  -> case lsplit patLen mtch of
                          (pt, rst) ->
                            if null rst
                              then [pt]
                              else let (h : t) = splitter' rst
                                   in (pt ++ h) : t

------------------------------------------------------------------------------
--                            Searching Function                            --
------------------------------------------------------------------------------

lazySearcher :: Bool -> S.ByteString -> [S.ByteString] -> [Int64]
lazySearcher _ !pat
    | S.null pat        =
      let zgo _ [] = []
          zgo !prior (!str : rest) =
              let !l = S.length str
                  !prior' = prior + fromIntegral l
              in [prior + fromIntegral i | i <- [1 .. l]] ++ zgo prior' rest
      in (0:) . zgo 0
    | S.length pat == 1 =
      let !w = S.head pat
          ixes = S.elemIndices w
          go _ [] = []
          go !prior (!str : rest)
            = let !prior' = prior + fromIntegral (S.length str)
              in map ((+ prior) . fromIntegral) (ixes str) ++ go prior' rest
      in go 0
lazySearcher !overlap pat = search 0 0
  where
    !patLen = S.length pat
    !auto   = automaton pat
    !p0     = unsafeIndex pat 0
    !ams    = if overlap then patLen else 0
    search _ _ [] = []
    search !prior st (!str:rest) = match st 0
      where
        !strLen = S.length str
        {-# INLINE strAt #-}
        strAt :: Int -> Int
        strAt i = fromIntegral (str `unsafeIndex` i)
        match 0 !idx
          | idx == strLen = search (prior + fromIntegral strLen) 0 rest
          | unsafeIndex str idx == p0   = match 1 (idx + 1)
          | otherwise     = match 0 (idx + 1)
        match state idx
          | idx == strLen = search (prior + fromIntegral strLen) state rest
          | otherwise     =
            let nstate = unsafeAt auto ((state `shiftL` 8) + strAt idx)
                !nxtIdx = idx + 1
            in if nstate == patLen
                then (prior + fromIntegral (nxtIdx - patLen)) :
                            match ams nxtIdx
                else match nstate nxtIdx

------------------------------------------------------------------------------
--                                 Breaking                                 --
------------------------------------------------------------------------------

-- Code duplication :(
-- Needed for reasonable performance.
lazyBreaker :: Bool -> S.ByteString -> [S.ByteString]
                    -> ([S.ByteString], [S.ByteString])
lazyBreaker before pat
  | S.null pat  = \strs -> ([], strs)
  | S.length pat == 1 =
    let !w = S.head pat
        !a = if before then 0 else 1
        ixes = S.elemIndices w
        scan [] = ([], [])
        scan (!str:rest) =
            let !strLen = S.length str
            in case ixes str of
                []  -> let (fr, bk) = scan rest in (str : fr, bk)
                (i:_) -> let !j = i + a
                         in if j == strLen
                              then ([str],rest)
                              else ([S.take j str], S.drop j str : rest)
    in scan
lazyBreaker !before pat = bscan [] 0
  where
    !patLen = S.length pat
    !auto   = automaton pat
    !p0     = unsafeIndex pat 0
    bscan _ _ [] = ([], [])
    bscan !past !sta (!str:rest) = match sta 0
      where
        !strLen = S.length str
        {-# INLINE strAt #-}
        strAt :: Int -> Int
        strAt i = fromIntegral (str `unsafeIndex` i)
        match 0 idx
          | idx == strLen =
            let (fr, bk) = bscan [] 0 rest
            in (foldr (flip (.) . (:)) id past (str:fr), bk)
          | unsafeIndex str idx == p0 = match 1 (idx + 1)
          | otherwise = match 0 (idx + 1)
        match state idx
          | idx == strLen =
            let (kp, !rl) = if before
                                then keep state (str:past)
                                else ([], str:past)
                (fr, bk) = bscan kp state rest
            in (foldr (flip (.) . (:)) id rl fr, bk)
          | otherwise =
            let !nstate = unsafeAt auto ((state `shiftL` 8) + strAt idx)
                !nxtIdx = idx + 1
            in if nstate == patLen
                then case if before then nxtIdx - patLen else nxtIdx of
                       0 -> (foldr (flip (.) . (:)) id past [], str:rest)
                       stIx | stIx < 0 -> rgo (-stIx) (str:rest) past
                            | stIx == strLen ->
                              (foldr (flip (.) . (:)) id past [str],rest)
                            | otherwise ->
                              (foldr (flip (.) . (:)) id past
                                    [S.take stIx str], S.drop stIx str : rest)
                else match nstate nxtIdx


-- Did I already mention that I suck at finding names?
{-# INLINE rgo #-}
rgo :: Int -> [S.ByteString] -> [S.ByteString]
    -> ([S.ByteString], [S.ByteString])
rgo !kp acc (!str:more)
  | sl == kp    = (reverse more, str:acc)
  | sl < kp     = rgo (kp - sl) (str:acc) more
  | otherwise   = case S.splitAt (sl - kp) str of
                    (fr, bk) ->
                      (foldr (flip (.) . (:)) id more [fr], bk:acc)
    where
      !sl = S.length str
rgo _ _ [] = error "Not enough past!"
-- If that error is ever encountered, I screwed up badly.