Commits

Bryan O'Sullivan  committed ca46082

Docs and such.

  • Participants
  • Parent commits 3b5ded3

Comments (0)

Files changed (6)

File Data/SuffixTree.hs

 -- >  import Data.SuffixTree (STree)
 -- >  import qualified Data.SuffixTree as T
 --
--- The implementation is based on the first one described in the
+-- The implementation is based on the first of those described in the
 -- following paper:
 -- 
 --   * Robert Giegerich and Stefan Kurtz, \"/A comparison of
 --     imperative and purely functional suffix tree constructions/\",
 --     Science of Computer Programming 25(2-3):187-218, 1995,
 --     <http://citeseer.ist.psu.edu/giegerich95comparison.html>
+-- 
+-- This implementation constructs the suffix tree lazily, so subtrees
+-- are not created until they are traversed.  Two construction
+-- functions are provided, 'constructWith' for sequences composed of
+-- small alphabets, and 'construct' for larger alphabets.
 
 module Data.SuffixTree
-    ( Edge
-    , Length
+    (
+    -- * Types
+      Alphabet
+    , Prefix
     , STree(..)
+
+    -- * Construction
     , constructWith
     , construct
+
+    -- * Querying
     , elem
+    , find
+
+    -- * Traversal
     , fold
-    , length
+    , fold'
+
+    -- * Other useful functions
+    , prefix
     , suffixes
-    , take
     ) where
                    
-import Prelude hiding (elem, length, take)
+import Prelude hiding (elem)
 import qualified Data.Map as M
 import Data.List (foldl')
 import Control.Arrow (second)
 import qualified Data.ByteString as SB
 import qualified Data.ByteString.Lazy as LB
-import qualified Data.List as L
+import Data.Maybe (listToMaybe, mapMaybe)
 
+-- | The length of a prefix list.  This type is formulated to do cheap
+-- work eagerly (to avoid constructing a pile of deferred thunks),
+-- while deferring potentially expensive work.
 data Length a = Exactly {-# UNPACK #-} !Int
               | Sum {-# UNPACK #-} !Int [a]
               deriving (Show)
 
-length :: Length a -> Int
-length (Exactly n) = n
-length (Sum n xs) = n + L.length xs
+-- | The list of symbols that 'constructWith' can possibly see in its
+-- input.
+type Alphabet a = [a]
 
+-- | The prefix string associated with an 'Edge'.
+newtype Prefix a = Prefix ([a], Length a)
+    deriving (Show)
+
+instance (Eq a) => Eq (Prefix a) where
+    a == b = prefix a == prefix b
+
+type EdgeFunction a = [[a]] -> (Length a, [[a]])
+
+-- | A suffix tree.  The implementation is exposed to ease the
+-- development of custom traversal functions.  Note that @('Prefix' a,
+-- 'STree' a)@ pairs are not stored in any order.
+data STree a = Node [(Prefix a, STree a)]
+             | Leaf
+               deriving (Show)
+
+-- | Obtain the list stored in a 'Prefix'.
+prefix :: Prefix a -> [a]
+prefix (Prefix (ys, Exactly n)) = take n ys
+prefix (Prefix (ys, Sum n xs)) = tk n ys
+    where tk 0 ys = zipWith (const id) xs ys
+          tk n (y:ys) = y : tk (n-1) ys
+
+-- | /O(n)/. Fold the edges in a tree, from bottom to top.  Suitable
+-- for lazy use.
+fold :: (Prefix a -> b -> b) -> b -> STree a -> b
+fold _ z Leaf = z
+fold f z (Node es) = foldr (\(e, t) v -> f e (fold f v t)) z es
+
+-- | /O(n)/. Fold the edges in a tree, from bottom to top.  Suitable
+-- for strict use.
+fold' :: (a -> Prefix b -> a) -> a -> STree b -> a
+fold' _ z Leaf = z
+fold' f z (Node es) = foldl' (\v (e, t) -> f (fold' f v t) e) z es
+
+-- | Increment the length of a prefix.
 inc :: Length a -> Length a
 inc (Exactly n) = Exactly (n+1)
 inc (Sum n xs)  = Sum (n+1) xs
 
-take :: Length a -> [a] -> [a]
-take (Exactly n) = L.take n
-take (Sum n xs) = tk n
-    where tk 0 ys = zipWith (const id) xs ys
-          tk n (y:ys) = y : tk (n-1) ys
-
-type Edge a = ([a], Length a)
-
-type EdgeFunction a = [[a]] -> (Length a, [[a]])
-
-data STree a = Node [(Edge a, STree a)]
-             | Leaf
-               deriving (Show)
-
-fold :: ([a] -> b -> b) -> b -> STree a -> b
-fold _ z Leaf = z
-fold f z (Node es) = foldr (\ ((l, n), t) v -> f (take n l) (fold f v t)) z es
-
-lazyTreeWith :: (Eq a) => EdgeFunction a -> [a] -> [a] -> STree a
+lazyTreeWith :: (Eq a) => EdgeFunction a -> Alphabet a -> [a] -> STree a
 lazyTreeWith edge alphabet = suf . suffixes
     where suf [[]] = Leaf
-          suf ss = Node [((a:sa, inc cpl), suf ssr)
+          suf ss = Node [(Prefix (a:sa, inc cpl), suf ssr)
                          | a <- alphabet,
                            n@(sa:_) <- [ss `clusterBy` a],
                            (cpl,ssr) <- [edge n]]
           clusterBy ss a = [cs | c:cs <- ss, c == a]
 
+-- | Return all non-empty suffixes of the argument, longest first.
+-- Behaves as follows:
+--
+-- >suffixes xs == init (tails xs)
 suffixes :: [a] -> [[a]]
 suffixes xs@(_:xs') = xs : suffixes xs'
 suffixes _ = []
 lazyTree :: (Ord a) => EdgeFunction a -> [a] -> STree a
 lazyTree edge = suf . suffixes
     where suf [[]] = Leaf
-          suf ss = Node [((a:sa, inc cpl), suf ssr)
+          suf ss = Node [(Prefix (a:sa, inc cpl), suf ssr)
                          | (a, n@(sa:_)) <- suffixMap ss,
                            (cpl,ssr) <- [edge n]]
 
 cst [s] = (Sum 0 s, [[]])
 cst awss@((a:w):ss)
     | null [c | c:_ <- ss, a /= c] = let cpl' = inc cpl
-                                     in cpl' `seq` (cpl', rss)
+                                     in seq cpl' (cpl', rss)
     | otherwise = (Exactly 0, awss)
     where (cpl, rss) = cst (w:[u | _:u <- ss])
 
 {-# SPECIALISE constructWith :: [LB.ByteString] -> [LB.ByteString]
                              -> STree LB.ByteString #-}
 {-# SPECIALISE constructWith :: (Eq a) => [[a]] -> [[a]] -> STree [a] #-}
-constructWith :: (Eq a) => [a] -> [a] -> STree a
+
+-- | /O(k n log n)/.  Construct a suffix tree using the given
+-- alphabet.  The performance of this function is linear in the size
+-- /k/ of the alphabet.  That makes this function suitable for small
+-- alphabets, such as DNA nucleotides.  For an alphabet containing
+-- more than a few symbols, 'construct' is usually several orders of
+-- magnitude faster.
+constructWith :: (Eq a) => Alphabet a -> [a] -> STree a
 constructWith = lazyTreeWith cst
 
 {-# SPECIALISE construct :: [Char] -> STree Char #-}
 {-# SPECIALISE construct :: [SB.ByteString] -> STree SB.ByteString #-}
 {-# SPECIALISE construct :: [LB.ByteString] -> STree LB.ByteString #-}
 {-# SPECIALISE construct :: (Ord a) => [[a]] -> STree [a] #-}
+
+-- | /O(n log n)/.  Construct a suffix tree.
 construct :: (Ord a) => [a] -> STree a
 construct = lazyTree cst
 
+suffix :: (Eq a) => [a] -> [a] -> Maybe [a]
+suffix (l:ls) (x:xs) | l == x = suffix ls xs
+                     | otherwise = Nothing
+suffix _ xs = Just xs
+
 {-# SPECIALISE elem :: [Char] -> STree Char -> Bool #-}
 {-# SPECIALISE elem :: [[Char]] -> STree [Char] -> Bool #-}
 {-# SPECIALISE elem :: [SB.ByteString] -> STree SB.ByteString -> Bool #-}
 {-# SPECIALISE elem :: [LB.ByteString] -> STree LB.ByteString -> Bool #-}
 {-# SPECIALISE elem :: (Eq a) => [[a]] -> STree [a] -> Bool #-}
+
+-- | /O(n)/.  Indicate the suffix tree contains the given subsequence.
+-- Performance is linear in the length of the subsequence.
 elem :: (Eq a) => [a] -> STree a -> Bool
 elem [] _ = True
 elem _ Leaf = False
-elem xs (Node es) = any prefix es
-    where prefix ((l, n), t) = maybe False (`elem` t) (rsuf (take n l) xs)
-          rsuf (l:ls) (x:xs) | l == x = rsuf ls xs
-                             | otherwise = Nothing
-          rsuf _ xs = Just xs
+elem xs (Node es) = any pfx es
+    where pfx (e, t) = maybe False (`elem` t) (suffix (prefix e) xs)
+
+{-# SPECIALISE find :: [Char] -> STree Char
+                    -> Maybe (Prefix Char, STree Char) #-}
+{-# SPECIALISE find :: [[Char]] -> STree [Char]
+                    -> Maybe (Prefix [Char], STree [Char]) #-}
+{-# SPECIALISE find :: [SB.ByteString] -> STree SB.ByteString
+                    -> Maybe (Prefix SB.ByteString, STree SB.ByteString) #-}
+{-# SPECIALISE find :: [LB.ByteString] -> STree LB.ByteString
+                    -> Maybe (Prefix LB.ByteString, STree LB.ByteString) #-}
+{-# SPECIALISE find :: (Eq a) => [[a]] -> STree [a]
+                    -> Maybe (Prefix [a], STree [a]) #-}
+
+-- | /O(n)/.  Return the portion of the suffix tree at which the given
+-- subsequence is located.  If the subsequence is not found, return
+-- 'Nothing'.
+find :: (Eq a) => [a] -> STree a -> Maybe (Prefix a, STree a)
+find _ Leaf = Nothing
+find xs (Node es) = listToMaybe (mapMaybe pfx es)
+    where pfx p@(e, t) = suffix (prefix e) xs >>= \suf ->
+            case suf of
+              [] -> return p
+              s -> find s t
+Copyright (c) Bryan O'Sullivan 2007.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. Neither the name of the author nor the names of his contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.

File examples/Tiny.hs

+module Main where
+
+import qualified Data.SuffixTree as S
+import System.Environment (getArgs)
+
+main = do
+  [fileName, cons] <- getArgs
+  let ctor = case cons of
+               "1" -> S.constructWith [minBound..maxBound]
+               "2" -> S.construct
+  tree <- ctor `fmap` readFile fileName
+  putStrLn (show (S.fold (const (1+)) 0 tree) ++ " edges")
+  (lines `fmap` getContents) >>= mapM_ (print . (`S.elem` tree))

File suffixtree.cabal

 Name:                suffixtree
 Version:             0.1
-Description:         A suffix tree implementation
+Description:         An efficient, lazy suffix tree implementation
 License:             BSD3
 License-File:        LICENSE
 Author:              Bryan O'Sullivan <bos@serpentine.com>
 Build-Depends:       base, QuickCheck
 -- add fps for ghc 6.4.2
 Exposed-modules:     Data.SuffixTree
-ghc-options:         -W -O2 -optc-O3 -funbox-strict-fields
+ghc-options:         -W -O2 -optc-O3 -funbox-strict-fields -fno-warn-incomplete-patterns
 

File tests/Makefile

+ghc := ghc
+ghcflags := $(shell awk -F: '/ghc-options/{print $$2}' ../suffixtree.cabal)
+
+tests:
+	runhaskell SuffixCheck
+
+inplace:
+	cd .. && $(ghc) $(ghcflags) --make Data/*.hs

File tests/SuffixCheck.hs

+module Main where
+import qualified Data.SuffixTree as T
+import Data.Char (ord)
+import Data.Maybe (isJust)
+import Test.QuickCheck
+
+instance Arbitrary Char where
+  arbitrary     = choose ('a', 'z')
+  coarbitrary c = variant (ord c `rem` 4)
+
+deepCheck p = check (defaultConfig {configMaxTest = 1000}) p
+
+prop_allsufs :: [Char] -> Bool
+prop_allsufs s = let t = T.construct s
+                 in all (`T.elem` t) (T.suffixes s)
+
+prop_find_eq_elem :: [Char] -> Bool
+prop_find_eq_elem s = all find_eq_elem (T.suffixes s)
+    where t = T.construct s
+          find_eq_elem s = isJust (T.find s t) == T.elem s t
+
+main = do
+  deepCheck prop_allsufs
+  deepCheck prop_find_eq_elem