Commits

Bryan O'Sullivan committed d2ce203

More docs, API improvements.

Comments (0)

Files changed (2)

 -- are not created until they are traversed.  Two construction
 -- functions are provided, 'constructWith' for sequences composed of
 -- small alphabets, and 'construct' for larger alphabets.
+--
+-- Estimates are given for performance.  The value /k/ is a constant;
+-- /n/ is the length of a query string; and /t/ is the number of
+-- elements (nodes and leaves) in a suffix tree.
 
 module Data.SuffixTree
     (
     -- * Types
       Alphabet
+    , Edge
     , Prefix
     , STree(..)
 
 
     -- * Querying
     , elem
-    , find
+    , findEdge
+    , findTree
+    , findPath
+    , countLeaves
+    , countRepeats
 
     -- * Traversal
+    , foldr
+    , foldl
     , fold
-    , fold'
 
     -- * Other useful functions
+    , mkPrefix
     , prefix
     , suffixes
     ) where
                    
-import Prelude hiding (elem)
+import Prelude hiding (elem, foldl, foldr)
 import qualified Data.Map as M
-import Data.List (foldl')
 import Control.Arrow (second)
 import qualified Data.ByteString as SB
 import qualified Data.ByteString.Lazy as LB
+import qualified Data.List as L
 import Data.Maybe (listToMaybe, mapMaybe)
 
 -- | The length of a prefix list.  This type is formulated to do cheap
 -- work eagerly (to avoid constructing a pile of deferred thunks),
--- while deferring potentially expensive work.
+-- while deferring potentially expensive work (computing the length of
+-- a list).
 data Length a = Exactly {-# UNPACK #-} !Int
               | Sum {-# UNPACK #-} !Int [a]
               deriving (Show)
 -- input.
 type Alphabet a = [a]
 
--- | The prefix string associated with an 'Edge'.
+-- | The prefix string associated with an 'Edge'.  Use 'mkPrefix' to
+-- create a value of this type, and 'prefix' to deconstruct one.
 newtype Prefix a = Prefix ([a], Length a)
-    deriving (Show)
 
 instance (Eq a) => Eq (Prefix a) where
     a == b = prefix a == prefix b
 
+instance (Ord a) => Ord (Prefix a) where
+    compare a b = compare (prefix a) (prefix b)
+
+instance (Show a) => Show (Prefix a) where
+    show a = "mkPrefix " ++ show (prefix a)
+
 type EdgeFunction a = [[a]] -> (Length a, [[a]])
 
--- | A suffix tree.  The implementation is exposed to ease the
+-- | An edge in the suffix tree.
+type Edge a = (Prefix a, STree a)
+
+-- | /O(1)/. Construct a 'Prefix' value.
+mkPrefix :: [a] -> Prefix a
+mkPrefix xs = Prefix (xs, Sum 0 xs)
+
+pmap :: (a -> b) -> Prefix a -> Prefix b
+pmap f = mkPrefix . map f . prefix
+
+instance Functor Prefix where
+    fmap = pmap
+
+-- | The suffix tree type.  The implementation is exposed to ease the
 -- development of custom traversal functions.  Note that @('Prefix' a,
 -- 'STree' a)@ pairs are not stored in any order.
-data STree a = Node [(Prefix a, STree a)]
+data STree a = Node [Edge a]
              | Leaf
                deriving (Show)
 
--- | Obtain the list stored in a 'Prefix'.
+smap :: (a -> b) -> STree a -> STree b
+smap _ Leaf = Leaf
+smap f (Node es) = Node (map (\(p, t) -> (fmap f p, smap f t)) es)
+
+instance Functor STree where
+    fmap = smap
+
+-- | /O(n)/. Obtain the list stored in a 'Prefix'.
 prefix :: Prefix a -> [a]
 prefix (Prefix (ys, Exactly n)) = take n ys
 prefix (Prefix (ys, Sum n xs)) = tk n ys
     where tk 0 ys = zipWith (const id) xs ys
           tk n (y:ys) = y : tk (n-1) ys
 
--- | /O(n)/. Fold the edges in a tree, from bottom to top.  Suitable
--- for lazy use.
-fold :: (Prefix a -> b -> b) -> b -> STree a -> b
-fold _ z Leaf = z
-fold f z (Node es) = foldr (\(e, t) v -> f e (fold f v t)) z es
+-- | /O(t)/. Folds the edges in a tree, using post-order traversal.
+-- Suitable for lazy use.
+foldr :: (Prefix a -> b -> b) -> b -> STree a -> b
+foldr _ z Leaf = z
+foldr f z (Node es) = L.foldr (\(p,t) v -> f p (foldr f v t)) z es
 
--- | /O(n)/. Fold the edges in a tree, from bottom to top.  Suitable
--- for strict use.
-fold' :: (a -> Prefix b -> a) -> a -> STree b -> a
-fold' _ z Leaf = z
-fold' f z (Node es) = foldl' (\v (e, t) -> f (fold' f v t) e) z es
+-- | /O(t)/. Folds the edges in a tree, using pre-order traversal.  The
+-- step function is evaluated strictly.
+foldl :: (a -> Prefix b -> a)   -- ^ step function (evaluated strictly)
+      -> a                      -- ^ initial state
+      -> STree b
+      -> a
+foldl _ z Leaf = z
+foldl f z (Node es) = L.foldl' (\v (p,t) -> f (foldl f v t) p) z es
 
--- | Increment the length of a prefix.
+-- | /O(t)/. Generic fold over a tree.
+--
+-- A few simple examples.
+--
+-- >countLeaves == fold id id (const const) (1+) 0
+--
+-- >countEdges = fold id id (\_ a _ -> a+1) id 0
+--
+-- This more complicated example generates a tree of the same shape,
+-- but new type, with annotated leaves.
+--
+-- @
+--data GenTree a b = GenNode [('Prefix' a, GenTree a b)]
+--                 | GenLeaf b
+--                   deriving ('Show')
+-- @
+-- 
+-- @
+--gentree :: 'STree' a -> GenTree a Int
+--gentree = 'fold' reset id fprefix reset leaf
+--    where leaf = GenLeaf 1
+--          reset = 'const' leaf
+--          fprefix p t (GenLeaf _) = GenNode [(p, t)]
+--          fprefix p t (GenNode es) = GenNode ((p, t):es)
+-- @
+fold :: (a -> a)                -- ^ downwards state transformer
+     -> (a -> a)                -- ^ upwards state transformer
+     -> (Prefix b -> a -> a -> a) -- ^ edge state transformer
+     -> (a -> a)                -- ^ leaf state transformer
+     -> a                       -- ^ initial state
+     -> STree b                 -- ^ tree
+     -> a
+fold fdown fup fprefix fleaf = go
+    where go v Leaf = fleaf v
+          go v (Node es) = fup (L.foldr edge v es)
+          edge (p, t) v = fprefix p (go (fdown v) t) v
+
+-- | Increments the length of a prefix.
 inc :: Length a -> Length a
 inc (Exactly n) = Exactly (n+1)
 inc (Sum n xs)  = Sum (n+1) xs
                            (cpl,ssr) <- [edge n]]
           clusterBy ss a = [cs | c:cs <- ss, c == a]
 
--- | Return all non-empty suffixes of the argument, longest first.
--- Behaves as follows:
+-- | /O(n)/. Returns all non-empty suffixes of the argument, longest
+-- first.  Behaves as follows:
 --
 -- >suffixes xs == init (tails xs)
 suffixes :: [a] -> [[a]]
                            (cpl,ssr) <- [edge n]]
 
 suffixMap :: Ord a => [[a]] -> [(a, [[a]])]
-suffixMap = map (second reverse) . M.toList . foldl' step M.empty
+suffixMap = map (second reverse) . M.toList . L.foldl' step M.empty
     where step m (x:xs) = M.alter (f xs) x m
           step m _ = m
           f x Nothing = Just [x]
                              -> STree LB.ByteString #-}
 {-# SPECIALISE constructWith :: (Eq a) => [[a]] -> [[a]] -> STree [a] #-}
 
--- | /O(k n log n)/.  Construct a suffix tree using the given
+-- | /O(k n log n)/.  Constructs a suffix tree using the given
 -- alphabet.  The performance of this function is linear in the size
 -- /k/ of the alphabet.  That makes this function suitable for small
 -- alphabets, such as DNA nucleotides.  For an alphabet containing
 {-# SPECIALISE construct :: [LB.ByteString] -> STree LB.ByteString #-}
 {-# SPECIALISE construct :: (Ord a) => [[a]] -> STree [a] #-}
 
--- | /O(n log n)/.  Construct a suffix tree.
+-- | /O(n log n)/.  Constructs a suffix tree.
 construct :: (Ord a) => [a] -> STree a
 construct = lazyTree cst
 
 suffix :: (Eq a) => [a] -> [a] -> Maybe [a]
-suffix (l:ls) (x:xs) | l == x = suffix ls xs
+suffix (p:ps) (x:xs) | p == x = suffix ps xs
                      | otherwise = Nothing
 suffix _ xs = Just xs
 
 {-# SPECIALISE elem :: [LB.ByteString] -> STree LB.ByteString -> Bool #-}
 {-# SPECIALISE elem :: (Eq a) => [[a]] -> STree [a] -> Bool #-}
 
--- | /O(n)/.  Indicate the suffix tree contains the given subsequence.
--- Performance is linear in the length of the subsequence.
+-- | /O(n)/.  Indicates whether the suffix tree contains the given
+-- sublist.  Performance is linear in the length /n/ of the
+-- sublist.
 elem :: (Eq a) => [a] -> STree a -> Bool
 elem [] _ = True
 elem _ Leaf = False
 elem xs (Node es) = any pfx es
     where pfx (e, t) = maybe False (`elem` t) (suffix (prefix e) xs)
 
-{-# SPECIALISE find :: [Char] -> STree Char
-                    -> Maybe (Prefix Char, STree Char) #-}
-{-# SPECIALISE find :: [[Char]] -> STree [Char]
-                    -> Maybe (Prefix [Char], STree [Char]) #-}
-{-# SPECIALISE find :: [SB.ByteString] -> STree SB.ByteString
-                    -> Maybe (Prefix SB.ByteString, STree SB.ByteString) #-}
-{-# SPECIALISE find :: [LB.ByteString] -> STree LB.ByteString
-                    -> Maybe (Prefix LB.ByteString, STree LB.ByteString) #-}
-{-# SPECIALISE find :: (Eq a) => [[a]] -> STree [a]
-                    -> Maybe (Prefix [a], STree [a]) #-}
+{-# SPECIALISE findEdge :: [Char] -> STree Char
+                        -> Maybe (Edge Char, Int) #-}
+{-# SPECIALISE findEdge :: [String] -> STree String
+                        -> Maybe (Edge String, Int) #-}
+{-# SPECIALISE findEdge :: [SB.ByteString] -> STree SB.ByteString
+                        -> Maybe (Edge SB.ByteString, Int) #-}
+{-# SPECIALISE findEdge :: [ LB.ByteString] -> STree LB.ByteString
+                        -> Maybe (Edge LB.ByteString, Int) #-}
+{-# SPECIALISE findEdge :: (Eq a) => [[a]] -> STree [a]
+                        -> Maybe (Edge [a], Int) #-}
 
--- | /O(n)/.  Return the portion of the suffix tree at which the given
--- subsequence is located.  If the subsequence is not found, return
--- 'Nothing'.
-find :: (Eq a) => [a] -> STree a -> Maybe (Prefix a, STree a)
-find _ Leaf = Nothing
-find xs (Node es) = listToMaybe (mapMaybe pfx es)
-    where pfx p@(e, t) = suffix (prefix e) xs >>= \suf ->
+-- | /O(n)/.  Finds the given subsequence in the suffix tree.  On
+-- failure, returns 'Nothing'.  On success, returns the 'Edge' in the
+-- suffix tree at which the subsequence ends, along with the number of
+-- elements to drop from the prefix of the 'Edge' to get the \"real\"
+-- remaining prefix.
+--
+-- Here is an example:
+--
+-- >> find "ssip" (construct "mississippi")
+-- >Just ((mkPrefix "ppi",Leaf),1)
+--
+-- This indicates that the edge @('mkPrefix' \"ppi\",'Leaf')@ matches,
+-- and that we must strip 1 character from the string @\"ppi\"@ to get
+-- the remaining prefix string @\"pi\"@.
+--
+-- Performance is linear in the length /n/ of the query list.
+findEdge :: (Eq a) => [a] -> STree a -> Maybe (Edge a, Int)
+findEdge _ Leaf = Nothing
+findEdge xs (Node es) = listToMaybe (mapMaybe pfx es)
+    where pfx e@(p, t) = let p' = prefix p
+                         in suffix p' xs >>= \suf ->
             case suf of
-              [] -> return p
-              s -> find s t
+              [] -> return (e, length (zipWith const xs p'))
+              s -> findEdge s t
+
+-- | /O(n)/. Finds the subtree rooted at the end of the given query
+-- sequence.  On failure, returns 'Nothing'.
+--
+-- Performance is linear in the length /n/ of the query list.
+findTree :: (Eq a) => [a] -> STree a -> Maybe (STree a)
+findTree s t = (snd . fst) `fmap` findEdge s t
+
+-- | /O(n)/. Returns the path from the 'Edge' in the suffix tree at
+-- which the given subsequence ends, up to the root of the tree.  If
+-- the subsequence is not found, returns the empty list.
+--
+-- Performance is linear in the length of the query list.
+findPath :: (Eq a) => [a] -> STree a -> [Edge a]
+findPath = go []
+    where go _ _ Leaf = []
+          go me xs (Node es) = pfx me es
+              where pfx _ [] = []
+                    pfx me (e@(p, t):es) =
+                        case suffix (prefix p) xs of
+                          Nothing -> pfx me es
+                          Just [] -> e:me
+                          Just s -> go (e:me) s t
+
+-- | /O(t)/. Count the number of leaves in a tree.
+--
+-- Performance is linear in the number /t/ of elements in the tree.
+countLeaves :: STree a -> Int
+countLeaves Leaf = 1
+countLeaves (Node es) = L.foldl' (\v (_, t) -> countLeaves t + v) 0 es
+
+-- | /O(n + r)/. Count the number of times a sequence is repeated
+-- in the input sequence that was used to construct the suffix tree.
+--
+-- Performance is linear in the length /n/ of the input sequence, plus
+-- the number of times /r/ the sequence is repeated.
+countRepeats :: (Eq a) => [a] -> STree a -> Int
+countRepeats s t = maybe 0 countLeaves (findTree s t)
 module Main where
 import qualified Data.SuffixTree as T
 import Data.Char (ord)
+import Data.List (inits)
 import Data.Maybe (isJust)
 import Test.QuickCheck
 
 
 deepCheck p = check (defaultConfig {configMaxTest = 1000}) p
 
-prop_allsufs :: [Char] -> Bool
-prop_allsufs s = let t = T.construct s
-                 in all (`T.elem` t) (T.suffixes s)
+prop_allsuffixes :: [Char] -> Bool
+prop_allsuffixes s = let t = T.construct s
+                     in all (`T.elem` t) (T.suffixes s)
 
-prop_find_eq_elem :: [Char] -> Bool
-prop_find_eq_elem s = all find_eq_elem (T.suffixes s)
+prefixes = tail . inits
+
+prop_allinfixes :: [Char] -> Bool
+prop_allinfixes s = let t = T.construct s
+                    in all (\suf -> all (`T.elem` t) (prefixes suf)) (T.suffixes s)
+
+prop_findEdge_eqv_elem :: [Char] -> Bool
+prop_findEdge_eqv_elem s = all check (T.suffixes s)
     where t = T.construct s
-          find_eq_elem s = isJust (T.find s t) == T.elem s t
+          check s = isJust (T.findEdge s t) == T.elem s t
+
+prop_findEdge_eqv_findTree :: [Char] -> Bool
+prop_findEdge_eqv_findTree s = all check (T.suffixes s)
+    where t = T.construct s
+          check s = isJust (T.findEdge s t) == isJust (T.findTree s t)
+
+prop_findEdge_eqv_findPath :: [Char] -> Bool
+prop_findEdge_eqv_findPath s = all check (T.suffixes s)
+    where t = T.construct s
+          check s = isJust (T.findEdge s t) == (not . null) (T.findPath s t)
 
 main = do
-  deepCheck prop_allsufs
-  deepCheck prop_find_eq_elem
+  deepCheck prop_allsuffixes
+  quickCheck prop_allinfixes
+  deepCheck prop_findEdge_eqv_elem
+  deepCheck prop_findEdge_eqv_findTree
+  deepCheck prop_findEdge_eqv_findPath