Commits

Grzegorz Chrupała committed caea2a5

Added aline

Comments (0)

Files changed (3)

aline/EditTree.hs

+{-# LANGUAGE NoMonomorphismRestriction #-}
+module EditTree 
+       ( make
+       , apply
+       , check
+       , split3
+       , EditTree(..)
+       )
+where
+
+--import GramLab.Data.CommonSubstrings 
+import Data.List
+import qualified GramLab.Data.StringLike as S
+import Data.Maybe (fromMaybe)
+import Debug.Trace
+import Data.Binary
+import Control.Monad (liftM2,liftM4)
+import Data.Function
+
+make = editTree
+
+data EditTree s a = Split !Int !Int (EditTree s a) (EditTree s a)
+                  | Replace s s
+                    deriving (Eq,Ord,Show,Read)
+editTree w w' = case lcsi w w' of
+                  Nothing -> Replace w w'
+                  Just (i_w,i_w_end,i_w',i_w'_end) -> 
+                      let (w_prefix, w_root, w_suffix)  = split3 w   i_w  i_w_end
+                          (w'_prefix,w'_root, w'_suffix) = split3 w' i_w' i_w'_end
+                      in  Split i_w i_w_end
+                                (editTree w_prefix  w'_prefix)
+                                (editTree w_suffix  w'_suffix)
+
+lcsi w w' = fmap f (lcString w w') 
+    where f (str,(i_w,i_w')) = (i_w,i_w_end,i_w',i_w'_end)
+              where i_w_end  = S.length w  - i_w  - len
+                    i_w'_end = S.length w' - i_w' - len
+                    len      = S.length str
+
+apply (Replace s s') w = s'
+apply (Split i i_end lt rt) w = (apply lt pre) `S.append` root `S.append` (apply rt suf)
+    where (pre,root,suf) = split3 w i i_end
+                                              
+
+split3 w i i_end = let (prefix, rest)  = S.splitAt i w 
+                       (suffix_r, root_r)  = S.splitAt i_end (S.reverse rest)
+                   in (prefix,(S.reverse root_r),(S.reverse suffix_r))
+
+
+check (Replace s s') w       = s == w
+check (Split i j lt rt) w  =      len >= i 
+                               && len >= j 
+                               && check lt w_pre 
+                               && check rt w_suf
+    where len = S.length w 
+          (w_pre,w_root,w_suf) = split3 w i j
+
+
+instance Binary s => Binary (EditTree s a) where
+    put (Replace xs ys)  = put (0::Word8) >> put xs >> put ys
+    put (Split i j lt rt) = put (1::Word8) >> put i >> put j 
+                            >> put lt >> put rt
+    get = do
+      tag <- get
+      case tag::Word8 of
+        0 -> liftM2 Replace get get
+        1 -> liftM4 Split   get get get get
+
+
+lcString xs ys = 
+    case unzip . lcstr xs $ ys of
+    ([],_) -> fail "No common substring"
+    (i:is,j:_) -> return $ (map (xs!!) (i:is),(i,j))
+lcstr xs ys = maximumBy (compare `on` length) . concat . reverse
+              $  [f xs' ysi | xs' <- tails xsi ] 
+              ++ [f xsi ys' | ys' <- drop 1 . tails $ ysi ]
+  where f xs ys = scanl g [] $ zip xs ys
+        g z ((x,i), (y,j)) = if x == y then z ++ [(i,j)] else []
+        xsi = zip xs [0..]
+        ysi = zip ys [0..]
+-----------------------------------------------------------------------------
+-- Memoized version of Simple.hs [1] but still simple stupid and slow. 
+-- foldlcs lcs and ses adapted from Gauche's util.lcs module [2]
+-- [1] http://urchin.earth.li/darcs/ian/lcs/Data/List/LCS/Simple.hs
+-- [2] http://www.shiro.dreamhost.com/scheme/gauche/memo.html
+
+module LCS 
+       ( Edit (Ins,Del)
+       , EditScript
+       , lcs
+       , ses 
+       , apply
+       ) 
+where
+
+import Control.Monad.State
+import qualified Data.Map as Map
+import Prelude hiding (lookup)
+import Data.Maybe (catMaybes)
+import Data.Binary hiding (put,get)
+import qualified Data.Binary as B
+
+data Edit a       = Ins a Int
+                  | Del a Int deriving (Eq,Ord,Show,Read)
+type EditScript a = [Edit a]
+
+instance (Binary a) => Binary (Edit a) where
+    put (Ins a i) = B.put (0::Word8) >> B.put a >> B.put i
+    put (Del a i) = B.put (1::Word8) >> B.put a >> B.put i
+    get = do tag <- getWord8
+             case tag of
+               0 -> liftM2 Ins B.get B.get
+               1 -> liftM2 Del B.get B.get
+
+-- |The 'lcs' function takes two lists and returns a list with a longest
+-- common subsequence of the two.
+lcs :: (Ord a, Eq a) => [a] -> [a] -> [a]
+lcs xs ys = map (\(x,_,_) -> x)  (snd (lcsWithPositions xs ys))
+
+-- |The 'ses' function takes two lists and returns the shortest edit script
+-- which would change the first list into the second
+ses :: (Ord a, Eq a) => [a] -> [a] -> EditScript a
+ses as bs =
+    reverse changes
+        where aOnly a ((aPos,bPos),zs) = ((aPos+1,bPos),(Del a aPos) : zs)
+              bOnly b ((aPos,bPos),zs) = ((aPos,bPos+1),(Ins b aPos) : zs)
+              both  c ((aPos,bPos),zs) = ((aPos+1,bPos+1),zs)
+              (_,changes) = foldlcs aOnly bOnly both ((0,0),[]) as bs
+
+-- |The 'apply' function takes an edit script and a sequence and applies the
+-- changes encoded in the script to the sequence.
+-- The following holds: apply (ses xs ys) xs == ys
+apply :: EditScript a -> [a] -> [a]
+apply s xs = concat $ applyIns s $ applyDel s xs
+
+foldlcs:: Ord b => 
+             (b -> a -> a)
+          -> (b -> a -> a)  
+          -> (b -> a -> a) 
+          -> a 
+          -> [b] 
+          -> [b] 
+          -> a
+foldlcs aOnly bOnly both seed a b =
+    loop common seed a 0 b 0
+    where fold f z xs = foldl (\z x -> f x z) z xs
+          (len,common) = lcsWithPositions a b
+          loop common seed a aPos b bPos 
+              | null common  = fold bOnly (fold aOnly seed a) b
+              | otherwise    = 
+                  let ((e,aOff,bOff):commonTail) = common
+                      aSkip = aOff - aPos
+                      bSkip = bOff - bPos
+                      (aHead,aTail) = splitAt aSkip a
+                      (bHead,bTail) = splitAt bSkip b
+                  in loop commonTail
+                          (both e (fold bOnly (fold aOnly seed aHead) bHead))
+                          (tail aTail)
+                          (aOff + 1)
+                          (tail bTail)
+                          (bOff + 1)
+
+type PosSpec a = (a,Int,Int)
+type LCSMemoTable a =  Map.Map ((Int,[a]),(Int,[a])) (Int,[PosSpec a])               
+
+lcsWithPositions :: (Ord a) => [a] -> [a] -> (Int,[PosSpec a])
+lcsWithPositions xs ys = evalState (lcs_memo (0,xs) (0,ys)) Map.empty 
+
+lcs_memo :: (Ord a) => (Int,[a]) -> (Int,[a]) -> State (LCSMemoTable a) (Int,[PosSpec a])
+lcs_memo xs ys = do
+    d <- get
+    case Map.lookup (xs,ys) d of
+        Nothing -> do
+            r <- lcs' lcs_memo xs ys
+            modify (Map.insert (xs,ys) r)
+            return r
+        Just r -> return r
+ 
+lcs' rec (px,x:xs) (py,y:ys)
+ | x == y = do 
+    (len,zs) <- rec (px+1,xs) (py+1,ys)
+    return (len + 1, (x,px,py):zs)
+ | otherwise = do 
+    r1@(l1, _) <- rec (px,  x:xs) (py+1,ys)
+    r2@(l2, _) <- rec (px+1,xs)   (py,  y:ys)
+    if l1 >= l2 then return r1 else return r2
+lcs' _ (_,[]) _     = return (0, [])
+lcs' _  _    (_,[]) = return (0, [])
+
+
+pos (Del _ i) = i
+pos (Ins _ i) = i
+isIns (Ins _ _) = True
+isIns (Del _ _) = False
+
+applyDel s xs = 
+    applyDel' 0 s' xs
+    where s' = filter (not . isIns) s
+applyIns s xs =
+    applyIns' 0 s' xs
+    where s' = filter isIns s
+
+applyDel' i (e@(Del _ j):es) (x:xs)
+    | i == j    =  [] : applyDel' (i+1) es xs
+    | otherwise =  [x]  : applyDel' (i+1) (e:es) xs
+applyDel' i [] xs = map (:[]) xs
+applyDel' i es [] = []
+
+applyIns' i es (x:xs) = (insertions ++ x) : applyIns' (i+1) es xs
+    where insertions = insertionsAt i es
+applyIns' i es [] = [insertionsAt i es]
+
+insertionsAt i es = concatMap (\(Ins c j) -> if j == i then [c] else []) es
+
+testset = [  ("carbon","cabron"),("otreum","rirom"),("evita","pabita")
+           , ("abcabba","cbabac"),("caballo","lacayo"),("dijeran","decir")
+           , ("joroba","jodieron"),("alzheimer","heimat"),("argentina","argentaria")
+           , ("morir","murieron"),("ordenadores","computadoras"),("toalla","oatmeal")
+          ]
+runtest = mapM_ print $ map (\(a,b) -> ((a,b),"=>",(apply (ses a b) a) == b)) testset
+

aline/StringLike.hs

+{-# LANGUAGE MultiParamTypeClasses , FunctionalDependencies 
+, FlexibleInstances #-}
+module StringLike (StringLike(..))
+where
+import qualified Data.ByteString.Lazy as L
+import qualified Data.ByteString      as B
+import qualified Data.List as List
+import Data.Word
+import Prelude hiding (length,null,tail,init,splitAt,reverse,map,foldl,foldr)
+import qualified Prelude as P
+import Codec.Binary.UTF8.String
+class StringLike seq a | seq -> a where
+    toString   :: seq -> [a]
+    fromString :: [a] -> seq
+    length     :: seq -> Int
+    length     =  P.length . toString
+    null       :: seq -> Bool
+    null       =  P.null .  toString
+    tail       :: seq -> seq 
+    tail       =  fromString . P.tail . toString
+    init       :: seq -> seq
+    init       =  fromString . P.init . toString
+    tails      :: seq -> [seq]
+    tails      =  P.map fromString . List.tails . toString
+    inits      :: seq -> [seq]
+    inits      =  P.map fromString . List.inits . toString
+    splitAt    :: Int -> seq -> (seq,seq)
+    splitAt i w = let (w',w'') = P.splitAt i (toString w) in (fromString w', fromString w'')
+    reverse    :: seq -> seq
+    reverse    =  fromString . P.reverse . toString
+    append     :: seq -> seq -> seq
+    append w w' = fromString (toString w ++ toString w')
+    cons       :: a -> seq -> seq
+    cons x xs  = fromString $ x: toString xs
+    uncons     :: seq -> Maybe (a,seq)
+    uncons xs  = case toString xs of
+                   (x:xs) -> Just (x,fromString xs)
+                   _      -> Nothing
+    map        :: (a -> a) -> seq -> seq
+    map f      = fromString . map f . toString
+
+instance StringLike [a] a where
+    toString = id
+    fromString = id
+
+instance StringLike B.ByteString Char where
+    toString   = decode . B.unpack
+    fromString = B.pack . encode
+    null    = B.null
+    append  = B.append
+
+
+instance StringLike L.ByteString Char where
+    toString   = decode . L.unpack
+    fromString = L.pack . encode
+    null    = L.null
+    append  = L.append