Source

delta-H / src / Main.hs

Full commit
{-# LANGUAGE BangPatterns , OverloadedStrings  #-}
import NLP.Scores (recipRank,mean)
import Entropy.Algorithm
import Text.Printf (printf)
import System.IO
import Reader (Token,readcorpus,format)
import Debug.Trace
import System.Environment 
import Data.Binary (encode,decode,put,get,Binary)
import qualified Data.ByteString.Lazy as B (readFile,writeFile)
import Control.Monad (when)
import qualified Data.Map as Map
import SparseVector (plus)
import qualified Control.Monad.Atom as Atom
import qualified Data.IntMap as IntMap
import Data.Foldable (foldlM)
import Utils (groupsOf,splitWith)
import Data.List (sortBy,foldl')
import Data.Ord (comparing)
import Counts (counts,vi,ari)
import EvalSem (evalSem)
import qualified Data.Text.Lazy as Text

type Txt = Text.Text

main = do
  (command:args) <- getArgs
  case command of
    "learn"   -> learn cluster args
    "learn-beam" -> do 
                 let (k:args') = args 
                 learn (clusterBeam (read k)) args'
    "learn-seeded" -> 
                 do let (n:m:seedf:trainf:_) = args
                    seed'  <- fmap decode $ B.readFile seedf
                              :: IO (ClusterSet (Int,String))
                    let seed = makeClusterSet . prune (read m) . countXY $ seed'
                    hPutStrLn stderr . show . Map.size . countXY $ seed
                    train <- fmap readcorpus $ readFile trainf
                    let xss = groupsOf (read n) train
                        fids = featIDs seed
                        step z (i,xs) = do
                            let cs = cluster False z
                                   . concat
                                   . examples fids 
                                   $ xs
                                tf = (trainf ++ ".learn-seeded." ++ show i 
                                                ++ ".model")
                            hPutStrLn stderr $ "Writing model " ++ tf
                            hFlush stdout
                            B.writeFile tf . encode $ cs
                            return cs
                    cs <- foldlM step seed . zip [1..] $ xss
                    hPutStrLn stderr . show . Map.size . countXY $ cs
                    B.writeFile (trainf ++ ".learn-seeded.model")
                      . encode
                      $ cs
    "learn-intermed" ->
                 do let (n:fids:trainf:_) = args
                    train <- fmap readcorpus $ readFile trainf
                    let xss =   groupsOf (read n) 
                              $ train
                        fs = read fids
                             
                    let step :: ClusterSet (Int,String) -> (Int,[[Token]]) 
                             -> IO (ClusterSet (Int,String))
                        step z (i,xs) = do
                          let cs = cluster False z
                                   . concat
                                   . examples fs 
                                   $ xs
                          printf "%.6f %.6f\n" (weightedhXY cs) (hY cs)
                          hFlush stdout
                          B.writeFile (trainf ++ "." ++ fids ++ ".learn."
                                              ++ show i ++ ".model")
                                          . encode
                                          $ cs
                          let ys = [ clusterWords fs cs . map fst $ x
                                   | x <- xs ]
                              xs' = zipWith (zipWith (\(w,_) y -> (w,y))) xs ys
                          writeFile (trainf ++ "." ++ fids ++ ".learn." 
                                            ++ show i ++ ".labeled")
                                    . format
                                    $ xs'
                          return cs
                    cs <- foldlM step empty . zip [1..] $ xss
                    B.writeFile (trainf ++ "." ++ fids ++ ".learn.model") 
                         . encode 
                         $ cs
                        
    "teach"   -> do let (fids:labelf:_) = args
                    train <-   fmap readcorpus $ readFile labelf
                    let (cs,as) =  teach (read fids) train
                    hPutStrLn stderr . show . Map.size . countXY $ cs
                    B.writeFile (labelf ++ "." ++ fids ++ ".teach.model") 
                         . encode 
                         $ cs
                         {-
                    writeFile (labelf ++ "." ++ fids ++ ".teach.mapping")
                         . unlines
                         . map (\(i,s) -> unwords [show i,s])
                         . IntMap.toList
                         . Atom.from
                         $ as
                        -}

    "display" -> do let (modelf:_) = args
                    cs <- fmap decode $ B.readFile modelf
                    putStr . unlines
                           . map display
                           . Map.toList
                           . countXY 
                           $ cs
    "distribution" -> do let (modelf:_) = args
                         cs <- fmap decode $ B.readFile modelf 
                            :: IO (ClusterSet (Int,String))
                         putStr
                               . unlines
                               . map (\(k,v) -> unwords [show k,show v])
                               . sortBy (comparing snd)
                               . Map.toList
                               . Map.fromListWith (+)
                               . map (\n -> (n,1))
                               . Map.elems
                               . Map.map (Map.fold (+) 0)
                               . countXY
                               $ cs
    "label" ->   do let (foc:backoff:modelf:_) = args
                    cs <- fmap decode $ B.readFile modelf
                    ws <- fmap readcorpus $ getContents
                    let xs =  map (if read foc then  id else map defocus) 
                              . examples (featIDs cs) 
                              $ ws
                        label = if read backoff 
                                then labelToken cs 
                                else fst . head . clusterToken True cs
                        ys = map (map (show . label)) 
                             $ xs
                        xyss = zipWith zip (map (map fst) ws) ys::[[Token]]
                    putStr . format $ xyss
    "eval-mrr"-> do let (full:details:modelf:_) = args
                    cs <- fmap decode $ B.readFile modelf 
                               :: IO (ClusterSet (Int,String))
                    xs <- fmap (concat 
                                . examples (featIDs cs) 
                                . readcorpus) 
                           $ getContents
                    let yss = map ((if read full 
                                    then predictX0Full 
                                    else predictX0) cs) 
                              xs
                        yys = zip (map getX0 xs) $ yss
                        rrs = map (uncurry recipRank) yys
                    when (read details) $
                         do putStr 
                                . unlines 
                                . map (take 120)
                                . map (\(r,(x,xs)) 
                                      -> printf "%-4.5f %-10s %s" r x 
                                         (unwords xs))
                                . zip rrs 
                                $ yys
                    printf "MRR: %.4f\n" . avg $ rrs
    "eval-mrr-gold" -> do let [trainf] = args
                          train <- fmap readcorpus $ readFile trainf
                          let (cs,as) = teach [0] train
                          xys <- fmap (concat . readcorpus) $ getContents
                          let yys = zip (map fst xys)
                                        . map (\k -> 
                                                case fst . Atom.runAtom (Atom.maybeToAtom (snd k)) $ as
                                                of Just y -> 
                                                     clusterLabelToX0 cs y
                                                   Nothing -> [])

                                        $ xys
                          let rrs = recipRanks yys
                          printf "MMR: %.4f\n" . avg $ rrs
    "eval-goldpos" -> do
                 let (testf:goldf:_) = args
                 test <- fmap (map snd . concat . readcorpus) 
                             $ readFile testf
                 gold <- fmap (map snd . concat . readcorpus) 
                             $ readFile goldf
                 let cs = counts . zip gold $ test
                 printf "VI:  %.4f\n" . vi  $ cs
                 printf "ARI: %.4f\n" . ari $ cs
    "eval-sem" -> evalSem args
    
learn f args = do 
  let (fids:trainf:_) = args
  train <-   fmap readcorpus $ readFile trainf
  let xss = concat . examples (read fids) $ train
      cs = f False empty xss
  hPutStrLn stderr . show . Map.size . countXY $ cs
  B.writeFile (trainf ++ "." ++ fids ++ ".learn.model") 
            . encode $ cs

teach :: [Int] -> [[Token]] -> (ClusterSet (Int,String),Atom.AtomTable)
teach fids train = flip Atom.runAtom Atom.empty  $ do
    fmap (makeClusterSet 
          . foldl' (\ z (!k,!x) -> Map.insertWith' plus k x z) Map.empty
          . concat) 
  . flip mapM train $ \s -> 
        do ys' <- mapM (\(x,y) -> Atom.toAtom y) s
           let xs' = ys' == ys' `seq` concat $ examples fids [s]
           return $ zipWith (\y x -> (x == x `seq` y,x))  ys' xs'

                     
prune :: Int 
      -> Map.Map Y (Map.Map (Int,String) Count) 
      -> Map.Map Y (Map.Map (Int,String) Count)
prune m =    Map.fromList 
           . take m 
           . sortBy (flip $ comparing (foldl' (+) 0 . Map.elems . snd))
           . Map.toList

mrr :: (Eq y) => [(y,[y])] -> Double
mrr = mean . recipRanks

recipRanks :: (Eq y) => [(y,[y])] -> [Double]
recipRanks = map (uncurry recipRank)

avg :: [Double] -> Double
avg = mean