Source

delta-H / src / EvalSem.hs

Full commit
module EvalSem 
       (evalSem)
where
import qualified Data.Map as Map
import qualified Data.Set as Set

import Reader (readcorpus,Token)
import Data.List (foldl',inits,isPrefixOf,sortBy)
import Data.Ord (comparing)
import SparseVector (plus,scale)
import Utils (splitOn)
import Data.Char (toLower)
import System.Environment 
import System.IO (stderr,hPutStr)
import Control.Exception (assert)
import Text.Printf
import NLP.Scores (avgPrecision, mean)

import Debug.Trace


type Word = String
type POS = String
type ClustID = String
type Feat = String
type Count = Double
type SemLex = Map.Map (Word,POS) (Map.Map Feat Count)
type SemClust = Map.Map ClustID [(Feat,Count)]

parseEntry :: String -> ((Word,POS),Map.Map Feat Count)
parseEntry ln = case words ln of 
                  (wp:fs) -> 
                      let [w,p] = splitOn ':' wp
                      in ((w,map toLower p),Map.fromList . map (\f -> (f,1)) 
                                         . splitOn ','                   
                                         . unwords
                                         $ fs)

parseLexicon :: String -> SemLex
parseLexicon =   foldl' f Map.empty
               . map parseEntry
               . filter (not . null)
               . lines
    where f z (k,v) = Map.insertWith' (Map.unionWith (+)) (v == v `seq` k) v z

semClusters :: SemLex -> [((Word,ClustID,POS),Count)] -> SemClust
semClusters dict =   
    Map.map (sortBy (flip $ comparing snd)
             . Map.toList)
    . Map.fromListWith (plus) 
    . map (\((w,cid,p),c) -> 
               (cid,Map.findWithDefault Map.empty (w,p) dict `scale` c))

  
evalSem args = do
  let [details   -- be verbose
        ,lexf      -- lexicon file
        ,trainposf -- POS tagged train file
        ,trainf    -- Cluster labeled train file
        ,posf      -- POS tagged test file
        ,clustf    -- Cluster labeled test file
        ] = args
  lex <- fmap parseLexicon $ readFile lexf
  css  <- fmap readcorpus $ readFile trainf
  cpos <- fmap readcorpus $ readFile trainposf
  pss <- fmap readcorpus $ readFile posf
  xss <- fmap readcorpus $ readFile clustf
  let toks yss zss = Map.toList 
             . Map.fromListWith (+)
             . map (\k -> (k,1))
             . zipWith (\(w,p) (w',cid) -> assert (w == w') (w,cid,p)) 
                     (concat yss)
             . concat 
             $ zss
      its = filter (\((w,_,p),_) -> 
                        (take 1 p `elem` ["n","v"] && Map.member (w,p) lex))
            . toks pss
            $ xss
      cs = semClusters lex . toks cpos $ css 
      ap ((w,cid,p),c) | read details && 
                         trace (show $ Map.findWithDefault [] cid $ cs) False =
                             undefined
      ap ((w,cid,p),c) = c * (avgPrecision (Map.keysSet 
                                           . Map.findWithDefault Map.empty (w,p)
                                          $ lex)
                              . map fst
                              . Map.findWithDefault [] cid 
                             $ cs)
      aps = map ap $ its :: [Double]
  hPutStr stderr . unlines . map (\(t,a) -> printf "%-40s %2.3f" (show t) a)
              . zip its $ aps
  printf "%2.3f\n" . (/ sum (map snd its)) . sum $ aps