Commits

Grzegorz Chrupała committed 1a768d2

Added NLP.Scores.histogram plus some docu.

Comments (0)

Files changed (2)

nlp-scores/NLP/Scores.hs

-{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE 
+    BangPatterns 
+  , NoMonomorphismRestriction
+ #-}
 -- | Scoring functions commonly used for evaluation of NLP
 -- systems. Most functions in this module work on sequences which are
 -- instances of 'Data.Foldable', but some take a precomputed table of
     , mean
     , jaccard
     , entropy
+    , histogram
       -- * Extracting joint and marginal counts from 'Counts'
     , countJoint
     , countFst
   fromIntegral (Set.size (Set.union a b))
 {-# SPECIALIZE jaccard :: (Ord a) => Set.Set a -> Set.Set a -> Double #-}  
 
--- | Entropy: H(X) = -SUM_i P(X=i) log_2(P(X=i))
+-- | Entropy: H(X) = -SUM_i P(X=i) log_2(P(X=i)). @entropy xs@ is the
+-- entropy of the random variable represented by the sequence @xs@,
+-- where each element of @xs@ is the count of the one particular 
+-- value the random variable can take. If you need to compute the 
+-- entropy from a sequence of outcomes, the following will work:
+--
+-- > entropy . elems . histogram
+--
 entropy :: (Floating c, F.Foldable t) => t c -> c
 entropy cx = negate . getSum . F.foldMap  (Sum . f)  $ cx
     where n    = sum cx
           logn = logBase 2 n
           f nx = nx / n * (logBase 2 nx - logn)
 
+-- | @histogram xs@ is returns the map of the frequency counts of the
+-- elements in sequence @xs@
+histogram :: (Num a, Ord k, F.Foldable t) => t k -> Map.Map k a
+histogram = F.foldl' (\ z k -> Map.insertWith' (+) k 1 z) Map.empty
+
 -- | Creates count table 'Counts'
 counts :: (Ord a, Ord b, F.Foldable t) => t (a, b) -> Counts a b
 counts xys = F.foldl' f empty xys

nlp-scores/nlp-scores.cabal

 -- The package version. See the Haskell package versioning policy
 -- (http://www.haskell.org/haskellwiki/Package_versioning_policy) for
 -- standards guiding when and how versions should be incremented.
-Version:             0.4.3
+Version:             0.4.4
 
 -- A short (one-line) description of the package.
 Synopsis:            Scoring functions commonly used for evaluation in NLP and IR