Commits

creswick  committed 20a5370

added text version of tokenize, moved str version to a new module, added benchmarking suite to compare performance

  • Participants
  • Parent commits 233a768

Comments (0)

Files changed (5)

File tokenize/src/NLP/Tokenize.hs

+{-# LANGUAGE OverloadedStrings #-}
+-- | NLP Tokenizer
+module NLP.Tokenize
+    ( module NLP.TokenizeStr
+    )
+where
+
+import NLP.TokenizeStr

File tokenize/src/NLP/TokenizeStr.hs

+module NLP.TokenizeStr 
+    ( EitherList(..)
+    , Tokenizer
+    , tokenize
+    , run
+    , defaultTokenizer
+    , whitespace
+    , uris
+    , punctuation
+    , finalPunctuation
+    , initialPunctuation
+    , contractions
+    , negatives
+    )
+where
+
+import qualified Data.Char as Char
+import Data.List
+import Data.Maybe
+import Control.Monad.Instances
+import Data.List.Split
+import Control.Monad
+
+-- | A Tokenizer is function which takes a list and returns a list of Eithers
+--  (wrapped in a newtype). Right Strings will be passed on for processing
+--  to tokenizers down
+--  the pipeline. Left Strings will be passed through the pipeline unchanged.
+--  Use a Left String in a tokenizer to protect certain tokens from further 
+--  processing (e.g. see the 'uris' tokenizer).
+type Tokenizer =  String -> EitherList String String
+
+-- | The EitherList is a newtype-wrapped list of Eithers.
+newtype EitherList a b =  E { unE :: [Either a b] }
+
+-- | Split string into words using the default tokenizer pipeline 
+tokenize :: String -> [String]
+tokenize  = run defaultTokenizer
+
+-- | Run a tokenizer
+run :: Tokenizer -> (String -> [String])
+run f = map unwrap . unE . f
+
+defaultTokenizer :: Tokenizer
+defaultTokenizer =     whitespace 
+                   >=> uris 
+                   >=> punctuation 
+                   >=> contractions 
+                   >=> negatives 
+
+-- | Detect common uris and freeze them
+uris :: Tokenizer
+uris x | isUri x = E [Left x]
+       | True    = E [Right x]
+    where isUri x = any (`isPrefixOf` x) ["http://","ftp://","mailto:"]
+
+-- | Split off initial and final punctuation
+punctuation :: Tokenizer 
+punctuation = finalPunctuation >=> initialPunctuation
+
+-- | Split off word-final punctuation
+finalPunctuation :: Tokenizer
+finalPunctuation x = E . filter (not . null . unwrap) $
+    case span Char.isPunctuation . reverse $ x of
+      ([],w) -> [Right . reverse $ w]
+      (ps,w) -> [Right . reverse $ w, Right . reverse $ ps]
+
+-- | Split off word-initial punctuation
+initialPunctuation :: Tokenizer
+initialPunctuation x = E . filter (not . null . unwrap) $
+    case span Char.isPunctuation$ x of
+      ([],w) -> [Right w]
+      (ps,w) -> [Right ps, Right w]
+
+-- | Split words ending in n't, and freeze n't 
+negatives :: Tokenizer
+negatives x | "n't" `isSuffixOf` x = E [ Right . reverse . drop 3 . reverse $ x
+                                       , Left "n't" ]
+            | True                 = E [Right x]
+
+-- | Split common contractions off and freeze them.
+-- | Currently deals with: 'm, 's, 'd, 've, 'll
+contractions :: Tokenizer
+contractions x = case catMaybes . map (splitSuffix x) $ cts of
+                   [] -> return x
+                   ((w,s):_) -> E [ Right w,Left s]
+    where cts = ["'m","'s","'d","'ve","'ll"]
+          splitSuffix w sfx = 
+              let w' = reverse w
+                  len = length sfx
+              in if sfx `isSuffixOf` w 
+                 then Just (take (length w - len) w, reverse . take len $ w')
+                 else Nothing
+
+
+-- | Split string on whitespace. This is just a wrapper for Data.List.words
+whitespace :: Tokenizer
+whitespace xs = E [Right w | w <- words xs ]
+
+instance Monad (EitherList a) where
+    return x = E [Right x]
+    E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
+
+unwrap (Left x) = x
+unwrap (Right x) = x
+
+examples = 
+    ["This shouldn't happen."
+    ,"Some 'quoted' stuff"
+    ,"This is a URL: http://example.org."
+    ,"How about an email@example.com"
+    ,"ReferenceError #1065 broke my debugger!"
+    ,"I would've gone."
+    ,"They've been there."
+    ]
+

File tokenize/src/NLP/TokenizeText.hs

+{-# LANGUAGE OverloadedStrings #-}
+-- | NLP Tokenizer, adapted to use Text instead of Strings from the
+-- `tokenize` package:
+--  * http://hackage.haskell.org/package/tokenize-0.1.3
+module NLP.TokenizeText
+    ( EitherList(..)
+    , Tokenizer
+    , tokenize
+    , run
+    , defaultTokenizer
+    , whitespace
+    , uris
+    , punctuation
+    , finalPunctuation
+    , initialPunctuation
+    , contractions
+    , negatives
+    )
+where
+
+import qualified Data.Char as Char
+import Data.Maybe
+import Control.Monad.Instances ()
+import Control.Monad
+
+import Data.Text (Text)
+import qualified Data.Text as T
+
+-- | A Tokenizer is function which takes a list and returns a list of Eithers
+--  (wrapped in a newtype). Right Strings will be passed on for processing
+--  to tokenizers down
+--  the pipeline. Left Strings will be passed through the pipeline unchanged.
+--  Use a Left String in a tokenizer to protect certain tokens from further 
+--  processing (e.g. see the 'uris' tokenizer).
+type Tokenizer =  Text -> EitherList Text Text
+
+-- | The EitherList is a newtype-wrapped list of Eithers.
+newtype EitherList a b =  E { unE :: [Either a b] }
+
+-- | Split string into words using the default tokenizer pipeline 
+tokenize :: Text -> [Text]
+tokenize = run defaultTokenizer
+
+-- | Run a tokenizer
+run :: Tokenizer -> (Text -> [Text])
+run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
+
+defaultTokenizer :: Tokenizer
+defaultTokenizer =     whitespace 
+                   >=> uris 
+                   >=> punctuation 
+                   >=> contractions 
+                   >=> negatives 
+
+-- | Detect common uris and freeze them
+uris :: Tokenizer
+uris x | isUri x = E [Left x]
+       | True    = E [Right x]
+    where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
+
+-- | Split off initial and final punctuation
+punctuation :: Tokenizer 
+punctuation = finalPunctuation >=> initialPunctuation
+
+hyphens :: Tokenizer
+hyphens xs = E [Right w | w <- T.split (=='-') xs ]
+
+-- | Split off word-final punctuation
+finalPunctuation :: Tokenizer
+finalPunctuation x = E $ filter (not . T.null . unwrap) res
+  where
+    res :: [Either Text Text]
+    res = case T.span Char.isPunctuation (T.reverse x) of
+      (ps, w) | T.null ps -> [ Right $ T.reverse w ]
+              | otherwise -> [ Right $ T.reverse w
+                             , Right $ T.reverse ps]
+      -- ([],w) -> [Right . T.reverse $ w]
+      -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
+
+-- | Split off word-initial punctuation
+initialPunctuation :: Tokenizer
+initialPunctuation x = E $ filter (not . T.null . unwrap) $
+    case T.span Char.isPunctuation x of
+      (ps,w) | T.null ps -> [ Right w ]
+             | otherwise -> [ Right ps
+                            , Right w ]
+
+-- | Split words ending in n't, and freeze n't 
+negatives :: Tokenizer
+negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
+                                         , Left "n't" ]
+            | True                   = E [ Right x ]
+
+-- | Split common contractions off and freeze them.
+-- | Currently deals with: 'm, 's, 'd, 've, 'll
+contractions :: Tokenizer
+contractions x = case catMaybes . map (splitSuffix x) $ cts of
+                   [] -> return x
+                   ((w,s):_) -> E [ Right w,Left s]
+    where cts = ["'m","'s","'d","'ve","'ll"]
+          splitSuffix w sfx = 
+              let w' = T.reverse w
+                  len = T.length sfx
+              in if sfx `T.isSuffixOf` w 
+                 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
+                 else Nothing
+
+
+-- | Split string on whitespace. This is just a wrapper for Data.List.words
+whitespace :: Tokenizer
+whitespace xs = E [Right w | w <- T.words xs ]
+
+instance Monad (EitherList a) where
+    return x = E [Right x]
+    E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
+
+unwrap :: Either a a -> a
+unwrap (Left x) = x
+unwrap (Right x) = x
+
+examples :: [Text]
+examples = 
+    ["This shouldn't happen."
+    ,"Some 'quoted' stuff"
+    ,"This is a URL: http://example.org."
+    ,"How about an email@example.com"
+    ,"ReferenceError #1065 broke my debugger!"
+    ,"I would've gone."
+    ,"They've been there."
+    ]
+

File tokenize/tests/src/Bench.hs

+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE PackageImports #-}
+module Bench where
+
+import qualified Data.ByteString as BS
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Encoding as TE
+
+import Control.DeepSeq
+import Criterion.Main
+import Criterion.Config (defaultConfig, Config(..), ljust)
+import Criterion (bench, bgroup, Benchmark)
+
+import System.Environment (getArgs, withArgs)
+
+import qualified NLP.TokenizeStr as StrTok
+import qualified NLP.TokenizeText as TextTok
+
+
+myConfig :: Config
+myConfig = defaultConfig {
+              -- Always GC between runs.
+              cfgPerformGC = ljust True
+            }
+
+readF :: FilePath -> IO Text
+readF file = do
+  bs <- BS.readFile file
+  return $ TE.decodeLatin1 bs
+
+main :: IO ()
+main = do
+  args <- getArgs
+  case args of
+    [] -> putStrLn "Usage: bench <input_text_file>"
+    (f:rest) -> do
+      plugTxt <- mapM readF [f]
+      let plugStr = map T.unpack plugTxt
+      deepseq plugStr $ withArgs rest $ defaultMainWith myConfig (return ())
+            [ bgroup "tokenizing"
+              [ bench "Native String Tokenizer" $ nf (map StrTok.tokenize) plugStr
+              , bench "Native Text Tokenizer" $ nf (map TextTok.tokenize) plugTxt
+              , bench "Text->Text based on String Tokenizer" $ nf (map strTokenizer) plugTxt
+              , bench "String->String based on Text Tokenizer" $ nf (map txtTokenizer) plugStr
+              ]
+            ]
+
+strTokenizer :: Text -> [Text]
+strTokenizer txt = map T.pack (StrTok.tokenize $ T.unpack txt)
+
+txtTokenizer :: String -> [String]
+txtTokenizer str = map T.unpack (TextTok.tokenize $ T.pack str)

File tokenize/tokenize.cabal

 -- Extra-source-files:  
 
 -- Constraint on the version of Cabal needed to build this package.
-Cabal-version:       >=1.2
+Cabal-version:       >=1.10
 
 
 Library
+ hs-source-dirs:      src
   -- Modules exported by the library.
  Exposed-modules:     NLP.Tokenize
-  
+                      NLP.TokenizeStr
+                      NLP.TokenizeText
+
   -- Packages needed in order to build this package.
- Build-depends: base >= 3 && < 5, split >= 0.1      
-  
- 
+ Build-depends: base >= 3 && < 5,
+                split >= 0.1,
+                text
+
+Executable bench
+   default-language: Haskell2010
+   Main-Is:          Bench.hs
+   hs-source-dirs:   tests/src
+
+   Build-depends:    tokenize,
+                     criterion,
+                     filepath >= 1.3.0.1,
+                     text >= 0.11.3.0,
+                     base       >= 4 && <= 6,
+                     deepseq,
+                     split >= 0.1.2.3,
+                     bytestring
+
+   ghc-options:      -Wall -main-is Bench