Commits

Eric Rochester committed 6dad9b2

Added a minimal token filter that removes WS and separator characters.

  • Participants
  • Parent commits 1c8ce03
  • Branches monad-stack

Comments (0)

Files changed (5)

   exposed-modules:
       System.Bakers12.Enumerators
     , Text.Bakers12.Tokenizer
+    , Text.Bakers12.Tokenizer.Minimal
     , Text.Bakers12.Tokenizer.PennTreebank
     , Text.Bakers12.Tokenizer.Types
 

lib/Text/Bakers12/Tokenizer/Minimal.hs

+
+-- |
+-- Module: Text.Bakers12.Tokenizer.Minimal
+-- License: Apache 2.0
+-- Maintainer: erochest@gmail.com
+-- Portability: GHC
+--
+-- This is a minimal tokenizer. It just removes whitespace and separators from
+-- the token stream.
+
+module Text.Bakers12.Tokenizer.Minimal
+    ( Token(..)
+    , TokenType(..)
+    , tokenize
+    , tokenizeFile
+    , tokenizeFileStream
+    , minimalFilter
+    ) where
+
+import           Control.Exception (SomeException)
+import qualified Data.Enumerator as E
+import qualified Data.Enumerator.List as EL
+import qualified Data.Text as T
+import qualified Text.Bakers12.Tokenizer as B12
+import           Text.Bakers12.Tokenizer.Types hiding (append, concat)
+import qualified Text.Bakers12.Tokenizer.Types as Tkn
+
+-- | This reads text from an instsance of Data.Text.Text and returns a list of
+-- Token instances.
+tokenize :: FilePath -> T.Text -> Either SomeException [Token]
+tokenize source input =
+    E.runLists [[input]] process
+    where process = B12.tokenizeStream source 0 E.=$ minimalFilter E.=$ EL.consume
+
+-- | This reads the input from a file and returns a list of Token instances.
+tokenizeFile :: FilePath -> IO (Either SomeException [Token])
+tokenizeFile inputFile =
+    E.run (B12.tokenizeFileStream inputFile E.$= minimalFilter E.$$ EL.consume)
+
+-- | This creates an Enumerator that reads from a file and produces Tokens.
+--
+-- This assumes the files are UTF8.
+tokenizeFileStream :: FilePath -> E.Enumerator Token IO b
+tokenizeFileStream inputFile =
+    B12.tokenizeFileStream inputFile E.$= minimalFilter
+
+-- | This is an enumeratee that filters a token stream created by
+-- Text.Bakers12.Tokenizer and basically just removes the whitespace and
+-- separator characters from it.
+minimalFilter :: Monad m => E.Enumeratee Token Token m b
+minimalFilter = EL.filter ((SeparatorToken /=) . tokenType)
+

tests/Test/Bakers12/System/Enumerators.hs

     where input    = ["tests"]
           expected = [ "tests/TestBakers12.hs"
                      , "tests/Test/Bakers12/Tokenizer.hs"
+                     , "tests/Test/Bakers12/System/Enumerators.hs"
+                     , "tests/Test/Bakers12/Tokenizer/Minimal.hs"
                      , "tests/Test/Bakers12/Tokenizer/PennTreebank.hs"
-                     , "tests/Test/Bakers12/System/Enumerators.hs"
                      ]
 
 assertExpDirExpandMixed :: Assertion
     where input    = ["tests"]
           expected = [ "tests/TestBakers12.hs"
                      , "tests/Test/Bakers12/Tokenizer.hs"
+                     , "tests/Test/Bakers12/System/Enumerators.hs"
+                     , "tests/Test/Bakers12/Tokenizer/Minimal.hs"
                      , "tests/Test/Bakers12/Tokenizer/PennTreebank.hs"
-                     , "tests/Test/Bakers12/System/Enumerators.hs"
                      ]
 
 

tests/Test/Bakers12/Tokenizer/Minimal.hs

+
+module Test.Bakers12.Tokenizer.Minimal
+    ( minimalFilterTests
+    ) where
+
+import qualified Data.List as L
+import qualified Data.Text as T
+import           Test.Framework (Test, testGroup)
+import           Test.Framework.Providers.HUnit (testCase)
+import           Test.HUnit (Assertion, assertBool)
+import           Text.Bakers12.Tokenizer.Minimal (Token(..), TokenType(..), tokenize)
+
+tokenize' :: FilePath -> T.Text -> [Token]
+tokenize' source input =
+    case tokenize source input of
+        Right tokens -> tokens
+        Left  err    -> []
+
+assertTokenizes :: String -> String -> [String] -> Assertion
+assertTokenizes msg input expected = assertBool msg' $ expected == output
+    where
+        msg'   = msg ++ ": " ++ show output
+        output = map (T.unpack . tokenText) . tokenize' msg $ T.pack input
+
+assertAll :: String -> String -> (Token -> Bool) -> Assertion
+assertAll msg input p = assertBool msg' all
+    where
+        msg'   = msg ++ ": " ++ show output
+        output = tokenize' msg $ T.pack input
+        all    = L.all p output
+
+
+assertRemovesWhitespace :: Assertion
+assertRemovesWhitespace = assertAll "assertRemovesWhitespace" input pred
+    where
+        input = "some text here. i really don't care what."
+        pred  = (SeparatorToken /=) . tokenType
+
+
+minimalFilterTests :: [Test]
+minimalFilterTests =
+    [ testGroup "whitespace" [ testCase "missing" assertRemovesWhitespace
+                             ]
+    ]
+

tests/TestBakers12.hs

 
 import Test.Bakers12.System.Enumerators
 import Test.Bakers12.Tokenizer
+import Test.Bakers12.Tokenizer.Minimal
 import Test.Bakers12.Tokenizer.PennTreebank
 import Test.Framework (Test, defaultMain, testGroup)
 
 tests :: [Test]
 tests =
     [ testGroup "tokenizer" tokenizerTests
+    , testGroup "minimal filter" minimalFilterTests
     , testGroup "penn treebank" pennTreebankTests
     , testGroup "system.enumerators" systemEnumeratorTests
     ]