Commits

Eric Rochester committed 8785cea

Added command-line option to specify the tokenizer filter.

  • Participants
  • Parent commits 6dad9b2
  • Branches monad-stack

Comments (0)

Files changed (3)

File src/Bakers12/Cli.hs

     , cmdArgs
     ) where
 
+import           Bakers12.Modes.Tokenizer (TokenFilter(..))
 import qualified Data.List as L
 import           Data.Version (Version(..))
 import           Paths_bakers12 (version)
+import           Prelude hiding (filter)
 import           System.Console.CmdArgs
 
 -- | The main type that the defines the command-line options.
 data Modes
     = Tokenize
-        { files :: [FilePath]
+        { filter :: Maybe TokenFilter
+        , files  :: [FilePath]
         }
     deriving (Show, Data, Typeable)
 
 bakers12Modes :: Modes
 bakers12Modes = modes
     [ Tokenize
-        { files = def &= args &= typ "FILES/DIRS"
+        { filter = def &= name "f" &= typ "TOKEN FILTER"
+                 &= help "The filter to use on the output tokens. This can\
+                          \ be one of 'null', 'minimal', 'penn'. The default\
+                          \ is 'minimal'."
+        , files = def &= args &= typ "FILES/DIRS"
         } &= details ["This takes one or files and tokenizes them."]
     ] &= summary ( "bakers12 v" ++ versionStr ++ (tagStrs $ versionTags version) ++
                    ", (c) Eric Rochester 2011, 2012" )

File src/Bakers12/Modes.hs

     ) where
 
 import Bakers12.Cli hiding (files)
-import Bakers12.Modes.Tokenizer (tokenize)
+import Bakers12.Modes.Tokenizer (TokenFilter(..), tokenize)
+import Data.Maybe (fromMaybe)
 
 -- | This dispatching function.
 execBakers12 :: Modes -> IO ()
-execBakers12 (Tokenize files) = tokenize files
+execBakers12 (Tokenize filter files) = tokenize filter' files
+    where filter' = fromMaybe Minimal filter
 

File src/Bakers12/Modes/Tokenizer.hs

+{-# LANGUAGE DeriveDataTypeable #-}
 
 -- | This is the controller for the `tokenizer` mode. It runs the tokenizer and
 -- prints out the output.
 
 module Bakers12.Modes.Tokenizer
     ( tokenize
+    , TokenFilter(..)
     ) where
 
 import           Control.Monad.Trans (lift)
 import qualified Data.Text.Lazy.Builder as TB
 import qualified Data.Text.Lazy.Builder.Int as TBI
 import           System.Bakers12.Enumerators (removeMissingFiles, expandDirectories)
+import           System.Console.CmdArgs (Data, Typeable)
 import           System.IO (stdout)
 import           Text.Bakers12.Tokenizer (Token(..), tokenizeE)
+import           Text.Bakers12.Tokenizer.Minimal (minimalFilter)
 import           Text.Bakers12.Tokenizer.PennTreebank (pennFilter)
 
+
+-- | This is an enumeration of the types of token filters provided by other
+-- modules.
+data TokenFilter
+    = Null
+    | Minimal
+    | Penn
+    deriving (Data, Enum, Eq, Show, Typeable)
+
 -- | This takes a list of possible file paths and tokenizes each one. It prints
 -- the tokens out as CSV. Missing files are silently skipped and directories
 -- are expanded into all the files in that directory and subdirectories. All of
 -- this is handled with Enumerators, so it's memory consumption should be
 -- decent.
-tokenize :: [FilePath] -> IO ()
-tokenize files =
-    run_ (enumLists [files] $= removeMissingFiles $= expandDirectories $=
-          tokenizeE $= pennFilter $$
-          tokenToCsv =$
-          ET.encode ET.utf8 =$
-          EB.iterHandle stdout)
+tokenize :: TokenFilter -> [FilePath] -> IO ()
+tokenize tokenFilter files =
+    run_ (tokenEnum' $$ outputIter)
+    where
+        fileEnum   = enumLists [files] $= removeMissingFiles $= expandDirectories
+        tokenEnum  = fileEnum $= tokenizeE
+        tokenEnum' = case tokenFilter of
+                        Null    -> tokenEnum
+                        Minimal -> tokenEnum $= minimalFilter
+                        Penn    -> tokenEnum $= pennFilter
+        outputIter = tokenToCsv =$
+                     ET.encode ET.utf8 =$
+                     EB.iterHandle stdout
 
 tokenToCsv :: Monad m => Enumeratee Token T.Text m b
 tokenToCsv cont@(Continue k) = do