Commits

Eric Rochester committed 68a99b2

Added JSON output for the tokenizer.

Comments (0)

Files changed (6)

     , Text.Bakers12.Csv.Enumerators
     , Text.Bakers12.Csv.Types
     , Text.Bakers12.Csv.Utils
+    , Text.Bakers12.JSON.Enumerators
     , Text.Bakers12.Tokenizer
     , Text.Bakers12.Tokenizer.Csv
+    , Text.Bakers12.Tokenizer.JSON
     , Text.Bakers12.Tokenizer.Minimal
     , Text.Bakers12.Tokenizer.PennTreebank
     , Text.Bakers12.Tokenizer.Types
     , directory >=1.1 && <2
     , text >=0.10 && <1
     , enumerator >=0.4 && <5
+    , aeson >=0.5 && <0.6
 
   extensions: TypeSynonymInstances
 

lib/Text/Bakers12/JSON/Enumerators.hs

+{-# LANGUAGE OverloadedStrings #-}
+
+-- | This provides an enumerator interface over the aeson JSON library.
+
+module Text.Bakers12.JSON.Enumerators
+    ( toJSONE
+    ) where
+
+import           Control.Monad.Trans (lift)
+import           Data.Aeson (ToJSON(..))
+import           Data.Aeson.Encode (fromValue)
+import           Data.Enumerator hiding (map)
+import qualified Data.Enumerator.List as EL
+import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Builder as TLB
+
+-- | This takes a stream of ToJSON objects and interprets them as an array.  To
+-- do this, it breaks strict monad laws by outputting a '[' before starting to
+-- process the stream and by outputting a ']' when the stream is done.
+toJSONE :: (ToJSON a, Monad m) => Enumeratee a T.Text m b
+toJSONE = loop 0
+    where
+        loop n cont@(Continue k) = do
+            maybeItem <- EL.head
+
+            -- Ugly, ugly, ugly. There's probably something monadic I can do to
+            -- pretty it up.
+            case maybeItem of
+                Just item -> do
+                    let chunks = Chunks [prefix n, jsonize item]
+                    next <- lift $ runIteratee $ k $ chunks
+                    loop (n + 1) next
+                Nothing   ->
+                    case (suffix n) of
+                        Just close -> do
+                            next <- lift $ runIteratee $ k $ Chunks [close]
+                            loop 0 next
+                        Nothing    -> return cont
+
+        loop _ step = return step
+
+        prefix :: Int -> T.Text
+        prefix 0 = "["
+        prefix _ = ","
+
+        suffix :: Int -> Maybe T.Text
+        suffix 0 = Nothing
+        suffix _ = Just "]"
+
+        jsonize = TL.toStrict . TLB.toLazyText . fromValue . toJSON
+

lib/Text/Bakers12/Tokenizer/JSON.hs

+{-# LANGUAGE OverloadedStrings #-}
+
+-- | This just defines Token and TokenType as instances of ToJSON.
+
+module Text.Bakers12.Tokenizer.JSON
+    () where
+
+import Data.Aeson (ToJSON(..), object, (.=))
+import Text.Bakers12.Tokenizer.Types (Token(..), TokenType(..))
+
+instance ToJSON Token where
+    toJSON (Token text raw len typ src offs) =
+        object [ "text"   .= text
+               , "raw"    .= raw
+               , "length" .= len
+               , "type"   .= typ
+               , "source" .= src
+               , "offset" .= offs
+               ]
+
+instance ToJSON TokenType where
+    toJSON = toJSON . show
+

src/Bakers12/Cli.hs

     , cmdArgs
     ) where
 
-import           Bakers12.Modes.Tokenizer (TokenFilter(..))
+import           Bakers12.Modes.Tokenizer (TokenFilter(..), OutputFormat(..))
 import qualified Data.List as L
 import           Data.Version (Version(..))
 import           Paths_bakers12 (version)
 data Modes
     = Tokenize
         { filter :: Maybe TokenFilter
+        , format :: Maybe OutputFormat
         , files  :: [FilePath]
         }
     deriving (Show, Data, Typeable)
                  &= help "The filter to use on the output tokens. This can\
                           \ be one of 'null', 'minimal', 'penn'. The default\
                           \ is 'minimal'."
+        , format = def &= name "F" &= typ "OUTPUT FORMAT"
+                 &= help "The output format to use. This can be either\
+                         \ 'csv' or 'json'. If there is no input, currently \
+                         \ the JSON formatter outputs nothing. This should \
+                         \ probably be an empty list."
         , files = def &= args &= typ "FILES/DIRS"
         } &= details ["This takes one or files and tokenizes them."]
     ] &= summary ( "bakers12 v" ++ versionStr ++ (tagStrs $ versionTags version) ++

src/Bakers12/Modes.hs

     ) where
 
 import Bakers12.Cli hiding (files, filter)
-import Bakers12.Modes.Tokenizer (TokenFilter(..), tokenize)
+import Bakers12.Modes.Tokenizer (TokenFilter(..), OutputFormat(..), tokenize)
 import Data.Maybe (fromMaybe)
 import Prelude hiding (filter)
 
 -- | This dispatching function.
 execBakers12 :: Modes -> IO ()
-execBakers12 (Tokenize filter files) = tokenize filter' files
+execBakers12 (Tokenize filter format files) = tokenize filter' format' files
     where filter' = fromMaybe Minimal filter
+          format' = fromMaybe CSV format
 

src/Bakers12/Modes/Tokenizer.hs

 module Bakers12.Modes.Tokenizer
     ( tokenize
     , TokenFilter(..)
+    , OutputFormat(..)
     ) where
 
 import           Control.Monad.Trans (lift)
 import           System.Console.CmdArgs (Data, Typeable)
 import           System.IO (stdout)
 import           Text.Bakers12.Csv (toCSVE)
+import           Text.Bakers12.JSON.Enumerators (toJSONE)
 import           Text.Bakers12.Tokenizer (Token(..), tokenizeE)
 import           Text.Bakers12.Tokenizer.Csv ()
+import           Text.Bakers12.Tokenizer.JSON ()
 import           Text.Bakers12.Tokenizer.Minimal (minimalFilter)
 import           Text.Bakers12.Tokenizer.PennTreebank (pennFilter)
 
     | Penn
     deriving (Data, Enum, Eq, Show, Typeable)
 
+-- | These are the available output formats.
+data OutputFormat
+    = CSV
+    | JSON
+    deriving (Data, Enum, Eq, Show, Typeable)
+
 -- | This takes a list of possible file paths and tokenizes each one. It prints
 -- the tokens out as CSV. Missing files are silently skipped and directories
 -- are expanded into all the files in that directory and subdirectories. All of
 -- this is handled with Enumerators, so it's memory consumption should be
 -- decent.
-tokenize :: TokenFilter -> [FilePath] -> IO ()
-tokenize tokenFilter files =
-    run_ (tokenEnum' $$ outputIter)
+tokenize :: TokenFilter -> OutputFormat -> [FilePath] -> IO ()
+tokenize tokenFilter format files =
+    run_ (input $= tokenFilter' $$ formatter =$ output)
     where
-        fileEnum   = enumLists [files] $= removeMissingFiles $= expandDirectories
-        tokenEnum  = fileEnum $= tokenizeE
-        tokenEnum' = case tokenFilter of
-                        Null    -> tokenEnum
-                        Minimal -> tokenEnum $= minimalFilter
-                        Penn    -> tokenEnum $= pennFilter
-        outputIter = toCSVE =$
-                     ET.encode ET.utf8 =$
-                     EB.iterHandle stdout
+        fileEnum     = enumLists [files] $= removeMissingFiles $= expandDirectories
+        input        = fileEnum $= tokenizeE
 
+        tokenFilter' = case tokenFilter of
+                        Null    -> EL.map id
+                        Minimal -> minimalFilter
+                        Penn    -> pennFilter
+        formatter    = case format of
+                        CSV  -> toCSVE
+                        JSON -> toJSONE
+        output       = ET.encode ET.utf8 =$ EB.iterHandle stdout
+