Commits

Eric Rochester  committed 904a3f8

Moved CSV output functions to Text.Bakers12.Csv.

  • Participants
  • Parent commits e976050
  • Branches monad-stack

Comments (0)

Files changed (7)

File bakers12.cabal

 
   exposed-modules:
       System.Bakers12.Enumerators
+    , Text.Bakers12.Csv
+    , Text.Bakers12.Csv.Enumerators
+    , Text.Bakers12.Csv.Types
+    , Text.Bakers12.Csv.Utils
     , Text.Bakers12.Tokenizer
+    , Text.Bakers12.Tokenizer.Csv
     , Text.Bakers12.Tokenizer.Minimal
     , Text.Bakers12.Tokenizer.PennTreebank
     , Text.Bakers12.Tokenizer.Types
   if flag(development)
     cpp-options -DDEVELOPMENT
     -- build-depends: hint >= 0.3.2 && < 0.4
-    ghc-options: -DDEVELOPMENT -threaded -w
+    ghc-options: -DDEVELOPMENT -threaded -w -Wall
   else
     if flag(profiling)
       cpp-options -DDEVELOPMENT

File lib/Text/Bakers12/Csv.hs

+
+-- | This allows you to generate CSV from a number of data types. The
+-- conversion functions work in the context of a Data.Text.Lazy.Builder.
+
+module Text.Bakers12.Csv
+    ( CSVRow
+    , ToCSV(..)
+    , toCSVE
+    , escape
+    ) where
+
+import           Text.Bakers12.Csv.Enumerators
+import           Text.Bakers12.Csv.Types (CSVRow, ToCSV(..))
+import           Text.Bakers12.Csv.Utils
+

File lib/Text/Bakers12/Csv/Enumerators.hs

+
+-- | This defines some enumerators to take a stream of ToCSV instances and
+-- output a ByteString of CSV.
+
+module Text.Bakers12.Csv.Enumerators
+    ( toCSVE
+    ) where
+
+import           Control.Monad.Trans (lift)
+import           Data.Enumerator hiding (map)
+import qualified Data.Enumerator.List as EL
+import qualified Data.Text as T
+import           Text.Bakers12.Csv.Types
+
+-- | This takes a stream of ToCSV and outputs the Text of the CSV.
+toCSVE :: (ToCSV a, Monad m) => Enumeratee a T.Text m b
+toCSVE cont@(Continue k) = do
+    maybeItem <- EL.head
+    case maybeItem of
+        Just item -> do
+            next <- lift $ runIteratee $ k $ Chunks [toCSVText item]
+            toCSVE next
+        Nothing -> return cont
+toCSVE step = return step
+

File lib/Text/Bakers12/Csv/Types.hs

+
+-- | This handles the types required for Text.Bakers12.Csv conversion.
+
+module Text.Bakers12.Csv.Types
+    ( CSVRow
+    , ToCSV(..)
+    ) where
+
+import qualified Data.Text as T
+import           Text.Bakers12.Csv.Utils
+
+-- | This is the type of a row of CSV data.
+type CSVRow = [T.Text]
+
+-- | A type that can be converted to CSV.
+class ToCSV a where
+
+    -- | This handles actually converting a list of CSV 
+    toCSV :: a -> CSVRow
+
+    -- | This converts it to a Text. This has a default implementation, but you
+    -- can speed things up by defining it yourself (and not escaping every
+    -- field, for instance).
+    toCSVText :: a -> T.Text
+    toCSVText item = buildRow . map escape $ toCSV item
+
+instance ToCSV CSVRow where
+    toCSV = id
+

File lib/Text/Bakers12/Csv/Utils.hs

+
+-- | Utilities for working with CSV data.
+
+module Text.Bakers12.Csv.Utils
+    ( escape
+    , buildRow
+    ) where
+
+import qualified Data.Char as C
+import           Data.Monoid (mappend)
+import qualified Data.List as L
+import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Builder as TB
+
+-- | This handles escaping values.
+escape :: T.Text -> TB.Builder
+escape input = if T.any (not . C.isAlphaNum) input
+               then quote `mappend` (TB.fromText escaped `mappend` quote)
+               else TB.fromText input
+    where
+        quote   = TB.singleton '"'
+        escaped = T.replace (T.singleton '"') (T.pack "\"\"") input
+
+-- | This takes a list of Builders, one for each field, and assembles them into
+-- a CSV row. Basically, it just adds commas and a newline.
+buildRow :: [TB.Builder] -> T.Text
+buildRow row = TL.toStrict $ TB.toLazyText line
+    where 
+        comma = TB.singleton ','
+        nl    = TB.singleton '\n'
+        line  = foldr mappend nl . L.intersperse comma $ row
+

File lib/Text/Bakers12/Tokenizer/Csv.hs

+
+-- | This turns a Token into a CSVRow
+
+module Text.Bakers12.Tokenizer.Csv
+    (
+    ) where
+
+import qualified Data.Text as T
+import qualified Data.Text.Lazy.Builder as TB
+import qualified Data.Text.Lazy.Builder.Int as TBI
+import           Text.Bakers12.Csv.Types
+import           Text.Bakers12.Csv.Utils
+import           Text.Bakers12.Tokenizer.Types
+
+instance ToCSV Token where
+    toCSV (Token text raw len typ src offs) =
+        [ text
+        , raw
+        , showText len
+        , showText typ
+        , T.pack src
+        , showText offs
+        ]
+        where
+            showText :: Show a => a -> T.Text
+            showText = T.pack . show
+
+    toCSVText (Token text raw len typ src offs) = buildRow row
+        where
+            row = [ escape text
+                  , escape raw
+                  , TBI.decimal len
+                  , fromShow typ
+                  , TB.fromString src
+                  , TBI.decimal offs
+                  ] 
+            fromShow = TB.fromString . show

File src/Bakers12/Modes/Tokenizer.hs

 import           System.Bakers12.Enumerators (removeMissingFiles, expandDirectories)
 import           System.Console.CmdArgs (Data, Typeable)
 import           System.IO (stdout)
+import           Text.Bakers12.Csv (toCSVE)
 import           Text.Bakers12.Tokenizer (Token(..), tokenizeE)
+import           Text.Bakers12.Tokenizer.Csv ()
 import           Text.Bakers12.Tokenizer.Minimal (minimalFilter)
 import           Text.Bakers12.Tokenizer.PennTreebank (pennFilter)
 
                         Null    -> tokenEnum
                         Minimal -> tokenEnum $= minimalFilter
                         Penn    -> tokenEnum $= pennFilter
-        outputIter = tokenToCsv =$
+        outputIter = toCSVE =$
                      ET.encode ET.utf8 =$
                      EB.iterHandle stdout
 
-tokenToCsv :: Monad m => Enumeratee Token T.Text m b
-tokenToCsv cont@(Continue k) = do
-    maybeT <- EL.head
-    case maybeT of
-        Just token -> do
-            next <- lift $ runIteratee $ k $ Chunks [showToken token]
-            tokenToCsv next
-        Nothing -> return cont
-tokenToCsv step = return step
-
-showToken :: Token -> T.Text
-showToken (Token tText tRaw tLen tType tSource tOffset) =
-    TL.toStrict $ TB.toLazyText line
-    where
-        comma  = TB.singleton ','
-        nl     = TB.singleton '\n'
-
-        text   = escape tText
-        raw    = escape tRaw
-        len    = TBI.decimal tLen
-        typ    = TB.fromString . show $ tType
-        src    = TB.fromString tSource
-        offs   = TBI.decimal tOffset
-
-        fields = [ text, raw, len, typ, src ]
-
-        push field builder = field `mappend` (comma `mappend` builder)
-        line = foldr push (offs `mappend` nl) fields
-
-escape :: T.Text -> TB.Builder
-escape input = if T.any (not . C.isAlphaNum) input
-               then quote `mappend` (TB.fromText escaped `mappend` quote)
-               else TB.fromText input
-    where
-        quote   = TB.singleton '"'
-        escaped = T.replace (T.singleton '"') (T.pack "\"\"") input
-