Source

astrosearch / Tokenize.hs

Full commit
{-# LANGUAGE OverloadedStrings #-}

{-
Usage:

  tokenize json endpoint mincounts num
  tokenize endpoint mincounts num

Tokenize the tweet text. The screen output is wither JSON format or
a simple format to the screen.

  mincounts is used to filter the analysis on all tweets
  num is used to filter the analysis on the original and retweets

We filter all the tweets by a minimum count, but for the other two
we just takle the top n values.

-}

module Main (main) where

import qualified Data.ByteString.Lazy.Char8 as LB
import qualified Data.HashMap.Strict as HM
import qualified Data.Text as T
import qualified Data.Text.IO as T

import Control.Applicative ((<$>), (<*>))
import Control.Arrow (second)

import Data.Aeson
import Data.List (sortBy, foldl')
import Data.Time (UTCTime)

import Database.HaSparqlClient (Query, NamedGraph, BindingValue(..))

import System.Environment
import System.Exit (exitFailure)
import System.IO (hPutStrLn, stderr)

-- import NLP.Tokenize (tokenize)

import SPARQL (FromBinding(..), makeQuery, queryStore, fromStores, getTimeRange)
import StopWords (stopWords)
import TwitterTokens (tokenize)
import Utils (maybeRead)

doNotWant :: [T.Text]
doNotWant = 
    ["#aas221", "#hackaas", "rt", "mt", "via", "aas", "#a", "...", ".\"", "--"
    , "http", "re"] ++
    stopWords ++
    ["de"]

-- to change now using twitter tokenizer
wantedWord :: T.Text -> Bool
wantedWord w = 
    let lc = T.toLower w
        isLongEnough t = T.compareLength t 1 == GT
    in isLongEnough w && 
       lc `notElem` doNotWant
       {-
       -- lc /= "aas219" && 
       -- lc /= "n't" &&
       lc `notElem` stopWords && 
       (not . isPunctuation . head) lc &&
       any (not . isPunctuation) lc &&
       any (not . isDigit) lc
       -}

{-
We remove stop words and some punctuation and
convert to upper case.

This is a bit waseful since we use two tokenization
systems; NLP.Tokenize.tokenize and TwitterTokens.separateLine.
It is not clear that both are needed.

-}

{-
Original version; uses NLP

twTokens :: T.Text -> [String]
twTokens t = 
    let toks = tokenize . T.unpack $ t
    in map (map toUpper) $ filter wantedWord toks

as above but also adds in the tokenization from TwitterTokens

twTokens :: T.Text -> [String]
twTokens t = 
    let toks = tokenize . T.unpack . separateLine $ t
    in map (map toUpper) $ filter wantedWord toks

now without the case-insensitive comparison

twTokens :: T.Text -> [String]
twTokens t = 
    let toks = tokenize . T.unpack . separateLine $ t
    in filter wantedWord toks
-}

twTokens :: T.Text -> [T.Text]
twTokens = filter wantedWord . tokenize

type WordCount = HM.HashMap T.Text Int

countToken :: WordCount -> T.Text -> WordCount
countToken m t = HM.insertWith (+) t 1 m

extractTokens :: [T.Text] -> WordCount
extractTokens = foldl' countToken HM.empty

-- | Extract the tokens into a ordered (descending) list;
--   that is the first element is the most-used word.
getOrderedTokens :: WordCount -> [(T.Text, Int)]
getOrderedTokens = 
    let f (t1,c1) (t2,c2) = compare (c1,t1) (c2,t2)
        in reverse . sortBy f . HM.toList

-- | Return all tokens with a count >= n, in descending
--   order.
minCountTokens :: Int -> WordCount -> [(T.Text, Int)]
minCountTokens minCount = takeWhile ((>=minCount) . snd) . getOrderedTokens

-- | Return the most-popular tokens. The return value is not
--   guaranteed to contain all tokens with the "minimum" count,
--   ie. the count of the last token returned.
mostPopularTokens :: 
    Int           -- ^ Return this number of matches
    -> WordCount 
    -> [(T.Text, Int)]
mostPopularTokens n = take n . getOrderedTokens

toN :: Int -> T.Text
toN = T.pack . show

displayWordCount :: (Int, (T.Text, Int)) -> IO ()
displayWordCount (ctr, (w, c)) = T.putStrLn $ T.concat ["#", toN ctr, " ", w, " ", toN c]

displaySimple :: Int -> Int -> Int -> UTCTime -> UTCTime -> WordCount -> WordCount -> WordCount -> IO ()    
displaySimple minCount num _ sTime eTime allTmap origTmap rtTmap = do
  putStrLn $ "*** Time range: " ++ show sTime ++ " to " ++ show eTime
  putStrLn "*** All tweets"
  mapM_ displayWordCount $ zip [1..] $ minCountTokens minCount allTmap
  putStrLn "*** Original tweets"
  mapM_ displayWordCount $ zip [1..] $ mostPopularTokens num origTmap
  putStrLn "*** Retweets"
  mapM_ displayWordCount $ zip [1..] $ mostPopularTokens num rtTmap
                                       
displayJSON :: Int -> Int -> Int -> UTCTime -> UTCTime -> WordCount -> WordCount -> WordCount -> IO ()    
displayJSON minCount num nTweets fTweet lTweet allTmap origTmap rtTmap = 
  let toVal :: (T.Text, Int) -> Value
      toVal (w,c) = object [ "word" .= w, "count" .= c ]                                     

      js = object 
           [ "total"     .= nTweets
           , "mincount"  .= minCount
           , "numfilter" .= num
           , "allwords"  .= map toVal (minCountTokens minCount allTmap)
           , "origwords" .= map toVal (mostPopularTokens num origTmap)
           , "rtwords"   .= map toVal (mostPopularTokens num rtTmap)
           , "firstTweet" .= fTweet
           , "lastTweet" .= lTweet
           ]
  in LB.putStrLn . encode $ js

queryTweets :: [NamedGraph] -> Query
queryTweets ngs = 
    unwords
    [ "prefix sioc: <http://rdfs.org/sioc/ns#>"
    , "prefix sioct: <http://rdfs.org/sioc/types#>"
    , "prefix tw: <http://purl.org/net/djburke/demo/twitter#>"
    , "SELECT ?f ?text "
    , fromStores ngs
    , " WHERE {"
    , "  ?tw a sioct:MicroblogPost ; sioc:content ?text ."
    , "  OPTIONAL { ?tw tw:isRetweet ?f }"
    , "}"
    ]

handleFlag :: BindingValue -> Maybe Bool
handleFlag Unbound = return False
handleFlag bv      = fromBinding bv

getTweets :: [BindingValue] -> Maybe (Bool, T.Text)
getTweets (fbv:tbv:[]) = (,) <$> handleFlag fbv <*> fromBinding tbv
getTweets _ = Nothing

processTweets :: 
    (Int -> UTCTime -> UTCTime -> WordCount -> WordCount -> WordCount -> IO ()) 
              -- ^ Given the number of tweets and the counts, output the values
    -> String  -- ^ The SPARQL endpoint
    -> IO ()
processTweets display endpoint = do
  stores <- queryStore endpoint
  tweets <- makeQuery getTweets endpoint (queryTweets stores)
  (firstTime, lastTime) <- getTimeRange endpoint stores
  let -- The flag before each list of tokens identifies if it is
      -- from a retweet (True) or was an "original" Tweet (False)
      tokenInfo :: [(Bool, [T.Text])]
      tokenInfo = map (second twTokens) tweets

      toTokens   = concatMap snd
      allTokens  = toTokens tokenInfo
      origTokens = toTokens $ filter (not . fst) tokenInfo
      rtTokens   = toTokens $ filter fst tokenInfo

      allTmap  = extractTokens allTokens
      origTmap = extractTokens origTokens
      rtTmap   = extractTokens rtTokens

  display (length tweets) firstTime lastTime allTmap origTmap rtTmap
  
usage :: IO ()
usage = do
  progName <- getProgName
  hPutStrLn stderr $ "Usage: " ++ progName ++ " [json] <endpoint> <mincount> <num>"
  exitFailure

getInts :: String -> String -> Maybe (Int, Int)
getInts a b = (,) <$> maybeRead a <*> maybeRead b
  
main :: IO ()
main = do
  args <- getArgs
  case args of
    
    (a1:a2:a3:a4:[]) -> 
        if a1 == "json"
        then case getInts a3 a4 of 
               Just (x1, x2) -> processTweets (displayJSON x1 x2) a2
               _ -> usage
        else usage

    (b1:b2:b3:[]) -> 
        case getInts b2 b3 of 
          Just (y1, y2) -> processTweets (displaySimple y1 y2) b1
          _ -> usage

    _ -> usage