Source

astrosearch / CountUserTweets.hs

Full commit
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}

{-

Usage:

  ./countusertweets <endpoint> [mincounts]
  ./countusertweets json <endpoint> [mincounts]

Aim:

Count up all the tweets by each user and write to the screen
(when first argument isn't json)

  count handle name (tab separated)

otherwise write out a JSON format. If mincounts is given only those users
with at least mincount tweets will be included.

The text (non-JSON) version should probably be piped through

  sort -n -r -k1

to ensure the users are listed by number of tweets.

-}

module Main where

import qualified Data.ByteString.Lazy.Char8 as LB
import qualified Data.Map as M
import qualified Data.Text as T

import Data.Aeson
import Data.Time (UTCTime)

import System.Environment (getArgs, getProgName)
import System.Exit (exitFailure)
import System.IO (stderr, hPutStrLn)

import Control.Monad (liftM)

import SPARQL ( UserId
              , BasicUserInfo(..), BasicTweetCount(..)
              , getUserName, getUserHandle
	      , queryStore
              , getTimeRange, getBasicUserInfo, getUserTweetCount)

import Utils (maybeRead)

data UserInfo = 
  UI 
  { uiHandle   :: T.Text
  , uiName     :: T.Text
  , uiCount    :: Int  -- ^ Number of tweets (includes retweets)
  , uiRetweets :: Int  -- ^ Number of retweets
  } deriving (Eq, Show)
                         
instance ToJSON UserInfo where
  toJSON UI {..} = 
    object [ "count" .= uiCount
           , "retweets" .= uiRetweets
           , "handle" .= uiHandle
           , "label" .= uiName
           ]
    
{-
I used to use a single query to combine user data and ntweets, but this
assumed that there was a unique mapping between user and foaf:name; however
users can have multiple foaf:name fields (and sioc:name). So I now use
separate queries (including a canned one from SPARQL)

 a) get number of tweets for each user
 b) get user details

We do them in this order so that if the database gets updated between the
two queries we don't end up with the possibility of having tweets without
users (instead we may get users without tweets).
-}

toUserInfo :: M.Map UserId BasicTweetCount -> M.Map UserId BasicUserInfo -> [UserInfo]
toUserInfo cMap uMap =
  let f out BasicTweetCount {..} =
        case M.lookup btcId uMap of
          Just bui -> UI (getUserHandle bui) (getUserName bui) btcNTweets btcNRetweets : out
          _ -> out
  in M.foldl' f [] cMap
 
{-
Run the query against the given endpoint, collecting
up the results. The output order is not guaranteed.
-}
query :: String -> IO ([UserInfo], (UTCTime, UTCTime))
query endpoint = do
  stores <- queryStore endpoint
  ts <- getTimeRange endpoint stores
  countMap <- getUserTweetCount endpoint stores
  buinfo <- getBasicUserInfo endpoint stores
  let uinfo = toUserInfo countMap buinfo
  return (uinfo, ts)

-- TODO: display the retweet count
-- Since handles can now contain spaces (when a user has multiple handles)
--   we use tab separated values
displayCount :: UserInfo -> IO ()
displayCount UI {..} = 
  putStrLn $ show uiCount ++ "\t" ++ T.unpack uiHandle ++ "\t" ++ T.unpack uiName

{- Filter the user list so that it contains those users with
at least the given number of tweets.
-}

filterUsers :: Int -> [UserInfo] -> [UserInfo]
-- filterUsers i = filter (\ui -> uiCount ui >= i)
filterUsers i = filter ((>=i) . uiCount)

displayAsJSON :: Maybe Int -> ([UserInfo], (UTCTime, UTCTime)) -> IO ()
displayAsJSON mf (us, ts) = 
  let -- ntot = foldl' (\n ui -> n + uiCount ui) 0 us
      ntot = sum $ map uiCount us 
      (mc, fus) = case mf of
        Just f -> (f, filterUsers f us)
        _ -> (1, us)
        
      obj = object [ "total" .= ntot, "mincount" .= mc
                   , "firstTweet" .= fst ts
                   , "lastTweet" .= snd ts
                   , "users" .= fus]

  in LB.putStrLn . encode . toJSON $ obj

usage :: IO ()
usage = do
  pName <- getProgName
  hPutStrLn stderr $ "Usage: " ++ pName ++ " [json] <endpoint> [mincount]"
  exitFailure

main :: IO ()
main = do
  args <- getArgs
  case args of
    [url] -> do
      (tws, _) <- query url
      mapM_ displayCount tws
      
    (a1:a2:[]) -> case a1 of
      "json" -> query a2 >>= displayAsJSON Nothing
      _ -> case maybeRead a2 of
        Just i -> liftM (filterUsers i . fst) (query a1) >>= mapM_ displayCount
        _ -> usage
        
    (b1:b2:b3:[]) -> case b1 of
      "json" -> case maybeRead b3 of
        Nothing -> usage
        j -> query b2 >>= displayAsJSON j
        
      _ -> usage
      
    _ -> usage