astrosearch / GetUserConnections.hs

{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}

{-

Usage:

  ./getuserconnections <endpoint>
  ./getuserconnections csv <endpoint>
  ./getuserconnections json <endpoint>
  ./getuserconnections sif <endpoint>
  ./getuserconnections communities <endpoint>

Aim:

Calculate the "connected" users in the Tweets by
seeing who mentions and responds to other users.

The output - to stdout - when csv is selected is

  tweeter,references;count

where tweeter and references are the Twitter user names (using the
canonical case where possible) and count is an integer giving the
number of times that tweeter mentions references via the @username
syntax. Any commas in tweeter or references are converted to \,
for this format.

The JSON format is that required by the aas219-html javascript,
and consists of

  nodes: [{ name: name, handle: handle: followers: count }, ..]
  links: [{ source: sindex, target: tindex, value: count }, ..]

and the sindex/tindex values index into nodes.

The sif argument creates
   <node1>\tab<count>\tab<node2>

since that is the format required by http://www.BioFabric.org/ (I am
assukming the count value is one of a small number of values so it can be
considered to be a small set of labels for v 1.0.0 of biofabric).

The communities argument displays debugging output related to the
community detection.

When no argument is given an "unspecified" screen output is used
(for debugging).

Notes:

The name and handle fields of a user can contain multiple values,
in which case they are separated by " AKA ".

-}

module Main where

import qualified Data.ByteString.Lazy.Char8 as LB
import qualified Data.HashMap.Strict as HM
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T

-- import Control.Monad (forM_)

import Data.Hashable (Hashable(..))
import Data.List (foldl')
import Data.Maybe (fromJust, fromMaybe, mapMaybe)
import Data.Time (UTCTime)
-- import Data.Tuple (swap)

import Database.HaSparqlClient (Query, NamedGraph, BindingValue(..))

import System.Environment (getArgs, getProgName)
import System.Exit (exitFailure)
import System.IO (stderr, hPutStrLn)

import Data.Aeson

import CommDetect ( NodeLabel, NodePair
                  -- , fromNodePair
                  , toNodePair, toNodeLabel, toEdges
                  , findCommunities)
import SPARQL ( BasicUserInfo(..), BasicTweetCount(..)
              , FollowerInfo(..)
              , UserId
              , to4
              , fromUserId
              , getUserName, getUserHandle
              , makeQuery, queryStore, fromStores
              , getTimeRange, getBasicUserInfo
              , getNumberFollowInfo
	      , getUserTweetCount)

-- perhaps should use Data.Map rather than Data.HashMap?
instance Hashable UserId where
#if !MIN_VERSION_hashable(1,2,0)
  hash = hash . fromUserId
#endif
  hashWithSalt s = hashWithSalt s . fromUserId

type Community = Int

{-
Find out who references who.
-}

queryConnections :: [NamedGraph] -> Query
queryConnections ngs = 
  unwords
  [ "prefix sioc: <http://rdfs.org/sioc/ns#>"
  , "prefix sioct: <http://rdfs.org/sioc/types#>"
  , "prefix dcterms: <http://purl.org/dc/terms/>"
  , "prefix tw: <http://purl.org/net/djburke/demo/twitter#>"
  , "SELECT ?cid ?pid (count(?tweet) as ?ntweets) (count(?rt) as ?nretweets)"
  , fromStores ngs
  , " WHERE {"
  , "  ?tweet a sioct:MicroblogPost ;"
  , "     sioc:has_creator [ sioc:id ?cid ] ;"
  , "     dcterms:references [ sioc:id ?pid ] ."
  , "  OPTIONAL { ?tweet tw:isRetweet ?rt }"
  , "} GROUP BY ?cid ?pid"
  ]

-- | Find what community an edge is in
getCommunityMap :: M.Map Community (S.Set NodePair) -> M.Map NodePair Community
getCommunityMap =
    let addSet orig comm pairs = S.foldl' (\o np -> M.insert np comm o) orig pairs
    in M.foldlWithKey' addSet M.empty

{-
Run the query against the given endpoint, collecting
up the results.
The user information is collected after the connection
information so that if the store is updated during the
run time then the user information will reflect this
new info (i.e. we will not get connection information
for users we do not know about).
-}
queries :: String -> IO ([UserConn], Int, (UTCTime, UTCTime))
queries endpoint = do
  stores <- queryStore endpoint
  (firstTime, lastTime) <- getTimeRange endpoint stores
  conns <- makeQuery getConn endpoint (queryConnections stores)
  buinfo <- getBasicUserInfo endpoint stores
  ntweets <- getUserTweetCount endpoint stores
  followInfo <- getNumberFollowInfo endpoint stores
  
  let um = makeUserMap buinfo followInfo ntweets
      times = (firstTime, lastTime)

      -- identify communities
      getU (u1,u2,_,_) = [u1,u2]
      userPairs = zip (S.toAscList $ S.fromList $ concatMap getU conns) $ map toNodeLabel [1..]
      users = M.fromDistinctAscList userPairs
      -- rusers = M.fromList $ map swap userPairs
      userIndex u = fromMaybe (error ("*Internal error* unknown user " ++ show u)) $ M.lookup u users
      -- getName i = fromMaybe (error ("*Internal error* unknown user label " ++ show i)) $ M.lookup i rusers

      toNP (u1,u2,_,_) = toNodePair (userIndex u1) (userIndex u2)

      e = toEdges $ S.fromList $ mapMaybe toNP conns
      (ncomms, commMap) = case findCommunities e of
                            Nothing -> (0, M.empty)
                            Just (_,_,cmap) -> (M.size cmap, getCommunityMap cmap)

  return (map (toUserConn um commMap userIndex) conns, ncomms, times)
  
data UserInfo = 
  UI 
  { uiId        :: UserId        -- ^ the twitter id 
  , uiHandle    :: T.Text        -- ^ the Twitter name
  , uiName      :: T.Text        -- ^ the full name of the user
  , uiFollowers :: Int           -- ^ the number of followers
  , uiNTweets   :: Int           -- ^ the number of tweets (including retweets)
  , uiNRetweets :: Int           -- ^ the number of retweets
  , uiNFollowsInGroup :: Int     -- ^ the number of people this user follows that are also tweeted
  , uiNFollowersInGroup :: Int   -- ^ the number of people that follow this user and that also tweeted
  } deriving (Show, Eq, Ord)
             
instance ToJSON UserInfo where
  toJSON UI {..} = 
    object [ "name" .= uiName
           , "handle" .= uiHandle
           , "followers" .= uiFollowers
           , "ntweets" .= uiNTweets
           , "nretweets" .= uiNRetweets
	   , "nfriendInGroup" .= uiNFollowsInGroup
	   , "nfollowersInGroup" .= uiNFollowersInGroup
           ]

type UserMap = HM.HashMap UserId UserInfo

data UserConn = 
    UC
    { ucFrom      :: UserInfo  -- ^ Account that wrote the tweets
    , ucTo        :: UserInfo  -- ^ Account that was mentioned
    , ucCommunity :: Int       -- ^ community number (or 0 if no community; e.g. ucFrom==ucTo)
    , ucNTweets   :: Int       -- ^ Number of all tweets
    , ucNRetweets :: Int       -- ^ Number of retweets
    } deriving (Eq, Show)

makeUserMap :: 
    M.Map UserId BasicUserInfo 
    -> M.Map UserId FollowerInfo
    -> M.Map UserId BasicTweetCount  -- ^ number of all tweets, number of retweets
    -> UserMap
makeUserMap bumap followInfo numtweets =
  let f omap bui = 
        let uid = buiId bui
	    FollowerInfo {..} = fromJust $ M.lookup uid followInfo
            BasicTweetCount {..} = fromMaybe (error ("No tweet count for user: " ++ show uid))
                                   $ M.lookup uid numtweets
            val = UI uid (getUserHandle bui) (getUserName bui) fiNumFollowers btcNTweets btcNRetweets fiNumFollowsInGroup fiNumFollowersInGroup
        in HM.insert uid val omap
  in M.foldl' f HM.empty bumap

-- could easily make this return Maybe UserConn instead but for now
-- just let it error out
toUserConn :: UserMap -> M.Map NodePair Community -> (UserId -> NodeLabel) -> (UserId, UserId, Int, Int) -> UserConn
toUserConn um cmap nlbl (cid, pid, nt, nrt) = 
    let u1 = toUserInfo um cid 
        u2 = toUserInfo um pid
        commid = case toNodePair (nlbl cid) (nlbl pid) of
                   Nothing -> 0
                   Just np -> fromMaybe 0 $ M.lookup np cmap
    in UC u1 u2 commid nt nrt

toUserInfo :: UserMap -> UserId -> UserInfo
toUserInfo um uid =
    fromMaybe (error ("Unknown user: " ++ show uid)) (HM.lookup uid um)

{-
toUserInfo :: UserMap -> UserId -> UserInfo
toUserInfo um uid = fromMaybe (unknownUser uid) (HM.lookup uid um)
-- toUserInfo = fromJust . flip HM.lookup

unknownUser :: UserId -> UserInfo
unknownUser uid = 
  let hdl = T.pack $ "<unknown>:" ++ show (fromUserId uid)
  in UI uid hdl hdl Nothing 0 0
-}

{-             
Return text identifying the user (the value depends on
what information we know about the user). Since this is for
the CSV output we replace , by \,

Actually, we now just use the user handle instead to avoid
this need for protection (since \, didn't seem to work in 
Gephi v0.8.2 beta).
-}
getCSVLabel :: UserInfo -> T.Text
-- getCSVLabel UI {..} = T.replace "," "\\," uiName
getCSVLabel = uiHandle

getConn :: [BindingValue] -> Maybe (UserId, UserId, Int, Int)
getConn = to4

{-
Need separate maps for users and connections. We could
do the loop in one, but separate out for simplicity.

The ordering for users is currently unspecified.
-}
displayJSON :: ([UserConn], Int, (UTCTime, UTCTime)) -> IO ()
displayJSON (us, ncomm, ts) = 
  let uSet = foldl' (\s1 UC {..} -> S.insert ucTo (S.insert ucFrom s1)) S.empty us
      users = S.toList uSet
      uMap = zip users [0::Int ..]
  
      getUserIndex = fromJust . flip lookup uMap
      mkLink UC {..} = 
        object [ "source"  .= getUserIndex ucFrom
               , "target"  .= getUserIndex ucTo
               , "ntweets" .= ucNTweets
               , "nretweets" .= ucNRetweets
               , "community" .= ucCommunity
               ]
        
      j = encode $ object 
          [ "firstTweet" .= fst ts
          , "lastTweet" .= snd ts
          , "numCommunities" .= ncomm
          , "nodes" .= users
          , "links" .= map mkLink us ]
      
  in LB.putStrLn j

fst3 :: (a,b,c) -> a
fst3 (a,_,_) = a

{-
The CSV-like format used by Gephi, namely

   user1,user2;count

We need to protect any commas that appear in user1/2.

For now we do not display the retweet count
-}
displayCSV :: ([UserConn], a, b) -> IO ()
displayCSV = 
  let asCSV UC {..} = T.unpack (getCSVLabel ucFrom) ++ "," ++ T.unpack (getCSVLabel ucTo) ++ ";" ++ show ucNTweets
  in mapM_ (putStrLn . asCSV) . fst3
  
displaySIF :: ([UserConn], a, b) -> IO ()
displaySIF = 
  let asSIF UC {..} = T.unpack (getCSVLabel ucFrom) ++ "\t" ++ show ucNTweets ++ "\t" ++ T.unpack (getCSVLabel ucTo)
  in mapM_ (putStrLn . asSIF) . fst3

-- Ideally would like to store the community information somewhere
displayCommunities :: ([UserConn], Int, a) -> IO ()
displayCommunities (ucs, ncomm, _) = 
    if ncomm == 0
    then putStrLn "No communities found!"
    else do
      putStrLn $ "*** Found " ++ show ncomm ++ " communities"
      mapM_ (putStrLn . show) ucs 

{-
         Just (dlist, dmax, cmap) -> do
                                putStrLn $ "*** Found " ++ show (M.size cmap) ++ " communities"
                                putStrLn $ "*** Best Similarity,Density = " ++ show dmax
                                putStrLn $ "*** (S,D) pairs = " ++ show dlist
                                forM_ (M.assocs cmap) $ \(comm, pairs) ->
                                    let getNames np = let (a,b) = fromNodePair np
                                                      in (uiHandle (getName a), uiHandle (getName b))
                                        ns = map getNames $ S.toAscList pairs
                                    in putStrLn $ " " ++ show comm ++ " : " ++ show ns
-}
                                
{-
Just dump the output.
-}
displayDebug :: ([UserConn], a, b) -> IO ()
displayDebug = mapM_ print . fst3

usage :: IO ()
usage = do
  pName <- getProgName
  hPutStrLn stderr $ "Usage: " ++ pName ++ " [csv|json|sif] <endpoint>"
  exitFailure
  
main :: IO ()
main = do
  args <- getArgs
  case args of
    [endpoint] -> queries endpoint >>= displayDebug
      
    (a:endpoint:[]) | a == "json" -> queries endpoint >>= displayJSON
                    | a == "csv"  -> queries endpoint >>= displayCSV
                    | a == "sif"  -> queries endpoint >>= displaySIF
                    | a == "communities"  -> queries endpoint >>= displayCommunities
    _ -> usage
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.