Commits

Doug Burke committed 0183517

Consolidate calculation of num followers/follows

Comments (0)

Files changed (3)

GetUserConnections.hs

 import Data.Aeson
 
 import SPARQL ( BasicUserInfo(..), BasicTweetCount(..)
+              , FollowerInfo(..)
               , UserId
               , to4
               , fromUserId
               , getUserName, getUserHandle
               , makeQuery, queryStore, fromStores
               , getTimeRange, getBasicUserInfo
-              , getNumberFollowers, getUserTweetCount)
+              , getNumberFollowInfo
+	      , getUserTweetCount)
 
 -- perhaps should use Data.Map rather than Data.HashMap?
 instance Hashable UserId where
   conns <- makeQuery getConn endpoint (queryConnections stores)
   buinfo <- getBasicUserInfo endpoint stores
   ntweets <- getUserTweetCount endpoint stores
-  followers <- getNumberFollowers endpoint stores
+  followInfo <- getNumberFollowInfo endpoint stores
   
-  let um = makeUserMap buinfo followers ntweets
+  let um = makeUserMap buinfo followInfo ntweets
   return (map (toUserConn um) conns, (firstTime, lastTime))
   
 data UserInfo = 
 
 makeUserMap :: 
     M.Map UserId BasicUserInfo 
-    -> M.Map UserId Int -- ^ number of followers
+    -> M.Map UserId FollowerInfo
     -> M.Map UserId BasicTweetCount  -- ^ number of all tweets, number of retweets
     -> UserMap
-makeUserMap bumap followers numtweets =
+makeUserMap bumap followInfo numtweets =
   let f omap bui = 
         let uid = buiId bui
-            mnf = M.lookup uid followers
+            mnf = fiNumFollowers `fmap` M.lookup uid followInfo
             BasicTweetCount {..} = fromMaybe (error ("No tweet count for user: " ++ show uid))
                                    $ M.lookup uid numtweets
             val = UI uid (getUserHandle bui) (getUserName bui) mnf btcNTweets btcNRetweets
          , UserId
          , BasicUserInfo(..)
          , BasicTweetCount(..)
+	 , FollowerInfo(..)
 	 , Counter(..)
 
          , to1, to2, to3, to4, toCounter
 	 , getUserName
 	 , getUserHandle
 	 , getUserTweetCount
-         , getNumberFollowers
+         , getNumberFollowInfo
        ) where
 
 import qualified Data.Map as M
 import Control.Applicative ((<$>), (<*>))
 
 import Data.List (foldl')
-import Data.Maybe (mapMaybe)
+import Data.Maybe (fromMaybe, mapMaybe)
 import Data.Time (UTCTime(..), ParseTime, getCurrentTime, parseTime)
 import Data.Typeable (Typeable)
 import Database.HaSparqlClient (Service(Sparql), Query, NamedGraph, BindingValue(..), Method(HGET), runSelectQuery)
   , "GROUP BY ?userid"
   ]
 
-queryNumFollowers :: [NamedGraph] -> Query
-queryNumFollowers ngs = 
+-- This is the total number of follows and followers, as recorded by Twitter, at the
+-- time of each tweet, of the user. We use the maximum values stored in the
+-- graph for each value.
+queryNumFollowerInfo :: [NamedGraph] -> Query
+queryNumFollowerInfo ngs = 
   unwords
   [ "prefix sioc: <http://rdfs.org/sioc/ns#>"
   , "prefix tw: <http://purl.org/net/djburke/demo/twitter#>"
-  , "SELECT ?userid (max(?followers) as ?num) "
+  , "SELECT ?userid (MAX(?follows) AS ?nfollows) (MAX(?followers) AS ?nfollowers) "
   , fromStores ngs
   , " WHERE {"
-  , "  [] a sioc:UserAccount ; sioc:id ?userid ; tw:numFollowers ?followers ."
+  , "  [] a sioc:UserAccount ; sioc:id ?userid ; "
+  , "     tw:numFriends ?follows ; tw:numFollowers ?followers ."
+  , "} GROUP BY ?userid"
+  ]
+
+-- Get the number of followers or follows who are themselves part of the "group";
+-- note that this data was accessed after the conference, so does not
+-- necessarily reflect the links between users during the conference
+-- but is a "best guess".
+--
+queryNumFollowersInGroup ::[NamedGraph] -> Query
+queryNumFollowersInGroup ngs = 
+  unwords
+  [ "prefix sioc: <http://rdfs.org/sioc/ns#>"
+  , "prefix tw: <http://purl.org/net/djburke/demo/twitter#>"
+  , "SELECT ?userid (COUNT(?follower) as ?nfollowers) "
+  , fromStores ngs
+  , " WHERE {"
+  , "  ?user a sioc:UserAccount ; sioc:id ?userid ."
+  , "  OPTIONAL { ?follower tw:follows ?user . }"
+  , "} GROUP BY ?userid"
+  ]
+
+queryNumFollowsInGroup ::[NamedGraph] -> Query
+queryNumFollowsInGroup ngs = 
+  unwords
+  [ "prefix sioc: <http://rdfs.org/sioc/ns#>"
+  , "prefix tw: <http://purl.org/net/djburke/demo/twitter#>"
+  , "SELECT ?userid (COUNT(?follow) as ?nfollows) "
+  , fromStores ngs
+  , " WHERE {"
+  , "  ?user a sioc:UserAccount ; sioc:id ?userid ."
+  , "  OPTIONAL { ?user tw:follows ?follow . }"
   , "} GROUP BY ?userid"
   ]
 
      , buiName      :: S.Set T.Text   -- ^ The "full name" (foaf:name in store)
      } deriving (Show, Eq)
 
+-- | How many users follow and are followed by this user. This
+--   data is only approximate (since the relationships change over time
+--   and were only accessed at discrete times).
+data FollowerInfo =
+     FollowerInfo
+     { fiNumFollows     :: Int -- ^ the number of people this user follows (i.e. they are "friends" with)
+     , fiNumFollowers   :: Int -- ^ the number of people that follow this user
+     , fiNumFollowsInGroup    :: Int -- ^ the number of people this user follows that have
+                                     --   contributed to the conference
+     , fiNumFollowersInGroup  :: Int -- ^ the number of people that follow this user and have
+                                     --   contributed to the conference
+     } deriving (Show, Eq)
+
 -- | Returns the user name (if there are multiple, they are
 --   separated by AKA).     
 getUserName :: BasicUserInfo -> T.Text
   return $ M.fromList $ map (\(uid,nt,nrt) -> (uid, BasicTweetCount uid nt (nt-nrt) nrt)) tinfo
 
 {-|
-Return the number of followers (the maximum value).
+Return the number of follows and followers for a user (the maximum value
+stored in the graph) and the numbers of these that are also
+in the "group" (note that these two sets of values are calculated
+from different sets of data so aren't quite comparable).
 -}
-getNumberFollowers :: String -> [NamedGraph] -> IO (M.Map UserId Int)
-getNumberFollowers endpoint stores = 
-    M.fromList `fmap` makeQuery to2 endpoint (queryNumFollowers stores)
+getNumberFollowInfo :: 
+    String 
+    -> [NamedGraph]
+    -> IO (M.Map UserId FollowerInfo)
+getNumberFollowInfo endpoint stores = do
+    followInfo <- makeQuery to3 endpoint (queryNumFollowerInfo stores)
+    grFollows <- makeQuery to2 endpoint (queryNumFollowsInGroup stores)
+    grFollowers <- makeQuery to2 endpoint (queryNumFollowersInGroup stores)
 
+    let mFollows   = M.fromList grFollows
+        mFollowers = M.fromList grFollowers
+
+        out m (uid,n1,n2) =
+          let nf1 = fromMaybe 0 $ M.lookup uid mFollows
+              nf2 = fromMaybe 0 $ M.lookup uid mFollowers
+              v = FollowerInfo n1 n2 nf1 nf2
+          in M.insertWith' const uid v m
+
+    return $ foldl' out M.empty followInfo 
+
                  , AddURL(..), GetURL(..)
                  , emptySimpleStatsStore)
 import SPARQL ( FromBinding(..), UserId, BasicUserInfo, BasicTweetCount(..)
+              , FollowerInfo(..)
        	      , Counter(..)
               , makeQuery, queryStore, fromStores
               , to1, to2, toCounter
               , getUserName, getUserHandle
               , getUserTweetCount
-              , getTimeRange, getBasicUserInfo)
+              , getTimeRange, getBasicUserInfo, getNumberFollowInfo
+              )
 import Utils (maybeRead)
 
 instance ToJSON N.URI where
   , "}"
   ]
 
-queryFollowerCount :: [NamedGraph] -> Query
-queryFollowerCount ngs = 
-  unwords
-  [ "prefix sioc: <http://rdfs.org/sioc/ns#>"
-  , "prefix sioct: <http://rdfs.org/sioc/types#>"
-  , "prefix tw: <http://purl.org/net/djburke/demo/twitter#>"
-  , "prefix dcterms: <http://purl.org/dc/terms/>"
-  , "SELECT ?userid (MAX(?nfriends) as ?nfriend) (MAX(?nfollowers) as ?nfollower) "
-  , fromStores ngs
-  , " WHERE {"
-  , "  [] a sioc:UserAccount ; sioc:id ?userid ;"
-  , "     tw:numFollowers ?nfollowers ; tw:numFriends ?nfriends ."
-  , "} GROUP BY ?userid"
-  ]
-
 queryPublisherCount :: [NamedGraph] -> Query
 queryPublisherCount ngs = 
   unwords
   mentionedHashTag <- makeQuery toCounter endpoint (queryMostMentionedHashTag stores)
   mentionedURL     <- makeQuery toCounter endpoint (queryMostMentionedURL stores)
   reTweetTimes     <- makeQuery toDelta   endpoint (queryTimeToRetweet stores)
-  followerCount    <- makeQuery (toFollowerCount countMap) endpoint (queryFollowerCount stores)
 
   publisherCount   <- makeQuery toCounter endpoint (queryPublisherCount stores)
   publisherInfo    <- makeQuery to2       endpoint (queryPublisherInfo stores)
 
+  followerInfo     <- getNumberFollowInfo endpoint stores
   uinfo            <- getBasicUserInfo endpoint stores
 
+  -- for now assume countMap and followerInfo have the same set of users in them
+  let toFC :: BasicTweetCount -> FollowerInfo -> (Int, Int, Int, Int)
+      toFC BasicTweetCount {..} FollowerInfo {..} = (fiNumFollows, fiNumFollowers, btcNTweets, btcNRetweets)
+      followerCount = M.elems $ M.intersectionWith toFC countMap followerInfo
+
   let f :: (Ord a) => [Counter a] -> [Counter a]
       f = take nmax . rsort