Commits

Doug Burke committed ca46253

Include a histogram of the number of different tools used to write tweets by a user

Comments (0)

Files changed (1)

 
 import Data.Acid hiding (Query)
 import Data.Aeson
-import Data.List (foldl')
+import Data.List (foldl', groupBy, sort)
 import Data.Maybe (fromMaybe, fromJust)
 import Data.Time (UTCTime, NominalDiffTime, diffUTCTime)
 import Database.HaSparqlClient (Query, NamedGraph, BindingValue(..))
     , Hist -- ^ retweet histogram
     , Hist -- ^ user tweet histogram
     , Hist -- ^ user retweet histogram
+    , Hist -- ^ number of different publishers used by a user 
     , V.Vector KDE  -- ^ KDE for the time-to-retweet values (10 minutes)
     , V.Vector KDE  -- ^ KDE for the time-to-retweet values (6 hour)
     , Sorted Object  -- ^ JSON from Twitter containing the contents of the
   , "} GROUP BY ?publisher"
   ]
 
+-- Look at how many different "publishers" a user used.
+--
+-- For the moment we do not include the actual publishers used
+queryUserPublisherCount :: [NamedGraph] -> Query
+queryUserPublisherCount ngs = 
+  unwords
+  [ "prefix sioc: <http://rdfs.org/sioc/ns#>"
+  , "prefix sioct: <http://rdfs.org/sioc/types#>"
+  , "prefix dcterms: <http://purl.org/dc/terms/>"
+  -- , "SELECT ?userid ?publisher (COUNT(?publisher) as ?ntw) "
+  , "SELECT ?userid (COUNT(?publisher) as ?ntw) "
+  , fromStores ngs
+  , " WHERE {"
+  , "  [] a sioct:MicroblogPost ; dcterms:publisher ?publisher ; sioc:has_creator [ sioc:id ?userid ] ."
+  , "} GROUP BY ?userid ?publisher"
+  ]
 
 getCount :: [BindingValue] -> Maybe Int
 getCount (_:bv:[]) = fromBinding bv
   reTweetTimes     <- makeQuery toDelta   endpoint (queryTimeToRetweet stores)
 
   publisherCount   <- makeQuery toCounter endpoint (queryPublisherCount stores)
+  userPubCount     <- makeQuery toCounter endpoint (queryUserPublisherCount stores)
   publisherInfo    <- makeQuery to2       endpoint (queryPublisherInfo stores)
 
   followerInfo     <- getNumberFollowInfo endpoint stores
       
       nPub = foldl' (\c ctr -> c + _count ctr) 0 publisherCount
 
+      -- fix the type for toCounter when creating userPubCount
+      uidEq :: UserId -> UserId -> Bool
+      uidEq = (==)
+
+      userPub = map length $ groupBy uidEq $ sort $ map _value userPubCount
+
   return ( M.size countMap
          , M.size onlyRetweetMap
          , ntws, nrtws, nPub
          , toHistogram retweetCount
          , toHistogram userTweetCount
          , toHistogram userRetweetCount
+         , toHistogram userPub
          , rtwDelta10MinHist
          , rtwDelta6HourHist
          , tws
 {-
 Instead of a KDE, just create a histogram.
 -}
-calcHist ::
-  Double  -- bin size in seconds
-  -> [NominalDiffTime]
+calcSimpleHist ::
+  Double
+  -> [Double]
   -> V.Vector KDE
-calcHist binsize dts =
-  let v = UV.fromList $ map (fromRational . toRational) dts
+calcSimpleHist binsize vs =
+  let v = UV.fromList vs
       vlo = 0.0 -- UV.minimum v
       vhi = UV.maximum v
 
 
   in V.map K $ UV.convert $ UV.zip xs ys
 
+calcHist ::
+  Double  -- bin size in seconds
+  -> [NominalDiffTime]
+  -> V.Vector KDE
+calcHist binsize = calcSimpleHist binsize . map (fromRational . toRational)
+
 {-
 Extract a URI from a request. I am assuming that if
 the URL was in the request then it's valid, which assumes I am
 displayCount :: 
     QueryResponse
     -> IO ()
-displayCount (n, nrtonly, ntws, nrtws, nPub, rtwHist, utwHist, urtwHist, _, _, rtStatus, repliedUser, garrulousUser, rtUser, mUser, mHT, mURL, mPub, _, uMap, ftime, ltime) = do
+displayCount (n, nrtonly, ntws, nrtws, nPub, rtwHist, utwHist, urtwHist, npubHist, _, _, rtStatus, repliedUser, garrulousUser, rtUser, mUser, mHT, mURL, mPub, _, uMap, ftime, ltime) = do
   putStrLn $ ">> Number of tweeters: " ++ show n
   putStrLn $ ">> Number of tweeters that *only* retweeted: " ++ show nrtonly
   putStrLn $ ">> First tweet: " ++ show ftime
   forM_ utwHist $ \(k,v) -> when (v > 0) (putStrLn $ "Num tweets = " ++ show k ++ " " ++ show v ++ " times")
   putStrLn "*** Histogram of number of retweets by a user"
   forM_ urtwHist $ \(k,v) -> when (v > 0) (putStrLn $ "Num tweets = " ++ show k ++ " " ++ show v ++ " times")
+  putStrLn "*** Histogram of number of publishers by a user"
+  forM_ npubHist $ \(k,v) -> when (v > 0) (putStrLn $ "Num publishers = " ++ show k ++ " " ++ show v ++ " times")
   putStrLn "*** Most mentioned URL"
   forM_ (fromCL mURL) $ \(Counter u cnt) -> putStrLn $ show cnt ++ " <" ++ show u ++ ">"
   putStrLn "*** Most used publisher"
     QueryResponse
     -> IO ()
 displayAsJSON ( n, nrtonly, ntws, nrtws, nWithPub
-              , rtwHist, utwHist, urtwHist, rtwDelta10MinHist, rtwDelta6HourHist, rtStatus, repliedUser, garrulousUser, rtUser, mUser, mHT, mURL, mPub, friendCount, uMap, fTime, lTime) = do
+              , rtwHist, utwHist, urtwHist, nPubHist
+              , rtwDelta10MinHist, rtwDelta6HourHist, rtStatus, repliedUser, garrulousUser, rtUser, mUser, mHT, mURL, mPub, friendCount, uMap, fTime, lTime) = do
   let fixUser = map (replaceUserId uMap) . fromCL
 
       toF (nfr,nfoll,nt,nrt) = object [ "follow" .= nfr
                    , "retweetHist" .= rtwHist
                    , "userTweetHist" .= utwHist
                    , "userRetweetHist" .= urtwHist
+                   , "numPubHist" .= nPubHist
                    , "retweetDeltaHist10Min" .= rtwDelta10MinHist
                    , "retweetDeltaHist6Hour" .= rtwDelta6HourHist
                    , "uris" .= fromCL mURL