Commits

Doug Burke  committed 6a46392

Use a phantom type to ensure lists are sorted (a bit OTT here)

  • Participants
  • Parent commits 1982d25

Comments (0)

Files changed (2)

 {-# LANGUAGE DeriveDataTypeable #-}
+{-# LANGUAGE EmptyDataDecls #-}
 {-# LANGUAGE FlexibleInstances #-}
 {-# LANGUAGE OverloadedStrings #-}
 
 	 , FollowerInfo(..)
 	 , Counter(..)
 
+	 , CounterList, SortedList
+         , toCL, fromCL, toSortedCL, takeCL, mapMCL
+
          , to1, to2, to3, to4, toCounter
          , makeQuery
          , makeQuery'
 import qualified Network.URI as N
 
 import Control.Applicative ((<$>), (<*>))
+import Control.Monad (liftM)
 
-import Data.List (foldl')
-import Data.Maybe (fromMaybe, mapMaybe)
+import Data.List (foldl', sort)
+import Data.Maybe (catMaybes, fromMaybe, mapMaybe)
 import Data.Time (UTCTime(..), ParseTime, getCurrentTime, parseTime)
 import Data.Typeable (Typeable)
 import Database.HaSparqlClient (Service(Sparql), Query, NamedGraph, BindingValue(..), Method(HGET), runSelectQuery)
     -> Maybe (Counter a)
 toCounter = (uncurry Counter `fmap`) . to2
 
+-- This is more because I want to try out phantom types than
+-- because it really improves the code.
+
+data SortedList
+
+newtype CounterList a b = CounterList { unCL :: [Counter a] }
+
+toCL :: [Counter a] -> CounterList a b
+toCL = CounterList
+
+fromCL :: CounterList a b -> [Counter a]
+fromCL = unCL
+
+toSortedCL :: (Ord a) => [Counter a] -> CounterList a SortedList
+toSortedCL = CounterList . reverse . sort
+
+takeCL :: Int -> CounterList a b -> CounterList a b
+takeCL n = CounterList . take n . unCL
+
+-- | A specialised version of `mapM` where the conversion function can
+--   drop elements.
+mapMCL :: Monad m => (a -> m (Maybe b)) -> CounterList a c -> m (CounterList b c)
+mapMCL f xs = do
+  let g (Counter a n) = fmap (`Counter` n) `liftM` f a
+  ys <- mapM g (unCL xs)
+  return $ toCL (catMaybes ys)
+
 {-|
 Run the query against the given endpoint, collecting
 up the results, applying the necessary transform.

File SimpleStats.hs

 import qualified Network.URI as N
 import qualified Statistics.Sample.Histogram as H
 
-import Control.Applicative ((<$>))
 import Control.Exception (bracket)
-import Control.Monad (forM_, when, void)
+import Control.Monad (forM_, void, when)
 
 import Data.Acid hiding (Query)
 import Data.Aeson
-import Data.List (foldl', sort)
-import Data.Maybe (fromMaybe, fromJust, mapMaybe, catMaybes)
+import Data.List (foldl')
+import Data.Maybe (fromMaybe, fromJust)
 import Data.Time (UTCTime, NominalDiffTime, diffUTCTime)
 import Database.HaSparqlClient (Query, NamedGraph, BindingValue(..))
 
 import SPARQL ( FromBinding(..), UserId, BasicUserInfo, BasicTweetCount(..)
               , FollowerInfo(..)
        	      , Counter(..)
+	      , CounterList, SortedList, toSortedCL, takeCL, fromCL, mapMCL
               , makeQuery, queryStore, fromStores
               , to1, to2, toCounter
               , getUserName, getUserHandle
   
 type Hist = [(Int, Int)]
 
+type Sorted a = CounterList a SortedList
+type SortedUsers = Sorted UserId
+
 type QueryResponse =
     ( Int  -- ^ number of accounts that tweeted
     , Int  -- ^ number of accounts that *only* retweeted
     , Hist -- ^ user retweet histogram
     , V.Vector KDE  -- ^ KDE for the time-to-retweet values (10 minutes)
     , V.Vector KDE  -- ^ KDE for the time-to-retweet values (6 hour)
-    , [Counter Object]   -- ^ JSON from Twitter containing the contents of the
-                         --   most-tweeted statuses
-    , [Counter UserId]   -- ^ the accounts that were replied to the most (excl RTs)
-    , [Counter UserId]   -- ^ the accounts that made the most replies (excl RTs)
-    , [Counter UserId]   -- ^ the most re-tweeted accounts
-    , [Counter UserId]   -- ^ the most-mentioned accounts
-    , [Counter String]   -- ^ the most-mentioned hash tags
-    , [Counter URIInfo]  -- ^ the most-mentioned URLs
-    , [Counter T.Text]   -- ^ the most-used publishers
+    , Sorted Object  -- ^ JSON from Twitter containing the contents of the
+                     --   most-tweeted statuses
+    , SortedUsers    -- ^ the accounts that were replied to the most (excl RTs)
+    , SortedUsers    -- ^ the accounts that made the most replies (excl RTs)
+    , SortedUsers    -- ^ the most re-tweeted accounts
+    , SortedUsers    -- ^ the most-mentioned accounts
+    , Sorted String  -- ^ the most-mentioned hash tags
+    , Sorted URIInfo -- ^ the most-mentioned URLs
+    , Sorted T.Text  -- ^ the most-used publishers
     , [(Int, Int, Int, Int)] -- ^ num friends, num followers, total num of tweets, num of retweets
     , M.Map UserId BasicUserInfo -- ^ map from user id to user labels
     , UTCTime -- ^ approximate time of first tweet (or when the search was made)
 
 toFollowerCount _ _ = Nothing
 
-rsort :: (Ord a) => [Counter a] -> [Counter a]
-rsort = reverse . sort
-
 logIt :: String -> IO ()
 logIt = hPutStrLn stderr
 
     in fmap (Just . TE.decodeUtf8 . B.concat . LB.toChunks) (simpleHttp url)
        `CE.catch` (\e -> logException e >> return Nothing)
 
-getTweetInfo :: AcidState SimpleStatsStore -> Counter Integer -> IO (Maybe (Counter T.Text))
-getTweetInfo acid (Counter twId c) = do
+getTweetInfo :: AcidState SimpleStatsStore -> Integer -> IO (Maybe T.Text)
+getTweetInfo acid twId = do
   mtxt <- query acid (GetOEmbed twId)
   case mtxt of
-    Just txt -> return (Just (Counter txt c))
+    Just _ -> return mtxt
     _ -> do
       logIt $ "Looking for oEmbed data for tweet " ++ show twId
       mrsp <- getFromTwitter twId
       case mrsp of 
-        Just rsp -> update acid (AddOEmbed twId rsp) >> return (Just (Counter rsp c))
+        Just rsp -> update acid (AddOEmbed twId rsp) >> return mrsp
         _ -> logIt " -- NONE FOUND!" >> return Nothing
 
 -- | Get the JSON representation from the AcidState store or
 --   Twitter OEmbed service. If we can not get a value then
 --   it is excluded.
-requestTweetInfo :: [Counter Integer] -> IO [Maybe (Counter T.Text)]
+--
+--   The restriction to a `SortedList` is not required by the
+--   routine.
+--
+requestTweetInfo :: CounterList Integer SortedList -> IO (CounterList Object SortedList)
 requestTweetInfo cs = 
     bracket
     (openLocalStateFrom "simplestats" emptySimpleStatsStore)
     closeAcidState
-    (\acid -> mapM (getTweetInfo acid) cs)
+    (\acid -> 
+     let toObj :: Integer -> IO (Maybe Object)
+         toObj ci = do
+           mtxt <- getTweetInfo acid ci
+           case mtxt of
+             Nothing  -> return Nothing
+             Just txt -> return $ decode' (LB.fromChunks [TE.encodeUtf8 txt])
+                      
+     in mapMCL toObj cs
+    )
 
 {-| Convert a set of counts into a histogram; the bin
 size is fixed at 1 and the input array is assumed to be
 
 For now we do nothing
 -}
-processURLs :: [Counter N.URI] -> IO [Counter URIInfo]
+processURLs :: [Counter N.URI] -> IO (Sorted URIInfo)
 processURLs = consolidateUrls . map (fmap show)
 
 {-
       toFC BasicTweetCount {..} FollowerInfo {..} = (fiNumFollows, fiNumFollowers, btcNTweets, btcNRetweets)
       followerCount = M.elems $ M.intersectionWith toFC countMap followerInfo
 
-  let f :: (Ord a) => [Counter a] -> [Counter a]
-      f = take nmax . rsort
+  let f :: (Ord a) => [Counter a] -> CounterList a SortedList
+      f = takeCL nmax . toSortedCL
 
   tws <- requestTweetInfo $ f retweetStatus
 
   urls <- processURLs mentionedURL
 
-  -- Conversion back and forth between text and bytestring is a bit pointless
-  let toObj :: Counter T.Text -> Maybe (Counter Object)
-      toObj (Counter t c) = (`Counter` c) <$> decode' (LB.fromChunks [TE.encodeUtf8 t])
-      ftws = mapMaybe toObj . catMaybes $ tws
-
-      rtwDelta10MinHist = calcHist 600.0 reTweetTimes
+  let rtwDelta10MinHist = calcHist 600.0 reTweetTimes
       rtwDelta6HourHist = calcHist 21600.0 reTweetTimes
 
       -- replace URI by label for publishers; we assume that there is
          , toHistogram userRetweetCount
          , rtwDelta10MinHist
          , rtwDelta6HourHist
-         , ftws, f repliedUser, f garrulousUser, f retweetUser
-         , f mentionedUser, f mentionedHashTag, f urls
+         , tws
+	 , f repliedUser
+	 , f garrulousUser
+	 , f retweetUser
+         , f mentionedUser
+	 , f mentionedHashTag
+         , takeCL nmax urls
          , f pCount
 	 , followerCount
          , uinfo, firstTime, lastTime)
 
 We use the cache first to find URL redirects.
 -}
-consolidateUrls :: [Counter String] -> IO [Counter URIInfo]
+consolidateUrls :: [Counter String] -> IO (Sorted URIInfo)
 consolidateUrls orig = 
     bracket
     (openLocalStateFrom "simplestats" emptySimpleStatsStore)
 doUrls ::
     [Counter String]
     -> AcidState SimpleStatsStore
-    -> IO [Counter URIInfo]
+    -> IO (Sorted  URIInfo)
 doUrls orig acid = do
     let ourls = map _value orig 
     new <- mapM (findRedirect acid) ourls
     let ncnts = zipWith (curry replaceUrl) new orig
         ans = foldl' (\omap Counter {..} -> M.insertWith' (+) _value _count omap) M.empty ncnts
-    return $ rsort $ map (uncurry Counter) $ M.toList ans
+    return $ toSortedCL $ map (uncurry Counter) $ M.toList ans
     
 {-
 Replace a redirected URL. We use a status value of 0 to
   putStrLn $ ">> Last tweet:  " ++ show ltime
   putStrLn $ ">> Number of tweets: " ++ show ntws ++ " Number of retweets: " ++ show nrtws
   putStrLn "*** Most retweeted statuses"
-  forM_ rtStatus $ \(Counter obj cnt) -> putStrLn $ show cnt ++ " status=" ++ show obj
+  forM_ (fromCL rtStatus) $ \(Counter obj cnt) -> putStrLn $ show cnt ++ " status=" ++ show obj
   putStrLn "*** Most replied-to user"
-  forM_ repliedUser $ \(Counter uid cnt) -> putStrLn $ show cnt ++ " user=" ++ show (M.lookup uid uMap)
+  forM_ (fromCL repliedUser) $ \(Counter uid cnt) -> putStrLn $ show cnt ++ " user=" ++ show (M.lookup uid uMap)
   putStrLn "*** Most garrulous user (makes most replies that are not retweets)"
-  forM_ garrulousUser $ \(Counter uid cnt) -> putStrLn $ show cnt ++ " user=" ++ show (M.lookup uid uMap)
+  forM_ (fromCL garrulousUser) $ \(Counter uid cnt) -> putStrLn $ show cnt ++ " user=" ++ show (M.lookup uid uMap)
   putStrLn "*** Most retweeted user"
-  forM_ rtUser $ \(Counter uid cnt) -> putStrLn $ show cnt ++ " user=" ++ show (M.lookup uid uMap)
+  forM_ (fromCL rtUser) $ \(Counter uid cnt) -> putStrLn $ show cnt ++ " user=" ++ show (M.lookup uid uMap)
   putStrLn "*** Most mentioned user"
-  forM_ mUser $ \(Counter uid cnt) -> putStrLn $ show cnt ++ " user=" ++ show (M.lookup uid uMap)
+  forM_ (fromCL mUser) $ \(Counter uid cnt) -> putStrLn $ show cnt ++ " user=" ++ show (M.lookup uid uMap)
   putStrLn "*** Most mentioned hashtag"
-  forM_ mHT $ \(Counter ht cnt) -> putStrLn $ show cnt ++ " #" ++ ht
+  forM_ (fromCL mHT) $ \(Counter ht cnt) -> putStrLn $ show cnt ++ " #" ++ ht
   putStrLn "*** Histogram of number of retweets of a tweet"
   forM_ rtwHist $ \(k,v) -> when (v > 0) (putStrLn $ "Num retweets = " ++ show k ++ " " ++ show v ++ " times")
   putStrLn "*** Histogram of number of tweets"
   putStrLn "*** Histogram of number of retweets by a user"
   forM_ urtwHist $ \(k,v) -> when (v > 0) (putStrLn $ "Num tweets = " ++ show k ++ " " ++ show v ++ " times")
   putStrLn "*** Most mentioned URL"
-  forM_ mURL $ \(Counter u cnt) -> putStrLn $ show cnt ++ " <" ++ show u ++ ">"
+  forM_ (fromCL mURL) $ \(Counter u cnt) -> putStrLn $ show cnt ++ " <" ++ show u ++ ">"
   putStrLn "*** Most used publisher"
-  forM_ mPub $ \(Counter p cnt) -> putStrLn $ show cnt ++ " " ++ T.unpack p
+  forM_ (fromCL mPub) $ \(Counter p cnt) -> putStrLn $ show cnt ++ " " ++ T.unpack p
 
 -- | Replace the id of a user with a human-readable label
 replaceUserId :: M.Map UserId BasicUserInfo -> Counter UserId -> Counter String
     QueryResponse
     -> IO ()
 displayAsJSON (n, nrtonly, ntws, nrtws, rtwHist, utwHist, urtwHist, rtwDelta10MinHist, rtwDelta6HourHist, rtStatus, repliedUser, garrulousUser, rtUser, mUser, mHT, mURL, mPub, friendCount, uMap, fTime, lTime) = do
-  let fixUser = map (replaceUserId uMap)
+  let fixUser = map (replaceUserId uMap) . fromCL
 
       toF (nfr,nfoll,nt,nrt) = object [ "follow" .= nfr
                                       , "followers" .= nfoll
                    , "garrulousUsers" .= fixUser garrulousUser
                    , "retweetedUsers" .= fixUser rtUser
                    , "mentionedUsers" .= fixUser mUser
-                   , "mostRetweeted" .= rtStatus
+                   , "mostRetweeted" .= fromCL rtStatus
                    , "retweetHist" .= rtwHist
                    , "userTweetHist" .= utwHist
                    , "userRetweetHist" .= urtwHist
                    , "retweetDeltaHist10Min" .= rtwDelta10MinHist
                    , "retweetDeltaHist6Hour" .= rtwDelta6HourHist
-                   , "uris" .= mURL
-                   , "hashtag" .= mHT
-                   , "publisher" .= mPub
+                   , "uris" .= fromCL mURL
+                   , "hashtag" .= fromCL mHT
+                   , "publisher" .= fromCL mPub
                    , "followerCount" .= map toF friendCount
                    ]