grabtweets / IdentifyUsers.hs

{-# LANGUAGE OverloadedStrings, ScopedTypeVariables, RecordWildCards #-}

{-

Usage:

  ./identifyusers <endpoint>

Aim:

Try and identify "unknown" users, and then query Twitter to
try and find the missing data.

  - identify referents to a Tweet
  - a general check on users

We use the default store to identify the named graphs which 
contain Tweet data (are of type tw:TweetStore).

Try and "fix up" all the user links in the graph so that 
they match:

  ?u a sioc:UserAccount ;
     sioc:id ?id ;
     sioc:name ?handle ;
     rdfs:label ?lbl ;
     foaf:name ?name .

The Twitter API has a search term which accepts either 

curl 'https://api.twitter.com/1/users/lookup.json?user_id=101775511&screen_name=doug_burke'

to return a list of fields which gives something like:

[{"follow_request_sent":null,"profile_sidebar_border_color":"C0DEED","protected":false,"is_translator":false,"geo_enabled":false,"name":"Douglas Burke","profile_use_background_image":true,"profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","favourites_count":0,"followers_count":101,"profile_image_url":"http:\/\/a1.twimg.com\/profile_images\/609576508\/me_normal.png","location":"Boston-ish","id_str":"101775511","utc_offset":-18000,"profile_text_color":"333333","description":"Astronomer. Apparently not-so reluctant tweeter.\r\n","following":null,"verified":false,"profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","url":"http:\/\/hea-www.harvard.edu\/~dburke\/","default_profile":true,"profile_link_color":"0084B4","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/609576508\/me_normal.png","status":{"contributors":null,"place":null,"id_str":"136126605050388480","favorited":false,"in_reply_to_user_id":null,"possibly_sensitive":false,"geo":null,"retweet_count":0,"in_reply_to_screen_name":null,"coordinates":null,"in_reply_to_status_id":null,"retweeted":false,"in_reply_to_status_id_str":null,"truncated":false,"source":"\u003Ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003ETweetie for Mac\u003C\/a\u003E","created_at":"Mon Nov 14 17:01:26 +0000 2011","id":136126605050388480,"in_reply_to_user_id_str":null,"text":"How to start a week: listening to A doughnut in my hand by Ivor Cutler http:\/\/t.co\/DwWRJjz2"},"listed_count":17,"contributors_enabled":false,"profile_background_color":"C0DEED","screen_name":"doug_burke","show_all_inline_media":false,"statuses_count":1325,"profile_background_tile":false,"created_at":"Mon Jan 04 14:41:44 +0000 2010","profile_sidebar_fill_color":"DDEEF6","id":101775511,"default_profile_image":false,"lang":"en","notifications":null,"time_zone":"Eastern Time (US & Canada)","friends_count":93}]

We could use a lot of this information to enrich all the users, but let's just add the
minimum info to match the RDF given above.

-}

module Main where

import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.ByteString.Lazy as BL
import qualified Data.Set as S

import Database.HaSparqlClient (Query, NamedGraph, BindingValue(..))

import Network.HTTP.Conduit (simpleHttp)

import Data.Aeson

import Network.URI (parseURI)

import Swish.RDF

import Swish.RDF.Vocabulary (rdfType, rdfsLabel)
import Swish.RDF.Vocabulary.FOAF (foafname)
import Swish.RDF.Vocabulary.SIOC (siocUserAccount, siocid, siocname)

import Swish.RDF.Formatter.Turtle (formatGraphAsText)

import Data.List ((\\), intercalate)
import Data.Maybe (fromJust)

import System.Environment (getArgs, getProgName)
import System.Exit (exitFailure)
import System.IO (stderr, hPutStrLn)

import Control.Applicative ((<$>), (<*>))
import Control.Monad (unless, mzero)

import SPARQL (makeQuery, queryStore, fromStores)
import RDFUtils (triple)  

{-
Find those referents of a Tweet for which we are missing data.

It appears that 4store and StarDog don't support SPARQL 1.1 negation, so
use 1.0 OPTIONAL/!bound check.

I am going to assume that if there is a missing sioc:id
then the other elements are missing too.

-}

queryReferents :: [NamedGraph] -> Query
queryReferents ngs =
  unwords
  [ "prefix sioc: <http://rdfs.org/sioc/ns#>"
  , "prefix sioct: <http://rdfs.org/sioc/types#>"
  , "prefix dcterms: <http://purl.org/dc/terms/>"
  , "prefix foaf: <http://xmlns.com/foaf/0.1/>"
  , "SELECT DISTINCT ?account "
  , fromStores ngs
  , " WHERE { "
  , "  [] a sioct:MicroblogPost ; dcterms:references ?account ."
  , "  OPTIONAL { ?account sioc:id ?id }"
  , "  FILTER (!bound(?id))"
  , "}"
  ]

queryUsers :: [NamedGraph] -> Query
queryUsers ngs =
  unwords
  [ "prefix sioc: <http://rdfs.org/sioc/ns#>"
  , "prefix sioct: <http://rdfs.org/sioc/types#>"
  , "prefix dcterms: <http://purl.org/dc/terms/>"
  , "prefix foaf: <http://xmlns.com/foaf/0.1/>"
  , "SELECT ?user ?id ?handle ?foaf " 
  , fromStores ngs
  , " WHERE { "
  , "  ?user a sioc:UserAccount ."
  , "  OPTIONAL { ?user sioc:id ?id }"
  , "  OPTIONAL { ?user sioc:name ?handle }"
  , "  OPTIONAL { ?user foaf:name ?foaf }"
  , "  FILTER (!bound(?id) || !bound(?handle) || !bound(?foaf))"
  , "}"
  ]
  
{-
Run the query against the given endpoint, collecting
up the results.
-}
queryRef :: String -> [NamedGraph] -> IO [String]
queryRef endpoint = makeQuery getURIFrag endpoint . queryReferents

-- handle from the Twitter URI
-- and do we already know sioc:id, sioc:name, foaf:name?
type UserInfo = (String, Bool, Bool, Bool)

queryUsr :: String -> [NamedGraph] -> IO [UserInfo]
queryUsr endpoint = makeQuery getUI endpoint . queryUsers

twURI :: T.Text
twURI = "http://twitter.com/"

-- extract the "fragment" of the twitter URI
getTWFrag :: T.Text -> Maybe String
getTWFrag u | twURI `T.isPrefixOf` u = Just $ T.unpack $ T.drop (T.length twURI) u
            | otherwise              = Nothing

getURIFrag :: [BindingValue] -> Maybe String
getURIFrag [URI u] = getTWFrag u
getURIFrag _       = Nothing

{-
this seems to be failing
getUI :: [BindingValue] -> Maybe UserInfo
getUI (URI u : idbv : hbv : nbv : []) = do
  hdl <- if twURI `isPrefixOf` u then Just (drop (length twURI) u) else Nothing
  hasId <- isJust `fmap` (fromBinding idbv :: Maybe (Maybe Int))
  hasHdl <- isJust `fmap` (fromBinding hbv :: Maybe (Maybe T.Text))
  hasName <- isJust `fmap` (fromBinding nbv :: Maybe (Maybe T.Text))
  return (hdl, hasId, hasHdl, hasName)
getUI _ = Nothing
-}
           
-- ignoring type conversion issues (e.g. expect Int but sent URI)                 
isBound :: BindingValue -> Bool
isBound Unbound = False
isBound _ = True

getUI :: [BindingValue] -> Maybe UserInfo
getUI (URI u : idbv : hbv : nbv : []) = do
  hdl <- getTWFrag u
  let hasId = isBound idbv
      hasHdl = isBound hbv
      hasName = isBound nbv
  return (hdl, hasId, hasHdl, hasName)
getUI _ = Nothing


data User = 
  User
  {
    usId :: Integer  -- id
  , usFullName :: T.Text -- name
  , usName :: T.Text -- screen_name
  } deriving (Eq, Show)
  
instance FromJSON User where
  parseJSON (Object o) =
    User
    <$> o .: "id"
    <*> o .: "name"
    <*> o .: "screen_name"
  parseJSON _ = mzero

convJSON :: BL.ByteString -> [User]
convJSON inp = case decode inp of
  Just r -> r
  _      -> error "Unable to convert to JSON"
  

{-                 
Given a list of twitter user names/handles, return information
about those users that are known.
-}
getTwitterUsers :: [String] -> IO [User]                 
getTwitterUsers us = do                 
  let qry = "screen_name=" ++ intercalate "," us
      uri = "https://api.twitter.com/1/users/lookup.json?" ++ qry
      
  hPutStrLn stderr $ "Twitter access: " ++ uri
  search <- simpleHttp uri
  return $ convJSON search
      
userToTriples :: User -> [RDFTriple]    
userToTriples User {..} =
  case parseURI ("http://twitter.com/" ++ T.unpack usName) of
    Just uri -> let t :: (ToRDFLabel p, ToRDFLabel o) => p -> o -> RDFTriple
                    t = triple uri
                in [ t rdfType siocUserAccount
                   , t siocid usId
                   , t siocname usName
                   , t rdfsLabel (T.toLower usName)
                   , t foafname usFullName ]
    _ -> []
    

usersToTriples :: [User] -> [RDFTriple]
usersToTriples = concatMap userToTriples
    
identifyUsers :: [String] -> IO [RDFTriple]
identifyUsers [] = return []
identifyUsers us = do
  usInfo <- getTwitterUsers us
  let fus = map (T.unpack . usName) usInfo
      missing = us \\ fus
      
  unless (null missing) $ do
    hPutStrLn stderr "The following users (referents) are unknown:"
    mapM_ (hPutStrLn stderr) missing
    
  return $ usersToTriples usInfo
  
userInfoToTriples :: [UserInfo] -> User -> [RDFTriple]  
userInfoToTriples uis u@(User {..}) = 
  let x = filter (\(a,_,_,_) -> T.toLower (T.pack a) == T.toLower usName) uis 
  in case x of
    [] -> error $ "Twitter returned: " ++ show u ++ "\nwhich is not in:\n" ++ show uis
    
    [(n,mid,mhdl,mname)] ->
      let uri = fromJust $ parseURI ("http://twitter.com/" ++ n)
          t :: (ToRDFLabel p, ToRDFLabel o) => p -> o -> RDFTriple
          t = triple uri
      
          t1 = if mid   then [] else [t siocid usId]
          t2 = if mhdl  then [] else [t siocname usName]
          t3 = if mname then [] else [t foafname usFullName]
          
      in t1 ++ t2 ++ t3
      
    _ -> error $ "Found multiple matches:\n" ++ show x

addUserInfo :: [UserInfo] -> IO [RDFTriple]
addUserInfo [] = return []
addUserInfo us = do
  let usNames = map (\(a,_,_,_) -> a) us
  usInfo <- getTwitterUsers usNames
  let fus = map (T.unpack . usName) usInfo
      missing = usNames \\ fus
  
  unless (null missing) $ do
    hPutStrLn stderr "The following users (general) are unknown:"
    mapM_ (hPutStrLn stderr) missing
    
  return $ concatMap (userInfoToTriples us) usInfo
  
usage :: IO ()
usage = do
  pName <- getProgName
  hPutStrLn stderr $ "Usage: " ++ pName ++ " <endpoint>"
  exitFailure

main :: IO ()
main = do
  args <- getArgs
  case args of
    [endpoint] -> do
      stores <- queryStore endpoint
      us1 <- queryRef endpoint stores
      t1 <- identifyUsers us1
      
      us2 <- queryUsr endpoint stores
      t2 <- addUserInfo us2
      
      T.putStrLn $ formatGraphAsText $ toRDFGraph $ S.fromList $ t1 ++ t2
      
    _ -> usage
  
    
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.