Source

astrosearch / ToRDF.hs

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ViewPatterns #-}

{-

Usage:

  ./tordf <port> [<n>] <... how to specify sparql endpoint ...>

Convert the database to RDF and add to a RDF storwe via SPARQL.

TODO:
  how best to record tw:numFollowers since may want to track
    changes over time

  how best to handle "mangled" data

-}

module Main where

import qualified Data.ByteString.Char8 as B
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as LT
import qualified Data.Set as S

import Control.Applicative (Applicative(..), (<$>))
import Control.Monad (mplus, join)

import Data.Acid
import Data.Char (isDigit, isHexDigit, chr, digitToInt)
import Data.Either (rights)
import Data.List (foldl')
import Data.Maybe (fromMaybe, catMaybes, mapMaybe, maybeToList)
import Data.Time (UTCTime, readTime)

import Network (PortNumber)
import Network.URI (URI(..), parseURI)

import Swish.RDF.Graph ( RDFArcSet
                       , RDFGraph
                       , NSGraph(..)
                       , RDFLabel(..)
                       , RDFTriple
                       , ToRDFLabel(..)
                       , toRDFTriple
                       )

-- From a simple test, it appears that the output graph type (NTriples
-- vs Turtle) does not change the run time.
import Swish.RDF.Formatter.Turtle (formatGraphAsText)
-- import Swish.RDF.Formatter.NTriples (formatGraphAsText)
import Swish.RDF.Parser.Turtle (parseTurtlefromText)
import Swish.RDF.Vocabulary.DublinCore
import Swish.RDF.Vocabulary.FOAF
import Swish.RDF.Vocabulary.RDF
import Swish.RDF.Vocabulary.SIOC

import System.Environment
import System.Exit (exitFailure)
import System.Locale (defaultTimeLocale)

import Web.Twitter.Types

import AcidState
import RDFUtils (blogPost, illustrate, fromStreaming, isRetweet, numFollowers, numFriends, twLangCode)

-- Extract just the tweets or re-tweets from the streaming API
-- and encode them in a type that reflects this. For now I am
-- going to assume that we have no delete messages since I have
-- not seen one; once a delete is seen then this will have to change.

data TweetData = TDS Status | TDR RetweetedStatus

toTweetData :: StreamingAPI -> Maybe TweetData
toTweetData (SStatus s)          = Just (TDS s)
toTweetData (SRetweetedStatus r) = Just (TDR r)
toTweetData _                    = Nothing

strToInt :: Int -> String -> Int
strToInt b = foldl' (\cval nval -> cval * b + nval) 0 . map digitToInt

{-
Replace \ddd or \uxxxx with the actual character in the input.

We do not look for \U00XXXXXX.

The input string is assumed to be short
-}
uDecode :: T.Text -> T.Text
uDecode = go1
  where
    go1 i = case T.uncons i of
      Nothing -> T.empty
      Just (c, is) -> if c == '\\' then go2 is else T.cons c (go1 is)
                           
    go2 i = case T.uncons i of                     
      Nothing -> T.singleton '\\'                   
      Just (c, is) | c == 'u' -> let (n4, rs) = T.splitAt 4 is                   
                                     s = T.unpack n4
                                     ch = strToInt 16 s
                                 in if length s == 4 && all isHexDigit s
                                    then T.cons (chr ch) (go1 rs)
                                    else T.append (T.pack "\\u") (go1 is)
                   | isDigit c -> let (l, r) = T.span isDigit is 
                                      lc = strToInt 10 $ T.unpack $ T.cons c l
                                  in T.cons (chr lc) (go1 r)
                   | otherwise -> T.append (T.pack ('\\' : [c])) (go1 is)              

{-
Not bothered about efficiency at this moment.
-}
    
entities :: [(T.Text, T.Text)]
entities = [ ("&quot;", "\"")
           , ("&amp;", "&")
           , ("&lt;", "<")
           , ("&gt;", ">")
           , ("\t", " ")
           , ("\n", " ")
             -- TODO: do we really want to clean these quote characters?
           , (T.singleton '\8220', "\"") -- it might be nice to convert the other way round but would need to do more parsing 
           , (T.singleton '\8221', "\"")
           ]
           
replaceEntities :: T.Text -> T.Text
replaceEntities txt = 
  let conv oldVal (inVal,outVal) = T.replace inVal outVal oldVal
  in foldl' conv txt entities

getJust :: String -> Maybe a -> a
getJust lbl = fromMaybe (error ("getJust sent Nothing: " ++ lbl))
  
{-
At least one URI has "fancy" unicode in it; in this case nbsp characters
presumably to avoid harvesting; in this case we just treat it as a
missing URI.

tordf: getJust sent Nothing: parseURI sent 'http://
                                                   www.gabrielfcoramos.m
                                                                        ex.tl/'

-}

pURI :: String -> URI
pURI u = getJust ("parseURI sent '" ++ u ++ "'") $ parseURI u

mkTweetURI :: 
  T.Text          -- ^ user name
  -> StatusId     -- ^ id of message
  -> RDFLabel
mkTweetURI user tid = (toRDFLabel . pURI . concat) ["http://twitter.com/", T.unpack user, "/status/", show tid]
  
mkTweeterURI :: T.Text -> RDFLabel
mkTweeterURI = toRDFLabel . pURI . ("http://twitter.com/" ++) . T.unpack 

{-
Extract the source URI and label from a
Tweet source element.

Expect 

 '<a href="..." ...>text</a>'

We perform simple substitution on the text to replace
\uxxxx with the actual code point. This is actually fairly
pointless as immediately converted back to the same encoding
scheme as writing out as Turtle, but we leave this step in
(in part to avoid the \ characters from being protected
when written out as Turtle).
-}
sourceToLabels :: T.Text -> Maybe (URI, T.Text)
sourceToLabels (T.stripPrefix "<a href=\"" -> Just rmd) = 
  let (uri, rmd2) = T.breakOn "\"" rmd
      rmd3 = T.dropWhile (/='>') rmd2
      rmd4 = T.breakOn "</a>" rmd3
  in Just ((pURI . T.unpack) uri, (uDecode . T.tail . fst) rmd4)
sourceToLabels _ = Nothing

triple :: (ToRDFLabel s, ToRDFLabel p, ToRDFLabel o) => s -> p -> o -> RDFTriple
triple = toRDFTriple

{-
Example tweet:

Status {statusCreatedAt = "Tue Jan 10 19:51:38 +0000 2012", statusId = 156825546397982720, statusText = "@Adrain_CC #aas219 \937\937 \50883\50883\1148\1148 \50883\50883 \26085\20985\22238\21475\30000 \937\937\1134\1134 \21482\21482 \49660\49660\937\937\12676\9742\9743\9834 \1120\1120\30000\8745__\8745\8745__\8745\8594_:\9678)\8801(:\12539\12408\12539)(\12539\12408\12539)\30000\20985\22238 (\8231\8231)(\8231\8231)(\8231\8231) \1134\1134 \9639\9638\20171\20171 \167\167\167 \8362\1578\1578 \49660\49660\49660 \22239\22239 \50724\50724\50724 \12676\12676\12676 \22239\30446 \26085 \1578\1578 \20171\9834\9834", statusSource = "<a href=\"http://levelupstudio.com\" rel=\"nofollow\">Plume\160\160</a>", statusTruncated = False, statusEntities = Entities {enHashTags = [HashTagEntity "aas219"], enUserMentions = [UserEntity (User {userId = 31473939, userName = " Adrain B", userScreenName = "Adrain_CC", userDescription = Nothing, userLocation = Nothing, userProfileImageURL = Nothing, userURL = Nothing, userProtected = Nothing, userFollowers = Nothing, userFriends = Nothing, userTweets = Nothing, userLangCode = Nothing, userCreatedAt = Nothing})], enURLs = []}, statusInReplyTo = Nothing, statusInReplyToUser = Just 31473939, statusFavorite = Just False, statusRetweetCount = Just 0, statusUser = User {userId = 188300993, userName = "Man\243n Farkas", userScreenName = "manonfarkas", userDescription = Just "Psicoanalista. Me place pintar telas,  pintar palabras y por sobre todo  intelecto.", userLocation = Just "Argentina        Chile", userProfileImageURL = Just "http://a3.twimg.com/profile_images/1706267932/1324095489788_normal.jpg", userURL = Nothing, userProtected = Just False, userFollowers = Just 2165, userFriends = Just 2378, userTweets = Just 4523, userLangCode = Just "es", userCreatedAt = Just "Wed Sep 08 12:08:46 +0000 2010"}}

-}

-- have decided to force lower case on hash-tags
fromHTE :: Entity HashTagEntity -> T.Text
fromHTE (Entity HashTagEntity {..} _) = T.toLower hashTagText

fromUE :: Entity UserEntity -> User
fromUE (Entity (UserEntity u) _) = u

-- Tue Jan 10 19:51:38 +0000 2012
getTime :: DateString -> UTCTime
getTime = readTime defaultTimeLocale "%b %d %T %z %Y" . drop 4 

isNotEmpty :: T.Text -> Maybe T.Text
isNotEmpty t = if T.null t then Nothing else Just t

userToRDF :: User -> (RDFLabel, [RDFTriple])
userToRDF User {..} =
  let uR = mkTweeterURI userScreenName
      
      t :: (ToRDFLabel p, ToRDFLabel o) => p -> o -> RDFTriple
      t = triple uR
      
      logoArc = t siocavatar <$> fmap (pURI . B.unpack) userProfileImageURL
      -- homeArc = t foafhomepage <$> fmap pURI userURL
      -- have at least one user with unicode characters in their home page description which
      -- does not parse cleanly; in that case we treat it as no home page for simplicity
      -- and because - presumably - the user does not want their home page URI
      -- harvested. Similarly, exclude empty descriptions
      homeArc = t foafhomepage <$> join (fmap (parseURI . B.unpack) userURL)
      statusArc = t foafstatus <$> join (fmap (isNotEmpty . uDecode) userDescription)
      followerArc = t numFollowers <$> userFollowers
      friendArc = t numFriends <$> userFriends
      langArc = t twLangCode <$> userLangCode

  in (uR, 
      [ t foafname $ uDecode userName
      , t siocname userScreenName
      , t rdfsLabel (T.toLower userScreenName)
      , t siocid userId
      , t rdfType siocUserAccount
      ] ++ catMaybes [logoArc, homeArc, statusArc, followerArc, friendArc, langArc])

{-
Create statements for URLs referenced by the tweet.

Have decided to avoid adding a lot of owl:sameAs statements
and just add a single link depending on whether ueExpanded is
different to ueURL and is valid.

Also trying to add a text label to describe the URL given the
"expanded" element in the URLEntity but it's complicated (if you
try and be clever), so just add in whatever.

-}
linkTos :: RDFLabel -> Entity URLEntity -> [RDFTriple]
linkTos twLbl (Entity URLEntity {..} _) = 
  case getLink ueURL ueExpanded of
    Just u -> [triple twLbl sioclinks_to u, triple u rdfsLabel ueDisplay]
    Nothing -> []

uriToLabel :: URIString -> Maybe RDFLabel
uriToLabel = (toRDFLabel `fmap`) . parseURI . B.unpack

getLink :: 
  URIString     -- ueURL
  -> URIString  -- ueExpanded
  -> Maybe RDFLabel
getLink url expanded = 
  let u = uriToLabel url
      e = uriToLabel expanded
  in if expanded == url
     then u
     else mplus e u

{-
Too lazy to create the graph directly; note that any statements made
in initGraph will be lost when used in createGraph.
-}
initGraph :: LT.Text
initGraph = 
  LT.concat
  [ "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . "
  , "@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . "
  -- , "@prefix owl: <http://www.w3.org/2002/07/owl#> . "
  , "@prefix dcterms: <http://purl.org/dc/terms/> . "
  , "@prefix sioc: <http://rdfs.org/sioc/ns#> ."
  , "@prefix sioc_t: <http://rdfs.org/sioc/types#> ."
  , "@prefix foaf: <http://xmlns.com/foaf/0.1/> ."
  , "@prefix geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> ."
  , "@prefix lode: <http://linkedevents.org/ontology/> ."
  , "@prefix tw: <http://purl.org/net/djburke/demo/twitter#> ."  
  ]
  
startGraph :: RDFGraph
startGraph = either error id $ parseTurtlefromText initGraph

{-
The id of the re-tweet (ie the original tweet that this
tweet is re-tweeting).
-}
getRetweetId :: TweetData -> Maybe Integer
getRetweetId (TDR (RetweetedStatus {..})) = Just $ statusId rsRetweetedStatus
getRetweetId _ = Nothing

{-
If the "original" tweet (ie the one that was being retweeted)
is a member of the input set, extract it's details.
-}
getMissingTweet :: S.Set Integer -> TweetData -> Maybe TweetData
getMissingTweet ts (TDR (RetweetedStatus {..})) = 
  if S.member (statusId rsRetweetedStatus) ts
  then Just (TDS rsRetweetedStatus)
  else Nothing
getMissingTweet _ _ = Nothing       
    
{-|
Create a graph based on the tweets.

For re-tweets, the original tweet is added to the store if not
known about; this has been seen previously for a few tweets that
were probably not missed but instead just weren't being considered
as "interesting" by the Twitter search machinery, and so were never
included in the search results.
-}
createGraph :: 
  [TweetData] -- ^ data to add
  -> RDFGraph -- ^ the graph
createGraph tws = 
    let (stmts, tweetIds) = foldl' tweetToRDF (S.empty,S.empty) tws

        retweetIds = S.fromList $ mapMaybe getRetweetId tws      
        missingIds = S.difference retweetIds tweetIds
        missingTweets = mapMaybe (getMissingTweet missingIds) tws

        (nstmts, _) = foldl' tweetToRDF (stmts, tweetIds) missingTweets

    in startGraph { statements = nstmts }
  
processEntities :: 
    (Entities -> [a])
    -> (a -> b)
    -> Maybe Entities
    -> [b]
processEntities f g = maybe [] (map g . f)

-- | Convert a tweet or retweet into a bunch of statements and
-- updates the set of known ids. It does NOT check that the
-- original tweet of a re-tweet is known about since we can not
-- guarantee the order of processing, and have decided to split
-- out this processing into a separate step (although it could be
-- done here).
--
-- Since we have no blank nodes, we re-add tweets that are already
-- known about, in case there is any difference in information content.
-- This may be a bad idea (e.g. if differences in the status text or
-- fields like the number of followers, as seen in previous versions).
--
tweetToRDF :: 
    (RDFArcSet, S.Set Integer) 
    -- ^ Existing statements and the ids of the tweets/retweets
    -> TweetData -- ^ tweet to add
    -> (RDFArcSet, S.Set Integer)
    -- ^ Output statements and map of seen tweets.
tweetToRDF (ostmts,omap) (TDS Status {..}) = 
  let (uR, userArcs) = userToRDF statusUser
      
      usHandle = userScreenName statusUser
      twR = mkTweetURI usHandle statusId
      
      t :: (ToRDFLabel p, ToRDFLabel o) => p -> o -> RDFTriple
      t = triple twR
      
      srcInfo = sourceToLabels . replaceEntities $ statusSource
      srcArcs = catMaybes
                [ t dctpublisher <$> fmap fst srcInfo
                , triple <$> fmap fst srcInfo <*> pure rdfsLabel <*> fmap snd srcInfo ]
      
      -- TODO: what processing do we need to do on the text?
      txt = Lit . uDecode . replaceEntities $ statusText
      
      -- TODO: could add in place information but that would require a
      -- blank node and I don't want to do that just yet.
      
      htArcs = processEntities enHashTags (t illustrate . fromHTE) statusEntities
      mArcs  = processEntities enUserMentions (t dctreferences . mkTweeterURI . userScreenName . fromUE) statusEntities
      uArcs  = maybe [] (concatMap (linkTos twR) . enURLs) statusEntities

      origR = mkTweetURI <$> statusInReplyToScreenName <*> statusInReplyTo
      replyArcs = maybeToList (t siocreply_of <$> origR)
      
      stmts = userArcs ++ srcArcs ++ htArcs ++ mArcs ++ uArcs ++ replyArcs ++
              [ t sioccontent txt
              , t siocid statusId
              , t siochas_creator uR
              , t dctcreated (getTime statusCreatedAt)
              , t rdfType blogPost
              , t fromStreaming True
              ]

  in (foldl' (flip S.insert) ostmts stmts,
      S.insert statusId omap)

tweetToRDF (ostmts,omap) (TDR RetweetedStatus {..}) = 
  let (uR, userArcs) = userToRDF rsUser
      
      usHandle = userScreenName rsUser
      twR = mkTweetURI usHandle rsId
      
      t :: (ToRDFLabel p, ToRDFLabel o) => p -> o -> RDFTriple
      t = triple twR
      
      srcInfo = sourceToLabels . replaceEntities $ rsSource
      srcArcs = catMaybes
                [ t dctpublisher <$> fmap fst srcInfo
                , triple <$> fmap fst srcInfo <*> pure rdfsLabel <*> fmap snd srcInfo ]
      
      -- TODO: what processing do we need to do on the text?
      txt = Lit . uDecode . replaceEntities $ rsText
      
      -- TODO: could add in place information but that would require a
      -- blank node and I don't want to do that just yet.
      
      -- TODO: need map of user id to userlabel to be able to create replyArc
      -- (or a bnode which then has to be cleaned up)
      -- replyArc = t siocaddressed_to <$> fmap ? statusInReplyToUser

      htArcs = processEntities enHashTags (t illustrate . fromHTE) rsEntities
      mArcs  = processEntities enUserMentions (t dctreferences . mkTweeterURI . userScreenName . fromUE) rsEntities
      uArcs  = maybe [] (concatMap (linkTos twR) . enURLs) rsEntities
      
      rt = rsRetweetedStatus
      
      -- in case the orginal tweet is unknown
      origR = mkTweetURI (userScreenName (statusUser rt)) (statusId rt)
      -- (origArcs, origR) = sToRDF rt
      
      stmts = userArcs ++ srcArcs ++ htArcs ++ mArcs ++ uArcs ++ -- origArcs ++
              [ t sioccontent txt
              , t siocid rsId
              , t siochas_creator uR
              , t dctcreated (getTime rsCreatedAt)
              , t rdfType blogPost 
              , t siocreply_of origR
              , t fromStreaming True
              , t isRetweet True
              ]

  in (foldl' (flip S.insert) ostmts stmts,
      S.insert rsId omap)

{-
TODO:

We probably need to view the existing graph to see if tweets
are already known about (if we assume that there is a possibility
for overlap). This is not so much a problem if the graph contains
no blank nodes.

-}
processTweets :: [TweetData] -> IO ()
processTweets = T.putStrLn . formatGraphAsText . createGraph
        
getTweets :: PortNumber -> Maybe Int -> IO [TweetData]
getTweets p mn = do
  acid <- openStore p
  twts <- case mn of
            Just n -> query acid (GetTweetEvents n)
            _ -> query acid GetAllTweetEvents
  closeAcidState acid
  return $ (mapMaybe toTweetData . rights . map toTweet') twts

usage :: IO ()
usage = do
  pName <- getProgName
  putStrLn $ "Usage: " ++ pName ++ " <astrosearch port num> [<last n tweets>]"
  exitFailure

main :: IO ()
main = do
  args <- getArgs
  case args of
    (portStr:xs) -> case getPort portStr of
                      Just port -> case xs of
                                     [] -> getTweets port Nothing >>= processTweets
                                     [x] -> case maybeRead x of
                                              mntwts@(Just _) -> getTweets port mntwts >>= processTweets
                                              _ -> usage
                                     _ -> usage
                                     
                      _ -> usage
    _ -> usage