Commits

Doug Burke committed 3942774

TweetsToRDF: Minor clean up

  • Participants
  • Parent commits 97a68a7

Comments (0)

Files changed (1)

 repeats. This may lead to faster execution but is not ideal;
 switching to addArc should help fix this (still relies on our
 creation of valid/distinct bnodes), but will be slower (2.5 s vs 6 s).
+The above is less valid as of Swish 0.8.0.0, since RDF graphs are
+now stored as sets rather than lists, but leave code as is since
+a specialized search is likely to be significantly quicker than
+the general RDF search facilities provided by Swish.
 
 Should we add rdfs:label of the lower-case version of the twitter's
 screen name now that we have some access to the full name of the user?
 import qualified Data.Text.Lazy as LT
 
 import qualified Data.Vector as V
--- import qualified Data.Map as M
 import qualified Data.HashMap.Strict as HM
--- import qualified Data.Set as S
+import qualified Data.Set as S
 
 -- import Data.Monoid (mappend)
 
       ms = (map (\EUM {..} -> UI eumId (Just eumName) (Just eumGivenName) Nothing) . eMentions) <$> twEntities
       users = u1 : catMaybes [mu2] ++ fromMaybe [] ms
                                    
-  -- in foldl' (\m ui -> M.insertWith' combineUserInfo (uiId ui) ui m) uMap users
   in foldl' (\m ui -> HM.insertWith combineUserInfo (uiId ui) ui m) uMap users
 
 {-
   case fromJSON v of
     Error emsg -> liftIO $ putStrLn emsg
     Success tw@(Tweet {..}) -> 
-      -- let nTweetMap = M.insertWith' mergeTweets twId tw tweetMap
       let nTweetMap = HM.insertWith mergeTweets twId tw tweetMap
           nUserMap  = addUsersFromTweet userMap tw
       in put (nTweetMap, nUserMap)
        
      , twSource     :: Maybe T.Text -- source
        
-     -- , twMeta       :: Maybe (M.Map T.Text T.Text) -- metadata
      , twMeta       :: Maybe (HM.HashMap T.Text T.Text) -- metadata
      , twPlace      :: Maybe Place -- place
      
   | id1 == id2  = UI id1 (un1 <|> un2) (sn1 <|> sn2) (lc1 <|> lc2)
   | otherwise   = error $ "User Ids do not match: " ++ show id1 ++ " vs " ++ show id2
 
-{-
-type TwMap = M.Map TweetId Tweet -- map from tweet id to tweet
-type TwUser = M.Map UserId UserInfo -- map from user id to user info
-
-type Sources      = M.Map T.Text RDFLabel -- map from source (as returned by Twitter) to a RDFLabel for the source
-type UserLabels   = M.Map UserId RDFLabel -- map from user id to the label for that user, which can be blank
-type UnknownUsers = M.Map UserId RDFLabel -- map from user id to blank node DEPTRECATES
--}
-
 type TwMap = HM.HashMap TweetId Tweet -- map from tweet id to tweet
 type TwUser = HM.HashMap UserId UserInfo -- map from user id to user info
 
 type Sources      = HM.HashMap T.Text RDFLabel -- map from source (as returned by Twitter) to a RDFLabel for the source
 type UserLabels   = HM.HashMap UserId RDFLabel -- map from user id to the label for that user, which can be blank
-type UnknownUsers = HM.HashMap UserId RDFLabel -- map from user id to blank node DEPTRECATES
 
 type MapStateIO = StateT (TwMap, TwUser) IO
 
 handleUserMentions :: UserLabels -> RDFLabel -> EntityUserMentions -> [RDFTriple]
 handleUserMentions ulabels twLbl EUM {..} = 
   catMaybes [triple twLbl dctreferences <$> HM.lookup eumId ulabels]
-  -- catMaybes [triple twLbl dctreferences <$> M.lookup eumId ulabels]
 
 {-
 Create statements for URLs referenced by the tweet.
   -> ([RDFTriple], Int)
 tweetToRDF srcs ulabels ctr tw@(Tweet {..}) = 
   let twR = mkTweetURI twName twId
-      -- uR  = fromMaybe (error ("UNEXPECTED: unable to find user for " ++ show tw ++ " in " ++ show ulabels)) $ M.lookup twUserId ulabels 
       uR  = fromMaybe (error ("UNEXPECTED: unable to find user for " ++ show tw ++ " in " ++ show ulabels)) $ HM.lookup twUserId ulabels 
-      
+
+      -- TODO: check whether have any twISO fields,
+      --       although as noted at the start, the inclusion of this
+      --       information is problematic.      
       -- txt = Lit ((uDecode . replaceEntities) twMessage) (fmap langName twISO)
       txt = Lit $ (uDecode . replaceEntities) twMessage
       
       srcArc = t dctpublisher <$> mLookup twSource srcs
       
       replyLabel = case twReplyTo of
-        -- Just ruid -> M.lookup ruid ulabels 
         Just ruid -> HM.lookup ruid ulabels 
         _ -> Nothing
       replyArc = t siocaddressed_to <$> replyLabel
   case twSource of
     Nothing -> old
     Just k  -> 
-      -- if M.member k osrcs
       if isJust (HM.lookup k osrcs)
       then old
       else case sourceToLabels (replaceEntities k) of
         Just (uri, lbl) -> 
           let sNode = toRDFLabel uri
-          -- in (triple sNode rdfsLabel lbl : ts, M.insert k sNode osrcs)
           in (triple sNode rdfsLabel lbl : ts, HM.insert k sNode osrcs)
         _ -> error $ "Unable to parse source: " ++ T.unpack k
 
   -> UserInfo
   -> ([RDFTriple], UserLabels, Int)
 processUser orig@(ots, usmap, bctr) UI {..} = 
-  -- case M.lookup uiId usmap of
   case HM.lookup uiId usmap of
     Just (Blank _) -> case uiName of
       Just (UN x) -> error ("blank node found for user: " ++ T.unpack x)
                  ++ catMaybes [ t foafname <$> uiGivenName 
                               , t twLangCode <$> uiLangCode ]
            
-        -- in (ts ++ ots, M.insertWith' const uiId userR usmap, bctr)
         in (ts ++ ots, HM.insertWith const uiId userR usmap, bctr)
           
       Nothing -> 
             ts = [ t rdfType siocUserAccount 
                  , t siocid uiId ]
           
-        -- in (ts ++ ots, M.insertWith' const uiId bNode usmap, nctr)
         in (ts ++ ots, HM.insertWith const uiId bNode usmap, nctr)
          
 processFile :: Int -> MapStateIO ()
   --    the source of tweets
   --    tweets
   --
-  -- (tweetMap, userMap) <- execStateT (processFile 1) (M.empty, M.empty)
   (tweetMap, userMap) <- execStateT (processFile 1) (HM.empty, HM.empty)
-  let -- tweets = M.elems tweetMap
-      -- (userTriples, userLabels, userCtr) = foldl' processUser ([], M.empty, 1) $ M.elems userMap
-      -- (sourceTriples, sourceMap) = foldl' processSource ([], M.empty) tweets
-      tweets = HM.elems tweetMap
+  let tweets = HM.elems tweetMap
       (userTriples, userLabels, userCtr) = foldl' processUser ([], HM.empty, 1) $ HM.elems userMap
       (sourceTriples, sourceMap) = foldl' processSource ([], HM.empty) tweets
       (tweetTriples, _) = foldl' (processTweet sourceMap userLabels) ([], userCtr) tweets
 
-      -- NOTE: the setArcs version does not ignore repeated triples
-      -- twGr = setArcs (sourceTriples ++ userTriples ++ replyToTriples ++ tweetTriples) startGraph
-      --
-      twGr = foldl' (flip addArc) startGraph (sourceTriples ++ userTriples ++ tweetTriples)
+      -- QUS: what happened to replyToTriples
+      twGr = setArcs startGraph $ 
+             S.fromList $ sourceTriples ++ userTriples ++ tweetTriples
   
   fh <- openFile outFile WriteMode
   T.hPutStrLn fh $ formatGraphAsText twGr