Commits

Doug Burke  committed 2018d40

FindRetweets: added edit distance calculation and improvements to simple version

  • Participants
  • Parent commits 8db5e77

Comments (0)

Files changed (2)

File FindRetweets.hs

 
 Usage:
 
-  ./findretweets <endpoint> [simple]
+  ./findretweets <endpoint> [simple|distance]
 
 Aim:
 
 all matches but is a start (e.g. of the 317 tweets beginning RT and
 not marked as being a retweet in aas219 it finds 117 matches).
 
+The distance version uses the edit distance between tweets to
+identify potential matches.
+
 -}
 
 module Main where
 
 import Control.Applicative ((<$>), (<*>))
 -- import qualified Control.Exception as CE
-import Data.Maybe (fromJust, catMaybes)
 -- import Control.Monad (forM_, unless)
-import Data.Char (isAlphaNum)
-import Data.List (foldl')
+import Data.Ord (comparing)
+import Data.Char (ord, isDigit)
+import Data.List (foldl', sortBy)
+import Data.Maybe (fromJust, fromMaybe, catMaybes, isJust)
 import Data.Time (UTCTime)
 import Network.URI (URI, parseURI)
 -- import Network.HTTP.Conduit (HttpException(..))
 
 import SPARQL (FromBinding(..), makeQuery)
 
+import Text.EditDistance
+
 isRetweet :: URI
 isRetweet = fromJust $ parseURI "http://purl.org/net/djburke/demo/twitter#isRetweet"
 
 Can we identify the original tweet?
 -}
 
+{-
+We assume we have
+
+RT @usern .. @origuser: ...
+
+and want to search for @origuser
+
+or
+
+RT @user [^@]...
+
+Updating to include some other forms we see:
+
+  RT of @...
+  RT from @...
+  RT ~ @...
+  RT "@...
+
+-}
+
+splitUpUsers :: T.Text -> (Maybe T.Text, T.Text)
+splitUpUsers txt0 =
+  let h = T.stripStart $ T.drop 2 txt0
+      txt = foldl' (flip stripLeading) h ["from", "of", "~", "\""]
+      
+      go m t = let (u,r) = grabUser t 
+               in if isJust u then go u r else (m, t)
+    
+      (muser, ans) = go Nothing txt
+      
+  in case T.uncons ans of
+    Just (':', ans') -> (muser, T.stripStart ans')
+    _ -> (muser, ans)
+
+stripLeading :: T.Text -> T.Text -> T.Text
+stripLeading l t = fromMaybe t $ T.stripStart `fmap` T.stripPrefix l t 
+
+{-
+Given @foo ... return Just foo, '...'
+otherwise Nothing, input.
+-}
+grabUser :: T.Text -> (Maybe T.Text, T.Text)
+grabUser t = case T.uncons t of
+  Just ('@', rest) -> let (l, r) = T.span isTwitterChar rest
+                      in if T.null l then (Nothing, t) else (Just l, T.stripStart r)
+  _ -> (Nothing, t)
+
+
 queryOriginalTweet :: URI -> UTCTime -> T.Text -> Query
-queryOriginalTweet uri0 time0 txt0 = 
-  -- we hard code the assumption that the re-tweet has the form
-  --     RT @username: ...
-  -- and we need to protect it for the lucene search
-  --
-  -- Add in support to strip leading/trailing "
-  --
-  let (users, txt1) = T.break (==':') txt0
-      tclean = T.strip $ T.dropWhileEnd (=='.') $ T.dropAround (=='"') $ T.drop 1 txt1
+queryOriginalTweet _ time0 txt0 = 
+  let (muser, txt1) = splitUpUsers txt0
+      tclean = T.strip $ T.dropWhileEnd (=='.') $ T.dropAround (=='"') $ T.strip txt1
       quoted = quoteT True tclean
       
       tstr = show $ toRDFLabel time0
       
-      -- look to split up "RT @foo ... @bar:" to get just @bar
-      -- 
-      muser = case T.breakOnAll "@" users of
-        [] -> Nothing
-        us -> let (_, un) = last us
-                  un' = T.drop 1 $ T.stripEnd un
-              in if T.null un' then Nothing else Just $ T.takeWhile isAlphaNum un' 
-      
       userQuery = case muser of
         Nothing -> ""
-        -- should we make this optional
         Just uname -> " ?tw sioc:has_creator [ rdfs:label \"" ++ T.unpack (T.toLower uname) ++ "\" ] ."
 
       -- characters that need protecting for Lucene searches
       --     + - && || ! ( ) { } [ ] ^ " ~ * ? : \
       -- word = T.replace "'" "\'" $ word0
       -- but it's not clear what causes problems here, so simplify
+      --
+      -- There are at least 2 AAS219 quotes which we should match but do not; it is not
+      -- clear whether quoting helps since attempts to do so lead to the query
+      -- not being parsed by Stardog.
+      --  
       quoted' = foldl' (\s c -> T.replace (T.singleton c) (T.snoc "\\" c) s)
              quoted "'\""
                            
      , "ORDER BY ASC(?time)"
      ]
      
+     
+{-     
+Valid characters for a Twitter user name. Note that we do not
+enforce the character-length limitation (which I believe is 15
+characters).
+-}
+isTwitterChar :: Char -> Bool     
+isTwitterChar c = 
+  let o = ord c
+      ischar = (o >= ord 'a' && o <= ord 'z')
+               ||
+               (o >= ord 'A' && o <= ord 'Z')
+  in ischar || c == '_' || isDigit c
+     
+{-
+These are for those tweets which are not identified by 
+queryOriginalTweet
+-}
+
+queryDistanceTweet :: URI -> UTCTime -> T.Text -> (T.Text, Query)
+queryDistanceTweet _ time0 txt0 = 
+  let (muser, txt1) = splitUpUsers txt0
+      tclean = T.strip $ T.dropWhileEnd (=='.') $ T.dropAround (=='"') $ T.strip txt1
+      tstr = show $ toRDFLabel time0
+      
+      userQuery = case muser of
+        Nothing -> ""
+        Just uname -> " ?tw sioc:has_creator [ rdfs:label \"" ++ T.unpack (T.toLower uname) ++ "\" ] ."
+
+  in (tclean,
+      unwords 
+      [ "prefix sioc: <http://rdfs.org/sioc/ns#>"
+      , "prefix sioct: <http://rdfs.org/sioc/types#>"
+      , "prefix dcterms: <http://purl.org/dc/terms/>"
+      , "prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>"
+      , "prefix xsd: <http://www.w3.org/2001/XMLSchema#>"
+      , "prefix tw: <http://purl.org/net/djburke/demo/twitter#>"
+      , "SELECT ?tw ?time ?text WHERE {"
+      , "  ?tw a sioct:MicroblogPost ; sioc:content ?text ; dcterms:created ?time ."
+      , "  FILTER (?time < " ++ tstr ++ " )"
+      , userQuery
+        
+        -- attempt to remove problematic matches by removing those marked
+        -- as retweets; there's no reason a user can't be retweeting a
+        -- retweet but with the matching we do this may help remove
+        -- some edge cases
+      , "  OPTIONAL { ?tw tw:isRetweet ?rt } FILTER (!bound(?rt))"
+        
+      , "}"
+      , "ORDER BY ASC(?time)"
+      ]
+     )
 
 -- do not move to SPARQL yet as don't want to bother cascading up the
 -- network dependency.
     [(u, _, _)] -> Just $ [toRDFTriple uri0 siocreply_of u, toRDFTriple uri0 isRetweet True]
     xs -> error $ "Found multiple matches for " ++ show uri0 ++ "\n" ++ T.unpack txt0 ++ "\n" ++ show xs ++ "\n"
   
+{-  
+Use the Levenshtein distance to match up tweets. This
+is done by using SPARQL to query for potential tweets and then
+Haskell code to do the matching.
+
+-}
+
+calcDiff :: String -> String -> Int
+calcDiff = 
+  restrictedDamerauLevenshteinDistance defaultEditCosts
+
+findMatchDistance :: String -> (URI, UTCTime, T.Text) -> IO (Maybe [RDFTriple])            
+findMatchDistance url (uri0, time0, txt0) = do
+  
+  let (mtxt, qry) = queryDistanceTweet uri0 time0 txt0
+  res <- makeQuery getVals url qry
+  
+  let getDiff = calcDiff (T.unpack mtxt)
+  
+  -- We restrict attention to those whose edit distance is less than
+  -- the length of the matching string; this is a heuristic but looks
+  -- sensible given the AAS219 data.
+  --
+  -- The limit of 80 for the edit difference has been chosen after
+  -- inspection of the results.
+      
+  let (uris, _, txts) = unzip3 res
+      middle (_,a,_) = a
+      rsps = filter (\(_,t,s) -> s < 80 && s < T.length t) $ sortBy (comparing middle) $ zip3 uris txts $ map (getDiff . T.unpack) txts
+  
+  return $ case rsps of
+     ((u, _, _):_) -> Just $ [toRDFTriple uri0 siocreply_of u, toRDFTriple uri0 isRetweet True]
+     [] -> Nothing
+  
 findRetweetsSimple :: String -> IO RDFGraph
 findRetweetsSimple epoint = do
   res <- makeQuery getVals epoint queryForRetweets
   mms <- mapM (findMatchSimple epoint) res
   return $ toRDFGraph $ concat $ catMaybes mms
 
+findRetweetsDistance :: String -> IO RDFGraph
+findRetweetsDistance epoint = do
+  res <- makeQuery getVals epoint queryForRetweets
+  hPutStrLn stderr $ "Found " ++ show (length res) ++ " tweets"
+  -- forM_ res $ putStrLn . show
+  mms <- mapM (findMatchDistance epoint) res
+  return $ toRDFGraph $ concat $ catMaybes mms
+
 usage :: IO ()
 usage = do
   pName <- getProgName
-  hPutStrLn stderr $ "Usage: " ++ pName ++ " <endpoint> [simple]"
+  hPutStrLn stderr $ "Usage: " ++ pName ++ " <endpoint> [simple|distance]"
   exitFailure
   
 main :: IO ()
       gr <- findRetweetsSimple x
       T.putStrLn $ formatGraphAsText gr
       
-    (x:"simple":[]) -> do
-      gr <- findRetweetsSimple x
-      T.putStrLn $ formatGraphAsText gr
-
+    (x:method:[]) -> case method of
+      "simple"   -> findRetweetsSimple x >>= T.putStrLn . formatGraphAsText
+      "distance" -> findRetweetsDistance x >>= T.putStrLn . formatGraphAsText
+      _ -> usage
+      
     _ -> usage
 

File grabtweets.cabal

   ghc-options: -Wall
   Build-Depends:
     base >= 3 && < 5,
+    edit-distance == 0.2.*,
     hasparql-client == 0.2.*,
     -- http-conduit == 1.2.*,
     network >= 2.2 && < 2.4,