Commits

Doug Burke committed 754e1ee

TokenizeText: exclude a few more terms based on data

Comments (0)

Files changed (1)

 
 doNotWant :: [T.Text]
 doNotWant = 
-    [ "#aas221", "#hackaas"
+    [ "#aas221", "#aas223", "#hackaas"
     , "rt", "mt", "via", "aas", "#a"
     , "st." -- found in tokenizing bios from JWST@SXSW; not sure what parsing leads to this
     , "ve"
-    , ".", "..", "...", ".\"", "--", "!", "!!", "!!!"
-    , "://"
+    , ".", "..", "...", "....", "....."
+    , ".\"", "--", "!", "!!", "!!!"
+    , "||"
+    , "://", "//"
     , "ll", "em", "la", "lo", "en", "es", "mi", "por"
-    , "www", "http", "re", "com", "com/"] ++
+    , "www", "http", "https", "re", "com", "com/", "co"] ++
     stopWords ++
     ["de"]