Christopher Johnson avatar Christopher Johnson committed 453f0b1

remove new lines during tokenize, fixed bugs

Comments (0)

Files changed (3)

src/main/scala/updown/preproc/PreprocObamaTweets.scala

   sentiment: SentimentLabel.Type,
   features: Iterable[String]) extends TweetParse
 
-case class FailedObamaRParse(reason: String) extends TweetParse
+case class FailedObamaParse(reason: String) extends TweetParse
 
 object PreprocObamaTweets {
 
 //    for (i <- ITERATE_START until ITERATE_END by 2) {
 //      val sentiment = if (fields.length > i) fields(i).trim else ""
 //      if (!(sentiment == OBAMA_POS || sentiment == OBAMA_NEG || sentiment == OBAMA_NEU))
-//        return FailedHCRParse(PARSE_FAIL_INVAL_SENT)
+//        return FailedObamaParse(PARSE_FAIL_INVAL_SENT)
 //
 //      sentimentList = (sentiment match {
 //        case `OBAMA_POS` => SentimentLabel.Positive
 //
 //      targetList = (if (fields.length > i + 1) fields(i + 1).trim else "") :: targetList
 //      if (targetList(0) == "")
-//        return FailedHCRParse(PARSE_FAIL_NO_TARGET)
+//        return FailedObamaParse(PARSE_FAIL_NO_TARGET)
 //    }
 //    val sentTargList = sentimentList zip targetList
 
     if (tweetid == "")
-      return FailedHCRParse(PARSE_FAIL_NO_TWEET_ID)
+      return FailedObamaParse(PARSE_FAIL_NO_TWEET_ID)
     if (userid == "")
-      return FailedHCRParse(PARSE_FAIL_NO_USERNAME)
+      return FailedObamaParse(PARSE_FAIL_NO_USERNAME)
     if (tweet == "")
-      return FailedHCRParse(PARSE_FAIL_NO_TWEET)
+      return FailedObamaParse(PARSE_FAIL_NO_TWEET)
 
-    val tokens = BasicTokenizer(tweet)
-    val features = tokens.filterNot(stoplist(_)) ::: StringUtil.generateBigrams(tokens)
+      val tokens = BasicTokenizer(tweet)
+      //val tokens = Twokenize(tweet)
+      val features = tokens.filterNot(stoplist(_)) ::: StringUtil.generateBigrams(tokens)
 
     //SuccessfulObamaParse(tweetid, userid, sentTargList, features)
     SuccessfulObamaParse(tweetid, userid, date, sentiment, features)
       (if (targetFile.value != None)
         new FileOutputStream(new File(targetFile.value.get))
       else
-        System.err), "UTF-8")
+       new FileOutputStream(new File("/dev/null"))), "UTF-8")
 
     val featureWriter = new OutputStreamWriter(
       (if (featureFile.value != None)
             //}
           }
           writeOutput(featureWriter, tweetid, userid, features, sentiment, targetWriter)
-        case FailedHCRParse(PARSE_FAIL_NO_SENT) =>
+        case FailedObamaParse(PARSE_FAIL_NO_SENT) =>
           noSentiment += 1
-        case FailedHCRParse(PARSE_FAIL_INVAL_SENT) =>
+        case FailedObamaParse(PARSE_FAIL_INVAL_SENT) =>
           invalSentiment += 1
-        case FailedHCRParse(PARSE_FAIL_NO_TWEET) =>
+        case FailedObamaParse(PARSE_FAIL_NO_TWEET) =>
           noTweet += 1
-        case FailedHCRParse(PARSE_FAIL_NO_TWEET_ID) =>
+        case FailedObamaParse(PARSE_FAIL_NO_TWEET_ID) =>
           noTweetID += 1
-        case FailedHCRParse(PARSE_FAIL_NO_USERNAME) =>
+        case FailedObamaParse(PARSE_FAIL_NO_USERNAME) =>
           noUserName += 1
-        case FailedHCRParse(PARSE_FAIL_NO_TARGET) =>
+        case FailedObamaParse(PARSE_FAIL_NO_TARGET) =>
           noTarget += 1
       }
       line = reader.readLine()

src/main/scala/updown/preproc/impl/PreprocStanfordTweets.scala

   val STAN_NEG = "0"
 
   override val defaultPipeline = "basicTokenize|addBigrams|removeStopwords"
+  
   def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
     for (line <- scala.io.Source.fromFile(fileName, "UTF-8").getLines) yield {
       val lineRE(sentimentRaw, id, username, tweet) = line

src/main/scala/updown/util/BasicTokenizer.scala

   def apply(s: String): List[String] = tokenize(s)
 
   def tokenize(s: String): List[String] = {
-    s.split("[\\s+]").map(StringUtil.preprocessKeepHash(_)).filter(_.length > 0).toList
+    s.split(" ").map(StringUtil.preprocessKeepHash(_)).filter(_.length > 0).
+    map(x => x.replaceAll("\n", " ").replaceAll("\r", " ")).toList
   }
 }
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.