Commits

Anonymous committed 9346296

added test for PreprocHCRTest. NOTE: I refactored PreprocHCRTest.scala significatly, and deleted a lot of Matt's scratch work. We can discuss how to implement those features and then grab the scratch work out of CS 38. Also, I slightly changed the debugging output. We can talk about that tomorrow too. Sorry if I broke something.

  • Participants
  • Parent commits 82d62fb

Comments (0)

Files changed (3)

File src/main/scala/updown/preproc/PreprocHCRTweets.scala

 package updown.preproc
 
-import model.{FailedParse, TweetParse}
+import model.{TweetParse}
 import updown.util._
 import java.io._
 import au.com.bytecode.opencsv.CSVReader
 
 case class SuccessfulHCRParse(tweetid: String, username: String, label: SentimentLabel.Type, target: String, features: Iterable[String]) extends TweetParse
 
+case class FailedHCRParse(reason: String) extends TweetParse
 
 object PreprocHCRTweets {
 
-  import ArgotConverters._
+    import ArgotConverters._
 
   val parser = new ArgotParser("updown run updown.preproc.PreprocHCRTweets", preUsage = Some("Updown"))
 
-  val inputFile = parser.option[String](List("in", "input"), "input", "csv input")
+  val inputFile = parser.option[String](List("i", "input"), "input", "csv input")
   val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "stoplist words")
   val targetFile = parser.option[String](List("t", "target"), "target", "target file")
   val featureFile = parser.option[String](List("f", "feature"), "feature", "feature file")
   val HCR_NEG = "negative"
   val HCR_NEU = "neutral"
 
-  def processOneLine(fields: Array[String], stoplist: Set[String],
-                     targetWriter: OutputStreamWriter, featureWriter: OutputStreamWriter) {
-    if (fields.length < 4) {
-      return FailedParse
+  val PARSE_FAIL_NO_SENT = "NO_SENT"
+  val PARSE_FAIL_INVAL_SENT = "INVAL_SENT"
+  val PARSE_FAIL_NO_TWEET_ID = "NO_TWEET_ID"
+  val PARSE_FAIL_NO_USERNAME = "NO_USERNAME"
+  val PARSE_FAIL_NO_TWEET = "NO_TWEET"
+  val PARSE_FAIL_NO_TARGET = "NO_TARGET"
+
+  def processOneLine(fields: Array[String], stoplist: Set[String]): TweetParse = {
+    if (fields.length < 5) {
+      return FailedHCRParse(PARSE_FAIL_NO_SENT)
     }
 
-    val tweetid = StringUtil.getLongNoExcept(fields(0).trim)
+    val tweetid = if (fields(0).trim.matches("\\d+")) fields(0).trim else "" // why are we doing this? why not just take whatever is there as the id?
     val username = fields(2).trim
     val tweet = fields(3).trim
     val sentiment = fields(4).trim
-    val target = fields(5).trim
+    val target = if (fields.length > 5) fields(5).trim else ""
 
-    if (!(sentiment.contains(HCR_POS) || sentiment.contains(HCR_NEG) || sentiment.contains(HCR_NEU))
-      || tweetid == "" || username == "" || tweet == "") {
-      return FailedParse
-    }
+    if (!(sentiment.contains(HCR_POS) || sentiment.contains(HCR_NEG) || sentiment.contains(HCR_NEU)))
+      return FailedHCRParse(PARSE_FAIL_INVAL_SENT)
+    if (tweetid == "")
+      return FailedHCRParse(PARSE_FAIL_NO_TWEET_ID)
+    if (username == "")
+      return FailedHCRParse(PARSE_FAIL_NO_USERNAME)
+    if (tweet == "")
+      return FailedHCRParse(PARSE_FAIL_NO_TWEET)
+    if (target == "")
+      return FailedHCRParse(PARSE_FAIL_NO_TARGET)
+
     val tokens = BasicTokenizer(tweet)
     val features = tokens.filterNot(stoplist(_)) ::: StringUtil.generateBigrams(tokens)
     val label = sentiment match {
 
 
     val reader = new CSVReader(new InputStreamReader(new FileInputStream(new File(inputFile.value.get)), "UTF-8"))
-    val stoplist = scala.io.Source.fromFile(stopListFile.value.get, "utf-8").getLines.toSet
+    val stoplist = scala.io.Source.fromFile(stopListFile.value.get, "utf-8").getLines().toSet
     val targetWriter = if (targetFile.value != None) new OutputStreamWriter(new FileOutputStream(new File(targetFile.value.get)), "UTF-8") else null
     val featureWriter = if (featureFile.value != None) new OutputStreamWriter(new FileOutputStream(new File(featureFile.value.get)), "UTF-8") else null
 
     var numPos = 0 //takes on a new meaning with multiple target labels
     var numNeg = 0 //same deal here
     var numNeu = 0
-    var aboveTry = 0
-    var noTweetID = 0;
-    var noUserName = 0;
+    var noTweetID = 0
+    var noUserName = 0
     var noTweet = 0
-    var noSentiment = 0;
+    var noSentiment = 0
     var noTarget = 0
 
     var fields = reader.readNext
-    var someCount = 0
-    var numPassing = 0 //should be same as numTweets. used for debugging.
     while (fields != null) {
-      someCount += 1
-      processOneLine(fields, stoplist, targetWriter, featureWriter) match {
+      numTweets += 1
+      processOneLine(fields, stoplist) match {
         case SuccessfulHCRParse(tweetid, username, label, target, features) =>
-          ()
-        case _ => ()
+          label match {
+            case SentimentLabel.Positive => numPos += 1
+            case SentimentLabel.Neutral => numNeu += 1
+            case SentimentLabel.Negative => numNeg += 1
+          }
+          if (featureWriter != null)
+            featureWriter.write("%s|%s|%s,%s\n".format(tweetid, username, features.mkString(","), label.toString))
+          else
+            printf("%s|%s|%s,%s\n", tweetid, username, features.mkString(","), label.toString)
+
+          if (targetWriter != null)
+            targetWriter.write("%s|%s\n".format(tweetid, target))
+          else
+            System.err.print("target: %s|%s\n".format(tweetid, target))
+        case FailedHCRParse(PARSE_FAIL_NO_SENT) =>
+          numNotCounted += 1
+          noSentiment += 1
+        case FailedHCRParse(PARSE_FAIL_INVAL_SENT) =>
+          numNotCounted += 1
+          noSentiment += 1
+        case FailedHCRParse(PARSE_FAIL_NO_TWEET) =>
+          numNotCounted += 1
+          noTweet += 1
+        case FailedHCRParse(PARSE_FAIL_NO_TWEET_ID) =>
+          numNotCounted += 1
+          noTweetID += 1
+        case FailedHCRParse(PARSE_FAIL_NO_USERNAME) =>
+          numNotCounted += 1
+          noUserName += 1
+        case FailedHCRParse(PARSE_FAIL_NO_TARGET) =>
+          numNotCounted += 1
+          noTarget += 1
+        case _ =>
+          numNotCounted += 1
       }
 
       fields = reader.readNext
-
     }
 
-    reader.close
-    if (targetWriter != null) targetWriter.close
-    if (featureWriter != null) featureWriter.close
+    // the convention is that method that have side effects should be called with an empty arg list instead of no parens.
+    // these methods are called mutators, apparently
+    reader.close()
+    if (targetWriter != null) targetWriter.close()
+    if (featureWriter != null) featureWriter.close()
 
     System.err.println("Preprocessed " + numTweets + " tweets. Fraction positive: " + (numPos.toFloat / numTweets) + "\tFraction Negative: " + (numNeg.toFloat / numTweets)
       + "\tFraction Neutral: " + (1 - ((numPos.toFloat / numTweets) + (numNeg.toFloat / numTweets))).toFloat)
     System.err.println("Num pos tweets: " + numPos + ".\t Num neg tweets: " + numNeg + ".\t Num neutral tweets: " + numNeu)
-    System.err.println(numNotCounted + "is numNotCounted" + " and aboveTry is: " + aboveTry + "and num of noSentiment: " + noSentiment + " and num of noTarget " + noTarget)
+    System.err.println(numNotCounted + "is numNotCounted" + "and num of noSentiment: " + noSentiment + " and num of noTarget " + noTarget)
     System.err.println("noTweet: " + noTweet + " noUserName: " + noUserName + " noTweetID: " + noTweetID)
   }
 }

File src/main/scala/updown/util/StringUtil.scala

     val innerBigrams = if(unigrams.length >= 2) unigrams.sliding(2).map(bi => bi(0)+" "+bi(1)).toList else Nil
     ("$ "+unigrams(0) :: innerBigrams) ::: (unigrams(unigrams.length-1)+" $" :: Nil)
   }
-
-  // john hates this function, but doesn't know what to do about them just yet
-  def getLongNoExcept(s: String): String = {
-    try {
-      s.toLong
-    } catch {
-      case _ => null
-    }
-  }
 }

File src/test/scala/PreprocHCRTest.scala

 import updown.preproc.{SuccessfulHCRParse, PreprocHCRTweets}
 
 class PreprocHCRTest extends FlatSpec {
-  val HCR_INPUT_LINE = "4;;3;;Mon May 11 03:17:40 UTC 2009;;kindle2;;tpryan;;@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right."
-  val HCR_SENTIMENT_RAW = "4"
+  val HCR_INPUT_FIELDS = Array("9932982701", "29136568", "Hexham67", "Bully for you Mr. President. Bully for you. #hcr",
+    "positive", "obama", "mteisberg", "Could be a compliment, or sarcasm")
   val HCR_SENTIMENT_GOLD = SentimentLabel.Positive
-  val HCR_TWEET_ID = "3"
-  val HCR_USERNAME = "tpryan"
-  val HCR_TWEET = "@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right."
+  val HCR_TARGET = "obama"
+  val HCR_TWEET_ID = "9932982701"
+  val HCR_USERNAME = "Hexham67"
+  val HCR_TWEET = "Bully for you Mr. President. Bully for you. #hcr"
   val HCR_TOKENS = List("stellargirl", "i", "loooooooovvvvvveee", "my", "kindle2", "not", "that", "the", "dx", "is", "cool", "but", "the", "2", "is", "fantastic", "in", "its", "own", "right")
-  val HCR_FEATURES = List("stellargirl", "i", "loooooooovvvvvveee", "my", "kindle2", "not", "that", "the", "dx", "is", "cool", "but", "the", "2", "is", "fantastic", "in", "its", "own", "right", "$ stellargirl", "stellargirl i", "i loooooooovvvvvveee", "loooooooovvvvvveee my", "my kindle2", "kindle2 not", "not that", "that the", "the dx", "dx is", "is cool", "cool but", "but the", "the 2", "2 is", "is fantastic", "fantastic in", "in its", "its own", "own right", "right $")
+  val HCR_FEATURES = List("bully", "president", "bully", "#hcr", "$ bully", "bully for", "for you", "you mr", "mr president", "president bully", "bully for", "for you", "you #hcr", "#hcr $")
 
   val pst = PreprocHCRTweets
 
-  "lineRE" should "parse a test line correctly" in {
-    val pst.lineRE(sentimentRaw, tweetid, username, tweet) = HCR_INPUT_LINE
-    assert(sentimentRaw === HCR_SENTIMENT_RAW)
-    assert(tweetid === HCR_TWEET_ID)
-    assert(username === HCR_USERNAME)
-    assert(tweet === HCR_TWEET)
-  }
-
   "processOneLine" should "produce expected output" in {
     assert(
-      pst.processOneLine(HCR_INPUT_LINE, Set())
+      pst.processOneLine(HCR_INPUT_FIELDS, Set("for", "you", "mr"))
         ===
         SuccessfulHCRParse(
           HCR_TWEET_ID,
           HCR_USERNAME,
           HCR_SENTIMENT_GOLD,
+          HCR_TARGET,
           HCR_FEATURES))
   }
+
+  //TODO test failure modes
 }