1. Christopher Johnson
  2. polify

Commits

vvcephei  committed 3e1bcf6

refactored preprocHCR somewhat

  • Participants
  • Parent commits 8b137f6
  • Branches default

Comments (0)

Files changed (3)

File src/main/scala/updown/data/SentimentLabel.scala

View file
   val Positive = Value("1")
   val Neutral = Value("0")
   val Negative = Value("-1")
+  val Invalid = Value("-2")
   // this is the end of the enum definition. the rest of this object just demonstrates other
   //  stuff you can do.
 

File src/main/scala/updown/preproc/PreprocHCRTweets.scala

View file
 import org.clapper.argot._
 import updown.data.SentimentLabel
 
-case class SuccessfulHCRParse(tweetid: String, username: String, label: SentimentLabel.Type, target: String, features: Iterable[String]) extends TweetParse
+case class SuccessfulHCRParse(tweetid: String, username: String,
+                              sentTargList: List[(SentimentLabel.Type, String)],
+                              features: Iterable[String]) extends TweetParse
 
 case class FailedHCRParse(reason: String) extends TweetParse
 
 object PreprocHCRTweets {
 
-    import ArgotConverters._
+  import ArgotConverters._
 
   val parser = new ArgotParser("updown run updown.preproc.PreprocHCRTweets", preUsage = Some("Updown"))
 
   val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "stoplist words")
   val targetFile = parser.option[String](List("t", "target"), "target", "target file")
   val featureFile = parser.option[String](List("f", "feature"), "feature", "feature file")
+  val ignoreNeutral = parser.flag[Boolean](List("ignoreNeutral"), "set this flag if you want to ignore neutral annotations")
 
   val HCR_POS = "positive"
   val HCR_NEG = "negative"
   val PARSE_FAIL_NO_TWEET = "NO_TWEET"
   val PARSE_FAIL_NO_TARGET = "NO_TARGET"
 
-  def processOneLine(fields: Array[String], stoplist: Set[String]): TweetParse = {
+  def processOneLine(numFields: Int, fields: Array[String], stoplist: Set[String]): TweetParse = {
+    // tweet id,user id,username,content,sentiment,target,annotator id,comment,dispute
+    val INDEX_TWEET_ID = 0
+    //val INDEX_USER_ID = 1 // unused, leaving here for documentation
+    val INDEX_USER_NAME = 2
+    val INDEX_CONTENT = 3
+
+    val ITERATE_START = 4
+    val ITERATE_END = numFields - 3
+
+    //    val INDEX_ANNOTATOR_ID = numFields - 3 // unused, leaving here for documentation
+    //    val INDEX_COMMENT = numFields - 2 // unused, leaving here for documentation
+    //    val INDEX_DISPUTE = numFields - 1 // unused, leaving here for documentation
+
     if (fields.length < 5) {
       return FailedHCRParse(PARSE_FAIL_NO_SENT)
     }
 
-    val tweetid = if (fields(0).trim.matches("\\d+")) fields(0).trim else "" // why are we doing this? why not just take whatever is there as the id?
-    val username = fields(2).trim
-    val tweet = fields(3).trim
-    val sentiment = fields(4).trim
-    val target = if (fields.length > 5) fields(5).trim else ""
+    val tweetid = if (fields(INDEX_TWEET_ID).trim.matches("\\d+")) fields(INDEX_TWEET_ID).trim else "" // why are we doing this? why not just take whatever is there as the id?
+    val username = if (fields.length > INDEX_USER_NAME) fields(INDEX_USER_NAME).trim else ""
+    val tweet = if (fields.length > INDEX_CONTENT) fields(INDEX_CONTENT).trim else ""
+    var sentimentList = List[SentimentLabel.Type]()
+    var targetList = List[String]()
+    for (i <- ITERATE_START until ITERATE_END by 2) {
+      val sentiment = if (fields.length > i) fields(i).trim else ""
+      if (!(sentiment == HCR_POS || sentiment == HCR_NEG || sentiment == HCR_NEU))
+        return FailedHCRParse(PARSE_FAIL_INVAL_SENT)
 
-    if (!(sentiment.contains(HCR_POS) || sentiment.contains(HCR_NEG) || sentiment.contains(HCR_NEU)))
-      return FailedHCRParse(PARSE_FAIL_INVAL_SENT)
+      sentimentList = (sentiment match {
+        case `HCR_POS` => SentimentLabel.Positive
+        case `HCR_NEU` => SentimentLabel.Neutral
+        case `HCR_NEG` => SentimentLabel.Negative
+      }) :: sentimentList
+
+      targetList = (if (fields.length > i + 1) fields(i + 1).trim else "") :: targetList
+      if (targetList(0) == "")
+        return FailedHCRParse(PARSE_FAIL_NO_TARGET)
+    }
+    val sentTargList = sentimentList zip targetList
+
     if (tweetid == "")
       return FailedHCRParse(PARSE_FAIL_NO_TWEET_ID)
     if (username == "")
       return FailedHCRParse(PARSE_FAIL_NO_USERNAME)
     if (tweet == "")
       return FailedHCRParse(PARSE_FAIL_NO_TWEET)
-    if (target == "")
-      return FailedHCRParse(PARSE_FAIL_NO_TARGET)
 
     val tokens = BasicTokenizer(tweet)
     val features = tokens.filterNot(stoplist(_)) ::: StringUtil.generateBigrams(tokens)
-    val label = sentiment match {
-      case `HCR_POS` => SentimentLabel.Positive
-      case `HCR_NEU` => SentimentLabel.Neutral
-      case `HCR_NEG` => SentimentLabel.Negative
+
+    SuccessfulHCRParse(tweetid, username, sentTargList, features)
+  }
+
+  def writeOutput(featureWriter: OutputStreamWriter, tweetid: String,
+                  username: String, features: Iterable[String],
+                  sentTargList: List[(SentimentLabel.Type, String)], targetWriter: OutputStreamWriter) {
+    var label = ""
+    var target = ""
+    for ((sentiment, targetString) <- sentTargList) {
+      label += (if (label != "") "," else "") + sentiment
+      target += (if (target != "") "," else "") + targetString
     }
-    SuccessfulHCRParse(tweetid, username, label, target, features)
+    featureWriter.write("%s|%s|%s,%s\n".format(tweetid, username, features.mkString(","), label))
+    targetWriter.write("%s|%s\n".format(tweetid, target))
   }
 
   def main(args: Array[String]) {
       sys.exit(0)
     }
 
+    // dumb, I know, but a boolean flag turns out to be an Option, which is even dumber
+    val ignoreNeut = if (ignoreNeutral.value == None) false else true
+
     if (inputFile.value == None) {
       println("You must specify a input data file via --in or --input ")
       sys.exit(0)
 
     val reader = new CSVReader(new InputStreamReader(new FileInputStream(new File(inputFile.value.get)), "UTF-8"))
     val stoplist = scala.io.Source.fromFile(stopListFile.value.get, "utf-8").getLines().toSet
-    val targetWriter = if (targetFile.value != None) new OutputStreamWriter(new FileOutputStream(new File(targetFile.value.get)), "UTF-8") else null
-    val featureWriter = if (featureFile.value != None) new OutputStreamWriter(new FileOutputStream(new File(featureFile.value.get)), "UTF-8") else null
+    val targetWriter = new OutputStreamWriter(
+      (if (targetFile.value != None)
+        new FileOutputStream(new File(targetFile.value.get))
+      else
+        System.err), "UTF-8")
+
+    val featureWriter = new OutputStreamWriter(
+      (if (featureFile.value != None)
+        new FileOutputStream(new File(featureFile.value.get))
+      else
+        System.out), "UTF-8")
 
 
     var numTweets = 0
-    var numNotCounted = 0
+    var numCounted = 0
     var numPos = 0 //takes on a new meaning with multiple target labels
     var numNeg = 0 //same deal here
     var numNeu = 0
     var noUserName = 0
     var noTweet = 0
     var noSentiment = 0
+    var invalSentiment = 0
     var noTarget = 0
 
+
     var fields = reader.readNext
+    // Assumes there is a header!!!
+    val numFields = fields.length
+    fields = reader.readNext
     while (fields != null) {
       numTweets += 1
-      processOneLine(fields, stoplist) match {
-        case SuccessfulHCRParse(tweetid, username, label, target, features) =>
-          label match {
-            case SentimentLabel.Positive => numPos += 1
-            case SentimentLabel.Neutral => numNeu += 1
-            case SentimentLabel.Negative => numNeg += 1
+      processOneLine(numFields, fields, stoplist) match {
+        case SuccessfulHCRParse(tweetid, username, sentTargList, features) =>
+          for ((sentiment, target) <- sentTargList) {
+            numCounted += 1
+            sentiment match {
+              case SentimentLabel.Positive => numPos += 1
+              case SentimentLabel.Negative => numNeg += 1
+              case SentimentLabel.Neutral =>
+                if (!ignoreNeut)
+                  numNeu += 1
+                else
+                  numCounted -= 1
+            }
           }
-          if (featureWriter != null)
-            featureWriter.write("%s|%s|%s,%s\n".format(tweetid, username, features.mkString(","), label.toString))
-          else
-            printf("%s|%s|%s,%s\n", tweetid, username, features.mkString(","), label.toString)
-
-          if (targetWriter != null)
-            targetWriter.write("%s|%s\n".format(tweetid, target))
-          else
-            System.err.print("target: %s|%s\n".format(tweetid, target))
+          writeOutput(featureWriter, tweetid, username, features, sentTargList, targetWriter)
         case FailedHCRParse(PARSE_FAIL_NO_SENT) =>
-          numNotCounted += 1
           noSentiment += 1
         case FailedHCRParse(PARSE_FAIL_INVAL_SENT) =>
-          numNotCounted += 1
-          noSentiment += 1
+          invalSentiment += 1
         case FailedHCRParse(PARSE_FAIL_NO_TWEET) =>
-          numNotCounted += 1
           noTweet += 1
         case FailedHCRParse(PARSE_FAIL_NO_TWEET_ID) =>
-          numNotCounted += 1
           noTweetID += 1
         case FailedHCRParse(PARSE_FAIL_NO_USERNAME) =>
-          numNotCounted += 1
           noUserName += 1
         case FailedHCRParse(PARSE_FAIL_NO_TARGET) =>
-          numNotCounted += 1
           noTarget += 1
-        case _ =>
-          numNotCounted += 1
       }
-
       fields = reader.readNext
     }
 
-    // the convention is that method that have side effects should be called with an empty arg list instead of no parens.
-    // these methods are called mutators, apparently
+    targetWriter.flush()
+    featureWriter.flush()
+
+    val log = System.err
+
+    log.println("Preprocessed " + numCounted +
+      " tweets. Fraction positive: " + (numPos.toFloat / numCounted) +
+      "\tFraction Negative: " + (numNeg.toFloat / numCounted)
+      + "\tFraction Neutral: " + (numNeu.toFloat / numCounted))
+    log.println("Num pos tweets: " + numPos + ".\t Num neg tweets: " + numNeg + ".\t Num neutral tweets: " + numNeu)
+    log.println((numTweets - numCounted) + " were numNotCounted" +
+      "\nand num of noSentiment: " + noSentiment +
+      "\nand num of invalSentiment: " + invalSentiment +
+      "\nand num of noTarget " + noTarget)
+    log.println("noTweet: " + noTweet + " noUserName: " + noUserName + " noTweetID: " + noTweetID)
+
     reader.close()
-    if (targetWriter != null) targetWriter.close()
-    if (featureWriter != null) featureWriter.close()
+    targetWriter.close()
+    featureWriter.close()
 
-    System.err.println("Preprocessed " + numTweets + " tweets. Fraction positive: " + (numPos.toFloat / numTweets) + "\tFraction Negative: " + (numNeg.toFloat / numTweets)
-      + "\tFraction Neutral: " + (1 - ((numPos.toFloat / numTweets) + (numNeg.toFloat / numTweets))).toFloat)
-    System.err.println("Num pos tweets: " + numPos + ".\t Num neg tweets: " + numNeg + ".\t Num neutral tweets: " + numNeu)
-    System.err.println(numNotCounted + "is numNotCounted" + "and num of noSentiment: " + noSentiment + " and num of noTarget " + noTarget)
-    System.err.println("noTweet: " + noTweet + " noUserName: " + noUserName + " noTweetID: " + noTweetID)
   }
 }

File src/test/scala/StringUtilTest.scala

View file
 
 class StringUtilTest extends FlatSpec {
   "stripPunc" should "turn /.,@$#asdf';.@#$% into asdf" in {
-    assert(StringUtil.stripPunc("/.,@$#asdf';.@#$%") === "asdf")
+    assert(StringUtil.stripPunc("/.,@$#ASdf';.@#$%") === "ASdf")
   }
   "preprocess" should "turn /.,@$#ASdf';.@#$% into asdf" in {
     assert(StringUtil.preprocess("/.,@$#ASdf';.@#$%") === "asdf")