Commits

Christopher Johnson committed 877352f

Added preproc-obama class

  • Participants
  • Parent commits f25daea

Comments (0)

Files changed (2)

 
      $SCALA_COMMAND updown.preproc.PreprocHCRTweets $*
 
+elif [ $CMD = 'preproc-obama' ]; then
+
+     $SCALA_COMMAND updown.preproc.PreprocObamaTweets $*
+
 elif [ $CMD = 'per-tweet-eval' ]; then
 
      $SCALA_COMMAND updown.app.PerTweetEvaluator $*

File src/main/scala/updown/preproc/PreprocObamaTweets.scala

+package updown.preproc
+
+import model.{ TweetParse }
+import updown.util._
+import java.io._
+import au.com.bytecode.opencsv.CSVReader
+import scala.collection.immutable._
+import scala.Console
+
+import org.clapper.argot._
+import updown.data.SentimentLabel
+import scala.util.parsing.json._
+
+case class SuccessfulObamaParse(tweetid: Long, userid: Long, date: String,
+  sentiment: SentimentLabel.Type,
+  features: Iterable[String]) extends TweetParse
+
+case class FailedObamaRParse(reason: String) extends TweetParse
+
+object PreprocObamaTweets {
+
+  import ArgotConverters._
+
+  val parser = new ArgotParser("updown run updown.preproc.PreprocObamaTweets", preUsage = Some("Updown"))
+
+  val inputFile = parser.option[String](List("i", "input"), "input", "csv input")
+  val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "stoplist words")
+  val targetFile = parser.option[String](List("t", "target"), "target", "target file")
+  val featureFile = parser.option[String](List("f", "feature"), "feature", "feature file")
+  val ignoreNeutral = parser.flag[Boolean](List("ignoreNeutral"), "set this flag if you want to ignore neutral annotations")
+
+  val OBAMA_POS = "positive"
+  val OBAMA_NEG = "negative"
+  val OBAMA_NEU = "neutral"
+
+  val PARSE_FAIL_NO_SENT = "NO_SENT"
+  val PARSE_FAIL_INVAL_SENT = "INVAL_SENT"
+  val PARSE_FAIL_NO_TWEET_ID = "NO_TWEET_ID"
+  val PARSE_FAIL_NO_USERNAME = "NO_USERNAME"
+  val PARSE_FAIL_NO_TWEET = "NO_TWEET"
+  val PARSE_FAIL_NO_TARGET = "NO_TARGET"
+
+  def processOneLine(numFields: Int, line: String, stoplist: Set[String]): TweetParse = {
+    // tweet id,user id,username,content,sentiment,target,annotator id,comment,dispute
+    // 
+    
+    // Fields
+    val TWEET_ID = "id"
+    val USER_ID = "user"
+    val TEXT = "text"
+    val DATE = "created_at"
+    val SENT = "sentiment"
+    
+    //val log = System.err
+    //log.println(line)
+    val json:Option[Any] = JSON.parseFull(line)	
+    if(!json.isDefined){
+      val log = System.err
+      log.println(line)
+    }
+    val map:Map[String,Any] = json.get.asInstanceOf[Map[String, Any]]
+
+    
+    val tweetid = if(map.contains(TWEET_ID)) map.get(TWEET_ID).get.asInstanceOf[Double].toLong else -1
+    val userid = if(map.contains(USER_ID)) map.get(USER_ID).get.asInstanceOf[Double].toLong else -1
+    val tweet = if(map.contains(TEXT)) map.get(TEXT).get.asInstanceOf[String] else ""
+    val sent = if(map.contains(SENT)) map.get(SENT).get.asInstanceOf[String] else ""
+    val sentiment = if (sent == OBAMA_POS) SentimentLabel.Positive else SentimentLabel.Negative
+    val date = if(map.contains(DATE)) map.get(DATE).get.asInstanceOf[String] else ""
+    
+    //TODO: add functionality to handle neutral as well as no sentiment given
+//    var sentimentList = List[SentimentLabel.Type]()
+//    var targetList = List[String]()
+//    for (i <- ITERATE_START until ITERATE_END by 2) {
+//      val sentiment = if (fields.length > i) fields(i).trim else ""
+//      if (!(sentiment == OBAMA_POS || sentiment == OBAMA_NEG || sentiment == OBAMA_NEU))
+//        return FailedHCRParse(PARSE_FAIL_INVAL_SENT)
+//
+//      sentimentList = (sentiment match {
+//        case `OBAMA_POS` => SentimentLabel.Positive
+//        case `OBAMA_NEU` => SentimentLabel.Neutral
+//        case `OBAMA_NEG` => SentimentLabel.Negative
+//      }) :: sentimentList
+//
+//      targetList = (if (fields.length > i + 1) fields(i + 1).trim else "") :: targetList
+//      if (targetList(0) == "")
+//        return FailedHCRParse(PARSE_FAIL_NO_TARGET)
+//    }
+//    val sentTargList = sentimentList zip targetList
+
+    if (tweetid == "")
+      return FailedHCRParse(PARSE_FAIL_NO_TWEET_ID)
+    if (userid == "")
+      return FailedHCRParse(PARSE_FAIL_NO_USERNAME)
+    if (tweet == "")
+      return FailedHCRParse(PARSE_FAIL_NO_TWEET)
+
+    val tokens = BasicTokenizer(tweet)
+    val features = tokens.filterNot(stoplist(_)) ::: StringUtil.generateBigrams(tokens)
+
+    //SuccessfulObamaParse(tweetid, userid, sentTargList, features)
+    SuccessfulObamaParse(tweetid, userid, date, sentiment, features)
+  }
+
+  def writeOutput(featureWriter: OutputStreamWriter, tweetid: Long,
+    userid: Long, features: Iterable[String],
+    sentiment: SentimentLabel.Type, targetWriter: OutputStreamWriter) {
+    var label = ""
+    var target = ""
+//    for ((sentiment, targetString) <- sentTargList) {
+//      label += (if (label != "") "," else "") + sentiment
+//      target += (if (target != "") "," else "") + targetString
+//    }
+    featureWriter.write("%s|%s|%s|%s\n".format(tweetid, userid, features.mkString(",").replace("|", ""), sentiment))
+    targetWriter.write("%s|%s\n".format(tweetid, target))
+  }
+
+  def main(args: Array[String]) {
+    try {
+      parser.parse(args)
+    } catch {
+      case e: ArgotUsageException =>
+        println(e.message);
+        sys.exit(0)
+    }
+
+    // dumb, I know, but a boolean flag turns out to be an Option, which is even dumber
+    val ignoreNeut = if (ignoreNeutral.value == None) false else true
+
+    if (inputFile.value == None) {
+      println("You must specify a input data file via --in or --input ")
+      sys.exit(0)
+    }
+    if (stopListFile.value == None) {
+      println("You must specify a stoplist file via -s ")
+      sys.exit(0)
+    }
+
+    //val reader = new CSVReader(new InputStreamReader(new FileInputStream(new File(inputFile.value.get)), "UTF-8"))
+    val reader = new BufferedReader(new FileReader(inputFile.value.get))
+    val stoplist = scala.io.Source.fromFile(stopListFile.value.get, "utf-8").getLines().toSet
+    val targetWriter = new OutputStreamWriter(
+      (if (targetFile.value != None)
+        new FileOutputStream(new File(targetFile.value.get))
+      else
+        System.err), "UTF-8")
+
+    val featureWriter = new OutputStreamWriter(
+      (if (featureFile.value != None)
+        new FileOutputStream(new File(featureFile.value.get))
+      else
+        System.out), "UTF-8")
+
+    var numTweets = 0
+    var numCounted = 0
+    var numPos = 0 //takes on a new meaning with multiple target labels
+    var numNeg = 0 //same deal here
+    var numNeu = 0
+    var noTweetID = 0
+    var noUserName = 0
+    var noTweet = 0
+    var noSentiment = 0
+    var invalSentiment = 0
+    var noTarget = 0
+
+    //var fields = reader.readNext
+    // Assumes there is a header!!!
+    var line = reader.readLine()
+    val numFields = line.length
+    while (line != null) {
+      numTweets += 1
+      processOneLine(numFields, line, stoplist) match {
+        case SuccessfulObamaParse(tweetid, userid, date, sentiment, features) =>
+          //for ((sentiment, target) <- sentTargList) {
+            numCounted += 1
+            sentiment match {
+              case SentimentLabel.Positive => numPos += 1
+              case SentimentLabel.Negative => numNeg += 1
+              case SentimentLabel.Neutral =>
+                if (!ignoreNeut)
+                  numNeu += 1
+                else
+                  numCounted -= 1
+            //}
+          }
+          writeOutput(featureWriter, tweetid, userid, features, sentiment, targetWriter)
+        case FailedHCRParse(PARSE_FAIL_NO_SENT) =>
+          noSentiment += 1
+        case FailedHCRParse(PARSE_FAIL_INVAL_SENT) =>
+          invalSentiment += 1
+        case FailedHCRParse(PARSE_FAIL_NO_TWEET) =>
+          noTweet += 1
+        case FailedHCRParse(PARSE_FAIL_NO_TWEET_ID) =>
+          noTweetID += 1
+        case FailedHCRParse(PARSE_FAIL_NO_USERNAME) =>
+          noUserName += 1
+        case FailedHCRParse(PARSE_FAIL_NO_TARGET) =>
+          noTarget += 1
+      }
+      line = reader.readLine()
+    }
+
+    targetWriter.flush()
+    featureWriter.flush()
+
+    val log = System.err
+
+    log.println("Preprocessed " + numCounted +
+      " tweets. Fraction positive: " + (numPos.toFloat / numCounted) +
+      "\tFraction Negative: " + (numNeg.toFloat / numCounted)
+      + "\tFraction Neutral: " + (numNeu.toFloat / numCounted))
+    log.println("Num pos tweets: " + numPos + ".\t Num neg tweets: " + numNeg + ".\t Num neutral tweets: " + numNeu)
+    log.println((numTweets - numCounted) + " were numNotCounted" +
+      "\nand num of noSentiment: " + noSentiment +
+      "\nand num of invalSentiment: " + invalSentiment +
+      "\nand num of noTarget " + noTarget)
+    log.println("noTweet: " + noTweet + " noUserName: " + noUserName + " noTweetID: " + noTweetID)
+
+    reader.close()
+    targetWriter.close()
+    featureWriter.close()
+
+  }
+}