Commits

vvcephei committed 5ec9586

another large-scale change: converted the Tweet to a case class (i.e. immutable). Instead of assigning to tweet.sentimentLabel, we now construct a SystemLabeledTweet

Comments (0)

Files changed (12)

src/main/scala/updown/app/JuntoClassifier.scala

 
     val edgeSeedSet = edgeSeedSetOption.value.getOrElse(DEFAULT_EDGE_SEED_SET)
 
-    val tweets = TweetFeatureReader(goldInputFile.value.get)
-    //tweets.foreach(println)
+    val goldLabeledTweets: List[GoldLabeledTweet] = TweetFeatureReader(goldInputFile.value.get)
 
     if (refCorpusProbsFile.value != None) {
       refCorpusNgramProbs = loadRefCorpusNgramProbs(refCorpusProbsFile.value.get)
-      thisCorpusNgramProbs = computeNgramProbs(tweets)
+      thisCorpusNgramProbs = computeNgramProbs(goldLabeledTweets)
     }
 
     val lexicon = MPQALexicon(mpqaInputFile.value.get)
 
-    val graph = createGraph(tweets, followerGraphFile.value.get, modelInputFile.value.get, lexicon, edgeSeedSet)
-
-    //graph.SaveEstimatedScores("input-graph")
+    val graph = createGraph(goldLabeledTweets, followerGraphFile.value.get, modelInputFile.value.get, lexicon, edgeSeedSet)
 
     JuntoRunner(graph, mu1.value.getOrElse(DEFAULT_MU1), .01, .01, iterations.value.getOrElse(DEFAULT_ITERATIONS), false)
 
-    //graph.SaveEstimatedScores("output-graph")
-
     val tweetIdsToPredictedLabels = new scala.collection.mutable.HashMap[String, SentimentLabel.Type]
 
     val ngramsToPositivity = new scala.collection.mutable.HashMap[String, Double]
       }
     }
 
-    for (tweet <- tweets) {
-      if (tweetIdsToPredictedLabels.containsKey(tweet.id)) {
-        tweet.systemLabel = tweetIdsToPredictedLabels(tweet.id)
-        //println(TWEET_ + tweet.id + "\t" + tweet.systemLabel)
+    val systemLabeledTweets =
+      for (GoldLabeledTweet(id, userid, features, goldLabel) <- goldLabeledTweets) yield {
+        SystemLabeledTweet(id, userid, features, goldLabel,
+          if (tweetIdsToPredictedLabels.containsKey(id)) {
+            tweetIdsToPredictedLabels(id)
+          } else {
+            null
+          })
       }
-    }
 
-    PerTweetEvaluator(tweets)
-    PerUserEvaluator(tweets)
+    PerTweetEvaluator(systemLabeledTweets)
+    PerUserEvaluator(systemLabeledTweets)
     if (targetsInputFile.value != None) {
       val targets = new scala.collection.mutable.HashMap[String, String]
 
       scala.io.Source.fromFile(targetsInputFile.value.get, "utf-8").getLines.foreach(p => targets.put(p.split("\t")(0).trim, p.split("\t")(1).trim))
-      PerTargetEvaluator(tweets, targets)
+      PerTargetEvaluator(systemLabeledTweets, targets)
     }
 
     if (topNOutputFile.value != None) {
     }
   }
 
-  def createGraph(tweets: List[Tweet], followerGraphFile: String, modelInputFile: String, lexicon: MPQALexicon, edgeSeedSet: String) = {
+  def createGraph(tweets: List[GoldLabeledTweet], followerGraphFile: String, modelInputFile: String, lexicon: MPQALexicon, edgeSeedSet: String) = {
     val edges = (if (edgeSeedSet.contains("n")) getTweetNgramEdges(tweets) else Nil) :::
       (if (edgeSeedSet.contains("f")) (getFollowerEdges(followerGraphFile) ::: getUserTweetEdges(tweets)) else Nil)
     val seeds = (if (edgeSeedSet.contains("m")) getMaxentSeeds(tweets, modelInputFile) else Nil) :::
     GraphBuilder(edges, seeds)
   }
 
-  def getTweetNgramEdges(tweets: List[Tweet]): List[Edge] = {
+  def getTweetNgramEdges(tweets: List[GoldLabeledTweet]): List[Edge] = {
     (for (tweet <- tweets) yield {
       for (ngram <- tweet.features) yield {
         val weight = getNgramWeight(ngram)
     }).flatten.flatten
   }
 
-  def getUserTweetEdges(tweets: List[Tweet]): List[Edge] = {
+  def getUserTweetEdges(tweets: List[GoldLabeledTweet]): List[Edge] = {
     for (tweet <- tweets) yield {
       //println(USER_ + tweet.userid + "   " + TWEET_ + tweet.id)
       new Edge(USER_ + tweet.userid, TWEET_ + tweet.id, 1.0)
     }).flatten.toList
   }
 
-  def getMaxentSeeds(tweets: List[Tweet], modelInputFile: String): List[Label] = {
+  def getMaxentSeeds(tweets: List[GoldLabeledTweet], modelInputFile: String): List[Label] = {
     val dataInputStream = new DataInputStream(new FileInputStream(modelInputFile));
     val reader = new BinaryGISModelReader(dataInputStream)
     val model = reader.getModel
     }
   }
 
-  def computeNgramProbs(tweets: List[Tweet]): scala.collection.mutable.HashMap[String, Double] = {
+  def computeNgramProbs(tweets: List[GoldLabeledTweet]): scala.collection.mutable.HashMap[String, Double] = {
     val probs = new scala.collection.mutable.HashMap[String, Double] {
       override def default(s: String) = 0.0
     }
     val edgeSeedSet = edgeSeedSetOption.value.getOrElse(DEFAULT_EDGE_SEED_SET)
 
     val trainTweets = TweetFeatureReader(goldInputFile.value.get)
-    val testTweets = TweetFeatureReader(testInputFile.value.get)
-    val totalTweets = trainTweets ::: testTweets
+    val goldLabeledTestTweets = TweetFeatureReader(testInputFile.value.get)
+    val totalTweets = trainTweets ::: goldLabeledTestTweets
 
     if (refCorpusProbsFile.value != None) {
       refCorpusNgramProbs = loadRefCorpusNgramProbs(refCorpusProbsFile.value.get)
       thisCorpusNgramProbs = computeNgramProbs(totalTweets)
     }
 
-    val graph = createTransductiveGraph(trainTweets, followerGraphFile.value.get, testTweets, followerGraphFileTest.value.get, edgeSeedSet)
+    val graph = createTransductiveGraph(trainTweets, followerGraphFile.value.get, goldLabeledTestTweets, followerGraphFileTest.value.get, edgeSeedSet)
 
     JuntoRunner(graph, mu1.value.getOrElse(DEFAULT_MU1), .01, .01, iterations.value.getOrElse(DEFAULT_ITERATIONS), false)
 
       }
     }
 
-    for (tweet <- testTweets) {
-      if (tweetIdsToPredictedLabels.containsKey(tweet.id)) {
-        tweet.systemLabel = tweetIdsToPredictedLabels(tweet.id)
-        //println(TWEET_ + tweet.id + "\t" + tweet.systemLabel)
+    val systemLabeledTestTweets =
+      for (GoldLabeledTweet(id, userid, features, goldLabel) <- goldLabeledTestTweets) yield {
+        SystemLabeledTweet(id, userid, features, goldLabel,
+          if (tweetIdsToPredictedLabels.containsKey(id)) {
+            tweetIdsToPredictedLabels(id)
+          } else {
+            null
+          })
       }
-    }
 
-    PerTweetEvaluator.apply(testTweets)
-    PerUserEvaluator.evaluate(testTweets)
+    PerTweetEvaluator.apply(systemLabeledTestTweets)
+    PerUserEvaluator.evaluate(systemLabeledTestTweets)
     if (targetsInputFile.value != None) {
       val targets = new scala.collection.mutable.HashMap[String, String]
 
       scala.io.Source.fromFile(targetsInputFile.value.get, "utf-8").getLines.foreach(p => targets.put(p.split("\t")(0).trim, p.split("\t")(1).trim))
-      PerTargetEvaluator(testTweets, targets)
+      PerTargetEvaluator(systemLabeledTestTweets, targets)
     }
   }
 
-  def createTransductiveGraph(trainTweets: List[Tweet], followerGraphFile: String, testTweets: List[Tweet], followerGraphFileTest: String, edgeSeedSet: String) = {
+  def createTransductiveGraph(trainTweets: List[GoldLabeledTweet], followerGraphFile: String, testTweets: List[GoldLabeledTweet], followerGraphFileTest: String, edgeSeedSet: String) = {
     val totalTweets = trainTweets ::: testTweets
     val edges = (if (edgeSeedSet.contains("n")) getTweetNgramEdges(totalTweets) else Nil) :::
       (if (edgeSeedSet.contains("f")) (getFollowerEdges(followerGraphFile) ::: getUserTweetEdges(totalTweets) :::
     GraphBuilder(edges, seeds)
   }
 
-  def getGoldSeeds(tweets: List[Tweet]): List[Label] = {
+  def getGoldSeeds(tweets: List[GoldLabeledTweet]): List[Label] = {
     for (tweet <- tweets) yield {
-      if (tweet.goldLabel == POS)
-        new Label(TWEET_ + tweet.id, POS, 1.0)
-      else if (tweet.goldLabel == NEG)
-        new Label(TWEET_ + tweet.id, NEG, 1.0)
-      else
-        new Label(TWEET_ + tweet.id, NEG, 1.0)
+      tweet match {
+        case GoldLabeledTweet(id, _, _, SentimentLabel.Positive) => new Label(TWEET_ + id, POS, 1.0)
+        case GoldLabeledTweet(id, _, _, SentimentLabel.Negative) => new Label(TWEET_ + id, POS, 1.0)
+        case GoldLabeledTweet(id, _, _, SentimentLabel.Neutral) => new Label(TWEET_ + id, POS, 1.0)
+      }
     }
   }
 }

src/main/scala/updown/app/LexicalRatioClassifier.scala

       else null
   }
 
-  def classifyTweets(tweets: scala.List[Tweet], lexicon: MPQALexicon) {
-    for (tweet <- tweets) {
+  def classifyTweets(tweets: scala.List[Tweet], lexicon: MPQALexicon):List[SystemLabeledTweet] = {
+    (for (GoldLabeledTweet(id, userid, features, goldLabel) <- tweets) yield {
 
       var numPosWords = 0
       var numNegWords = 0
       var numNeuWords = 0
 
-      for (feature <- tweet.features) {
+      for (feature <- features) {
         if (lexicon.contains(feature)) {
           val entry = lexicon(feature)
           if (entry.isPositive) numPosWords += 1
           if (entry.isNeutral) numNeuWords += 1
         }
       }
-
-      tweet.systemLabel = classifyTweet(numPosWords, numNegWords, numNeuWords)
-    }
+      SystemLabeledTweet(id, userid, features, goldLabel, classifyTweet(numPosWords, numNegWords, numNeuWords))
+    }).toList
   }
 
   def main(args: Array[String]) {
       sys.exit(0)
     }
 
-    val tweets = TweetFeatureReader(goldInputFile.value.get)
+
     val lexicon = MPQALexicon(mpqaInputFile.value.get)
 
     //println("mpqa lex val for word 'good': " + lexicon.peek("good"))    
     var totTweets = 0
     var numAbstained = 0
 
-    classifyTweets(tweets, lexicon)
+    val tweets = classifyTweets(TweetFeatureReader(goldInputFile.value.get), lexicon)
     
     PerTweetEvaluator(tweets)
     PerUserEvaluator(tweets)

src/main/scala/updown/app/NFoldMaxentExperiement.scala

+package updown.app
+
+import org.clapper.argot.ArgotParser._
+import org.clapper.argot.{ArgotUsageException, ArgotParser}
+import java.io.{FileInputStream, DataInputStream}
+import opennlp.maxent.io.BinaryGISModelReader
+import updown.data.io.TweetFeatureReader._
+import updown.data.SentimentLabel
+import org.clapper.argot.ArgotConverters._
+import updown.data.io.TweetFeatureReader
+
+object NFoldMaxentExperiement {
+  // this exists purely to make the ArgotConverters appear used to IDEA
+  convertByte _
+
+  def main(args: Array[String]) {
+    val parser = new ArgotParser("updown run updown.app.PerTweetEvaluator", preUsage = Some("Updown"))
+    val goldInputFile = parser.option[String](List("g", "gold"), "gold", "gold labeled input")
+    val n = parser.option[Int](List("n", "folds"), "FOLDS", "the number of folds for the experiment (default 10)")
+
+    try {
+      parser.parse(args)
+    }
+    catch {
+      case e: ArgotUsageException => println(e.message); sys.exit(0)
+    }
+
+    val nFolds: Int = n.value.getOrElse(10)
+
+    if (goldInputFile.value == None) {
+      println("You must specify a gold labeled input file via -g.")
+      sys.exit(1)
+    }
+
+    val tweets = TweetFeatureReader(goldInputFile.value.get)
+
+  }
+}

src/main/scala/updown/app/PerTargetEvaluator.scala

   val NEG = "NEG"
   val NEU = "NEU"
 
-  //val DEFAULT_MIN_TPU = 1
-
-  def apply(tweets: List[Tweet], targets: scala.collection.mutable.HashMap[String, String]) = evaluate(tweets, targets)
-
-  def computeEvaluation(tweets: scala.List[Tweet], targets: HashMap[String, String]): (List[(String, Double)], Int, HashMap[String, List[Tweet]]) = {
-    var totalError = 0.0
-    var totalNumAbstained = 0
-    //val usersToTweets = new scala.collection.mutable.HashMap[String, List[Tweet]] { override def default(s: String) = List() }
-    val targetsToTweets = new scala.collection.mutable.HashMap[String, List[Tweet]] {
+  def computeEvaluation(tweets: scala.List[SystemLabeledTweet], targets: HashMap[String, String]):
+  (List[(String, Double)], Int, HashMap[String, List[SystemLabeledTweet]]) = {
+    val targetsToTweets = new scala.collection.mutable.HashMap[String, List[SystemLabeledTweet]] {
       override def default(s: String) = List()
     }
     var targetsToAccuracies = List[(String, Double)]()
 
-    //val minTPU = DEFAULT_MIN_TPU
-    //println(tweets.length)
-
     for (tweet <- tweets) {
-      //val prevList = targetsToTweets(tweet.userid)
       if (targets.contains(tweet.id)) {
         val curTarget = targets(tweet.id)
         targetsToTweets.put(curTarget, tweet :: targetsToTweets(curTarget))
       }
     }
 
-    //targetsToTweets.foreach(p => println(p._1+"   "+p._2.length))
-
     var numAbstained = 0
     for (target <- targetsToTweets.keys) {
       val curTweets = targetsToTweets(target)
       val correct = curTweets.count(tweet => tweet.goldLabel == tweet.systemLabel) + abstained.toFloat / 2
 
       targetsToAccuracies = targetsToAccuracies ::: ((target, correct.toDouble / curTweets.length) :: Nil)
-      /*for(tweet <- curTweets) {
-        if(tweet.goldLabel == tweet.systemLabel)
-      }*/
     }
 
     targetsToAccuracies.sortWith((x, y) => targetsToTweets(x._1).length >= targetsToTweets(y._1).length)
     (targetsToAccuracies, numAbstained, targetsToTweets)
   }
 
-  def evaluate(tweets: List[Tweet], targets: scala.collection.mutable.HashMap[String, String]) = {
+  def apply(tweets: List[SystemLabeledTweet], targets: scala.collection.mutable.HashMap[String, String]) = {
     val (targetsToAccuracies, numAbstained, targetsToTweets) = computeEvaluation(tweets, targets)
 
     println("\n***** PER TARGET EVAL *****")
 
     val goldLines = scala.io.Source.fromFile(goldInputFile.value.get, "utf-8").getLines.toList
 
-    val tweets = TweetFeatureReader(goldInputFile.value.get)
-    for (tweet <- tweets) {
-      tweet.systemLabel = SentimentLabel.figureItOut(model.getBestOutcome(model.eval(tweet.features.toArray)))
-    }
-
+    val tweets = (for (tweet <- TweetFeatureReader(goldInputFile.value.get)) yield {
+      tweet match {
+        case GoldLabeledTweet(id, userid, features, goldLabel) =>
+          SystemLabeledTweet(id, userid, features, goldLabel,
+            SentimentLabel.figureItOut(model.getBestOutcome(model.eval(features.toArray))))
+      }
+    })
 
     val targets = new scala.collection.mutable.HashMap[String, String]
 
 
     //targets.foreach(p => println(p._1+" "+p._2))
 
-    evaluate(tweets, targets)
+    apply(tweets, targets)
   }
 }

src/main/scala/updown/app/PerTweetEvaluator.scala

 
 import java.io._
 
-import opennlp.maxent._
 import opennlp.maxent.io._
-import opennlp.model._
-
 import org.clapper.argot._
 import ArgotConverters._
 
   // this exists purely to make the ArgotConverters appear used to IDEA
   convertByte _
 
-  def tabulate(tweets: scala.List[Tweet]): (Double, Int, Int, String) = {
+  def tabulate(tweets: scala.List[SystemLabeledTweet]): (Double, Int, Int, String) = {
     var correct = 0.0
     var total = 0
     var numAbstained = tweets.count(_.systemLabel == null)
       *  val normedNormedTweet = normedTweet.normalize("int")
       *  println(normedTweet.systemLabel + "|" + normedTweet.goldLabel + "\t" + normedNormedTweet.systemLabel + "|" + normedNormedTweet.goldLabel)
       */
-//      val normedTweet = tweet.normalize("alpha")
+      //      val normedTweet = tweet.normalize("alpha")
       if (tweet.systemLabel == tweet.goldLabel) {
         correct += 1
       }
         "a third of these are actually POS or NEG (empirically), we randomy assign a label to them.")
   }
 
-  def apply(tweets: List[Tweet]) = {
+  def apply(tweets: List[SystemLabeledTweet]) = {
 
     val (correct, total, abstained, message) = tabulate(tweets)
 
 
     val model = reader.getModel
     val tweets = TweetFeatureReader(goldInputFile.value.get)
-    for (tweet <- tweets) {
-      tweet.systemLabel = SentimentLabel.figureItOut(model.getBestOutcome(model.eval(tweet.features.toArray)))
-    }
-    apply(tweets)
+
+    apply(for (tweet <- tweets) yield {
+      tweet match {
+        case GoldLabeledTweet(id, userid, features, goldLabel) =>
+          SystemLabeledTweet(id, userid, features, goldLabel,
+            SentimentLabel.figureItOut(model.getBestOutcome(model.eval(features.toArray))))
+      }
+    })
   }
 }

src/main/scala/updown/app/PerUserEvaluator.scala

 
   val DEFAULT_MIN_TPU = 3
 
-  def apply(tweets: List[Tweet]) = evaluate(tweets)
+  def apply(tweets: List[SystemLabeledTweet]) = evaluate(tweets)
 
-  def computeEvaluation(tweets: scala.List[Tweet]): (Int, Int, Double, String) = {
+  def computeEvaluation(tweets: scala.List[SystemLabeledTweet]): (Int, Int, Double, String) = {
     var totalError = 0.0;
     var totalErrorAlt = 0.0
     var totalNumAbstained = 0
         var numSysNeu = 0.0
 
         for (tweet <- curTweets) {
-          if (tweet.goldLabel == POS) numGoldPos += 1
-          else if (tweet.goldLabel == NEG) numGoldNeg += 1
-          else if (tweet.goldLabel == NEU) numGoldNeu += 1
-          if (tweet.systemLabel == POS && doRandom.value == None) numSysPos += 1
-          else if (tweet.systemLabel == NEG && doRandom.value == None) numSysNeg += 1
-          else if (tweet.systemLabel == NEU && doRandom.value == None) numSysNeu += 1
-          else if (tweet.systemLabel == null || doRandom.value != None) numAbstained += 1
+          tweet match {
+            case SystemLabeledTweet(_, _, _, SentimentLabel.Positive, _) => numGoldPos += 1
+            case SystemLabeledTweet(_, _, _, SentimentLabel.Negative, _) => numGoldNeg += 1
+            case SystemLabeledTweet(_, _, _, SentimentLabel.Neutral, _) => numGoldNeu += 1
+          }
+          if (doRandom == None) {
+            tweet match {
+              case SystemLabeledTweet(_, _, _, _, SentimentLabel.Positive) => numSysPos += 1
+              case SystemLabeledTweet(_, _, _, _, SentimentLabel.Negative) => numSysNeg += 1
+              case SystemLabeledTweet(_, _, _, _, SentimentLabel.Neutral) => numSysNeu += 1
+              case SystemLabeledTweet(_, _, _, _, null) => numAbstained += 1
+            }
+          } else {
+            numAbstained += 1
+          }
         }
 
         numSysPos += numAbstained.toFloat / 3
       "(min of " + minTPU + " tweets per user)")
   }
 
-  def evaluate(tweets: List[Tweet]) = {
+  def evaluate(tweets: List[SystemLabeledTweet]) = {
     val (total, abstained, error, message) = computeEvaluation(tweets)
 
     println("\n***** PER USER EVAL *****")
     val goldLines = scala.io.Source.fromFile(goldInputFile.value.get, "utf-8").getLines.toList
 
     val tweets = TweetFeatureReader(goldInputFile.value.get)
-    for (tweet <- tweets) {
-      tweet.systemLabel = SentimentLabel.figureItOut(model.getBestOutcome(model.eval(tweet.features.toArray)))
-    }
-
-    evaluate(tweets)
+    evaluate(for (tweet <- tweets) yield {
+      tweet match {
+        case GoldLabeledTweet(id, userid, features, goldLabel) =>
+          SystemLabeledTweet(id, userid, features, goldLabel,
+            SentimentLabel.figureItOut(model.getBestOutcome(model.eval(features.toArray))))
+      }
+    })
   }
 }

src/main/scala/updown/app/TrainMaxentModel.scala

   val DEFAULT_CUTOFF = 5
 
 
-  def apply(fileName: String, iterations: Int, cutoff: Int): AbstractModel = 
+  def apply(fileName: String, iterations: Int, cutoff: Int): AbstractModel =
     GIS.trainModel(MaxentEventStreamFactory(fileName), iterations, cutoff)
+  def apply(fileName: String): AbstractModel = apply(fileName, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
 
   def apply(iterator: Iterator[String], iterations:Int, cutoff:Int): AbstractModel =
     GIS.trainModel(MaxentEventStreamFactory(iterator), iterations, cutoff)
+  def apply(iterator: Iterator[String]): AbstractModel = apply(iterator, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
 
   def main(args: Array[String]) {
     val parser = new ArgotParser("updown run updown.app.TrainMaxentModel", preUsage = Some("Updown"))

src/main/scala/updown/app/model/MaxentEventStreamFactory.scala

 
 import opennlp.maxent.{DataStream, BasicEventStream}
 import updown.data.io.TweetFeatureReader
-import updown.data.{SentimentLabel, Tweet}
 import opennlp.model.EventStream
+import updown.data.{GoldLabeledTweet, SystemLabeledTweet, SentimentLabel, Tweet}
 
 object MaxentEventStreamFactory {
   val DEFAULT_DELIMITER = ","
     new BasicEventStream(new DataStream {
       def nextToken(): AnyRef = {
         TweetFeatureReader.parseLine(iterator.next()) match {
-          case Tweet(tweetid, userid, features, label, systemLabel) =>
+          case GoldLabeledTweet(tweetid, userid, features, label) =>
             (features ::: (label::Nil)).mkString(DEFAULT_DELIMITER)
           case _ =>
             throw new RuntimeException("bad line")

src/main/scala/updown/data/Tweet.scala

 package updown.data
 
-case class Tweet(val id: String,
-            val userid: String,
-            val features: List[String],
-            val goldLabel: SentimentLabel.Type,
-            var systemLabel: SentimentLabel.Type) {
-  /* val POS_ALPHA = "POS"; val POS_INT = "1"
-  val NEG_ALPHA = "NEG"; val NEG_INT = "-1"
-  val NEU_ALPHA = "NEU"; val NEU_INT = "0"
-  */
-  def this(id: String, userid: String, features: List[String], goldLabel: SentimentLabel.Type) {
-    this (id, userid, features, goldLabel, null)
-  }
+abstract class Tweet()
 
-  override def toString = "id: " + id + "\t" + "userid: " + userid + "\t" + "features: " + features + "\t" + "goldLabel: " + goldLabel + "\tsystemLabel: " + systemLabel
+case class GoldLabeledTweet(id: String,
+                            userid: String,
+                            features: List[String],
+                            goldLabel: SentimentLabel.Type) extends Tweet
 
-  override def equals(other: Any):Boolean = {
-    if (other != null && other.isInstanceOf[Tweet]) {
-      val otherTweet = other.asInstanceOf[Tweet]
-      (this.id == otherTweet.id) &&
-        (this.userid == otherTweet.userid) &&
-        (this.features == otherTweet.features) &&
-        (this.goldLabel == otherTweet.goldLabel) &&
-        (this.systemLabel == otherTweet.systemLabel)
-    }
-    else {
-      false
-    }
-
-  }
-
-  /*/*
-   * param must contain string "alpha" or "int" -- former to map into alphas, latter to map into ints.
-  */
-  def normalize(res: String): Tweet = {
-    var sl = ""
-    val intToAlpha = List("-1", "0", "1") zip List("NEG", "NEU", "POS")
-    if (this.systemLabel != null && res.contains("alpha")) {
-      if (this.systemLabel.contains("-1")) sl = "NEG"
-      else if (this.systemLabel.contains("0")) sl = "NEU"
-      else if (this.systemLabel.contains("1")) sl = "POS"
-      else sl = this.systemLabel //if systemLabel is already alpha
-    }
-    else if (this.systemLabel != null && res.contains("int")) {
-      if (this.systemLabel == "NEG") sl = "-1"
-      else if (this.systemLabel == "NEU") sl = "0"
-      else if (this.systemLabel == "POS") sl = "1"
-      else sl = this.systemLabel //if systemLabel is already alpha
-    }
-    new Tweet(this.id, this.userid, this.features, this.goldLabel.toString.trim, sl.toString.trim)
-  }
-*/
-}
+case class SystemLabeledTweet(id: String,
+                              userid: String,
+                              features: List[String],
+                              goldLabel: SentimentLabel.Type,
+                              systemLabel: SentimentLabel.Type) extends Tweet
 	   
 	
 

src/main/scala/updown/data/io/TweetFeatureReader.scala

 object TweetFeatureReader {
   val featureRowRE = """^([^|]*)\|([^|]*)\|([^|]*)\|(.*)$""".r //python verbose regexes are so much nicer :/
 
-  def apply(inputFile: String): List[Tweet] = {
+  def apply(inputFile: String): List[GoldLabeledTweet] = {
 
     val lines = scala.io.Source.fromFile(inputFile, "utf-8").getLines.toList
 
     }
   }
 
-  def parseLine(line: String): Tweet = {
+  def parseLine(line: String): GoldLabeledTweet = {
     val featureRowRE(tweetid, userid, featureString, label) = line
     val features = featureString.split(",").toList.map(_.trim).filter(_.length > 0) // filter out features that are all whitespace or the empty string
 
-    val t = new Tweet(tweetid, userid, features, SentimentLabel.figureItOut(label))
-    t
+    GoldLabeledTweet(tweetid, userid, features, SentimentLabel.figureItOut(label))
   }
-/*
-  def standardize(label: String): String = {
-    label match {
-      case "1" => "POS"
-      case "-1" => "NEG"
-      case "0" => "NEU"
-      case _ => label
-    }
-  }*/
 }

src/test/scala/updown/test/LexicalRatioClassifierTest.scala

 import collection.mutable.HashMap
 import org.scalatest.FlatSpec
 import updown.app.LexicalRatioClassifier
-import updown.data.Tweet
 import updown.lex.{MPQALexicon, MPQAEntry}
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel, Tweet}
 
 class LexicalRatioClassifierTest extends FlatSpec {
   "classifyTweet" should "be null with (0,0,0)" in assert(LexicalRatioClassifier.classifyTweet(0, 0, 0) === null)
   it should "be null with (10,10,0)" in assert(LexicalRatioClassifier.classifyTweet(10, 10, 0) === null)
-  it should "be 0 with (10,10,10)" in assert(LexicalRatioClassifier.classifyTweet(10, 10, 10) === "0")
-  it should "be 0 with (11,11,10)" in assert(LexicalRatioClassifier.classifyTweet(11, 11, 10) === "0")
-  it should "be 1 with (12,11,10)" in assert(LexicalRatioClassifier.classifyTweet(12, 11, 10) === "1")
-  it should "be -1 with (11,12,10)" in assert(LexicalRatioClassifier.classifyTweet(11, 12, 10) === "-1")
-  it should "be 0 with (12,11,13)" in assert(LexicalRatioClassifier.classifyTweet(12, 11, 13) === "0")
-  it should "be 0 with (11,12,13)" in assert(LexicalRatioClassifier.classifyTweet(11, 12, 13) === "0")
+  it should "be 0 with (10,10,10)" in assert(LexicalRatioClassifier.classifyTweet(10, 10, 10) === SentimentLabel.Neutral)
+  it should "be 0 with (11,11,10)" in assert(LexicalRatioClassifier.classifyTweet(11, 11, 10) === SentimentLabel.Neutral)
+  it should "be 1 with (12,11,10)" in assert(LexicalRatioClassifier.classifyTweet(12, 11, 10) === SentimentLabel.Positive)
+  it should "be -1 with (11,12,10)" in assert(LexicalRatioClassifier.classifyTweet(11, 12, 10) === SentimentLabel.Negative)
+  it should "be 0 with (12,11,13)" in assert(LexicalRatioClassifier.classifyTweet(12, 11, 13) === SentimentLabel.Neutral)
+  it should "be 0 with (11,12,13)" in assert(LexicalRatioClassifier.classifyTweet(11, 12, 13) === SentimentLabel.Neutral)
 
 
   "classifyTweets" should "assign the correct label to a few tweets" in {
     map += "bad" -> new MPQAEntry("bad", "NEG", "strong")
     map += "fact" -> new MPQAEntry("fact", "NEU", "strong")
 
-    val tweets = List[Tweet](new Tweet("3", "tpryan", "good" :: features, "POS"),
-      new Tweet("4", "tpryan", "bad" :: features, "NEG"),
-      new Tweet("5", "tpryan", "fact" :: features, "NEU"))
+    val glTweets = List[Tweet](GoldLabeledTweet("3", "tpryan", "good" :: features, SentimentLabel.Positive),
+      GoldLabeledTweet("4", "tpryan", "bad" :: features, SentimentLabel.Negative),
+      GoldLabeledTweet("5", "tpryan", "fact" :: features, SentimentLabel.Neutral))
     val lexicon = new MPQALexicon(map)
 
-    LexicalRatioClassifier.classifyTweets(tweets, lexicon)
+    val slTweets = LexicalRatioClassifier.classifyTweets(glTweets, lexicon)
 
-    for (tweet: Tweet <- tweets) {
-      val gl = tweet.goldLabel match {
-        case "POS" => "1"
-        case "NEG" => "-1"
-        case "NEU" => "0"
-      }
-      assert(gl === tweet.systemLabel)
+    for (tweet: SystemLabeledTweet <- slTweets) {
+      assert(tweet.goldLabel === tweet.systemLabel)
     }
   }
 }

src/test/scala/updown/test/TweetFeatureReaderTest.scala

 
 import org.scalatest.FlatSpec
 import updown.data.io.TweetFeatureReader
-import updown.data.Tweet
+import updown.data.{GoldLabeledTweet, SentimentLabel, Tweet}
 
 class TweetFeatureReaderTest extends FlatSpec {
   val line = "3|tpryan|stellargirl,loooooooovvvvvveee,kindle2,dx,cool,2,fantastic,$ stellargirl,stellargirl i,i loooooooovvvvvveee,loooooooovvvvvveee my,my kindle2,kindle2 not,not that,that the,the dx,dx is,is cool,cool but,but the,the 2,2 is,is fantastic,fantastic in,in its,its own,own right,right $|1"
   "parseLine" should "produce the right Tweet obj" in {
     assert(TweetFeatureReader.parseLine(line)
       ===
-      new Tweet("3", "tpryan", features, "POS"))
+      GoldLabeledTweet("3", "tpryan", features, SentimentLabel.Positive))
   }
 
   // not testing standardize because it will be obviated when I refactor the label to use the SentimentLabel enum