Commits

vvcephei committed eb521b8

added support for n-fold experiements, and added a maxent experiment for it.

Comments (0)

Files changed (7)

src/main/scala/updown/app/NFoldExperiment.scala

+package updown.app
+
+import updown.data.io.TweetFeatureReader
+import updown.data.{SentimentLabel, GoldLabeledTweet}
+
+abstract class NFoldExperiment {
+  def generateTrials(inputFile: String, nFolds: Int): Iterator[(List[GoldLabeledTweet], List[GoldLabeledTweet])] = {
+    val foldsToTweets = (for ((fold, list) <- TweetFeatureReader(inputFile).zipWithIndex.groupBy((pair) => {
+      val (_, index) = pair;
+      index % nFolds
+    })) yield {
+      (fold, list.map((pair) => {
+        val (tweet, _) = pair;
+        tweet
+      }))
+    }).toList
+
+    (for ((heldOutFold, heldOutData) <- foldsToTweets) yield {
+      (heldOutData,
+        foldsToTweets.filter((pair) => {
+          val (listFold, _) = pair;
+          listFold != heldOutFold
+        }).map((pair) => {
+          val (_, tweets) = pair;
+          tweets
+        }).flatten)
+    }).iterator
+  }
+
+  def reportResults(resultTuple: (Double, scala.List[(SentimentLabel.Type, Double, Double, Double)])): String = {
+    val (accuracy, labelResultsList) = resultTuple
+    "Results:\n" +
+      "%12s%6.2f\n".format("Accuracy", accuracy) +
+      "%12s%11s%8s%9s\n".format("Label", "Precision", "Recall", "F-Score") +
+      (for ((label, precision, recall, fScore) <- labelResultsList) yield {
+        "%12s%11.2f%8.2f%9.2f".format(SentimentLabel.toEnglishName(label), precision, recall, fScore)
+      }).mkString("\n") + "\n"
+  }
+}

src/main/scala/updown/app/NFoldMaxentExperiement.scala

-package updown.app
-
-import org.clapper.argot.ArgotParser._
-import org.clapper.argot.{ArgotUsageException, ArgotParser}
-import java.io.{FileInputStream, DataInputStream}
-import opennlp.maxent.io.BinaryGISModelReader
-import updown.data.io.TweetFeatureReader._
-import updown.data.SentimentLabel
-import org.clapper.argot.ArgotConverters._
-import updown.data.io.TweetFeatureReader
-
-object NFoldMaxentExperiement {
-  // this exists purely to make the ArgotConverters appear used to IDEA
-  convertByte _
-
-  def main(args: Array[String]) {
-    val parser = new ArgotParser("updown run updown.app.PerTweetEvaluator", preUsage = Some("Updown"))
-    val goldInputFile = parser.option[String](List("g", "gold"), "gold", "gold labeled input")
-    val n = parser.option[Int](List("n", "folds"), "FOLDS", "the number of folds for the experiment (default 10)")
-
-    try {
-      parser.parse(args)
-    }
-    catch {
-      case e: ArgotUsageException => println(e.message); sys.exit(0)
-    }
-
-    val nFolds: Int = n.value.getOrElse(10)
-
-    if (goldInputFile.value == None) {
-      println("You must specify a gold labeled input file via -g.")
-      sys.exit(1)
-    }
-
-    val tweets = TweetFeatureReader(goldInputFile.value.get)
-
-  }
-}

src/main/scala/updown/app/NFoldMaxentExperiment.scala

+package updown.app
+
+import org.clapper.argot.{ArgotUsageException, ArgotParser}
+import org.clapper.argot.ArgotConverters._
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+
+object NFoldMaxentExperiment extends NFoldExperiment {
+  // this exists purely to make the ArgotConverters appear used to IDEA
+  convertByte _
+
+  def doExperiment(inputFile: String, nFolds: Int) = {
+    (for ((testSet, trainSet) <- generateTrials(inputFile, nFolds)) yield {
+      val model = TrainMaxentModel.trainWithGoldLabeledTweetIterator(trainSet.iterator)
+
+      PerTweetEvaluator.getEvalStats(for (tweet <- testSet) yield {
+        tweet match {
+          case GoldLabeledTweet(id, userid, features, goldLabel) =>
+            SystemLabeledTweet(id, userid, features, goldLabel,
+              SentimentLabel.figureItOut(model.getBestOutcome(model.eval(features.toArray))))
+        }
+      })
+    }).toList
+  }
+
+  def initializeAverageList(list: List[(updown.data.SentimentLabel.Type, Double, Double, Double)]): List[(updown.data.SentimentLabel.Type, Double, Double, Double)] = {
+    if (list.length == 0)
+      Nil
+    else {
+      val ((lLabel, _, _, _) :: ls) = list
+      (lLabel, 0.0, 0.0, 0.0) :: initializeAverageList(ls)
+    }
+  }
+
+  def addAll(list: List[(updown.data.SentimentLabel.Type, Double, Double, Double)], to: List[(updown.data.SentimentLabel.Type, Double, Double, Double)]): List[(updown.data.SentimentLabel.Type, Double, Double, Double)] = {
+    if (list.length == 0)
+      Nil
+    else {
+      val ((lLabel, lPrecision, lRecall, lFScore) :: ls) = list
+      val ((tLabel, tPrecision, tRecall, tFScore) :: ts) = to
+      assert(lLabel == tLabel)
+      (lLabel, lPrecision + tPrecision, lRecall + tRecall, lFScore + tFScore) :: addAll(ls, ts)
+    }
+  }
+
+  def divideBy(list: List[(updown.data.SentimentLabel.Type, Double, Double, Double)], by: Double): List[(updown.data.SentimentLabel.Type, Double, Double, Double)] = {
+    if (list.length == 0)
+      Nil
+    else {
+      val ((lLabel, lPrecision, lRecall, lFScore) :: ls) = list
+      (lLabel, lPrecision / by, lRecall / by, lFScore / by) :: divideBy(ls, by)
+    }
+  }
+
+
+  def averageResults(results: scala.List[(Double, scala.List[(SentimentLabel.Type, Double, Double, Double)])]): (Double, scala.List[(SentimentLabel.Type, Double, Double, Double)]) = {
+    var avgAccuracy = 0.0
+    var avgLabelResultsList = initializeAverageList(results(0)._2)
+    for ((accuracy, labelResults) <- results) {
+      avgAccuracy += accuracy
+      avgLabelResultsList = addAll(labelResults, avgLabelResultsList)
+    }
+    avgAccuracy /= results.length
+    avgLabelResultsList = divideBy(avgLabelResultsList, results.length)
+    //    println(results.mkString("\n"))
+    //    println("Averages:")
+    //    println("(Accuracy, List((Label, Precision, Recall, F-Score)")
+    //    println((avgAccuracy, avgLabelResultsList))
+    (avgAccuracy, avgLabelResultsList)
+
+  }
+
+  def main(args: Array[String]) {
+    val parser = new ArgotParser("updown run updown.app.PerTweetEvaluator", preUsage = Some("Updown"))
+    val goldInputFile = parser.option[String](List("g", "gold"), "gold", "gold labeled input")
+    val n = parser.option[Int](List("n", "folds"), "FOLDS", "the number of folds for the experiment (default 10)")
+
+    try {
+      parser.parse(args)
+    }
+    catch {
+      case e: ArgotUsageException => println(e.message); sys.exit(0)
+    }
+
+    val nFolds: Int = n.value.getOrElse(10)
+
+    if (goldInputFile.value == None) {
+      println("You must specify a gold labeled input file via -g.")
+      sys.exit(1)
+    }
+
+    val inputFile = goldInputFile.value.get
+    val results = doExperiment(inputFile, nFolds)
+    val averages = averageResults(results)
+    println("\n"+reportResults(averages))
+  }
+}

src/main/scala/updown/app/PerTweetEvaluator.scala

   // this exists purely to make the ArgotConverters appear used to IDEA
   convertByte _
 
+  val accurracy: (Double, Double) => Double =
+    (correct, total) => correct / total
+  val precision: (Double, Double) => Double =
+    (numCorrectlyLabeled, totalNumLabeled) => numCorrectlyLabeled / totalNumLabeled
+  val recall: (Double, Double) => Double =
+    (numCorrectlyLabeled, numberThatShouldHaveBeenLabeled) => numCorrectlyLabeled / numberThatShouldHaveBeenLabeled
+  val fScore: (Double, Double) => Double =
+    (precision, recall) => 2.0 * precision * recall / (precision + recall)
+
+  def getEvalStats(tweets: scala.List[SystemLabeledTweet]): (Double, List[(SentimentLabel.Type, Double, Double, Double)]) = {
+    val (correct, total, _, _) = tabulate(tweets)
+
+    (accurracy(correct, total.toDouble),
+      (for (label <- SentimentLabel.values) yield {
+        val goldList = tweets.filter((tweet) => tweet.goldLabel == label)
+        val systemList = tweets.filter((tweet) => tweet.systemLabel == label)
+        val labelPrecision = precision(
+          systemList.filter((tweet) => tweet.goldLabel == label).length,
+          systemList.length)
+        val labelRecall = recall(
+          goldList.filter((tweet) => tweet.systemLabel == label).length,
+          goldList.length
+        )
+        (label, labelPrecision, labelRecall, fScore(labelPrecision, labelRecall))
+      }).toList)
+  }
+
   def tabulate(tweets: scala.List[SystemLabeledTweet]): (Double, Int, Int, String) = {
     var correct = 0.0
     var total = 0

src/main/scala/updown/app/TrainMaxentModel.scala

 
 import org.clapper.argot._
 import ArgotConverters._
+import updown.data.GoldLabeledTweet
+import updown.data.io.TweetFeatureReader
 
 /**
  * Train a maxent model from labeled tweet input where each line has the format:
 
   def apply(fileName: String, iterations: Int, cutoff: Int): AbstractModel =
     GIS.trainModel(MaxentEventStreamFactory(fileName), iterations, cutoff)
+
   def apply(fileName: String): AbstractModel = apply(fileName, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
 
-  def apply(iterator: Iterator[String], iterations:Int, cutoff:Int): AbstractModel =
-    GIS.trainModel(MaxentEventStreamFactory(iterator), iterations, cutoff)
-  def apply(iterator: Iterator[String]): AbstractModel = apply(iterator, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
+  def trainWithStringIterator(iterator: Iterator[String], iterations: Int, cutoff: Int): AbstractModel =
+    GIS.trainModel(MaxentEventStreamFactory.getWithStringIterator(iterator), iterations, cutoff)
+
+  //  def apply[String](iterator: Iterator[String]): AbstractModel = apply(iterator, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
+
+  def trainWithGoldLabeledTweetIterator(iterator: Iterator[GoldLabeledTweet], iterations: Int, cutoff: Int): AbstractModel =
+    GIS.trainModel(MaxentEventStreamFactory.getWithGoldLabeledTweetIterator(iterator), iterations, cutoff)
+
+  def trainWithGoldLabeledTweetIterator(iterator: Iterator[GoldLabeledTweet]): AbstractModel = trainWithGoldLabeledTweetIterator(iterator, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
 
   def main(args: Array[String]) {
     val parser = new ArgotParser("updown run updown.app.TrainMaxentModel", preUsage = Some("Updown"))

src/main/scala/updown/app/model/MaxentEventStreamFactory.scala

 import opennlp.maxent.{DataStream, BasicEventStream}
 import updown.data.io.TweetFeatureReader
 import opennlp.model.EventStream
-import updown.data.{GoldLabeledTweet, SystemLabeledTweet, SentimentLabel, Tweet}
+import updown.data._
 
 object MaxentEventStreamFactory {
   val DEFAULT_DELIMITER = ","
+
   def apply(fileName: String): EventStream = {
-    apply(scala.io.Source.fromFile(fileName).getLines)
+    getWithStringIterator(scala.io.Source.fromFile(fileName).getLines)
   }
 
-  def apply(iterator: Iterator[String]): EventStream = {
+  def getWithStringIterator(iterator: Iterator[String]): EventStream = {
     new BasicEventStream(new DataStream {
       def nextToken(): AnyRef = {
-        TweetFeatureReader.parseLine(iterator.next()) match {
-          case GoldLabeledTweet(tweetid, userid, features, label) =>
-            (features ::: (label::Nil)).mkString(DEFAULT_DELIMITER)
-          case _ =>
-            throw new RuntimeException("bad line")
-        }
+        val GoldLabeledTweet(tweetid, userid, features, label) = TweetFeatureReader.parseLine(iterator.next())
+        (features ::: (label :: Nil)).mkString(DEFAULT_DELIMITER)
       }
+
       def hasNext: Boolean = iterator.hasNext
     }, DEFAULT_DELIMITER)
   }
+
+  def getWithGoldLabeledTweetIterator(iterator: Iterator[GoldLabeledTweet]): EventStream = {
+    new BasicEventStream(new DataStream {
+      def nextToken(): AnyRef = {
+        val GoldLabeledTweet(tweetid, userid, features, label) = iterator.next()
+        (features ::: (label :: Nil)).mkString(DEFAULT_DELIMITER)
+      }
+
+      def hasNext: Boolean =
+        iterator.hasNext
+
+    }, DEFAULT_DELIMITER)
+  }
 }

src/main/scala/updown/data/SentimentLabel.scala

   val Positive = Value("1")
   val Neutral = Value("0")
   val Negative = Value("-1")
-  val Invalid = Value("-2")
+  val Abstained = Value("A")
   // this is the end of the enum definition. the rest of this object just demonstrates other
   //  stuff you can do.
 
   private val _POS_NAME = "positive"
   private val _NEG_NAME = "negative"
   private val _NEU_NAME = "neutral"
+  private val _ABS_NAME = "abstained"
 
-  def toEnglishName = {
-    this match {
+  def toEnglishName(label:SentimentLabel.Type) = {
+    label match {
       case Positive => _POS_NAME
       case Negative => _NEG_NAME
       case Neutral  => _NEU_NAME
+      case Abstained  => _ABS_NAME
     }
   }
 
       case `_POS_NAME` => Positive
       case `_NEG_NAME` => Negative
       case `_NEU_NAME` => Neutral
+      case `_ABS_NAME` => Abstained
     }
   }
 
   def figureItOut(name:String) = {
     name.toLowerCase match {
-      case "positive"|"pos"|"p"|"+"|"1" => Positive
-      case "negative"|"neg"|"-"|"-1" => Negative
-      case "neutral"|"neu"|"neut"|"0" => Neutral
+      case `_POS_NAME` |"pos"|"p"|"+"|"1" => Positive
+      case `_NEG_NAME`|"neg"|"-"|"-1" => Negative
+      case `_NEU_NAME`|"neu"|"neut"|"0" => Neutral
+      case `_ABS_NAME` => Abstained
     }
   }
 }
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.