Commits

vvcephei committed cc83346

working on topic experiments

Comments (0)

Files changed (22)

config/main/log4j.properties

-log4j.rootLogger=INFO, stderr
+log4j.rootLogger=TRACE, stderr
 log4j.appender.stderr=org.apache.log4j.ConsoleAppender
 log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
 

lib/mallet.jar

Binary file modified.

src/main/scala/updown/app/TopicalChunker.scala

       summary.write("%s\n".format(model.getTopicPriors.zipWithIndex.map {
         case (a, b) => "Topic %s:%6.3f".format(b, a)
       }.mkString("\n")))
-      summary.write("%s\n".format(model.getTopicsPerTarget.toList.map {
+      summary.write("%s\n".format(model.getLabelsToTopicDist.toList.map {
         case (a, b) => "Label %9s:%s".format(SentimentLabel.toEnglishName(a), b.map {
           "%7.3f".format(_)
         }.mkString(""))
         }.mkString("\n")))
         index.write(("<div id=labelDistributions class=\"bordered table\">" +
           "<div class=\"labelDistribution row\"><span class=\"title cell\">topic</span><span class=\"values cell\"><span class=\"value\">  0</span><span class=\"value\">  1</span><span class=\"value\">  2</span></span></div>" +
-          "%s</div>\n").format(model.getTopicsPerTarget.toList.sortBy({
+          "%s</div>\n").format(model.getLabelsToTopicDist.toList.sortBy({
           case (a, b) => SentimentLabel.ordinality(a)
         }).map {
           case (a, b) => "<div class=\"labelDistribution row\"><span class=\"title cell\">Label %9s</span><span class=\"values cell\">%s</span></div>".format(SentimentLabel.toEnglishName(a), b.map {

src/main/scala/updown/app/experiment/ExperimentalResult.scala

 import updown.data.SentimentLabel
 
 case class ExperimentalResult(name: String, n: Int, accuracy: Double, classes: List[LabelResult]) {
-  def header: String = "\n%15s%5s%11s%8s%9s\n".format("Label", "N", "Precision", "Recall", "F-Score")
+  def header: String = "\n%15s%7s%9s%11s%8s%9s\n".format("Label", "NGold", "NSystem", "Precision", "Recall", "F-Score")
 
   override def toString(): String =
     "%s Results:\n".format(name) +
     val otherClassesMap = (other.classes.groupBy((labelResult) => labelResult.label).map((tup) => {
       val (k, (v: LabelResult) :: vs) = tup
       (k, v)
-    }).toMap).withDefaultValue(LabelResult(0, SentimentLabel.Abstained, 0.0, 0.0, 0.0))
+    }).toMap).withDefaultValue(LabelResult(0, 0, SentimentLabel.Abstained, 0.0, 0.0, 0.0))
     ExperimentalResult(name, n + other.n, accuracy + other.accuracy,
       (for ((label, classResult) <- classesMap.toList) yield classResult + otherClassesMap(label)).toList
     )
     val otherClassesMap = (other.classes.groupBy((labelResult) => labelResult.label).map((tup) => {
       val (k, (v: LabelResult) :: vs) = tup
       (k, v)
-    }).toMap).withDefaultValue(LabelResult(0, SentimentLabel.Abstained, 0.0, 0.0, 0.0))
+    }).toMap).withDefaultValue(LabelResult(0, 0, SentimentLabel.Abstained, 0.0, 0.0, 0.0))
     ExperimentalResult(name, n * other.n, accuracy * other.accuracy,
       (for ((label, classResult) <- classesMap.toList) yield classResult * otherClassesMap(label)).toList
     )
 }
 
 
-case class LabelResult(n: Int, label: SentimentLabel.Type, precision: Double, recall: Double, f: Double) {
-  override def toString(): String = "%15s%5d%11.2f%8.2f%9.2f".format(SentimentLabel.toEnglishName(label), n, precision, recall, f)
+case class LabelResult(nGold: Int, nSystem: Int, label: SentimentLabel.Type, precision: Double, recall: Double, f: Double) {
+  override def toString(): String = "%15s%7d%9d%11.2f%8.2f%9.2f".format(SentimentLabel.toEnglishName(label), nGold, nSystem, precision, recall, f)
 
   def +(other: LabelResult): LabelResult = {
     assert(label == other.label)
-    LabelResult(n + other.n, label, precision + other.precision, recall + other.recall, f + other.f)
+    LabelResult(nGold + other.nGold, nSystem + other.nSystem, label, precision + other.precision, recall + other.recall, f + other.f)
   }
 
   def *(other: LabelResult): LabelResult = {
     assert(label == other.label)
-    LabelResult(n * other.n, label, precision * other.precision, recall * other.recall, f * other.f)
+    LabelResult(nGold * other.nGold, nSystem * other.nSystem, label, precision * other.precision, recall * other.recall, f * other.f)
   }
 
-  def /(scalar: Double): LabelResult = LabelResult((n.toFloat / scalar).toInt, label, precision / scalar, recall / scalar, f / scalar)
+  def /(scalar: Double): LabelResult = LabelResult((nGold.toFloat / scalar).toInt, (nSystem.toFloat / scalar).toInt, label, precision / scalar, recall / scalar, f / scalar)
 
-  def *(scalar: Double): LabelResult = LabelResult((n.toFloat * scalar).toInt, label, precision * scalar, recall * scalar, f * scalar)
+  def *(scalar: Double): LabelResult = LabelResult((nGold.toFloat * scalar).toInt, (nSystem.toFloat * scalar).toInt, label, precision * scalar, recall * scalar, f * scalar)
 }

src/main/scala/updown/app/experiment/NFoldExperiment.scala

 package updown.app.experiment
 
 import updown.data.io.TweetFeatureReader
-import updown.data.{SentimentLabel, GoldLabeledTweet}
 import org.clapper.argot.ArgotParser._
 import org.clapper.argot.ArgotConverters._
 import com.weiglewilczek.slf4s.Logging
 import updown.util.Statistics
 import org.clapper.argot.{SingleValueOption, ArgotUsageException, ArgotParser}
+import updown.data.{SystemLabeledTweet, SentimentLabel, GoldLabeledTweet}
 
-abstract class NFoldExperiment extends Logging {
+abstract class NFoldExperiment extends Experiment {
   // this exists purely to make the ArgotConverters appear used to IDEA
   convertByte _
-  val parser = new ArgotParser(this.getClass.getName)
+//  val parser = new ArgotParser(this.getClass.getName)
     
   val goldInputFile = parser.option[String](List("g", "gold"), "gold", "gold labeled input")
   val n = parser.option[Int](List("n", "folds"), "FOLDS", "the number of folds for the experiment (default 10)")
   var experimentalRun = 0
 
-  def doExperiment(train: List[GoldLabeledTweet], test: List[GoldLabeledTweet]): ExperimentalResult
+  def doExperiment(train: List[GoldLabeledTweet], test: List[GoldLabeledTweet]): List[SystemLabeledTweet]
 
   def generateTrials(inputFile: String, nFolds: Int): Iterator[(List[GoldLabeledTweet], List[GoldLabeledTweet])] = {
     val polToTweetLists = TweetFeatureReader(inputFile).groupBy((tweet) => tweet.goldLabel)
 
     val minListLength = (for ((pol, tweetList) <- polToTweetLists) yield tweetList.length).min
-    logger.info("takining %d items from each polarity class. This was the minimum number in any class".format(minListLength))
+    logger.info("taking %d items from each polarity class. This was the minimum number in any class".format(minListLength))
     val allTweetsFolded =
       (for (index <- 0 until minListLength) yield {
         (for ((pol, tweetList) <- polToTweetLists) yield {
           logger.debug("starting run " + experimentalRun)
           val result = doExperiment(trainSet, testSet)
           logger.debug("ending run " + experimentalRun)
+          logger.info("Intermediate:")
+          report(result)
           result
         }).toList
 
-      logger.info("intermediate results:\n" + results.mkString("\n"))
-      println("\n" + Statistics.averageResults("%d-fold Average".format(nFolds), results).toString)
+      val result = results.flatten
+      logger.info("Final Result:")
+      report(result)
+//      println("\n" + Statistics.averageResults("%d-fold Average".format(nFolds), results).toString)
       logger.debug("running cleanup code")
     }
     catch {

src/main/scala/updown/app/experiment/maxent/NFoldMaxentExperiment.scala

     val model = TrainMaxentModel.trainWithGoldLabeledTweetIterator(trainSet.iterator)
 
     logger.debug("testing model")
-    val res = Statistics.getEvalStats("Maxent",for (tweet <- testSet) yield {
+    val res = (for (tweet <- testSet) yield {
       tweet match {
         case GoldLabeledTweet(id, userid, features, goldLabel) =>
           SystemLabeledTweet(id, userid, features, goldLabel,
             SentimentLabel.figureItOut(model.getBestOutcome(model.eval(features.toArray))))
       }
-    })
+    }).toList
     logger.info(res.toString)
     res
   }

src/main/scala/updown/app/experiment/topic/NFoldMajorityTopicExperiment.scala

-package updown.app.experiment.topic
-
-import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
-import updown.util.{Statistics, LDATopicModel, TopicModel}
-import updown.app.experiment.ExperimentalResult
-
-object NFoldMajorityTopicExperiment extends NFoldTopicExperiment {
-
-  def label(model: TopicModel, tweet: GoldLabeledTweet, goodTopic: Int, badTopic: Int): SystemLabeledTweet = {
-    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
-    val topicDistribution = model.inferTopics(tweet)
-    val sortedDist = topicDistribution.zipWithIndex.sortBy((i) => 1.0 - i._1).map((i) => i._2)
-    val chosenTopic = topicDistribution.indexOf(topicDistribution.max)
-
-    SystemLabeledTweet(id, userid, features, goldLabel,
-      if (chosenTopic == goodTopic) SentimentLabel.Positive
-      else if (chosenTopic == badTopic) SentimentLabel.Negative
-      else SentimentLabel.Neutral
-    )
-  }
-
-  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]) = {
-    val labelToTopicDist = model.getTopicsPerTarget
-
-    //This approach will only work if there is a very clear sentiment-topic correlation.
-    val badTopic = labelToTopicDist(SentimentLabel.Negative).indexOf(labelToTopicDist(SentimentLabel.Negative).max)
-    val goodTopic = labelToTopicDist(SentimentLabel.Positive).indexOf(labelToTopicDist(SentimentLabel.Positive).max)
-    val neutralTopic = if (labelToTopicDist.contains(SentimentLabel.Neutral)) labelToTopicDist(SentimentLabel.Neutral).indexOf(labelToTopicDist(SentimentLabel.Neutral).max) else -1
-    logger.info("goodTopic:%d badTopic:%d neutralTopic:%d".format(goodTopic, badTopic, neutralTopic))
-
-    if (goodTopic == badTopic){
-      logger.error("Patholological distribution. No clear topics for bad/good labels. Exiting...")
-      System.exit(1)
-    } else if (neutralTopic != -1 && (badTopic == neutralTopic | goodTopic == neutralTopic)) {
-      logger.warn("No clear distribution for the neutral label. ")
-    }
-
-    val res = Statistics.getEvalStats("Majority Topic",for (tweet <- testSet) yield {
-      label(model, tweet, goodTopic, badTopic)
-    })
-    logger.info(res.toString)
-    res
-  }
-}

src/main/scala/updown/app/experiment/topic/NFoldSimilarityTopicExperiment.scala

-package updown.app.experiment.topic
-
-import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
-import updown.util.{Statistics, TopicModel}
-
-object NFoldSimilarityTopicExperiment extends NFoldTopicExperiment {
-
-  def labelNoop(model: TopicModel, tweet: GoldLabeledTweet, labelToTopicDist: Map[SentimentLabel.Type,List[Double]]): SystemLabeledTweet = {
-    val topicDistribution = model.inferTopics(tweet)
-    val similarities = (for ((k,v) <- labelToTopicDist) yield (Statistics.cosineSimilarity(topicDistribution, v), k)).toList.sorted.reverse
-    SentimentLabel.unitSentiment(similarities(0)._2)
-    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
-    SystemLabeledTweet(id, userid, features, goldLabel,SentimentLabel.Abstained)
-  }
-
-  def label(model: TopicModel, tweet: GoldLabeledTweet, labelToTopicDist: Map[SentimentLabel.Type,List[Double]]): SystemLabeledTweet = {
-    val topicDistribution = model.inferTopics(tweet)
-    val similarities = (for ((k,v) <- labelToTopicDist) yield (Statistics.cosineSimilarity(topicDistribution, v), k)).toList.sorted.reverse
-    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
-    SystemLabeledTweet(id, userid, features, goldLabel,SentimentLabel.unitSentiment(similarities(0)._2))
-  }
-
-  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]) = {
-    logger.debug("entering evaluation with %d items in the test set".format(testSet.length))
-    val topicsPerTarget: Map[SentimentLabel.Type, List[Double]] = model.getTopicsPerTarget
-    val start = System.currentTimeMillis()
-    val res = Statistics.getEvalStats("Similarity Topic",for ((tweet,i) <- testSet.zipWithIndex) yield {
-      if (i%100 == 0) {
-        logger.debug("%.0f%% remaining; average label time = %fs".format((1.0-(i+1).toDouble/testSet.length.toDouble)*100, (System.currentTimeMillis()-start).toDouble/(i+1.0) /1000.0))
-      }
-      label(model, tweet, topicsPerTarget)
-    })
-    logger.info(res.toString)
-    res
-  }
-}

src/main/scala/updown/app/experiment/topic/NFoldTopicExperiment.scala

 import java.io.{FileWriter, BufferedWriter, File}
 import updown.util.{WordleUtils, Statistics, LDATopicModel, TopicModel}
 import updown.app.experiment.{ExperimentalResult, NFoldExperiment}
+import java.util.Arrays
 
 abstract class NFoldTopicExperiment extends NFoldExperiment {
   var iterations = 1000
   val wordleConfigOption = parser.option[String](List("wordleConfig"), "PATH", ("the path to the config file for IBM's " +
     "word cloud generator (default %s)").format(WordleUtils.defaultConfigurationPath))
 
-  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]): ExperimentalResult
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]): List[SystemLabeledTweet]
 
   def doOutput(model: TopicModel) {
     if (outputOption.value.isDefined) {
       summary.write("%s\n".format(model.getTopicPriors.zipWithIndex.map {
         case (a, b) => "Topic %s:%6.3f".format(b, a)
       }.mkString("\n")))
-      summary.write("%s\n".format(model.getTopicsPerTarget.toList.map {
+      summary.write("%s\n".format(model.getLabelsToTopicDist.toList.map {
         case (a, b) => "Label %9s:%s".format(SentimentLabel.toEnglishName(a), b.map {
           "%7.3f".format(_)
         }.mkString(""))
         }.mkString("\n")))
         index.write(("<div id=labelDistributions class=\"bordered table\">" +
           "<div class=\"labelDistribution row\"><span class=\"title cell\">topic</span><span class=\"values cell\"><span class=\"value\">  0</span><span class=\"value\">  1</span><span class=\"value\">  2</span></span></div>" +
-          "%s</div>\n").format(model.getTopicsPerTarget.toList.sortBy({case(a,b)=>SentimentLabel.ordinality(a)}).map {
+          "%s</div>\n").format(model.getLabelsToTopicDist.toList.sortBy({case(a,b)=>SentimentLabel.ordinality(a)}).map {
           case (a, b) => "<div class=\"labelDistribution row\"><span class=\"title cell\">Label %9s</span><span class=\"values cell\">%s</span></div>".format(SentimentLabel.toEnglishName(a), b.map {
             "<span class=value>%7.3f</span>".format(_)
           }.mkString(""))
 
     logger.debug("alphaSum: " + alphaSum)
     val model: TopicModel = new LDATopicModel(trainSet, numTopics, iterations, alphaSum, beta)
-    logger.debug("topic distribution:\n     :" + model.getTopicPriors)
+    logger.debug("topic distribution:\n     :" + Arrays.toString(model.getTopicPriors))
     logger.debug({
-      val labelToTopicDist = model.getTopicsPerTarget
-      "topic distribution over labels:\n" + (for ((k, v) <- labelToTopicDist) yield "%5s:%s".format(k, v)).mkString("\n")
+      val labelToTopicDist = model.getLabelsToTopicDist
+      "topic distribution over labels:\n" + (for ((k, v) <- labelToTopicDist) yield "%5s:%s".format(k, Arrays.toString(v))).mkString("\n")
     })
     logger.debug({
       val topics = model.getTopics

src/main/scala/updown/app/experiment/topic/lda/NFoldDiscriminantLDAExperiment.scala

+package updown.app.experiment.topic.lda
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.util.TopicModel
+import scala.Array
+import updown.app.experiment.topic.NFoldTopicExperiment
+import updown.app.experiment.topic.util.MaxentDiscriminant
+
+object NFoldDiscriminantLDAExperiment extends NFoldTopicExperiment with MaxentDiscriminant {
+
+  def label(model: TopicModel, tweet: GoldLabeledTweet, discriminantFn: (Array[Float]) => (String, String)): SystemLabeledTweet = {
+    val topicDist: Array[Float] = model.inferTopics(tweet).map((item) => item.asInstanceOf[Float])
+    val (label: String, outcomes: String) = discriminantFn(topicDist)
+
+    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
+    logger.trace("labeling id:%s with label:%s from outcomes:%s".format(id, label.toString, outcomes))
+    SystemLabeledTweet(id, userid, features, goldLabel, SentimentLabel.figureItOut(label))
+  }
+
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]) = {
+    logger.debug("entering evaluation with %d items in the test set".format(testSet.length))
+    val labelsToTopicDists: Map[SentimentLabel.Type, List[Array[Double]]] = model.getLabelsToTopicDists
+    val discriminantFn = getDiscriminantFn(labelsToTopicDists)
+    val start = System.currentTimeMillis()
+
+    val res = (for ((tweet, i) <- testSet.zipWithIndex) yield {
+      if (i % 100 == 0) {
+        logger.debug("%.0f%% remaining; average label time = %fs".format((1.0 - (i + 1).toDouble / testSet.length.toDouble) * 100, (System.currentTimeMillis() - start).toDouble / (i + 1.0) / 1000.0))
+      }
+      label(model, tweet, discriminantFn)
+    }).toList
+    res
+  }
+}

src/main/scala/updown/app/experiment/topic/lda/NFoldMajorityTopicExperiment.scala

+package updown.app.experiment.topic.lda
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.util.TopicModel
+import updown.app.experiment.topic.NFoldTopicExperiment
+
+object NFoldMajorityTopicExperiment extends NFoldTopicExperiment {
+
+  def label(model: TopicModel, tweet: GoldLabeledTweet, goodTopic: Int, badTopic: Int): SystemLabeledTweet = {
+    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
+    val topicDistribution = model.inferTopics(tweet)
+    val sortedDist = topicDistribution.zipWithIndex.sortBy((i) => 1.0 - i._1).map((i) => i._2)
+    val chosenTopic = topicDistribution.indexOf(topicDistribution.max)
+
+    SystemLabeledTweet(id, userid, features, goldLabel,
+      if (chosenTopic == goodTopic) SentimentLabel.Positive
+      else if (chosenTopic == badTopic) SentimentLabel.Negative
+      else SentimentLabel.Neutral
+    )
+  }
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]) = {
+    val labelToTopicDist = model.getLabelsToTopicDist
+
+    //This approach will only work if there is a very clear sentiment-topic correlation.
+    val badTopic = labelToTopicDist(SentimentLabel.Negative).indexOf(labelToTopicDist(SentimentLabel.Negative).max)
+    val goodTopic = labelToTopicDist(SentimentLabel.Positive).indexOf(labelToTopicDist(SentimentLabel.Positive).max)
+    val neutralTopic = if (labelToTopicDist.contains(SentimentLabel.Neutral)) labelToTopicDist(SentimentLabel.Neutral).indexOf(labelToTopicDist(SentimentLabel.Neutral).max) else -1
+    logger.info("goodTopic:%d badTopic:%d neutralTopic:%d".format(goodTopic, badTopic, neutralTopic))
+
+    if (goodTopic == badTopic){
+      logger.error("Patholological distribution. No clear topics for bad/good labels. Exiting...")
+      System.exit(1)
+    } else if (neutralTopic != -1 && (badTopic == neutralTopic | goodTopic == neutralTopic)) {
+      logger.warn("No clear distribution for the neutral label. ")
+    }
+
+    val res = (for (tweet <- testSet) yield {
+      label(model, tweet, goodTopic, badTopic)
+    }).toList
+//    logger.info(res.toString)
+    res
+  }
+}

src/main/scala/updown/app/experiment/topic/lda/NFoldSimilarityTopicExperiment.scala

+package updown.app.experiment.topic.lda
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.util.{Statistics, TopicModel}
+import updown.app.experiment.topic.NFoldTopicExperiment
+
+object NFoldSimilarityTopicExperiment extends NFoldTopicExperiment {
+  def label(model: TopicModel, tweet: GoldLabeledTweet, labelToTopicDist: Map[SentimentLabel.Type,Array[Double]]): SystemLabeledTweet = {
+    val topicDistribution = model.inferTopics(tweet)
+    val similarities = (for ((k,v) <- labelToTopicDist) yield (Statistics.arrayCosineSimilarity(topicDistribution, v), k)).toList.sorted.reverse
+    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
+    SystemLabeledTweet(id, userid, features, goldLabel,SentimentLabel.unitSentiment(similarities(0)._2))
+  }
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]) = {
+    logger.debug("entering evaluation with %d items in the test set".format(testSet.length))
+    val start = System.currentTimeMillis()
+    val labelsToTopicDist: Map[SentimentLabel.Type, Array[Double]] = model.getLabelsToTopicDist
+    val res = (for ((tweet,i) <- testSet.zipWithIndex) yield {
+      if (i%100 == 0) {
+        logger.debug("%.0f%% remaining; average label time = %fs".format((1.0-(i+1).toDouble/testSet.length.toDouble)*100, (System.currentTimeMillis()-start).toDouble/(i+1.0) /1000.0))
+      }
+      label(model, tweet, labelsToTopicDist)
+    }).toList
+//    logger.info(res.toString)
+    res
+  }
+}

src/main/scala/updown/app/experiment/topic/pam/NFoldDiscriminantPAMExperiment.scala

+package updown.app.experiment.topic.pam
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.util.TopicModel
+import updown.app.experiment.topic.util.MaxentDiscriminant
+import java.util.Arrays
+
+object NFoldDiscriminantPAMExperiment extends NFoldPAMExperiment with MaxentDiscriminant {
+  def label(model: TopicModel, tweet: GoldLabeledTweet, discriminantFn: (Array[Float]) => (String, String)): SystemLabeledTweet = {
+    val topicDist: Array[Float] = model.inferTopics(tweet).map((item) => item.asInstanceOf[Float])
+    val (label: String, outcomes: String) = discriminantFn(topicDist)
+
+    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
+    logger.trace("labeling id:%s gold:%s with label:%s from outcomes:%s".format(id, goldLabel.toString, label.toString, outcomes))
+    SystemLabeledTweet(id, userid, features, goldLabel, SentimentLabel.figureItOut(label))
+  }
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]) = {
+    logger.debug("entering evaluation with %d items in the test set".format(testSet.length))
+    val labelsToTopicDists: Map[SentimentLabel.Type, List[Array[Double]]] = model.getLabelsToTopicDists
+    logger.debug({
+      val tmp = model.getLabelsToTopicDist
+      "Average distributions:\n"+(for ((label,dist)<- tmp) yield {
+        "\t"+label.toString + ": "+Arrays.toString(dist)
+      }).mkString("\n")
+    })
+    val discriminantFn = getDiscriminantFn(labelsToTopicDists)
+
+    val start = System.currentTimeMillis()
+
+    (for ((tweet, i) <- testSet.zipWithIndex) yield {
+      if (i % 100 == 0) {
+        logger.debug("%.0f%% remaining; average label time = %fs".format((1.0 - (i + 1).toDouble / testSet.length.toDouble) * 100, (System.currentTimeMillis() - start).toDouble / (i + 1.0) / 1000.0))
+      }
+      label(model, tweet, discriminantFn)
+    }).toList
+  }
+}

src/main/scala/updown/app/experiment/topic/pam/NFoldPAMExperiment.scala

 package updown.app.experiment.topic.pam
 
-import updown.data.{GoldLabeledTweet, SentimentLabel}
 import org.clapper.argot.ArgotParser._
 import org.clapper.argot.ArgotConverters._
 import java.io.{FileWriter, BufferedWriter, File}
 import updown.util.{HPAMTopicModel, WordleUtils, LDATopicModel, TopicModel}
 import updown.app.experiment.{LabelResult, ExperimentalResult, NFoldExperiment}
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
 
 abstract class NFoldPAMExperiment extends NFoldExperiment {
   var iterations = 1000
 //  var alpha = 30
 //  var beta = 0.1
-  var numTopics = 20
+  var numSuperTopics = 3
+  var numSubTopics = 20
   val fileSeparator = System.getProperty("file.separator")
 
   var childProcesses = List[Process]()
   val iterationOption = parser.option[Int](List("iterations"), "INT", "the number of iterations for the training the topicModel")
 //  val alphaOption = parser.option[Int](List("alpha"), "INT", "the symmetric alpha hyperparameter for LDA")
 //  val betaOption = parser.option[Double](List("beta"), "DOUBLE", "the symmetric beta hyperparameter for LDA")
-  val numTopicsOption = parser.option[Int](List("numTopics"), "INT", "the number of topics for LDA")
+  val numSuperTopicsOption = parser.option[Int](List("numSuperTopics"), "INT", "the number of supertopics for PAM")
+  val numSubTopicsOption = parser.option[Int](List("numSubTopics"), "INT", "the number of subtopics for PAM")
 
   val outputOption = parser.option[String](List("o", "output"), "DIR", "the directory to dump topics into")
   val wordleOption = parser.flag[Boolean](List("w", "wordle"), "generate wordles for the topics (requires -o DIR) " +
   val wordleConfigOption = parser.option[String](List("wordleConfig"), "PATH", ("the path to the config file for IBM's " +
     "word cloud generator (default %s)").format(WordleUtils.defaultConfigurationPath))
 
-  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]): ExperimentalResult
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]): List[SystemLabeledTweet]
 
   def doOutput(model: TopicModel) {
     if (outputOption.value.isDefined) {
       summary.write("%s\n".format(model.getTopicPriors.zipWithIndex.map {
         case (a, b) => "Topic %s:%6.3f".format(b, a)
       }.mkString("\n")))
-      summary.write("%s\n".format(model.getTopicsPerTarget.toList.map {
+      summary.write("%s\n".format(model.getLabelsToTopicDist.toList.map {
         case (a, b) => "Label %9s:%s".format(SentimentLabel.toEnglishName(a), b.map {
           "%7.3f".format(_)
         }.mkString(""))
         }.mkString("\n")))
         index.write(("<div id=labelDistributions class=\"bordered table\">" +
           "<div class=\"labelDistribution row\"><span class=\"title cell\">topic</span><span class=\"values cell\"><span class=\"value\">  0</span><span class=\"value\">  1</span><span class=\"value\">  2</span></span></div>" +
-          "%s</div>\n").format(model.getTopicsPerTarget.toList.sortBy({case(a,b)=>SentimentLabel.ordinality(a)}).map {
+          "%s</div>\n").format(model.getLabelsToTopicDist.toList.sortBy({case(a,b)=>SentimentLabel.ordinality(a)}).map {
           case (a, b) => "<div class=\"labelDistribution row\"><span class=\"title cell\">Label %9s</span><span class=\"values cell\">%s</span></div>".format(SentimentLabel.toEnglishName(a), b.map {
             "<span class=value>%7.3f</span>".format(_)
           }.mkString(""))
     if (iterationOption.value.isDefined) {
       iterations = iterationOption.value.get
     }
-    /*if (alphaOption.value.isDefined) {
-      alpha = alphaOption.value.get
+    if (numSuperTopicsOption.value.isDefined) {
+      numSuperTopics = numSuperTopicsOption.value.get
     }
-    if (betaOption.value.isDefined) {
-      beta = betaOption.value.get
-    }*/
-    if (numTopicsOption.value.isDefined) {
-      numTopics = numTopicsOption.value.get
+    if (numSubTopicsOption.value.isDefined) {
+      numSubTopics = numSubTopicsOption.value.get
     }
-
-    // Thanks to a bug in Mallet, we have to cap alphaSum
-//    val alphaSum = 300 min (alpha * numTopics)
-
-
-//    logger.debug("alphaSum: " + alphaSum)
-    val model: TopicModel = new HPAMTopicModel(trainSet, numTopics, iterations/*, alphaSum, beta*/)
+    val model: TopicModel = new HPAMTopicModel(trainSet, numSuperTopics, numSubTopics, iterations)
     logger.info("topicString:\n"+model.toString)
-    /*logger.debug("topic distribution:\n     :" + model.getTopicPriors)
-    logger.debug({
-      val labelToTopicDist = model.getTopicsPerTarget
-      "topic distribution over labels:\n" + (for ((k, v) <- labelToTopicDist) yield "%5s:%s".format(k, v)).mkString("\n")
-    })
-    logger.debug({
-      val topics = model.getTopics
-      "topic distributions\n" +
-        (for (i <- 0 until 3) yield "%5s: Topic(%s,%s)".format(i, topics(i).prior, topics(i).distribution.toList.sortBy((pair) => (1 - pair._2)).take(10))).mkString("\n")
-    })
-    doOutput(model)
-    evaluate(model, testSet)*/
-    ExperimentalResult("dummy",0,0.0,List[LabelResult]())
+    
+    evaluate(model, testSet)
   }
 
   def after(): Int = {

src/main/scala/updown/app/experiment/topic/pam/NFoldSimilarityPAMExperiment.scala

 import updown.app.experiment.topic.NFoldTopicExperiment
 
 object NFoldSimilarityPAMExperiment extends NFoldPAMExperiment {
-  def label(model: TopicModel, tweet: GoldLabeledTweet, labelToTopicDist: Map[SentimentLabel.Type,List[Double]]): SystemLabeledTweet = {
+  def label(model: TopicModel, tweet: GoldLabeledTweet, labelToTopicDist: Map[SentimentLabel.Type, Array[Double]]): SystemLabeledTweet = {
     val topicDistribution = model.inferTopics(tweet)
-    val similarities = (for ((k,v) <- labelToTopicDist) yield (Statistics.cosineSimilarity(topicDistribution, v), k)).toList.sorted.reverse
+    val similarities = (for ((k, v) <- labelToTopicDist) yield (Statistics.arrayCosineSimilarity(topicDistribution, v), k)).toList.sorted.reverse
     val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
-    SystemLabeledTweet(id, userid, features, goldLabel,SentimentLabel.unitSentiment(similarities(0)._2))
+    val res =
+      similarities match {
+        case (sim, label) :: _ =>
+          SystemLabeledTweet(id, userid, features, goldLabel, SentimentLabel.unitSentiment(label))
+        case Nil =>
+          SystemLabeledTweet(id, userid, features, goldLabel, SentimentLabel.Abstained)
+      }
+    logger.trace(res.toString)
+    res
   }
 
   def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]) = {
+    model.getTopicPriors
     logger.debug("entering evaluation with %d items in the test set".format(testSet.length))
-    val topicsPerTarget: Map[SentimentLabel.Type, List[Double]] = model.getTopicsPerTarget
+    val labelToTopicVector: Map[SentimentLabel.Type, Array[Double]] = model.getLabelsToTopicDist
+    logger.debug("labelToTopicVector:" + labelToTopicVector.toString)
     val start = System.currentTimeMillis()
-    val res = Statistics.getEvalStats("Similarity Topic",for ((tweet,i) <- testSet.zipWithIndex) yield {
-      if (i%100 == 0) {
-        logger.debug("%.0f%% remaining; average label time = %fs".format((1.0-(i+1).toDouble/testSet.length.toDouble)*100, (System.currentTimeMillis()-start).toDouble/(i+1.0) /1000.0))
+    var labeledTestSet = List[SystemLabeledTweet]()
+    for ((tweet, i) <- testSet.zipWithIndex) {
+      if (i % 100 == 0) {
+        logger.debug("%.0f%% remaining; average label time = %fs".format((1.0 - (i + 1).toDouble / testSet.length.toDouble) * 100, (System.currentTimeMillis() - start).toDouble / (i + 1.0) / 1000.0))
       }
-      label(model, tweet, topicsPerTarget)
-    })
+      if (i % 1000 == 0) {
+        logger.debug("results so far:\n" + Statistics.getEvalStats("intermediate results", labeledTestSet))
+      }
+      labeledTestSet = label(model, tweet, labelToTopicVector) :: labeledTestSet
+    }
+    /*val res = Statistics.getEvalStats("Similarity Topic", labeledTestSet)
     logger.info(res.toString)
-    res
+    res*/
+    labeledTestSet
   }
 }

src/main/scala/updown/app/experiment/topic/util/MaxentDiscriminant.scala

+package updown.app.experiment.topic.util
+
+import updown.data.SentimentLabel
+import opennlp.maxent.GIS
+import opennlp.model.DataIndexer
+
+trait MaxentDiscriminant {
+
+  def getDiscriminantFn(labelsToTopicDists: Map[SentimentLabel.Type, scala.List[Array[Double]]]): (Array[Float]) => (String, String) = {
+    val discriminantModel = GIS.trainModel(1000, new DataIndexer {
+      private var _total = 0
+      for ((_, list) <- labelsToTopicDists) {
+        _total += list.length
+      }
+      private val _dimensions = labelsToTopicDists.toList(0)._2(0).size //yikes!
+
+      private val _labels: Array[String] = labelsToTopicDists.keys.toList.map(l => l.toString).toArray
+      private val _contexts: Array[Array[Int]] = {
+        val result = Array.ofDim[Int](_total, _dimensions)
+        for (i <- 0 until _total) {
+          for (j <- 0 until _dimensions) {
+            result(i)(j) = j
+          }
+        }
+        result
+      }
+      private val _predLabels: Array[String] = {
+        val result = Array.ofDim[String](_dimensions)
+        for (j <- 0 until _dimensions) {
+          result(j) = j.toString
+        }
+        result
+      }
+      private val _predCounts: Array[Int] = Array.fill[Int](_dimensions)(1)
+
+      private val (_eventCounts: Array[Int], _eventOutcomes: Array[Int], _eventValues: Array[Array[Float]]) = {
+        var eventCounts = List[Int]()
+        var eventOutcomes = List[Int]()
+        var eventValues = List[Array[Float]]()
+        for ((label, labelIndex) <- _labels.zipWithIndex) {
+          val events = labelsToTopicDists(SentimentLabel.figureItOut(label))
+          for (event <- events) {
+            eventCounts = 1 :: eventCounts
+            eventOutcomes = labelIndex :: eventOutcomes
+            eventValues = event.map(d => d.asInstanceOf[Float]) :: eventValues
+          }
+        }
+        val (tmp1,tmp2,tmp3)=(eventCounts.toArray, eventOutcomes.toArray, eventValues.toArray)
+        (eventCounts.toArray, eventOutcomes.toArray, eventValues.toArray)
+      }
+
+      def getContexts = _contexts
+
+      def getPredLabels = _predLabels
+
+      def getPredCounts = _predCounts
+
+      def getNumTimesEventsSeen = _eventCounts
+
+      def getOutcomeList = _eventOutcomes
+
+      def getOutcomeLabels = _labels
+
+      def getValues = _eventValues
+
+      def getNumEvents = _eventCounts.size
+    })
+
+    (topicDist: Array[Float]) => {
+      val weights = discriminantModel.eval({
+        val result = Array.ofDim[String](topicDist.size)
+        for (j <- 0 until topicDist.size) {
+          result(j) = j.toString
+        }
+        result
+      }, topicDist)
+      (discriminantModel.getBestOutcome(weights), discriminantModel.getAllOutcomes(weights))
+    }
+  }
+}

src/main/scala/updown/preproc/impl/RePreprocToLDATopicVectors.scala

+package updown.preproc.impl
+
+import java.io.File
+import updown.preproc.GenericPreprocessor
+import updown.data.io.TweetFeatureReader
+import updown.data.{GoldLabeledTweet, SentimentLabel}
+import updown.util.{TopicModel, LDATopicModel}
+import org.clapper.argot.ArgotConverters._
+
+/**
+ * This preprocessor is suitable for any directory that contains files which should each be mapped to one instance
+ * whose polarity is signified by the label given to the directory in the inputOption
+ */
+object RePreprocToLDATopicVectors extends GenericPreprocessor {
+  var iterations = 1000
+  var alpha = 30
+  var beta = 0.1
+  var numTopics = 3
+
+  val iterationOption = parser.option[Int](List("iterations"), "INT", "the number of iterations for the training the topicModel")
+  val alphaOption = parser.option[Int](List("alpha"), "INT", "the symmetric alpha hyperparameter for LDA")
+  val betaOption = parser.option[Double](List("beta"), "DOUBLE", "the symmetric beta hyperparameter for LDA")
+  val numTopicsOption = parser.option[Int](List("numTopics"), "INT", "the number of topics for LDA")
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
+    if (iterationOption.value.isDefined) {
+      iterations = iterationOption.value.get
+    }
+    if (alphaOption.value.isDefined) {
+      alpha = alphaOption.value.get
+    }
+    if (betaOption.value.isDefined) {
+      beta = betaOption.value.get
+    }
+    if (numTopicsOption.value.isDefined) {
+      numTopics = numTopicsOption.value.get
+    }
+
+    // Thanks to a bug in Mallet, we have to cap alphaSum
+    val alphaSum = 300 min (alpha * numTopics)
+    val tweets = TweetFeatureReader(fileName)
+    val model: TopicModel = new LDATopicModel(tweets, numTopics, iterations, alphaSum, beta)
+
+    try {
+      (for (tweet <- tweets) yield
+        (tweet.id,
+          tweet.userid,
+          Left(tweet.goldLabel),
+          model.inferTopics(tweet).mkString(" ")
+          )
+        ).iterator
+    } catch {
+      case e: MatchError =>
+        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
+          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
+        Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)]()
+    }
+  }
+}

src/main/scala/updown/util/HPAMTopicModel.scala

 import cc.mallet.util.Randoms
 import scala.Predef._
 import scala._
+import com.weiglewilczek.slf4s.Logging
 
-class HPAMTopicModel(tweets: List[GoldLabeledTweet], numTopics: Int, numIterations: Int/*, alphaSum: Double, beta: Double*/) extends TopicModel {
+class HPAMTopicModel(tweets: List[GoldLabeledTweet], numSuperTopics: Int, numSubTopics: Int, numIterations: Int
+                     /*, alphaSum: Double, beta: Double*/) extends TopicModel with Logging {
   private final val MAX_THREADS = 20
 
-  private val (alphabet, instanceList) = getInstanceList(tweets)
-  private var model = new HierarchicalPAM(3, numTopics, 1.0,1.0)
-  model.estimate(instanceList,instanceList,numIterations,50,10,100,"",new Randoms())
-//  ParallelTopicModel.logger.setLevel(Level.OFF)
+  private val (_alphabet, instanceList) = getInstanceList(tweets)
+  _alphabet.stopGrowth()
+  val alphabet = _alphabet
+  logger.debug("creating pam topic model with %d supers and %d subs".format(numSuperTopics, numSubTopics))
+  private var model = new HierarchicalPAM(numSuperTopics, numSubTopics, 1.0, 1.0)
+  model.estimate(instanceList, instanceList, numIterations, 50, 10, 100, "", new Randoms())
+
+  //  ParallelTopicModel.logger.setLevel(Level.OFF)
+  private val _labelToIndices = tweets.zipWithIndex.groupBy {
+    case (tweet, index) => tweet.goldLabel
+  }.map {
+    case (label, tweetList) => (label, tweetList.map {
+      case (tweet, index) => index
+    })
+  }
 
   override def toString(): String = {
-    model.printTopWords(20,true)
+    model.printTopWords(20, true)
   }
 
   def getTopics: List[Topic] = {
-/*
-    val priors = getTopicPriors
-    val topicsToAlphaIds = scala.collection.mutable.Map[Int,List[(Int,Double)]]()
+    val supers = model.getSuperTopicPriorWeights
+    val subs = model.getSuperSubTopicPriorWeights
+    var result = Topic(1.0, Map(("TOPIC_1" -> supers(1)), ("TOPIC_2" -> supers(2)), ("TOPIC_3" -> supers(3)))) //root
+    for (i <- 0 until numSuperTopics) {
+      val sub = subs(i)
+    }
+    /*
+        val priors = getTopicPriors
+        val topicsToAlphaIds = scala.collection.mutable.Map[Int,List[(Int,Double)]]()
 
-    val wordsTopicsCounts = (for ((topicCounts, typeIndex) <- model.typeTopicCounts.zipWithIndex) yield {
-      val word = alphabet.lookupObject(typeIndex).toString
-      (for (topicCount <- topicCounts) yield {
-        val topic = topicCount & model.topicMask
-        val count = topicCount >> model.topicBits
-        (word,topic,count)
-      }).iterator
-    }).iterator.flatten.toList
+        val wordsTopicsCounts = (for ((topicCounts, typeIndex) <- model.typeTopicCounts.zipWithIndex) yield {
+          val word = alphabet.lookupObject(typeIndex).toString
+          (for (topicCount <- topicCounts) yield {
+            val topic = topicCount & model.topicMask
+            val count = topicCount >> model.topicBits
+            (word,topic,count)
+          }).iterator
+        }).iterator.flatten.toList
 
 
-    val res = (for (i <- 0 until numTopics) yield {
-      val wordCounts = wordsTopicsCounts.filter((triple)=>(triple._2==i && triple._3!=0))
-      val sum = wordCounts.map((triple)=>triple._3).reduce(_ + _)
-      Topic(priors(i), wordCounts.map((triple)=>(triple._1->(triple._3.toDouble/sum))).toMap)
-    }).toList
+        val res = (for (i <- 0 until numSubTopics) yield {
+          val wordCounts = wordsTopicsCounts.filter((triple)=>(triple._2==i && triple._3!=0))
+          val sum = wordCounts.map((triple)=>triple._3).reduce(_ + _)
+          Topic(priors(i), wordCounts.map((triple)=>(triple._1->(triple._3.toDouble/sum))).toMap)
+        }).toList
 
-    res
-*/
+        res
+    */
     List[Topic]()
   }
 
-  def getTopicPriors: List[Double] = {
-/*
-    val result: Array[Double] = new Array[Double](numTopics)
-    var sum = 0.0
-    for (topicAssignment <- model.getData) {
-      val temp: Array[Double] = model.getTopicProbabilities(topicAssignment.topicSequence)
-      for (i <- 0 until result.length) {
-        result(i) += temp(i)
-        sum += temp(i)
-      }
+  /**
+   * Since PAM makes a tree of topics, we just start with the supers and then append each of the children
+   */
+  def getTopicPriors = {
+    val supers: Array[Double] = model.getSuperTopicPriorWeights
+    val subs: Array[Array[Double]] = model.getSuperSubTopicPriorWeights
+    var result = supers.toList
+    for (i <- 0 until numSuperTopics) {
+      result = result ::: subs(i).toList
     }
-    result.toList.map((double: Double) => double / sum)
-*/
-    List[Double]()
+    result.toArray
   }
 
-  def getTopicsPerInstance = {
-  /*  (for (topicAssignment <- model.getData) yield {
-      val source = topicAssignment.instance.getName.toString
-      val dist = model.getTopicProbabilities(topicAssignment.topicSequence)
-      (source, dist.toList)
-    }).toList
-  */
-    List[(String,List[Double])]()
+  def getIdsToTopicDist = {
+    /*  (for (topicAssignment <- model.getData) yield {
+        val source = topicAssignment.instance.getName.toString
+        val dist = model.getTopicProbabilities(topicAssignment.topicSequence)
+        (source, dist.toList)
+      }).toList
+    */
+    Map[String, Array[Double]]()
   }
 
-  def getTopicsPerTarget = {
-  /*  val result = scala.collection.mutable.Map[SentimentLabel.Type,List[Double]]()
-    for (topicAssignment <- model.getData) {
-      val target = topicAssignment.instance.getTarget.asInstanceOf[SentimentLabel.Type]
-      result(target) = result.getOrElse(target, (new Array[Double](numTopics)).toList).zip(model.getTopicProbabilities(topicAssignment.topicSequence).toList).map((pair) => pair._1+pair._2)
-    }
-    (for ((key, value) <- result) yield {
-      val sum = value.reduce( _ + _ )
-      (key->value.map(_ / sum))
+  def getLabelsToTopicDists = {
+    (for ((label, indexList: List[Int]) <- _labelToIndices) yield {
+      (label, indexList.map((i) => getTopicVector(model.getTopicsForDoc(i))))
     }).toMap
-  */
-    Map[SentimentLabel.Type,List[Double]]()
   }
 
-  def inferTopics(tweet: GoldLabeledTweet): List[Double] = {
-    /*val instance = tweet match {
-        case GoldLabeledTweet(id, userid, features, goldLabel) =>
-          val featureSequence = new FeatureSequence(alphabet, features.length)
-          for (feature <- features) {
-            featureSequence.add(feature)
-          }
-          new Instance(featureSequence, goldLabel, id, null)
-      }
-    model.getInferencer.getSampledDistribution(instance, numIterations, 1, 1).toList*/
-    List[Double]()
+  def computeDistribution(assignments: List[Int]): Map[Int, Double] = {
+    val counts = scala.collection.mutable.Map[Int, Double]().withDefaultValue(0.0)
+    for (t <- assignments) {
+      counts(t) += 1
+    }
+    for (k <- counts.keys) {
+      counts(k) /= assignments.length
+    }
+    counts.toMap
+  }
+
+  def inferTopics(tweet: GoldLabeledTweet): Array[Double] = {
+    tweet match {
+      case GoldLabeledTweet(id, userid, features, goldLabel) =>
+        val featureSequence = new FeatureSequence(alphabet, features.length)
+        for (feature <- features) {
+          featureSequence.add(feature)
+        }
+        getTopicVector(model.destructiveTopicInference(featureSequence, numIterations/2))
+    }
+  }
+
+  def getTopicVector(topics: Array[Array[Int]]): Array[Double] = {
+    val superCounts = computeDistribution(topics(0).toList).withDefaultValue(0.0)
+    val subCounts = computeDistribution(topics(1).toList).withDefaultValue(0.0)
+    val result = Array.ofDim[Double](
+      1 + numSuperTopics
+//        + numSubTopics
+    )
+    for (i <- 0 until (1 + numSuperTopics)) {result(i) = superCounts(i)}
+//    for (i <- 0 until (numSubTopics)) {result(1+numSuperTopics+i) = subCounts(i)}
+    result
   }
 
   def save(filename: String) {

src/main/scala/updown/util/LDATopicModel.scala

     res
   }
 
-  def getTopicPriors: List[Double] = {
+  def getTopicPriors = {
     val result: Array[Double] = new Array[Double](numTopics)
     var sum = 0.0
     for (topicAssignment <- model.getData) {
         sum += temp(i)
       }
     }
-    result.toList.map((double: Double) => double / sum)
+    result.toList.map((double: Double) => double / sum).toArray
   }
 
-  def getTopicsPerInstance = {
+  def getIdsToTopicDist = {
     (for (topicAssignment <- model.getData) yield {
       val source = topicAssignment.instance.getName.toString
       val dist = model.getTopicProbabilities(topicAssignment.topicSequence)
-      (source, dist.toList)
-    }).toList
-  }
-
-  def getTopicsPerTarget = {
-    val result = scala.collection.mutable.Map[SentimentLabel.Type,List[Double]]()
-    for (topicAssignment <- model.getData) {
-      val target = topicAssignment.instance.getTarget.asInstanceOf[SentimentLabel.Type]
-      result(target) = result.getOrElse(target, (new Array[Double](numTopics)).toList).zip(model.getTopicProbabilities(topicAssignment.topicSequence).toList).map((pair) => pair._1+pair._2)
-    }
-    (for ((key, value) <- result) yield {
-      val sum = value.reduce( _ + _ )
-      (key->value.map(_ / sum))
+      (source, dist)
     }).toMap
   }
 
-  def inferTopics(tweet: GoldLabeledTweet): List[Double] = {
+  def getLabelsToTopicDists = {
+    val result = scala.collection.mutable.Map[SentimentLabel.Type,List[Array[Double]]]().withDefaultValue(Nil)
+    for (topicAssignment <- model.getData) {
+      val label = topicAssignment.instance.getTarget.asInstanceOf[SentimentLabel.Type]
+      result(label) = model.getTopicProbabilities(topicAssignment.topicSequence) :: result(label)
+    }
+    result.toMap // immutize
+  }
+
+  def inferTopics(tweet: GoldLabeledTweet) = {
     val instance = tweet match {
         case GoldLabeledTweet(id, userid, features, goldLabel) =>
           val featureSequence = new FeatureSequence(alphabet, features.length)
           }
           new Instance(featureSequence, goldLabel, id, null)
       }
-    model.getInferencer.getSampledDistribution(instance, numIterations, 1, 1).toList
+    model.getInferencer.getSampledDistribution(instance, numIterations, 1, 1)
   }
 
   def save(filename: String) {

src/main/scala/updown/util/LDATopicModelFromFile.scala

     res
   }
 
-  def getTopicPriors: List[Double] = {
+  def getTopicPriors = {
     val result: Array[Double] = new Array[Double](numTopics)
     var sum = 0.0
     for (topicAssignment <- model.getData) {
         sum += temp(i)
       }
     }
-    result.toList.map((double: Double) => double / sum)
+    result.toList.map((double: Double) => double / sum).toArray
   }
 
-  def getTopicsPerInstance = {
+  def getIdsToTopicDist = {
     (for (topicAssignment <- model.getData) yield {
       val source = topicAssignment.instance.getName.toString
       val dist = model.getTopicProbabilities(topicAssignment.topicSequence)
-      (source, dist.toList)
-    }).toList
-  }
-
-  def getTopicsPerTarget = {
-    val result = scala.collection.mutable.Map[SentimentLabel.Type,List[Double]]()
-    for (topicAssignment <- model.getData) {
-      val target = topicAssignment.instance.getTarget.asInstanceOf[SentimentLabel.Type]
-      result(target) = result.getOrElse(target, (new Array[Double](numTopics)).toList).zip(model.getTopicProbabilities(topicAssignment.topicSequence).toList).map((pair) => pair._1+pair._2)
-    }
-    (for ((key, value) <- result) yield {
-      val sum = value.reduce( _ + _ )
-      (key->value.map(_ / sum))
+      (source, dist)
     }).toMap
   }
 
-  def inferTopics(tweet: GoldLabeledTweet): List[Double] = {
+    def getLabelsToTopicDists = {
+    val result = scala.collection.mutable.Map[SentimentLabel.Type,List[Array[Double]]]().withDefaultValue(Nil)
+    for (topicAssignment <- model.getData) {
+      val label = topicAssignment.instance.getTarget.asInstanceOf[SentimentLabel.Type]
+      result(label) = model.getTopicProbabilities(topicAssignment.topicSequence) :: result(label)
+    }
+    result.toMap // immutize
+  }
+
+  def inferTopics(tweet: GoldLabeledTweet) = {
     val instance = tweet match {
         case GoldLabeledTweet(id, userid, features, goldLabel) =>
           val featureSequence = new FeatureSequence(alphabet, features.length)
           }
           new Instance(featureSequence, goldLabel, id, null)
       }
-    model.getInferencer.getSampledDistribution(instance, numIterations, 1, 1).toList
+    model.getInferencer.getSampledDistribution(instance, numIterations, 1, 1)
   }
 
   def save(filename: String) {

src/main/scala/updown/util/Statistics.scala

   val mag: (List[Double]) => Double =
     (A) => math.sqrt(A.map((i) => i * i).reduce(_ + _))
 
+  val arrayCosineSimilarity: (Array[Double], Array[Double]) => Double = (a,b)=>cosineSimilarity(a.toList, b.toList)
+  
   val cosineSimilarity: (List[Double], List[Double]) => Double =
     (A, B) => (dot(A, B) / (mag(A) * mag(B)))
 
   def averageResults(newName: String, results: scala.List[ExperimentalResult]): ExperimentalResult = {
     var avgAccuracy = 0.0
     var avgN = 0.0
-    var avgLabelResults = scala.collection.mutable.Map[SentimentLabel.Type, LabelResult]().withDefault((label) => LabelResult(0, label, 0.0, 0.0, 0.0))
+    var avgLabelResults = scala.collection.mutable.Map[SentimentLabel.Type, LabelResult]().withDefault((label) => LabelResult(0, 0, label, 0.0, 0.0, 0.0))
     // first, sum
     for (ExperimentalResult(name, n, accuracy, classes) <- results) {
       avgAccuracy += accuracy
       avgN += n
-      for (LabelResult(n, label, precision, recall, f) <- classes) {
-        val LabelResult(oN, oLabel, oPrecision, oRecall, oF) = avgLabelResults(label)
-        avgLabelResults(label) = LabelResult(n + oN, label, precision + oPrecision, recall + oRecall, f + oF)
+      for (LabelResult(nG, nS, label, precision, recall, f) <- classes) {
+        val LabelResult(oNG, oNS, oLabel, oPrecision, oRecall, oF) = avgLabelResults(label)
+        avgLabelResults(label) = LabelResult(nG + oNG, nS+oNS, label, precision + oPrecision, recall + oRecall, f + oF)
       }
     }
     // then, scale
     val N = results.length
     ExperimentalResult(newName, (avgN / N).toInt, avgAccuracy / N,
-      (for ((_, LabelResult(n, label, precision, recall, f)) <- avgLabelResults.toList.sortBy {
+      (for ((_, LabelResult(nG, nS, label, precision, recall, f)) <- avgLabelResults.toList.sortBy {
         case (k, v) => SentimentLabel.ordinality(k)
       }) yield {
-        LabelResult(n / N, label, precision / N, recall / N, f / N)
+        LabelResult(nG / N, nS/N, label, precision / N, recall / N, f / N)
       }).toList)
   }
 
   def getEvalStats(resultName: String, tweets: scala.List[SystemLabeledTweet]): ExperimentalResult = {
     val (correct, total) = tabulate(tweets)
     ExperimentalResult(resultName, total, accurracy(correct, total),
-      (for (label <- List(SentimentLabel.Positive, SentimentLabel.Negative, SentimentLabel.Neutral)) yield {
+      (for (label <- List(SentimentLabel.Positive, SentimentLabel.Negative, SentimentLabel.Neutral, SentimentLabel.Abstained)) yield {
         val goldList = tweets.filter((tweet) => tweet.goldLabel == label)
         logger.debug("%s gold tweets: %d".format(SentimentLabel.toEnglishName(label), goldList.length))
         val systemList = tweets.filter((tweet) => tweet.systemLabel == label)
           goldList.length
         )
 
-        LabelResult(goldList.length, label, labelPrecision, labelRecall, fScore(labelPrecision, labelRecall))
+        LabelResult(goldList.length, systemList.length, label, labelPrecision, labelRecall, fScore(labelPrecision, labelRecall))
       }).toList)
   }
 

src/main/scala/updown/util/TopicModel.scala

 package updown.util
 
+import updown.data.{SentimentLabel, GoldLabeledTweet}
 import cc.mallet.types._
-import updown.data.{SentimentLabel, GoldLabeledTweet}
 
-case class Topic(prior:Double, distribution: Map[String,Double])
+case class Topic(prior: Double, distribution: Map[String, Double])
 
 abstract class TopicModel {
   protected def getInstanceList(tweetList: List[GoldLabeledTweet]): (Alphabet, InstanceList) = {
     (alphabet, instanceList)
   }
 
+  protected def getInstanceList(tweetList: List[GoldLabeledTweet], alphabet: Alphabet) = {
+    val instances = (for (tweet <- tweetList) yield {
+      tweet match {
+        case GoldLabeledTweet(id, userid, features, goldLabel) =>
+          val featureSequence = new FeatureSequence(alphabet, features.length)
+          for (feature <- features) {
+            featureSequence.add(feature)
+          }
+          new Instance(featureSequence, goldLabel, id, null)
+      }
+    }).toList
+
+    val instanceList = new InstanceList(alphabet, null)
+    for (instance <- instances) {
+      instanceList.add(instance)
+    }
+    instanceList
+  }
+
   def getTopics: List[Topic]
-  def getTopicPriors: List[Double]
-  def getTopicsPerInstance: List[(String,List[Double])]
-  def getTopicsPerTarget: Map[SentimentLabel.Type,List[Double]]
-  def inferTopics(tweet: GoldLabeledTweet): List[Double]
+
+  def getTopicPriors: Array[Double]
+
+  def getIdsToTopicDist: Map[String, Array[Double]]
+
+  def getLabelsToTopicDists: Map[SentimentLabel.Type, List[Array[Double]]]
+
+  def getLabelsToTopicDist: Map[SentimentLabel.Type, Array[Double]] = {
+    (for ((label, topicDist: List[Array[Double]]) <- getLabelsToTopicDists) yield {
+      val N = topicDist.length
+      (label,
+        topicDist
+          .reduce((a: Array[Double], b: Array[Double]) => (a zip b).map {
+          case (x, y) => x + y
+        })
+          .map(_ / N)
+        )
+    }).toMap
+  }
+
+  def inferTopics(tweet: GoldLabeledTweet): Array[Double]
 
   def save(filename: String)
 }