vvcephei avatar vvcephei committed b186eb9

did a bunch of work on PAM and started work on DMR

Comments (0)

Files changed (18)

config/main/log4j.properties

-log4j.rootLogger=TRACE, stderr
+log4j.rootLogger=INFO, stderr
 log4j.appender.stderr=org.apache.log4j.ConsoleAppender
 log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
 

Binary file modified.

src/main/scala/updown/app/experiment/topic/NFoldTopicExperiment.scala

-package updown.app.experiment.topic
-
-import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
-import org.clapper.argot.{SingleValueOption, ArgotParser}
-import org.clapper.argot.ArgotParser._
-import org.clapper.argot.ArgotConverters._
-import java.io.{FileWriter, BufferedWriter, File}
-import updown.util.{WordleUtils, Statistics, LDATopicModel, TopicModel}
-import updown.app.experiment.{ExperimentalResult, NFoldExperiment}
-import java.util.Arrays
-
-abstract class NFoldTopicExperiment extends NFoldExperiment {
-  var iterations = 1000
-  var alpha = 30
-  var beta = 0.1
-  var numTopics = 3
-  val fileSeparator = System.getProperty("file.separator")
-
-  var childProcesses = List[Process]()
-
-  val iterationOption = parser.option[Int](List("iterations"), "INT", "the number of iterations for the training the topicModel")
-  val alphaOption = parser.option[Int](List("alpha"), "INT", "the symmetric alpha hyperparameter for LDA")
-  val betaOption = parser.option[Double](List("beta"), "DOUBLE", "the symmetric beta hyperparameter for LDA")
-  val numTopicsOption = parser.option[Int](List("numTopics"), "INT", "the number of topics for LDA")
-
-  val outputOption = parser.option[String](List("o", "output"), "DIR", "the directory to dump topics into")
-  val wordleOption = parser.flag[Boolean](List("w", "wordle"), "generate wordles for the topics (requires -o DIR) " +
-    "(requires that you have downloaded IBM's word cloud generator)")
-  val wordleJarOption = parser.option[String](List("wordleJar"), "PATH", ("the path to IBM's word cloud generator " +
-    "(default %s)").format(WordleUtils.defaultJarPath))
-  val wordleConfigOption = parser.option[String](List("wordleConfig"), "PATH", ("the path to the config file for IBM's " +
-    "word cloud generator (default %s)").format(WordleUtils.defaultConfigurationPath))
-
-  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]): List[SystemLabeledTweet]
-
-  def doOutput(model: TopicModel) {
-    if (outputOption.value.isDefined) {
-      val file = new File(outputOption.value.get + fileSeparator + "run" + experimentalRun)
-      file.mkdirs()
-      val outputDirForThisRun = file.getAbsolutePath
-      val summary = new BufferedWriter((new FileWriter((outputDirForThisRun + fileSeparator + "summary"))))
-      summary.write("%s\n".format(model.getTopicPriors.zipWithIndex.map {
-        case (a, b) => "Topic %s:%6.3f".format(b, a)
-      }.mkString("\n")))
-      summary.write("%s\n".format(model.getLabelsToTopicDist.toList.map {
-        case (a, b) => "Label %9s:%s".format(SentimentLabel.toEnglishName(a), b.map {
-          "%7.3f".format(_)
-        }.mkString(""))
-      }.mkString("\n")))
-      summary.close()
-      val outputFiles =
-        (for ((topic, i) <- model.getTopics.zipWithIndex) yield {
-          val outFile = new File(outputDirForThisRun + fileSeparator + "topic" + i)
-          val output = new BufferedWriter(new FileWriter(outFile))
-          output.write("%s\n".format(topic.distribution.toList.sortBy((pair) => (1 - pair._2)).map {
-            case (a, b) => "%s\t%s".format(a, b)
-          }.mkString("\n")))
-          output.close()
-          outFile.getAbsolutePath
-        })
-      if (wordleOption.value.isDefined) {
-        logger.debug("making wordles and report")
-        val index = new BufferedWriter((new FileWriter((outputDirForThisRun + fileSeparator + "index.html"))))
-        index.write("<head><style>\n%s\n</style></head>\n".format(List(
-        "div.bordered{border-style: solid none none none; padding: 5px; border-width: 1px; border-color: gray;}",
-        "div#wordles{display:block; clear:both; padding-top:20px;}",
-        "div.wordle{float:left;width:45%;border-style:solid; border-width:1px; border-color:gray; margin:2px;}",
-        "div.wordle img{width: 100%;}",
-        ".table{display:block; clear: both;}",
-        ".row{display:block;clear:both;}",
-        ".cell{display:block;float:left;}",
-        ".values{display:block;float:left;width:300px;}",
-        ".value{display:block;float:left;width:60px;}",
-        "div.topicFreq .title{width:100px;}",
-        "div.labelDistribution .title{width:150px;}"
-        ).mkString("\n")))
-        index.write("<body>")
-        index.write("<div id=topicDistribution class=\"bordered table\">%s</div>\n".format(model.getTopicPriors.zipWithIndex.map {
-          case (a, b) => "<div class=\"topicFreq row\"><span class=\"title cell\">Topic %s</span><span class=\"value cell\">%6.3f</span></div>".format(b, a)
-        }.mkString("\n")))
-        index.write(("<div id=labelDistributions class=\"bordered table\">" +
-          "<div class=\"labelDistribution row\"><span class=\"title cell\">topic</span><span class=\"values cell\"><span class=\"value\">  0</span><span class=\"value\">  1</span><span class=\"value\">  2</span></span></div>" +
-          "%s</div>\n").format(model.getLabelsToTopicDist.toList.sortBy({case(a,b)=>SentimentLabel.ordinality(a)}).map {
-          case (a, b) => "<div class=\"labelDistribution row\"><span class=\"title cell\">Label %9s</span><span class=\"values cell\">%s</span></div>".format(SentimentLabel.toEnglishName(a), b.map {
-            "<span class=value>%7.3f</span>".format(_)
-          }.mkString(""))
-        }.mkString("\n")))
-        val jarPath = if (wordleJarOption.value.isDefined) wordleJarOption.value.get else WordleUtils.defaultJarPath
-        val configPath = if (wordleConfigOption.value.isDefined) wordleConfigOption.value.get else WordleUtils.defaultConfigurationPath
-        index.write("<div id=wordles class=bordered>")
-        childProcesses = childProcesses ::: WordleUtils.makeWordles(jarPath, configPath, outputFiles, Some(index))
-        index.write("</div></body>")
-        index.close()
-        logger.debug("done making report and initializing wordles")
-      }
-    }
-  }
-
-  def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]) = {
-    if (iterationOption.value.isDefined) {
-      iterations = iterationOption.value.get
-    }
-    if (alphaOption.value.isDefined) {
-      alpha = alphaOption.value.get
-    }
-    if (betaOption.value.isDefined) {
-      beta = betaOption.value.get
-    }
-    if (numTopicsOption.value.isDefined) {
-      numTopics = numTopicsOption.value.get
-    }
-
-    // Thanks to a bug in Mallet, we have to cap alphaSum
-    val alphaSum = 300 min (alpha * numTopics)
-
-
-    logger.debug("alphaSum: " + alphaSum)
-    val model: TopicModel = new LDATopicModel(trainSet, numTopics, iterations, alphaSum, beta)
-    logger.debug("topic distribution:\n     :" + Arrays.toString(model.getTopicPriors))
-    logger.debug({
-      val labelToTopicDist = model.getLabelsToTopicDist
-      "topic distribution over labels:\n" + (for ((k, v) <- labelToTopicDist) yield "%5s:%s".format(k, Arrays.toString(v))).mkString("\n")
-    })
-    logger.debug({
-      val topics = model.getTopics
-      "topic distributions\n" +
-        (for (i <- 0 until 3) yield "%5s: Topic(%s,%s)".format(i, topics(i).prior, topics(i).distribution.toList.sortBy((pair) => (1 - pair._2)).take(10))).mkString("\n")
-    })
-    doOutput(model)
-    evaluate(model, testSet)
-  }
-
-  def after(): Int = {
-    if (childProcesses.length > 0) {
-      logger.info("waiting for child processes...")
-      WordleUtils.waitForChildren(childProcesses)
-    } else {
-      0
-    }
-  }
-}

src/main/scala/updown/app/experiment/topic/dmr/NFoldDMRExperiment.scala

+package updown.app.experiment.topic.dmr
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import org.clapper.argot.ArgotParser._
+import org.clapper.argot.ArgotConverters._
+import java.io.{FileWriter, BufferedWriter, File}
+import updown.app.experiment.NFoldExperiment
+import java.util.Arrays
+import updown.util.{DMRTopicModel, WordleUtils, LDATopicModel, TopicModel}
+
+abstract class NFoldDMRExperiment extends NFoldExperiment {
+  var iterations = 1000
+  var alpha = 30
+  var beta = 0.1
+  var numTopics = 3
+  val fileSeparator = System.getProperty("file.separator")
+
+  var childProcesses = List[Process]()
+
+  val iterationOption = parser.option[Int](List("iterations"), "INT", "the number of iterations for the training the topicModel")
+  val alphaOption = parser.option[Int](List("alpha"), "INT", "the symmetric alpha hyperparameter for LDA")
+  val betaOption = parser.option[Double](List("beta"), "DOUBLE", "the symmetric beta hyperparameter for LDA")
+  val numTopicsOption = parser.option[Int](List("numTopics"), "INT", "the number of topics for LDA")
+
+  val outputOption = parser.option[String](List("o", "output"), "DIR", "the directory to dump topics into")
+  val wordleOption = parser.flag[Boolean](List("w", "wordle"), "generate wordles for the topics (requires -o DIR) " +
+    "(requires that you have downloaded IBM's word cloud generator)")
+  val wordleJarOption = parser.option[String](List("wordleJar"), "PATH", ("the path to IBM's word cloud generator " +
+    "(default %s)").format(WordleUtils.defaultJarPath))
+  val wordleConfigOption = parser.option[String](List("wordleConfig"), "PATH", ("the path to the config file for IBM's " +
+    "word cloud generator (default %s)").format(WordleUtils.defaultConfigurationPath))
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]): List[SystemLabeledTweet]
+
+  def doOutput(model: TopicModel) {
+    if (outputOption.value.isDefined) {
+      val file = new File(outputOption.value.get + fileSeparator + "run" + experimentalRun)
+      file.mkdirs()
+      val outputDirForThisRun = file.getAbsolutePath
+      val summary = new BufferedWriter((new FileWriter((outputDirForThisRun + fileSeparator + "summary"))))
+      summary.write("%s\n".format(model.getTopicPriors.zipWithIndex.map {
+        case (a, b) => "Topic %s:%6.3f".format(b, a)
+      }.mkString("\n")))
+      summary.write("%s\n".format(model.getLabelsToTopicDist.toList.map {
+        case (a, b) => "Label %9s:%s".format(SentimentLabel.toEnglishName(a), b.map {
+          "%7.3f".format(_)
+        }.mkString(""))
+      }.mkString("\n")))
+      summary.close()
+      val outputFiles =
+        (for ((topic, i) <- model.getTopics.zipWithIndex) yield {
+          val outFile = new File(outputDirForThisRun + fileSeparator + "topic" + i)
+          val output = new BufferedWriter(new FileWriter(outFile))
+          output.write("%s\n".format(topic.distribution.toList.sortBy((pair) => (1 - pair._2)).map {
+            case (a, b) => "%s\t%s".format(a, b)
+          }.mkString("\n")))
+          output.close()
+          outFile.getAbsolutePath
+        })
+      if (wordleOption.value.isDefined) {
+        logger.debug("making wordles and report")
+        val index = new BufferedWriter((new FileWriter((outputDirForThisRun + fileSeparator + "index.html"))))
+        index.write("<head><style>\n%s\n</style></head>\n".format(List(
+        "div.bordered{border-style: solid none none none; padding: 5px; border-width: 1px; border-color: gray;}",
+        "div#wordles{display:block; clear:both; padding-top:20px;}",
+        "div.wordle{float:left;width:45%;border-style:solid; border-width:1px; border-color:gray; margin:2px;}",
+        "div.wordle img{width: 100%;}",
+        ".table{display:block; clear: both;}",
+        ".row{display:block;clear:both;}",
+        ".cell{display:block;float:left;}",
+        ".values{display:block;float:left;width:300px;}",
+        ".value{display:block;float:left;width:60px;}",
+        "div.topicFreq .title{width:100px;}",
+        "div.labelDistribution .title{width:150px;}"
+        ).mkString("\n")))
+        index.write("<body>")
+        index.write("<div id=topicDistribution class=\"bordered table\">%s</div>\n".format(model.getTopicPriors.zipWithIndex.map {
+          case (a, b) => "<div class=\"topicFreq row\"><span class=\"title cell\">Topic %s</span><span class=\"value cell\">%6.3f</span></div>".format(b, a)
+        }.mkString("\n")))
+        index.write(("<div id=labelDistributions class=\"bordered table\">" +
+          "<div class=\"labelDistribution row\"><span class=\"title cell\">topic</span><span class=\"values cell\"><span class=\"value\">  0</span><span class=\"value\">  1</span><span class=\"value\">  2</span></span></div>" +
+          "%s</div>\n").format(model.getLabelsToTopicDist.toList.sortBy({case(a,b)=>SentimentLabel.ordinality(a)}).map {
+          case (a, b) => "<div class=\"labelDistribution row\"><span class=\"title cell\">Label %9s</span><span class=\"values cell\">%s</span></div>".format(SentimentLabel.toEnglishName(a), b.map {
+            "<span class=value>%7.3f</span>".format(_)
+          }.mkString(""))
+        }.mkString("\n")))
+        val jarPath = if (wordleJarOption.value.isDefined) wordleJarOption.value.get else WordleUtils.defaultJarPath
+        val configPath = if (wordleConfigOption.value.isDefined) wordleConfigOption.value.get else WordleUtils.defaultConfigurationPath
+        index.write("<div id=wordles class=bordered>")
+        childProcesses = childProcesses ::: WordleUtils.makeWordles(jarPath, configPath, outputFiles, Some(index))
+        index.write("</div></body>")
+        index.close()
+        logger.debug("done making report and initializing wordles")
+      }
+    }
+  }
+
+  def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]) = {
+    if (iterationOption.value.isDefined) {
+      iterations = iterationOption.value.get
+    }
+    if (alphaOption.value.isDefined) {
+      alpha = alphaOption.value.get
+    }
+    if (betaOption.value.isDefined) {
+      beta = betaOption.value.get
+    }
+    if (numTopicsOption.value.isDefined) {
+      numTopics = numTopicsOption.value.get
+    }
+
+    // Thanks to a bug in Mallet, we have to cap alphaSum
+    val alphaSum = 300 min (alpha * numTopics)
+
+
+    logger.debug("alphaSum: " + alphaSum)
+    val model: TopicModel = new DMRTopicModel(trainSet,numTopics,iterations,alphaSum,beta)
+    logger.debug("topic distribution:\n     :" + Arrays.toString(model.getTopicPriors))
+    model.asInstanceOf[DMRTopicModel].dumpToStdOut
+    logger.debug({
+      val labelToTopicDist = model.getLabelsToTopicDist
+      "topic distribution over labels:\n" + (for ((k, v) <- labelToTopicDist) yield "%5s:%s".format(k, Arrays.toString(v))).mkString("\n")
+    })
+    logger.debug({
+      val topics = model.getTopics
+      "topic distributions\n" +
+        (for ((topic,i) <- topics.zipWithIndex) yield "%5s: Topic(%s,%s)".format(i, topics(i).prior, topics(i).distribution.toList.sortBy((pair) => (1 - pair._2)).take(10))).mkString("\n")
+    })
+    doOutput(model)
+    evaluate(model, testSet)
+  }
+
+  def after(): Int = {
+    if (childProcesses.length > 0) {
+      logger.info("waiting for child processes...")
+      WordleUtils.waitForChildren(childProcesses)
+    } else {
+      0
+    }
+  }
+}

src/main/scala/updown/app/experiment/topic/dmr/NFoldMaxentDiscriminantDMRExperiment.scala

+package updown.app.experiment.topic.dmr
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.util.TopicModel
+import scala.Array
+import updown.app.experiment.topic.util.MaxentDiscriminant
+
+object NFoldMaxentDiscriminantDMRExperiment extends NFoldDMRExperiment with MaxentDiscriminant {
+
+  def label(model: TopicModel, tweet: GoldLabeledTweet, discriminantFn: (Array[Float]) => (String, String)): SystemLabeledTweet = {
+    val topicDist: Array[Float] = model.inferTopics(tweet).map((item) => item.asInstanceOf[Float])
+    val (label: String, outcomes: String) = discriminantFn(topicDist)
+
+    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
+    logger.trace("labeling id:%s gold:%2s with label:%2s from outcomes:%s".format(id, goldLabel.toString, label.toString, outcomes))
+    SystemLabeledTweet(id, userid, features, goldLabel, SentimentLabel.figureItOut(label))
+  }
+
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]) = {
+    logger.debug("entering evaluation with %d items in the test set".format(testSet.length))
+    val labelsToTopicDists: Map[SentimentLabel.Type, List[Array[Double]]] = model.getLabelsToTopicDists
+    val discriminantFn = getDiscriminantFn(labelsToTopicDists)
+    val start = System.currentTimeMillis()
+
+    val res = (for ((tweet, i) <- testSet.zipWithIndex) yield {
+      if (i % 100 == 0) {
+        logger.debug("%.0f%% remaining; average label time = %fs".format((1.0 - (i + 1).toDouble / testSet.length.toDouble) * 100, (System.currentTimeMillis() - start).toDouble / (i + 1.0) / 1000.0))
+      }
+      label(model, tweet, discriminantFn)
+    }).toList
+    res
+  }
+}

src/main/scala/updown/app/experiment/topic/lda/NFoldDiscriminantLDAExperiment.scala

 import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
 import updown.util.TopicModel
 import scala.Array
-import updown.app.experiment.topic.NFoldTopicExperiment
 import updown.app.experiment.topic.util.MaxentDiscriminant
 
 object NFoldDiscriminantLDAExperiment extends NFoldTopicExperiment with MaxentDiscriminant {
     val (label: String, outcomes: String) = discriminantFn(topicDist)
 
     val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
-    logger.trace("labeling id:%s with label:%s from outcomes:%s".format(id, label.toString, outcomes))
+    logger.trace("labeling id:%s gold:%2s with label:%2s from outcomes:%s".format(id, goldLabel.toString, label.toString, outcomes))
     SystemLabeledTweet(id, userid, features, goldLabel, SentimentLabel.figureItOut(label))
   }
 

src/main/scala/updown/app/experiment/topic/lda/NFoldKNNDiscriminantExperiment.scala

+package updown.app.experiment.topic.lda
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.util.TopicModel
+import scala.Array
+import updown.app.experiment.topic.util.KNNDiscriminant
+import org.clapper.argot.ArgotConverters._
+object NFoldKNNDiscriminantExperiment extends NFoldTopicExperiment with KNNDiscriminant {
+  val DEFAULT_K = 11
+  val kOption = parser.option[Int](List("k","numNearestNeighbors"), "INT", "the number of nearest neighbors to consider in choosing a label")
+
+  def label(model: TopicModel, tweet: GoldLabeledTweet, discriminantFn: (Array[Float]) => (String, String)): SystemLabeledTweet = {
+    val topicDist: Array[Float] = model.inferTopics(tweet).map((item) => item.asInstanceOf[Float])
+    val (label: String, outcomes: String) = discriminantFn(topicDist)
+
+    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
+    logger.trace("labeling id:%s gold:%2s with label:%2s from outcomes:%s".format(id, goldLabel.toString, label.toString, outcomes))
+    SystemLabeledTweet(id, userid, features, goldLabel, SentimentLabel.figureItOut(label))
+  }
+
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]) = {
+    logger.debug("entering evaluation with %d items in the test set".format(testSet.length))
+    val k = kOption.value match {
+      case Some(x:Int) => x
+      case None => DEFAULT_K
+    }
+
+    val labelsToTopicDists: Map[SentimentLabel.Type, List[Array[Double]]] = model.getLabelsToTopicDists
+    val discriminantFn = getDiscriminantFn(k,labelsToTopicDists)
+    val start = System.currentTimeMillis()
+
+    val res = (for ((tweet, i) <- testSet.zipWithIndex) yield {
+      if (i % 100 == 0) {
+        logger.debug("%.0f%% remaining; average label time = %fs".format((1.0 - (i + 1).toDouble / testSet.length.toDouble) * 100, (System.currentTimeMillis() - start).toDouble / (i + 1.0) / 1000.0))
+      }
+      label(model, tweet, discriminantFn)
+    }).toList
+    res
+  }
+}

src/main/scala/updown/app/experiment/topic/lda/NFoldMajorityTopicExperiment.scala

 
 import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
 import updown.util.TopicModel
-import updown.app.experiment.topic.NFoldTopicExperiment
 
 object NFoldMajorityTopicExperiment extends NFoldTopicExperiment {
 

src/main/scala/updown/app/experiment/topic/lda/NFoldSimilarityTopicExperiment.scala

 
 import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
 import updown.util.{Statistics, TopicModel}
-import updown.app.experiment.topic.NFoldTopicExperiment
 
 object NFoldSimilarityTopicExperiment extends NFoldTopicExperiment {
   def label(model: TopicModel, tweet: GoldLabeledTweet, labelToTopicDist: Map[SentimentLabel.Type,Array[Double]]): SystemLabeledTweet = {

src/main/scala/updown/app/experiment/topic/lda/NFoldTopicExperiment.scala

+package updown.app.experiment.topic.lda
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import org.clapper.argot.{SingleValueOption, ArgotParser}
+import org.clapper.argot.ArgotParser._
+import org.clapper.argot.ArgotConverters._
+import java.io.{FileWriter, BufferedWriter, File}
+import updown.util.{WordleUtils, Statistics, LDATopicModel, TopicModel}
+import updown.app.experiment.{ExperimentalResult, NFoldExperiment}
+import java.util.Arrays
+
+abstract class NFoldTopicExperiment extends NFoldExperiment {
+  var iterations = 1000
+  var alpha = 30
+  var beta = 0.1
+  var numTopics = 3
+  val fileSeparator = System.getProperty("file.separator")
+
+  var childProcesses = List[Process]()
+
+  val iterationOption = parser.option[Int](List("iterations"), "INT", "the number of iterations for the training the topicModel")
+  val alphaOption = parser.option[Int](List("alpha"), "INT", "the symmetric alpha hyperparameter for LDA")
+  val betaOption = parser.option[Double](List("beta"), "DOUBLE", "the symmetric beta hyperparameter for LDA")
+  val numTopicsOption = parser.option[Int](List("numTopics"), "INT", "the number of topics for LDA")
+
+  val outputOption = parser.option[String](List("o", "output"), "DIR", "the directory to dump topics into")
+  val wordleOption = parser.flag[Boolean](List("w", "wordle"), "generate wordles for the topics (requires -o DIR) " +
+    "(requires that you have downloaded IBM's word cloud generator)")
+  val wordleJarOption = parser.option[String](List("wordleJar"), "PATH", ("the path to IBM's word cloud generator " +
+    "(default %s)").format(WordleUtils.defaultJarPath))
+  val wordleConfigOption = parser.option[String](List("wordleConfig"), "PATH", ("the path to the config file for IBM's " +
+    "word cloud generator (default %s)").format(WordleUtils.defaultConfigurationPath))
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]): List[SystemLabeledTweet]
+
+  def doOutput(model: TopicModel) {
+    if (outputOption.value.isDefined) {
+      val file = new File(outputOption.value.get + fileSeparator + "run" + experimentalRun)
+      file.mkdirs()
+      val outputDirForThisRun = file.getAbsolutePath
+      val summary = new BufferedWriter((new FileWriter((outputDirForThisRun + fileSeparator + "summary"))))
+      summary.write("%s\n".format(model.getTopicPriors.zipWithIndex.map {
+        case (a, b) => "Topic %s:%6.3f".format(b, a)
+      }.mkString("\n")))
+      summary.write("%s\n".format(model.getLabelsToTopicDist.toList.map {
+        case (a, b) => "Label %9s:%s".format(SentimentLabel.toEnglishName(a), b.map {
+          "%7.3f".format(_)
+        }.mkString(""))
+      }.mkString("\n")))
+      summary.close()
+      val outputFiles =
+        (for ((topic, i) <- model.getTopics.zipWithIndex) yield {
+          val outFile = new File(outputDirForThisRun + fileSeparator + "topic" + i)
+          val output = new BufferedWriter(new FileWriter(outFile))
+          output.write("%s\n".format(topic.distribution.toList.sortBy((pair) => (1 - pair._2)).map {
+            case (a, b) => "%s\t%s".format(a, b)
+          }.mkString("\n")))
+          output.close()
+          outFile.getAbsolutePath
+        })
+      if (wordleOption.value.isDefined) {
+        logger.debug("making wordles and report")
+        val index = new BufferedWriter((new FileWriter((outputDirForThisRun + fileSeparator + "index.html"))))
+        index.write("<head><style>\n%s\n</style></head>\n".format(List(
+        "div.bordered{border-style: solid none none none; padding: 5px; border-width: 1px; border-color: gray;}",
+        "div#wordles{display:block; clear:both; padding-top:20px;}",
+        "div.wordle{float:left;width:45%;border-style:solid; border-width:1px; border-color:gray; margin:2px;}",
+        "div.wordle img{width: 100%;}",
+        ".table{display:block; clear: both;}",
+        ".row{display:block;clear:both;}",
+        ".cell{display:block;float:left;}",
+        ".values{display:block;float:left;width:300px;}",
+        ".value{display:block;float:left;width:60px;}",
+        "div.topicFreq .title{width:100px;}",
+        "div.labelDistribution .title{width:150px;}"
+        ).mkString("\n")))
+        index.write("<body>")
+        index.write("<div id=topicDistribution class=\"bordered table\">%s</div>\n".format(model.getTopicPriors.zipWithIndex.map {
+          case (a, b) => "<div class=\"topicFreq row\"><span class=\"title cell\">Topic %s</span><span class=\"value cell\">%6.3f</span></div>".format(b, a)
+        }.mkString("\n")))
+        index.write(("<div id=labelDistributions class=\"bordered table\">" +
+          "<div class=\"labelDistribution row\"><span class=\"title cell\">topic</span><span class=\"values cell\"><span class=\"value\">  0</span><span class=\"value\">  1</span><span class=\"value\">  2</span></span></div>" +
+          "%s</div>\n").format(model.getLabelsToTopicDist.toList.sortBy({case(a,b)=>SentimentLabel.ordinality(a)}).map {
+          case (a, b) => "<div class=\"labelDistribution row\"><span class=\"title cell\">Label %9s</span><span class=\"values cell\">%s</span></div>".format(SentimentLabel.toEnglishName(a), b.map {
+            "<span class=value>%7.3f</span>".format(_)
+          }.mkString(""))
+        }.mkString("\n")))
+        val jarPath = if (wordleJarOption.value.isDefined) wordleJarOption.value.get else WordleUtils.defaultJarPath
+        val configPath = if (wordleConfigOption.value.isDefined) wordleConfigOption.value.get else WordleUtils.defaultConfigurationPath
+        index.write("<div id=wordles class=bordered>")
+        childProcesses = childProcesses ::: WordleUtils.makeWordles(jarPath, configPath, outputFiles, Some(index))
+        index.write("</div></body>")
+        index.close()
+        logger.debug("done making report and initializing wordles")
+      }
+    }
+  }
+
+  def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]) = {
+    if (iterationOption.value.isDefined) {
+      iterations = iterationOption.value.get
+    }
+    if (alphaOption.value.isDefined) {
+      alpha = alphaOption.value.get
+    }
+    if (betaOption.value.isDefined) {
+      beta = betaOption.value.get
+    }
+    if (numTopicsOption.value.isDefined) {
+      numTopics = numTopicsOption.value.get
+    }
+
+    // Thanks to a bug in Mallet, we have to cap alphaSum
+    val alphaSum = 300 min (alpha * numTopics)
+
+
+    logger.debug("alphaSum: " + alphaSum)
+    val model: TopicModel = new LDATopicModel(trainSet, numTopics, iterations, alphaSum, beta)
+    logger.debug("topic distribution:\n     :" + Arrays.toString(model.getTopicPriors))
+    logger.debug({
+      val labelToTopicDist = model.getLabelsToTopicDist
+      "topic distribution over labels:\n" + (for ((k, v) <- labelToTopicDist) yield "%5s:%s".format(k, Arrays.toString(v))).mkString("\n")
+    })
+    logger.debug({
+      val topics = model.getTopics
+      "topic distributions\n" +
+        (for (i <- 0 until 3) yield "%5s: Topic(%s,%s)".format(i, topics(i).prior, topics(i).distribution.toList.sortBy((pair) => (1 - pair._2)).take(10))).mkString("\n")
+    })
+    doOutput(model)
+    evaluate(model, testSet)
+  }
+
+  def after(): Int = {
+    if (childProcesses.length > 0) {
+      logger.info("waiting for child processes...")
+      WordleUtils.waitForChildren(childProcesses)
+    } else {
+      0
+    }
+  }
+}

src/main/scala/updown/app/experiment/topic/pam/NFoldDiscriminantPAMExperiment.scala

 package updown.app.experiment.topic.pam
 
 import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
-import updown.util.TopicModel
 import updown.app.experiment.topic.util.MaxentDiscriminant
 import java.util.Arrays
+import updown.util.{HPAMTopicModel, TopicModel}
 
 object NFoldDiscriminantPAMExperiment extends NFoldPAMExperiment with MaxentDiscriminant {
   def label(model: TopicModel, tweet: GoldLabeledTweet, discriminantFn: (Array[Float]) => (String, String)): SystemLabeledTweet = {
     val (label: String, outcomes: String) = discriminantFn(topicDist)
 
     val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
-    logger.trace("labeling id:%s gold:%s with label:%s from outcomes:%s".format(id, goldLabel.toString, label.toString, outcomes))
+    logger.trace("labeling id:%s gold:%2s with label:%2s from outcomes:%s (has distribution %s)".format(id, goldLabel.toString, label.toString, outcomes, Arrays.toString(topicDist)))
     SystemLabeledTweet(id, userid, features, goldLabel, SentimentLabel.figureItOut(label))
   }
 

src/main/scala/updown/app/experiment/topic/pam/NFoldKNNDiscriminantExperiment.scala

+package updown.app.experiment.topic.pam
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import java.util.Arrays
+import updown.util.TopicModel
+import updown.app.experiment.topic.util.KNNDiscriminant
+import org.clapper.argot.ArgotConverters._
+
+object NFoldKNNDiscriminantExperiment extends NFoldPAMExperiment with KNNDiscriminant {
+  val DEFAULT_K = 11
+  val kOption = parser.option[Int](List("k","numNearestNeighbors"), "INT", "the number of nearest neighbors to consider in choosing a label")
+
+  def label(model: TopicModel, tweet: GoldLabeledTweet, discriminantFn: (Array[Float]) => (String, String)): SystemLabeledTweet = {
+    val topicDist: Array[Float] = model.inferTopics(tweet).map((item) => item.asInstanceOf[Float])
+    val (label: String, outcomes: String) = discriminantFn(topicDist)
+
+    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
+    logger.trace("labeling id:%s gold:%2s with label:%2s from outcomes:%s (has distribution %s)".format(id, goldLabel.toString, label.toString, outcomes, Arrays.toString(topicDist)))
+    SystemLabeledTweet(id, userid, features, goldLabel, SentimentLabel.figureItOut(label))
+  }
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]) = {
+    logger.debug("entering evaluation with %d items in the test set".format(testSet.length))
+
+    val k = kOption.value match {
+      case Some(x:Int) => x
+      case None => DEFAULT_K
+    }
+
+    val labelsToTopicDists: Map[SentimentLabel.Type, List[Array[Double]]] = model.getLabelsToTopicDists
+    logger.debug({
+      val tmp = model.getLabelsToTopicDist
+      "Average distributions:\n"+(for ((label,dist)<- tmp) yield {
+        "\t"+label.toString + ": "+Arrays.toString(dist)
+      }).mkString("\n")
+    })
+    val discriminantFn = getDiscriminantFn(k, labelsToTopicDists)
+
+    val start = System.currentTimeMillis()
+
+    (for ((tweet, i) <- testSet.zipWithIndex) yield {
+      if (i % 100 == 0) {
+        logger.debug("%.0f%% remaining; average label time = %fs".format((1.0 - (i + 1).toDouble / testSet.length.toDouble) * 100, (System.currentTimeMillis() - start).toDouble / (i + 1.0) / 1000.0))
+      }
+      label(model, tweet, discriminantFn)
+    }).toList
+  }
+}

src/main/scala/updown/app/experiment/topic/pam/NFoldSimilarityPAMExperiment.scala

 
 import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
 import updown.util.{Statistics, TopicModel}
-import updown.app.experiment.topic.NFoldTopicExperiment
 
 object NFoldSimilarityPAMExperiment extends NFoldPAMExperiment {
   def label(model: TopicModel, tweet: GoldLabeledTweet, labelToTopicDist: Map[SentimentLabel.Type, Array[Double]]): SystemLabeledTweet = {

src/main/scala/updown/app/experiment/topic/util/KNNDiscriminant.scala

+package updown.app.experiment.topic.util
+
+import updown.data.SentimentLabel
+import opennlp.maxent.GIS
+import opennlp.model.DataIndexer
+
+trait KNNDiscriminant {
+
+  private def euclideanDist(a: Array[Float], b: Array[Float]): Float = {
+    var result: Double = 0.0
+    for (i <- 0 until a.length) {
+      result += math.pow(a(i) - b(i), 2)
+    }
+    math.sqrt(result).toFloat
+  }
+
+  private def getNearestNeighbors(k: Int)(self: Array[Float], neighborhood: List[(SentimentLabel.Type, Array[Float])]): List[(SentimentLabel.Type, Float)] = {
+    neighborhood.map {
+      case (label, position) => (label, euclideanDist(self, position))
+    }.sortBy {
+      case (label, dist) => dist
+    }.take(k)
+  }
+
+  def getDiscriminantFn(k: Int, labelsToTopicDists: Map[SentimentLabel.Type, scala.List[Array[Double]]]): (Array[Float]) => (String, String) = {
+    val posLabelList: List[(SentimentLabel.Type, Array[Double])] =
+      (for ((label, posList) <- labelsToTopicDists.toList) yield (for (pos <- posList) yield (label, pos)).toList).toList.flatten
+    val posLabelListFloat: List[(SentimentLabel.Type, Array[Float])] =
+      posLabelList.map {
+        case (label, pos) => (label, pos.map(d => d.asInstanceOf[Float]))
+      }
+    val getKNearestNeighbors = getNearestNeighbors(k)_
+
+    (topicDist: Array[Float]) => {
+      val nearestNeighbors = getKNearestNeighbors(topicDist,posLabelListFloat)
+      val sizes = nearestNeighbors.groupBy{case(label,dist)=>label}.map{case(label,list)=>(label,list.size)}.toList.sortBy{case(label,size)=>size}
+      val (winningLabel,dist) = sizes.last
+      (winningLabel.toString, sizes.toString)
+    }
+  }
+}

src/main/scala/updown/data/SentimentLabel.scala

 package updown.data
 
-object SentimentLabel extends Enumeration{
+object SentimentLabel extends Enumeration {
   type Type = Value
   val Positive2 = Value("2")
   val Positive = Value("1")
   //  stuff you can do.
 
   private val _POS_NAME = "positive"
+  private val _POS_DOUBLE = 1.0
   private val _POS_NAME2 = "superPositive"
+  private val _POS_DOUBLE2 = 2.0
   private val _NEG_NAME = "negative"
+  private val _NEG_DOUBLE = -1.0
   private val _NEG_NAME2 = "superNegative"
+  private val _NEG_DOUBLE2 = -2.0
   private val _NEU_NAME = "neutral"
+  private val _NEU_DOUBLE = 0.0
   private val _ABS_NAME = "abstained"
+  private val _ABS_DOUBLE = Double.NegativeInfinity // somewhat nonsense; we just need a flag value
   private val _UNK_NAME = "unknown"
+  private val _UNK_DOUBLE = Double.NaN
 
-  def unitSentiment(label:SentimentLabel.Type) = {
+  def unitSentiment(label: SentimentLabel.Type) = {
     label match {
       case Positive2 => Positive
       case Negative2 => Negative
-      case x  => x
+      case x => x
     }
   }
 
-  def ordinality(label:SentimentLabel.Type) = {
+  def ordinality(label: SentimentLabel.Type) = {
     label match {
       case Abstained => 0
       case Negative2 => 1
     }
   }
 
-  def toEnglishName(label:SentimentLabel.Type) = {
+  def toEnglishName(label: SentimentLabel.Type) = {
     label match {
       case Positive => _POS_NAME
       case Positive2 => _POS_NAME2
       case Negative => _NEG_NAME
       case Negative2 => _NEG_NAME2
-      case Neutral  => _NEU_NAME
-      case Abstained  => _ABS_NAME
-      case Unknown  => _UNK_NAME
+      case Neutral => _NEU_NAME
+      case Abstained => _ABS_NAME
+      case Unknown => _UNK_NAME
     }
   }
 
-  def fromEnglishName(name:String) = {
+  def fromEnglishName(name: String) = {
     name match {
       case `_POS_NAME` => Positive
       case `_POS_NAME2` => Positive2
     }
   }
 
-  def figureItOut(name:String) = {
+  def toDouble(label: SentimentLabel.Type) = {
+    label match {
+      case Positive => _POS_DOUBLE
+      case Positive2 => _POS_DOUBLE2
+      case Negative => _NEG_DOUBLE
+      case Negative2 => _NEG_DOUBLE2
+      case Neutral => _NEU_DOUBLE
+      case Abstained => _ABS_DOUBLE
+      case Unknown => _UNK_DOUBLE
+    }
+  }
+
+  def fromDouble(name: Double) = {
+    name match {
+      case `_POS_DOUBLE` => Positive
+      case `_POS_DOUBLE2` => Positive2
+      case `_NEG_DOUBLE` => Negative
+      case `_NEG_DOUBLE2` => Negative2
+      case `_NEU_DOUBLE` => Neutral
+      case `_ABS_DOUBLE` => Abstained
+      case `_UNK_DOUBLE` => Unknown
+    }
+  }
+
+  def figureItOut(name: String) = {
     try {
-    name.toLowerCase match {
-      case `_POS_NAME` |"pos"|"p"|"+"|"1" => Positive
-      case `_POS_NAME2` |"pos2"|"2" => Positive2
-      case `_NEG_NAME`|"neg"|"-"|"-1" => Negative
-      case `_NEG_NAME2`|"neg2"|"-2" => Negative2
-      case `_NEU_NAME`|"neu"|"neut"|"0" => Neutral
-      case `_ABS_NAME` => Abstained
-      case `_UNK_NAME` => Unknown
-    }
+      val posDouble = _POS_DOUBLE.toString
+      val negDouble = _NEG_DOUBLE.toString
+      val neuDouble = _NEU_DOUBLE.toString
+      val posDouble2 = _POS_DOUBLE2.toString
+      val negDouble2 = _NEG_DOUBLE2.toString
+      val absDouble = _ABS_DOUBLE.toString
+      val unkDouble = _UNK_DOUBLE.toString
+      name.toLowerCase match {
+        case `_POS_NAME` | `posDouble` | "pos" | "p" | "+" | "1" => Positive
+        case `_POS_NAME2` | `posDouble2` | "pos2" | "2" => Positive2
+        case `_NEG_NAME` | `negDouble` | "neg" | "-" | "-1" => Negative
+        case `_NEG_NAME2` | `negDouble2` | "neg2" | "-2" => Negative2
+        case `_NEU_NAME` | `neuDouble` | "neu" | "neut" | "0" => Neutral
+        case `_ABS_NAME` | `absDouble` => Abstained
+        case `_UNK_NAME` | `unkDouble` => Unknown
+      }
     } catch {
-      case e:scala.MatchError =>
+      case e: scala.MatchError =>
         System.err.println("couldn't figure out: \"%s\"".format(name))
         throw e
     }

src/main/scala/updown/util/DMRTopicModel.scala

+package updown.util
+
+import cc.mallet.types._
+
+import updown.data.GoldLabeledTweet
+import java.io.File
+import scala.Predef._
+import scala._
+import com.weiglewilczek.slf4s.Logging
+
+class DMRTopicModel(tweets: List[GoldLabeledTweet], numTopics: Int, numIterations: Int
+                    , alphaSum: Double, beta: Double) extends TopicModel with Logging {
+  private val (_alphabet, instanceList) = getInstanceList(tweets)
+  _alphabet.stopGrowth()
+  val alphabet = _alphabet
+  logger.debug("creating dmr topic model with %d topics".format(numTopics))
+  private var model = new cc.mallet.topics.DMRTopicModel(numTopics)
+  model.setOptimizeInterval(100)
+  model.setTopicDisplay(100, 10)
+  model.addInstances(instanceList)
+  model.setNumIterations(numIterations)
+  model.estimate()
+
+  //  ParallelTopicModel.logger.setLevel(Level.OFF)
+  private val _labelToIndices = tweets.zipWithIndex.groupBy {
+    case (tweet, index) => tweet.goldLabel
+  }.map {
+    case (label, tweetList) => (label, tweetList.map {
+      case (tweet, index) => index
+    })
+  }
+  /*
+  override def toString(): String = {
+    model.printTopWords()
+    model.printTopWords(20, true)
+  }*/
+
+  def dumpToStdOut = {
+    model.printTopWords(System.out,20,true)
+    System.out.flush()
+  }
+
+  def getTopics: List[Topic] = {
+    List[Topic]()
+  }
+
+  def getTopicPriors = {
+    Array[Double](numTopics)
+  }
+
+  def getIdsToTopicDist = {
+    Map[String, Array[Double]]()
+  }
+
+  def getLabelsToTopicDists = {
+    (for ((label, indexList: List[Int]) <- _labelToIndices) yield {
+      (label, indexList.map {
+        (i) => getTopicVector(model.getData.get(i).topicSequence.asInstanceOf[LabelSequence])
+      })
+    }).toMap
+  }
+
+  def inferTopics(tweet: GoldLabeledTweet): Array[Double] = {
+    /*tweet match {
+      case GoldLabeledTweet(id, userid, features, goldLabel) =>
+        val featureSequence = new FeatureSequence(alphabet, features.length)
+        for (feature <- features) {
+          featureSequence.add(feature)
+        }
+        getTopicVector(model.topicInferenceLast(featureSequence, numIterations))
+    }*/
+    Array[Double]()
+  }
+
+  def getTopicVector(topics: LabelSequence): Array[Double] = {
+    val topicVector: Array[Double] = Array.ofDim[Double](numTopics)
+    var total = 0.0
+    val topicsIterator = topics.iterator()
+    while (topicsIterator.hasNext) {
+      val label:Label= topicsIterator.next().asInstanceOf[Label]
+      topicVector(label.getIndex) += 1.0
+      total += 1.0
+    }
+    topicVector.map(d => d / total)
+  }
+
+  def save(filename: String) {
+    model.printState(new File(filename))
+  }
+}
+

src/main/scala/updown/util/HPAMTopicModel.scala

         for (feature <- features) {
           featureSequence.add(feature)
         }
-        getTopicVector(model.destructiveTopicInference(featureSequence, numIterations/2))
+        getTopicVector(model.topicInferenceLast(featureSequence, numIterations))
     }
   }
 
   def getTopicVector(topics: Array[Array[Int]]): Array[Double] = {
     val superCounts = computeDistribution(topics(0).toList).withDefaultValue(0.0)
     val subCounts = computeDistribution(topics(1).toList).withDefaultValue(0.0)
+    logger.trace("getting topic vector for super:%s sub:%s".format(superCounts, subCounts))
     val result = Array.ofDim[Double](
-      1 + numSuperTopics
-//        + numSubTopics
+      //      1 +
+      numSuperTopics
+      //        + numSubTopics
     )
-    for (i <- 0 until (1 + numSuperTopics)) {result(i) = superCounts(i)}
-//    for (i <- 0 until (numSubTopics)) {result(1+numSuperTopics+i) = subCounts(i)}
+    for (i <- 1 until (1 + numSuperTopics)) {
+      result(i - 1) = superCounts(i)
+    }
+    //    for (i <- 0 until (numSubTopics)) {result(1+numSuperTopics+i) = subCounts(i)}
     result
   }
 

src/main/scala/updown/util/TopicModel.scala

 abstract class TopicModel {
   protected def getInstanceList(tweetList: List[GoldLabeledTweet]): (Alphabet, InstanceList) = {
     val alphabet = new Alphabet()
+    val labelAlphabet = new Alphabet()
     val instances = (for (tweet <- tweetList) yield {
       tweet match {
         case GoldLabeledTweet(id, userid, features, goldLabel) =>
           for (feature <- features) {
             featureSequence.add(feature)
           }
-          new Instance(featureSequence, goldLabel, id, null)
+          val label = new FeatureVector(
+            labelAlphabet,
+            Array[Object]("label"), Array[Double](SentimentLabel.toDouble(goldLabel)))
+          new Instance(featureSequence, label, id, null)
       }
     }).toList
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.