Commits

vvcephei committed 5a0ed46

adding PAM experiments

Comments (0)

Files changed (3)

src/main/scala/updown/app/experiment/topic/pam/NFoldPAMExperiment.scala

+package updown.app.experiment.topic.pam
+
+import updown.data.{GoldLabeledTweet, SentimentLabel}
+import org.clapper.argot.ArgotParser._
+import org.clapper.argot.ArgotConverters._
+import java.io.{FileWriter, BufferedWriter, File}
+import updown.util.{HPAMTopicModel, WordleUtils, LDATopicModel, TopicModel}
+import updown.app.experiment.{LabelResult, ExperimentalResult, NFoldExperiment}
+
+abstract class NFoldPAMExperiment extends NFoldExperiment {
+  var iterations = 1000
+//  var alpha = 30
+//  var beta = 0.1
+  var numTopics = 20
+  val fileSeparator = System.getProperty("file.separator")
+
+  var childProcesses = List[Process]()
+
+  val iterationOption = parser.option[Int](List("iterations"), "INT", "the number of iterations for the training the topicModel")
+//  val alphaOption = parser.option[Int](List("alpha"), "INT", "the symmetric alpha hyperparameter for LDA")
+//  val betaOption = parser.option[Double](List("beta"), "DOUBLE", "the symmetric beta hyperparameter for LDA")
+  val numTopicsOption = parser.option[Int](List("numTopics"), "INT", "the number of topics for LDA")
+
+  val outputOption = parser.option[String](List("o", "output"), "DIR", "the directory to dump topics into")
+  val wordleOption = parser.flag[Boolean](List("w", "wordle"), "generate wordles for the topics (requires -o DIR) " +
+    "(requires that you have downloaded IBM's word cloud generator)")
+  val wordleJarOption = parser.option[String](List("wordleJar"), "PATH", ("the path to IBM's word cloud generator " +
+    "(default %s)").format(WordleUtils.defaultJarPath))
+  val wordleConfigOption = parser.option[String](List("wordleConfig"), "PATH", ("the path to the config file for IBM's " +
+    "word cloud generator (default %s)").format(WordleUtils.defaultConfigurationPath))
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]): ExperimentalResult
+
+  def doOutput(model: TopicModel) {
+    if (outputOption.value.isDefined) {
+      val file = new File(outputOption.value.get + fileSeparator + "run" + experimentalRun)
+      file.mkdirs()
+      val outputDirForThisRun = file.getAbsolutePath
+      val summary = new BufferedWriter((new FileWriter((outputDirForThisRun + fileSeparator + "summary"))))
+      summary.write("%s\n".format(model.getTopicPriors.zipWithIndex.map {
+        case (a, b) => "Topic %s:%6.3f".format(b, a)
+      }.mkString("\n")))
+      summary.write("%s\n".format(model.getTopicsPerTarget.toList.map {
+        case (a, b) => "Label %9s:%s".format(SentimentLabel.toEnglishName(a), b.map {
+          "%7.3f".format(_)
+        }.mkString(""))
+      }.mkString("\n")))
+      summary.close()
+      val outputFiles =
+        (for ((topic, i) <- model.getTopics.zipWithIndex) yield {
+          val outFile = new File(outputDirForThisRun + fileSeparator + "topic" + i)
+          val output = new BufferedWriter(new FileWriter(outFile))
+          output.write("%s\n".format(topic.distribution.toList.sortBy((pair) => (1 - pair._2)).map {
+            case (a, b) => "%s\t%s".format(a, b)
+          }.mkString("\n")))
+          output.close()
+          outFile.getAbsolutePath
+        })
+      if (wordleOption.value.isDefined) {
+        logger.debug("making wordles and report")
+        val index = new BufferedWriter((new FileWriter((outputDirForThisRun + fileSeparator + "index.html"))))
+        index.write("<head><style>\n%s\n</style></head>\n".format(List(
+        "div.bordered{border-style: solid none none none; padding: 5px; border-width: 1px; border-color: gray;}",
+        "div#wordles{display:block; clear:both; padding-top:20px;}",
+        "div.wordle{float:left;width:45%;border-style:solid; border-width:1px; border-color:gray; margin:2px;}",
+        "div.wordle img{width: 100%;}",
+        ".table{display:block; clear: both;}",
+        ".row{display:block;clear:both;}",
+        ".cell{display:block;float:left;}",
+        ".values{display:block;float:left;width:300px;}",
+        ".value{display:block;float:left;width:60px;}",
+        "div.topicFreq .title{width:100px;}",
+        "div.labelDistribution .title{width:150px;}"
+        ).mkString("\n")))
+        index.write("<body>")
+        index.write("<div id=topicDistribution class=\"bordered table\">%s</div>\n".format(model.getTopicPriors.zipWithIndex.map {
+          case (a, b) => "<div class=\"topicFreq row\"><span class=\"title cell\">Topic %s</span><span class=\"value cell\">%6.3f</span></div>".format(b, a)
+        }.mkString("\n")))
+        index.write(("<div id=labelDistributions class=\"bordered table\">" +
+          "<div class=\"labelDistribution row\"><span class=\"title cell\">topic</span><span class=\"values cell\"><span class=\"value\">  0</span><span class=\"value\">  1</span><span class=\"value\">  2</span></span></div>" +
+          "%s</div>\n").format(model.getTopicsPerTarget.toList.sortBy({case(a,b)=>SentimentLabel.ordinality(a)}).map {
+          case (a, b) => "<div class=\"labelDistribution row\"><span class=\"title cell\">Label %9s</span><span class=\"values cell\">%s</span></div>".format(SentimentLabel.toEnglishName(a), b.map {
+            "<span class=value>%7.3f</span>".format(_)
+          }.mkString(""))
+        }.mkString("\n")))
+        val jarPath = if (wordleJarOption.value.isDefined) wordleJarOption.value.get else WordleUtils.defaultJarPath
+        val configPath = if (wordleConfigOption.value.isDefined) wordleConfigOption.value.get else WordleUtils.defaultConfigurationPath
+        index.write("<div id=wordles class=bordered>")
+        childProcesses = childProcesses ::: WordleUtils.makeWordles(jarPath, configPath, outputFiles, Some(index))
+        index.write("</div></body>")
+        index.close()
+        logger.debug("done making report and initializing wordles")
+      }
+    }
+  }
+
+  def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]) = {
+    if (iterationOption.value.isDefined) {
+      iterations = iterationOption.value.get
+    }
+    /*if (alphaOption.value.isDefined) {
+      alpha = alphaOption.value.get
+    }
+    if (betaOption.value.isDefined) {
+      beta = betaOption.value.get
+    }*/
+    if (numTopicsOption.value.isDefined) {
+      numTopics = numTopicsOption.value.get
+    }
+
+    // Thanks to a bug in Mallet, we have to cap alphaSum
+//    val alphaSum = 300 min (alpha * numTopics)
+
+
+//    logger.debug("alphaSum: " + alphaSum)
+    val model: TopicModel = new HPAMTopicModel(trainSet, numTopics, iterations/*, alphaSum, beta*/)
+    logger.info("topicString:\n"+model.toString)
+    /*logger.debug("topic distribution:\n     :" + model.getTopicPriors)
+    logger.debug({
+      val labelToTopicDist = model.getTopicsPerTarget
+      "topic distribution over labels:\n" + (for ((k, v) <- labelToTopicDist) yield "%5s:%s".format(k, v)).mkString("\n")
+    })
+    logger.debug({
+      val topics = model.getTopics
+      "topic distributions\n" +
+        (for (i <- 0 until 3) yield "%5s: Topic(%s,%s)".format(i, topics(i).prior, topics(i).distribution.toList.sortBy((pair) => (1 - pair._2)).take(10))).mkString("\n")
+    })
+    doOutput(model)
+    evaluate(model, testSet)*/
+    ExperimentalResult("dummy",0,0.0,List[LabelResult]())
+  }
+
+  def after(): Int = {
+    if (childProcesses.length > 0) {
+      logger.info("waiting for child processes...")
+      WordleUtils.waitForChildren(childProcesses)
+    } else {
+      0
+    }
+  }
+}

src/main/scala/updown/app/experiment/topic/pam/NFoldSimilarityPAMExperiment.scala

+package updown.app.experiment.topic.pam
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.util.{Statistics, TopicModel}
+import updown.app.experiment.topic.NFoldTopicExperiment
+
+object NFoldSimilarityPAMExperiment extends NFoldPAMExperiment {
+  def label(model: TopicModel, tweet: GoldLabeledTweet, labelToTopicDist: Map[SentimentLabel.Type,List[Double]]): SystemLabeledTweet = {
+    val topicDistribution = model.inferTopics(tweet)
+    val similarities = (for ((k,v) <- labelToTopicDist) yield (Statistics.cosineSimilarity(topicDistribution, v), k)).toList.sorted.reverse
+    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
+    SystemLabeledTweet(id, userid, features, goldLabel,SentimentLabel.unitSentiment(similarities(0)._2))
+  }
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]) = {
+    logger.debug("entering evaluation with %d items in the test set".format(testSet.length))
+    val topicsPerTarget: Map[SentimentLabel.Type, List[Double]] = model.getTopicsPerTarget
+    val start = System.currentTimeMillis()
+    val res = Statistics.getEvalStats("Similarity Topic",for ((tweet,i) <- testSet.zipWithIndex) yield {
+      if (i%100 == 0) {
+        logger.debug("%.0f%% remaining; average label time = %fs".format((1.0-(i+1).toDouble/testSet.length.toDouble)*100, (System.currentTimeMillis()-start).toDouble/(i+1.0) /1000.0))
+      }
+      label(model, tweet, topicsPerTarget)
+    })
+    logger.info(res.toString)
+    res
+  }
+}

src/main/scala/updown/util/HPAMTopicModel.scala

+package updown.util
+
+import cc.mallet.types._
+import scala.collection.JavaConversions._
+
+import updown.data.{SentimentLabel, GoldLabeledTweet}
+import java.io.File
+import cc.mallet.topics.HierarchicalPAM
+import cc.mallet.util.Randoms
+import scala.Predef._
+import scala._
+
+class HPAMTopicModel(tweets: List[GoldLabeledTweet], numTopics: Int, numIterations: Int/*, alphaSum: Double, beta: Double*/) extends TopicModel {
+  private final val MAX_THREADS = 20
+
+  private val (alphabet, instanceList) = getInstanceList(tweets)
+  private var model = new HierarchicalPAM(3, numTopics, 1.0,1.0)
+  model.estimate(instanceList,instanceList,numIterations,50,10,100,"",new Randoms())
+//  ParallelTopicModel.logger.setLevel(Level.OFF)
+
+  override def toString(): String = {
+    model.printTopWords(20,true)
+  }
+
+  def getTopics: List[Topic] = {
+/*
+    val priors = getTopicPriors
+    val topicsToAlphaIds = scala.collection.mutable.Map[Int,List[(Int,Double)]]()
+
+    val wordsTopicsCounts = (for ((topicCounts, typeIndex) <- model.typeTopicCounts.zipWithIndex) yield {
+      val word = alphabet.lookupObject(typeIndex).toString
+      (for (topicCount <- topicCounts) yield {
+        val topic = topicCount & model.topicMask
+        val count = topicCount >> model.topicBits
+        (word,topic,count)
+      }).iterator
+    }).iterator.flatten.toList
+
+
+    val res = (for (i <- 0 until numTopics) yield {
+      val wordCounts = wordsTopicsCounts.filter((triple)=>(triple._2==i && triple._3!=0))
+      val sum = wordCounts.map((triple)=>triple._3).reduce(_ + _)
+      Topic(priors(i), wordCounts.map((triple)=>(triple._1->(triple._3.toDouble/sum))).toMap)
+    }).toList
+
+    res
+*/
+    List[Topic]()
+  }
+
+  def getTopicPriors: List[Double] = {
+/*
+    val result: Array[Double] = new Array[Double](numTopics)
+    var sum = 0.0
+    for (topicAssignment <- model.getData) {
+      val temp: Array[Double] = model.getTopicProbabilities(topicAssignment.topicSequence)
+      for (i <- 0 until result.length) {
+        result(i) += temp(i)
+        sum += temp(i)
+      }
+    }
+    result.toList.map((double: Double) => double / sum)
+*/
+    List[Double]()
+  }
+
+  def getTopicsPerInstance = {
+  /*  (for (topicAssignment <- model.getData) yield {
+      val source = topicAssignment.instance.getName.toString
+      val dist = model.getTopicProbabilities(topicAssignment.topicSequence)
+      (source, dist.toList)
+    }).toList
+  */
+    List[(String,List[Double])]()
+  }
+
+  def getTopicsPerTarget = {
+  /*  val result = scala.collection.mutable.Map[SentimentLabel.Type,List[Double]]()
+    for (topicAssignment <- model.getData) {
+      val target = topicAssignment.instance.getTarget.asInstanceOf[SentimentLabel.Type]
+      result(target) = result.getOrElse(target, (new Array[Double](numTopics)).toList).zip(model.getTopicProbabilities(topicAssignment.topicSequence).toList).map((pair) => pair._1+pair._2)
+    }
+    (for ((key, value) <- result) yield {
+      val sum = value.reduce( _ + _ )
+      (key->value.map(_ / sum))
+    }).toMap
+  */
+    Map[SentimentLabel.Type,List[Double]]()
+  }
+
+  def inferTopics(tweet: GoldLabeledTweet): List[Double] = {
+    /*val instance = tweet match {
+        case GoldLabeledTweet(id, userid, features, goldLabel) =>
+          val featureSequence = new FeatureSequence(alphabet, features.length)
+          for (feature <- features) {
+            featureSequence.add(feature)
+          }
+          new Instance(featureSequence, goldLabel, id, null)
+      }
+    model.getInferencer.getSampledDistribution(instance, numIterations, 1, 1).toList*/
+    List[Double]()
+  }
+
+  def save(filename: String) {
+    model.printState(new File(filename))
+  }
+}
+
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.