Commits

vvcephei  committed 5de4834

implemented experiments for the DP corpus

  • Participants
  • Parent commits bf1c88b

Comments (0)

Files changed (3)

File bin/run-dp.sh

+#!/bin/bash
+
+path=/data/deathpenalty/umd_deathpenalty_corpus/folds
+out=out/data
+
+CMD=$1
+shift
+
+case $CMD in 
+  preproc)
+    for fold in $path/*; do
+      for mode in "test" "train"; do
+        fold=$(basename $fold)
+        mode=$(basename $mode)
+        outfile=$out/dp.$fold.$mode.updown
+        echo "generating $outfile"
+        echo "\ttwok"
+        updown run updown.preproc.impl.PreprocDPArticles --textPipeline "twokenize" -f $out/dp.twok.$fold.$mode.updown $path/$fold/$mode/*/*
+        echo "\tbasic"
+        updown run updown.preproc.impl.PreprocDPArticles --textPipeline "basicTokenize" -f $out/dp.basic.$fold.$mode.updown $path/$fold/$mode/*/*
+      done
+    done
+    ;;
+  eval)
+    for fold in $path/*; do
+      fold=$(basename $fold)
+      for pipe in "twok" "basic" ; do
+        for k in 25 50 75 100; do
+          alpha=$( echo - | awk "{ print 50/$k }" )
+          updown run updown.app.experiment.topic.lda.SplitLDAMaxentExperiment --numTopics $k --alpha $alpha --beta 0.01 --iterations 100 --name Dp_"$fold"_"$pipe"Lda$k -G $out/dp.$pipe.$fold.train.updown -g $out/dp.$pipe.$fold.test.updown
+          updown run updown.app.experiment.topic.maxent.SplitMaxentExperiment --name Dp_"$fold"_"$pipe"Maxent -G $out/dp.$pipe.$fold.train.updown -g $out/dp.$pipe.$fold.test.updown
+        done
+      done
+    done
+    ;;
+esac

File src/main/scala/updown/app/experiment/topic/lda/SplitLDAExperiment.scala

+package updown.app.experiment.topic.lda
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import org.clapper.argot.ArgotParser._
+import org.clapper.argot.ArgotConverters._
+import java.io.{FileWriter, BufferedWriter, File}
+import updown.util.{WordleUtils, LDATopicModel, TopicModel}
+import updown.app.experiment.SplitExperiment
+import java.util.Arrays
+
+abstract class SplitLDAExperiment extends SplitExperiment {
+  var iterations = 1000
+  var alpha = 30.0
+  var beta = 0.1
+  var numTopics = 3
+  val fileSeparator = System.getProperty("file.separator")
+
+  var childProcesses = List[Process]()
+
+  val iterationOption = parser.option[Int](List("iterations"), "INT", "the number of iterations for the training the topicModel")
+  val alphaOption = parser.option[Double](List("alpha"), "INT", "the symmetric alpha hyperparameter for LDA")
+  val betaOption = parser.option[Double](List("beta"), "DOUBLE", "the symmetric beta hyperparameter for LDA")
+  val numTopicsOption = parser.option[Int](List("numTopics"), "INT", "the number of topics for LDA")
+
+  val outputOption = parser.option[String](List("o", "output"), "DIR", "the directory to dump topics into")
+  val wordleOption = parser.flag[Boolean](List("w", "wordle"), "generate wordles for the topics (requires -o DIR) " +
+    "(requires that you have downloaded IBM's word cloud generator)")
+  val wordleJarOption = parser.option[String](List("wordleJar"), "PATH", ("the path to IBM's word cloud generator " +
+    "(default %s)").format(WordleUtils.defaultJarPath))
+  val wordleConfigOption = parser.option[String](List("wordleConfig"), "PATH", ("the path to the config file for IBM's " +
+    "word cloud generator (default %s)").format(WordleUtils.defaultConfigurationPath))
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]): List[SystemLabeledTweet]
+
+  def doOutput(model: TopicModel) {
+    if (outputOption.value.isDefined) {
+      val file = new File(outputOption.value.get)
+      file.mkdirs()
+      val outputDirForThisRun = file.getAbsolutePath
+      val summary = new BufferedWriter((new FileWriter((outputDirForThisRun + fileSeparator + "summary"))))
+      summary.write("%s\n".format(model.getTopicPriors.zipWithIndex.map {
+        case (a, b) => "Topic %s:%6.3f".format(b, a)
+      }.mkString("\n")))
+      summary.write("%s\n".format(model.getLabelsToTopicDist.toList.map {
+        case (a, b) => "Label %9s:%s".format(SentimentLabel.toEnglishName(a), b.map {
+          "%7.3f".format(_)
+        }.mkString(""))
+      }.mkString("\n")))
+      summary.close()
+      val outputFiles =
+        (for ((topic, i) <- model.getTopics.zipWithIndex) yield {
+          val outFile = new File(outputDirForThisRun + fileSeparator + "topic" + i)
+          val output = new BufferedWriter(new FileWriter(outFile))
+          output.write("%s\n".format(topic.distribution.toList.sortBy((pair) => (1 - pair._2)).map {
+            case (a, b) => "%s\t%s".format(a, b)
+          }.mkString("\n")))
+          output.close()
+          outFile.getAbsolutePath
+        })
+      if (wordleOption.value.isDefined) {
+        logger.debug("making wordles and report")
+        val index = new BufferedWriter((new FileWriter((outputDirForThisRun + fileSeparator + "index.html"))))
+        index.write("<head><style>\n%s\n</style></head>\n".format(List(
+          "div.bordered{border-style: solid none none none; padding: 5px; border-width: 1px; border-color: gray;}",
+          "div#wordles{display:block; clear:both; padding-top:20px;}",
+          "div.wordle{float:left;width:45%;border-style:solid; border-width:1px; border-color:gray; margin:2px;}",
+          "div.wordle img{width: 100%;}",
+          ".table{display:block; clear: both;}",
+          ".row{display:block;clear:both;}",
+          ".cell{display:block;float:left;}",
+          ".values{display:block;float:left;width:300px;}",
+          ".value{display:block;float:left;width:60px;}",
+          "div.topicFreq .title{width:100px;}",
+          "div.labelDistribution .title{width:150px;}"
+        ).mkString("\n")))
+        index.write("<body>")
+        index.write("<div id=topicDistribution class=\"bordered table\">%s</div>\n".format(model.getTopicPriors.zipWithIndex.map {
+          case (a, b) => "<div class=\"topicFreq row\"><span class=\"title cell\">Topic %s</span><span class=\"value cell\">%6.3f</span></div>".format(b, a)
+        }.mkString("\n")))
+        index.write(("<div id=labelDistributions class=\"bordered table\">" +
+          "<div class=\"labelDistribution row\"><span class=\"title cell\">topic</span><span class=\"values cell\"><span class=\"value\">  0</span><span class=\"value\">  1</span><span class=\"value\">  2</span></span></div>" +
+          "%s</div>\n").format(model.getLabelsToTopicDist.toList.sortBy({
+          case (a, b) => SentimentLabel.ordinality(a)
+        }).map {
+          case (a, b) => "<div class=\"labelDistribution row\"><span class=\"title cell\">Label %9s</span><span class=\"values cell\">%s</span></div>".format(SentimentLabel.toEnglishName(a), b.map {
+            "<span class=value>%7.3f</span>".format(_)
+          }.mkString(""))
+        }.mkString("\n")))
+        val jarPath = if (wordleJarOption.value.isDefined) wordleJarOption.value.get else WordleUtils.defaultJarPath
+        val configPath = if (wordleConfigOption.value.isDefined) wordleConfigOption.value.get else WordleUtils.defaultConfigurationPath
+        index.write("<div id=wordles class=bordered>")
+        childProcesses = childProcesses ::: WordleUtils.makeWordles(jarPath, configPath, outputFiles, Some(index))
+        index.write("</div></body>")
+        index.close()
+        logger.debug("done making report and initializing wordles")
+      }
+    }
+  }
+
+  def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]) = {
+    if (iterationOption.value.isDefined) {
+      iterations = iterationOption.value.get
+    }
+    if (alphaOption.value.isDefined) {
+      alpha = alphaOption.value.get
+    }
+    if (betaOption.value.isDefined) {
+      beta = betaOption.value.get
+    }
+    if (numTopicsOption.value.isDefined) {
+      numTopics = numTopicsOption.value.get
+    }
+
+    // Thanks to a bug in Mallet, we have to cap alphaSum
+    val alphaSum = 300.0 min (alpha * numTopics)
+
+
+    logger.debug("alphaSum: " + alphaSum)
+    val model: TopicModel = new LDATopicModel(trainSet, numTopics, iterations, alphaSum, beta)
+    logger.debug("topic distribution:\n     :" + Arrays.toString(model.getTopicPriors))
+    logger.debug({
+      val labelToTopicDist = model.getLabelsToTopicDist
+      "topic distribution over labels:\n" + (for ((k, v) <- labelToTopicDist) yield "%5s:%s".format(k, Arrays.toString(v))).mkString("\n")
+    })
+    logger.debug({
+      val topics = model.getTopics
+      "topic distributions\n" +
+        (for (i <- 0 until 3) yield "%5s: Topic(%s,%s)".format(i, topics(i).prior, topics(i).distribution.toList.sortBy((pair) => (1 - pair._2)).take(10))).mkString("\n")
+    })
+    doOutput(model)
+    evaluate(model, testSet)
+  }
+
+  def after(): Int = {
+    if (childProcesses.length > 0) {
+      logger.info("waiting for child processes...")
+      WordleUtils.waitForChildren(childProcesses)
+    } else {
+      0
+    }
+  }
+}

File src/main/scala/updown/app/experiment/topic/lda/SplitLDAMaxentExperiment.scala

+package updown.app.experiment.topic.lda
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.util.TopicModel
+import scala.Array
+import updown.app.experiment.topic.util.MaxentDiscriminant
+
+object SplitLDAMaxentExperiment extends SplitLDAExperiment with MaxentDiscriminant {
+
+  def label(model: TopicModel, tweet: GoldLabeledTweet, discriminantFn: (Array[Float]) => (String, String)): SystemLabeledTweet = {
+    val topicDist: Array[Float] = model.inferTopics(tweet).map((item) => item.asInstanceOf[Float])
+    val (label: String, outcomes: String) = discriminantFn(topicDist)
+
+    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
+    logger.trace("labeling id:%s gold:%2s with label:%2s from outcomes:%s".format(id, goldLabel.toString, label.toString, outcomes))
+    SystemLabeledTweet(id, userid, features, goldLabel, SentimentLabel.figureItOut(label))
+  }
+
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]) = {
+    logger.debug("entering evaluation with %d items in the test set".format(testSet.length))
+    val labelsToTopicDists: Map[SentimentLabel.Type, List[Array[Double]]] = model.getLabelsToTopicDists
+    val discriminantFn = getDiscriminantFn(labelsToTopicDists)
+    val start = System.currentTimeMillis()
+
+    val res = (for ((tweet, i) <- testSet.zipWithIndex) yield {
+      if (i % 100 == 0) {
+        logger.debug("%.0f%% remaining; average label time = %fs".format((1.0 - (i + 1).toDouble / testSet.length.toDouble) * 100, (System.currentTimeMillis() - start).toDouble / (i + 1.0) / 1000.0))
+      }
+      label(model, tweet, discriminantFn)
+    }).toList
+    res
+  }
+}