vvcephei avatar vvcephei committed de88105

thesis work.

Comments (0)

Files changed (11)

config/log4j.properties

-log4j.rootLogger=INFO, stderr
+log4j.rootLogger=DEBUG, stderr
 log4j.appender.stderr=org.apache.log4j.ConsoleAppender
 log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
 

src/main/scala/updown/app/experiment/NFoldExperiment.scala

 import updown.data.io.TweetFeatureReader
 import updown.data.{SentimentLabel, GoldLabeledTweet}
 import org.clapper.argot.ArgotParser._
-import org.clapper.argot.{ArgotUsageException, ArgotParser}
 import org.clapper.argot.ArgotConverters._
 import com.weiglewilczek.slf4s.Logging
 import updown.util.Statistics
+import org.clapper.argot.{SingleValueOption, ArgotUsageException, ArgotParser}
 
 abstract class NFoldExperiment extends Logging {
   // this exists purely to make the ArgotConverters appear used to IDEA
   convertByte _
+  val parser = new ArgotParser(this.getClass.getName, preUsage = Some("Updown"))
+  val goldInputFile = parser.option[String](List("g", "gold"), "gold", "gold labeled input")
+  val n = parser.option[Int](List("n", "folds"), "FOLDS", "the number of folds for the experiment (default 10)")
+  var experimentalRun = 0
 
   def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]):
-    (Double, List[(updown.data.SentimentLabel.Type, Double, Double, Double)])
+  (Double, List[(updown.data.SentimentLabel.Type, Double, Double, Double)])
+  def after():Int
 
   def generateTrials(inputFile: String, nFolds: Int): Iterator[(List[GoldLabeledTweet], List[GoldLabeledTweet])] = {
     val foldsToTweets = (for ((fold, list) <- TweetFeatureReader(inputFile).zipWithIndex.groupBy((pair) => {
   }
 
   def main(args: Array[String]) {
-    val parser = new ArgotParser(this.getClass.getName, preUsage = Some("Updown"))
-    val goldInputFile = parser.option[String](List("g", "gold"), "gold", "gold labeled input")
-    val n = parser.option[Int](List("n", "folds"), "FOLDS", "the number of folds for the experiment (default 10)")
-
     try {
       parser.parse(args)
 
       val inputFile = goldInputFile.value.get
       val results =
         (for ((testSet, trainSet) <- generateTrials(inputFile, nFolds)) yield {
-          doExperiment(testSet, trainSet)
+          experimentalRun += 1
+          logger.debug("starting run "+experimentalRun)
+          val result = doExperiment(testSet, trainSet)
+          logger.debug("ending run "+experimentalRun)
+          result
         }).toList
 
-      logger.info("intermediate results:\n"+results.mkString("\n"))
+      logger.info("intermediate results:\n" + results.mkString("\n"))
       println("\n" + Statistics.reportResults(Statistics.averageResults(results)))
+      logger.debug("running cleanup code")
+      System.exit(after())
     }
     catch {
-      case e: ArgotUsageException => println(e.message); sys.exit(0)
+      case e: ArgotUsageException => println(e.message); sys.exit(1)
     }
   }
 }

src/main/scala/updown/app/experiment/maxent/NFoldMaxentExperiment.scala

     logger.info(Statistics.reportResults(res))
     res
   }
+  def after():Int=0
 }

src/main/scala/updown/app/experiment/topic/NFoldMajorityTopicExperiment.scala

       val neutralTopic = neutralDist(0)
         if (goodDist(0) != neutralTopic) {
           if (goodDist(0) != badDist(0)) {
-            (goodDist(0), neutralDist, badDist(0))
+            (goodDist(0), neutralTopic, badDist(0))
           } else {
             //then we have a pathological case
             logger.warn("pathological topic distribution: %s".format(labelToTopicDist.toString))

src/main/scala/updown/app/experiment/topic/NFoldSimilarityTopicExperiment.scala

 
 object NFoldSimilarityTopicExperiment extends NFoldTopicExperiment {
 
+  def labelNoop(model: TopicModel, tweet: GoldLabeledTweet, labelToTopicDist: Map[SentimentLabel.Type,List[Double]]): SystemLabeledTweet = {
+    val topicDistribution = model.inferTopics(tweet)
+    val similarities = (for ((k,v) <- labelToTopicDist) yield (Statistics.cosineSimilarity(topicDistribution, v), k)).toList.sorted.reverse
+    SentimentLabel.unitSentiment(similarities(0)._2)
+    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
+    SystemLabeledTweet(id, userid, features, goldLabel,SentimentLabel.Abstained)
+  }
+
   def label(model: TopicModel, tweet: GoldLabeledTweet, labelToTopicDist: Map[SentimentLabel.Type,List[Double]]): SystemLabeledTweet = {
     val topicDistribution = model.inferTopics(tweet)
-    logger.debug("inferred topicDist: "+topicDistribution.toString)
     val similarities = (for ((k,v) <- labelToTopicDist) yield (Statistics.cosineSimilarity(topicDistribution, v), k)).toList.sorted.reverse
-    logger.debug("similarities: "+similarities.toString)
     val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
-
-    SystemLabeledTweet(id, userid, features, goldLabel,similarities(0)._2)
+    SystemLabeledTweet(id, userid, features, goldLabel,SentimentLabel.unitSentiment(similarities(0)._2))
   }
 
   def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]): (Double, scala.List[(SentimentLabel.Type, Double, Double, Double)]) = {
-    val res = Statistics.getEvalStats(for (tweet <- testSet) yield {
-      label(model, tweet, model.getTopicsPerTarget)
+    logger.debug("entering evaluation with %d items in the test set".format(testSet.length))
+    val topicsPerTarget: Map[SentimentLabel.Type, List[Double]] = model.getTopicsPerTarget
+    val start = System.currentTimeMillis()
+    val res = Statistics.getEvalStats(for ((tweet,i) <- testSet.zipWithIndex) yield {
+      if (i%100 == 0) {
+        logger.debug("%.0f%% remaining; average label time = %fs".format((1.0-(i+1).toDouble/testSet.length.toDouble)*100, (System.currentTimeMillis()-start).toDouble/(i+1.0) /1000.0))
+      }
+      label(model, tweet, topicsPerTarget)
     })
     logger.debug(res.toString)
     logger.info(Statistics.reportResults(res))

src/main/scala/updown/app/experiment/topic/NFoldTopicExperiment.scala

 package updown.app.experiment.topic
 
 import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
-import updown.util.{Statistics, LDATopicModel, TopicModel}
 import updown.app.experiment.NFoldExperiment
+import org.clapper.argot.{SingleValueOption, ArgotParser}
+import org.clapper.argot.ArgotParser._
+import org.clapper.argot.ArgotConverters._
+import java.io.{FileWriter, BufferedWriter, File}
+import updown.util.{WordleUtils, Statistics, LDATopicModel, TopicModel}
 
 abstract class NFoldTopicExperiment extends NFoldExperiment {
+  var iterations = 1000
+  var alpha = 30
+  var beta = 0.1
+  var numTopics = 3
+  val fileSeparator = System.getProperty("file.separator")
+
+  var childProcesses = List[Process]()
+
+  val iterationOption = parser.option[Int](List("iterations"), "INT", "the number of iterations for the training the topicModel")
+  val alphaOption = parser.option[Int](List("alpha"), "INT", "the symmetric alpha hyperparameter for LDA")
+  val betaOption = parser.option[Double](List("beta"), "DOUBLE", "the symmetric beta hyperparameter for LDA")
+  val numTopicsOption = parser.option[Int](List("numTopics"), "INT", "the number of topics for LDA")
+
+  val outputOption = parser.option[String](List("o", "output"), "DIR", "the directory to dump topics into")
+  val wordleOption = parser.flag[Boolean](List("w", "wordle"), "generate wordles for the topics (requires -o DIR) " +
+    "(requires that you have downloaded IBM's word cloud generator)")
+  val wordleJarOption = parser.option[String](List("wordleJar"), "PATH", ("the path to IBM's word cloud generator " +
+    "(default %s)").format(WordleUtils.defaultJarPath))
+  val wordleConfigOption = parser.option[String](List("wordleConfig"), "PATH", ("the path to the config file for IBM's " +
+    "word cloud generator (default %s)").format(WordleUtils.defaultConfigurationPath))
 
   def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]):
   (Double, scala.List[(SentimentLabel.Type, Double, Double, Double)])
 
+  def doOutput(model: TopicModel) {
+    if (outputOption.value.isDefined) {
+      val file = new File(outputOption.value.get + fileSeparator + "run" + experimentalRun)
+      file.mkdirs()
+      val outputDirForThisRun = file.getAbsolutePath
+      val summary = new BufferedWriter((new FileWriter((outputDirForThisRun + fileSeparator + "summary"))))
+      summary.write("%s\n".format(model.getTopicPriors.zipWithIndex.map {
+        case (a, b) => "Topic %s:%6.3f".format(b, a)
+      }.mkString("\n")))
+      summary.write("%s\n".format(model.getTopicsPerTarget.toList.map {
+        case (a, b) => "Label %9s:%s".format(SentimentLabel.toEnglishName(a), b.map {
+          "%7.3f".format(_)
+        }.mkString(""))
+      }.mkString("\n")))
+      summary.close()
+      val outputFiles =
+        (for ((topic, i) <- model.getTopics.zipWithIndex) yield {
+          val outFile = new File(outputDirForThisRun + fileSeparator + "topic" + i)
+          val output = new BufferedWriter(new FileWriter(outFile))
+          output.write("%s\n".format(topic.distribution.toList.sortBy((pair) => (1 - pair._2)).map {
+            case (a, b) => "%s\t%s".format(a, b)
+          }.mkString("\n")))
+          output.close()
+          outFile.getAbsolutePath
+        })
+      if (wordleOption.value.isDefined) {
+        logger.debug("making wordles and report")
+        val index = new BufferedWriter((new FileWriter((outputDirForThisRun + fileSeparator + "index.html"))))
+        index.write("<head><style>\n%s\n</style></head>\n".format(List(
+        "div.bordered{border-style: solid none none none; padding: 5px; border-width: 1px; border-color: gray;}",
+        "div#wordles{display:block; clear:both; padding-top:20px;}",
+        "div.wordle{float:left;width:45%;border-style:solid; border-width:1px; border-color:gray; margin:2px;}",
+        "div.wordle img{width: 100%;}",
+        ".table{display:block; clear: both;}",
+        ".row{display:block;clear:both;}",
+        ".cell{display:block;float:left;}",
+        ".values{display:block;float:left;width:300px;}",
+        ".value{display:block;float:left;width:60px;}",
+        "div.topicFreq .title{width:100px;}",
+        "div.labelDistribution .title{width:150px;}"
+        ).mkString("\n")))
+        index.write("<body>")
+        index.write("<div id=topicDistribution class=\"bordered table\">%s</div>\n".format(model.getTopicPriors.zipWithIndex.map {
+          case (a, b) => "<div class=\"topicFreq row\"><span class=\"title cell\">Topic %s</span><span class=\"value cell\">%6.3f</span></div>".format(b, a)
+        }.mkString("\n")))
+        index.write(("<div id=labelDistributions class=\"bordered table\">" +
+          "<div class=\"labelDistribution row\"><span class=\"title cell\">topic</span><span class=\"values cell\"><span class=\"value\">  0</span><span class=\"value\">  1</span><span class=\"value\">  2</span></span></div>" +
+          "%s</div>\n").format(model.getTopicsPerTarget.toList.sortBy({case(a,b)=>SentimentLabel.ordinality(a)}).map {
+          case (a, b) => "<div class=\"labelDistribution row\"><span class=\"title cell\">Label %9s</span><span class=\"values cell\">%s</span></div>".format(SentimentLabel.toEnglishName(a), b.map {
+            "<span class=value>%7.3f</span>".format(_)
+          }.mkString(""))
+        }.mkString("\n")))
+        val jarPath = if (wordleJarOption.value.isDefined) wordleJarOption.value.get else WordleUtils.defaultJarPath
+        val configPath = if (wordleConfigOption.value.isDefined) wordleConfigOption.value.get else WordleUtils.defaultConfigurationPath
+        index.write("<div id=wordles class=bordered>")
+        childProcesses = childProcesses ::: WordleUtils.makeWordles(jarPath, configPath, outputFiles, Some(index))
+        index.write("</div></body>")
+        index.close()
+        logger.debug("done making report and initializing wordles")
+      }
+    }
+  }
+
   def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]) = {
-    val model: TopicModel = new LDATopicModel(trainSet, 3, 1000, 100, 0.1)
+    if (iterationOption.value.isDefined) {
+      iterations = iterationOption.value.get
+    }
+    if (alphaOption.value.isDefined) {
+      alpha = alphaOption.value.get
+    }
+    if (betaOption.value.isDefined) {
+      beta = betaOption.value.get
+    }
+    if (numTopicsOption.value.isDefined) {
+      numTopics = numTopicsOption.value.get
+    }
 
-    logger.info("topic distribution:\n     :" + model.getTopicPriors)
-    logger.info({
+
+    val model: TopicModel = new LDATopicModel(trainSet, numTopics, iterations, alpha * numTopics, beta)
+    logger.debug("topic distribution:\n     :" + model.getTopicPriors)
+    logger.debug({
       val labelToTopicDist = model.getTopicsPerTarget
       "topic distribution over labels:\n" + (for ((k, v) <- labelToTopicDist) yield "%5s:%s".format(k, v)).mkString("\n")
     })
-    logger.info({
+    logger.debug({
       val topics = model.getTopics
       "topic distributions\n" +
-        (for (i <- 0 until 3) yield "%5s: Topic(%s,%s)".format(i, topics(i).prior, topics(i).distribution.toList.sortBy((pair) => (1 - pair._2)))).mkString("\n")
+        (for (i <- 0 until 3) yield "%5s: Topic(%s,%s)".format(i, topics(i).prior, topics(i).distribution.toList.sortBy((pair) => (1 - pair._2)).take(10))).mkString("\n")
     })
+    doOutput(model)
     evaluate(model, testSet)
   }
+
+  def after(): Int = {
+    if (childProcesses.length > 0) {
+      logger.info("waiting for child processes...")
+      WordleUtils.waitForChildren(childProcesses)
+    } else {
+      0
+    }
+  }
 }

src/main/scala/updown/data/SentimentLabel.scala

 
 object SentimentLabel extends Enumeration{
   type Type = Value
+  val Positive2 = Value("2")
   val Positive = Value("1")
   val Neutral = Value("0")
   val Negative = Value("-1")
+  val Negative2 = Value("-2")
   val Abstained = Value("A")
   // this is the end of the enum definition. the rest of this object just demonstrates other
   //  stuff you can do.
 
   private val _POS_NAME = "positive"
+  private val _POS_NAME2 = "superPositive"
   private val _NEG_NAME = "negative"
+  private val _NEG_NAME2 = "superNegative"
   private val _NEU_NAME = "neutral"
   private val _ABS_NAME = "abstained"
 
+  def unitSentiment(label:SentimentLabel.Type) = {
+    label match {
+      case Positive2 => Positive
+      case Negative2 => Negative
+      case x  => x
+    }
+  }
+
+  def ordinality(label:SentimentLabel.Type) = {
+    label match {
+      case Abstained => 0
+      case Negative2 => 1
+      case Negative => 2
+      case Neutral => 3
+      case Positive => 4
+      case Positive2 => 5
+    }
+  }
+
   def toEnglishName(label:SentimentLabel.Type) = {
     label match {
       case Positive => _POS_NAME
+      case Positive2 => _POS_NAME2
       case Negative => _NEG_NAME
+      case Negative2 => _NEG_NAME2
       case Neutral  => _NEU_NAME
       case Abstained  => _ABS_NAME
     }
   def fromEnglishName(name:String) = {
     name match {
       case `_POS_NAME` => Positive
+      case `_POS_NAME2` => Positive2
       case `_NEG_NAME` => Negative
+      case `_NEG_NAME2` => Negative2
       case `_NEU_NAME` => Neutral
       case `_ABS_NAME` => Abstained
     }
   def figureItOut(name:String) = {
     name.toLowerCase match {
       case `_POS_NAME` |"pos"|"p"|"+"|"1" => Positive
+      case `_POS_NAME2` |"pos2"|"2" => Positive2
       case `_NEG_NAME`|"neg"|"-"|"-1" => Negative
+      case `_NEG_NAME2`|"neg2"|"-2" => Negative2
       case `_NEU_NAME`|"neu"|"neut"|"0" => Neutral
       case `_ABS_NAME` => Abstained
     }

src/main/scala/updown/preproc/GenericPreprocessor.scala

   def main(args: Array[String]) {
     logger.debug(args.toList.toString)
     // don't forget that this is linked to the pipeStages dict below
-    val availablePipes = Set("addBiGrams", "twokenize", "twokenizeSkipGtOneGrams", "removeStopwords", "splitSpace", "filterAlpha")
+    val availablePipes = Set("lowerCase", "addBiGrams", "twokenize", "twokenizeSkipGtOneGrams", "removeStopwords", "splitSpace", "filterAlpha", "filterAlphaQuote")
 
     // PARSE ARGS
     val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
 
       val pipeStages: Map[String, (List[String]) => List[String]] =
         Map[String, (List[String]) => List[String]](
+          ("lowerCase" -> TokenizationPipes.toLowercase),
           ("addBiGrams" -> TokenizationPipes.addNGrams(2)),
           ("twokenize" -> TokenizationPipes.twokenize),
           ("twokenizeSkipGtOneGrams" -> TokenizationPipes.twokenizeSkipGtOneGrams),
           ("removeStopwords" -> TokenizationPipes.filterOnStopset(stopSet)),
           ("filterAlpha") -> TokenizationPipes.filterOnRegex("\\p{Alpha}+"),
+          ("filterAlphaQuote") -> TokenizationPipes.filterOnRegex("(\\p{Alpha}|')+"),
           ("splitSpace" -> TokenizationPipes.splitOnDelimiter(" "))
         )
       // had to predefine the available pipes so they could be printed in the usage string, before the stopset can be parsed.

src/main/scala/updown/preproc/PreprocTSVFiles.scala

+package updown.preproc
+
+import updown.data.SentimentLabel
+
+/**
+ * This preprocessor is suitable for any file that contains one instance per line with no labels or ids
+ */
+object PreprocTSVFiles extends GenericPreprocessor {
+
+  val lineRegex = "(\\S*)\\s*(\\S*)\\s*(.*)".r
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)] = {
+    try {
+      for (line <- scala.io.Source.fromFile(fileName, "UTF-8").getLines) yield {
+        val lineRegex(id, label, text) = line
+        logger.debug("id:%s label:%s opol:%s pol:%s".format(id, label, polarity, SentimentLabel.figureItOut(polarity)))
+        (id, "reviewer", SentimentLabel.figureItOut(polarity), text.replace("|", ""))
+      }
+    } catch {
+      case e: MatchError =>
+        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
+          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
+        Iterator[(String, String, SentimentLabel.Type, String)]()
+    }
+  }
+}

src/main/scala/updown/util/Statistics.scala

 
   def averageResults(results: scala.List[(Double, scala.List[(SentimentLabel.Type, Double, Double, Double)])]): (Double, scala.List[(SentimentLabel.Type, Double, Double, Double)]) = {
     var avgAccuracy = 0.0
-    var avgLabelResultsList = initializeAverageList(results(0)._2)
+    var avgLabelResultsList = initializeAverageList(results(0)._2).sortBy({case(x,_,_,_)=>SentimentLabel.ordinality(x)})
     for ((accuracy, labelResults) <- results) {
       avgAccuracy += accuracy
-      avgLabelResultsList = addAll(labelResults, avgLabelResultsList)
+      avgLabelResultsList = addAll(labelResults.sortBy({case(x,_,_,_)=>SentimentLabel.ordinality(x)}), avgLabelResultsList)
     }
     avgAccuracy /= results.length
     avgLabelResultsList = divideBy(avgLabelResultsList, results.length)
 
   def getEvalStats(tweets: scala.List[SystemLabeledTweet]): (Double, List[(SentimentLabel.Type, Double, Double, Double)]) = {
     val (correct, total) = tabulate(tweets)
-    logger.debug("goldLabels: %s".format((tweets.map((tweet) => tweet.goldLabel))))
-    logger.debug("systemLabels: %s".format((tweets.map((tweet) => tweet.systemLabel))))
     (accurracy(correct, total.toDouble),
-      (for (label <- SentimentLabel.values) yield {
+      (for (label <- List(SentimentLabel.Negative, SentimentLabel.Neutral, )) yield {
         val goldList = tweets.filter((tweet) => tweet.goldLabel == label)
         val systemList = tweets.filter((tweet) => tweet.systemLabel == label)
         val labelPrecision = precision(

src/main/scala/updown/util/Twokenize.scala

      ).flatten
 
     // Split based on special patterns (like contractions) and check all tokens are non empty
-    zippedStr.map(splitToken(_)).flatten.filter(_.length > 0)
+    zippedStr.map(splitToken(_)).flatten.map(replaceApostrophes(_)).filter(_.length > 0)
   }  
 
+  def replaceApostrophes(str:String): String = {
+    str.replaceAll("'+","'") match {
+      case "'s"=>"is"
+      case "'m"=>"am"
+      case "n't"=>"not"
+      case "'ve"=>"have"
+      case "'re"=>"are"
+      case "'"=>""
+      case "'ll"=>"will"
+      case "'d"=>"would"
+      case x=>x
+    }
+  }
+
   // 'foo' => ' foo '
   def splitEdgePunct (input: String) = {
     val splitLeft  = EdgePunctLeft.replaceAllIn(input,"$1$2 $3")
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.