Commits

vvcephei committed 5375559

working on replicating bishal's results

Comments (0)

Files changed (5)

 
 path=/data/deathpenalty/umd_deathpenalty_corpus/folds
 out=out/data
+stoplist=src/main/resources/eng/dictionary/stoplist.txt
 
 CMD=$1
 shift
         mode=$(basename $mode)
         outfile=$out/dp.$fold.$mode.updown
         echo "generating $outfile"
-        echo "\ttwok"
-        updown run updown.preproc.impl.PreprocDPArticles --textPipeline "twokenize" -f $out/dp.twok.$fold.$mode.updown $path/$fold/$mode/*/*
-        echo "\tbasic"
-        updown run updown.preproc.impl.PreprocDPArticles --textPipeline "basicTokenize" -f $out/dp.basic.$fold.$mode.updown $path/$fold/$mode/*/*
+        echo "STAT: $fold $mode twok_1600V"
+        updown run updown.preproc.impl.PreprocDPArticles --textPipeline "twokenize" --vocabSize 1600 -f $out/dp.twok_1600V.$fold.$mode.updown $path/$fold/$mode/*/*
+        echo "STAT: $fold $mode basic_1600V"
+        updown run updown.preproc.impl.PreprocDPArticles --textPipeline "basicTokenize" --vocabSize 1600 -f $out/dp.basic_1600V.$fold.$mode.updown $path/$fold/$mode/*/*
       done
     done
     ;;
   eval)
     for fold in $path/*; do
       fold=$(basename $fold)
-      for pipe in "twok" "basic" ; do
+      for pipe in "twok" "twok_stop" "twok_1600V" "basic" "basic_stop" "basic_1600V"; do
         for k in 25 50 75 100; do
           alpha=$( echo - | awk "{ print 50/$k }" )
-          updown run updown.app.experiment.topic.lda.SplitLDAMaxentExperiment --numTopics $k --alpha $alpha --beta 0.01 --iterations 100 --name Dp_"$fold"_"$pipe"Lda$k -G $out/dp.$pipe.$fold.train.updown -g $out/dp.$pipe.$fold.test.updown
-          updown run updown.app.experiment.topic.maxent.SplitMaxentExperiment --name Dp_"$fold"_"$pipe"Maxent -G $out/dp.$pipe.$fold.train.updown -g $out/dp.$pipe.$fold.test.updown
+          echo "STAT: $fold $pipe lda $k"
+          updown 3 run updown.app.experiment.topic.lda.SplitLDAMaxentExperiment --numTopics $k --alpha $alpha --beta 0.01 --iterations 1000 --name Dp_"$fold"_"$pipe"Lda$k -G $out/dp.$pipe.$fold.train.updown -g $out/dp.$pipe.$fold.test.updown
         done
+        echo "STAT: $fold $pipe maxent"
+        updown run updown.app.experiment.maxent.SplitMaxentExperiment --name Dp_"$fold"_"$pipe"Maxent -G $out/dp.$pipe.$fold.train.updown -g $out/dp.$pipe.$fold.test.updown
       done
     done
     ;;

lib/mallet.jar

Binary file modified.

src/main/scala/updown/app/experiment/SplitExperiment.scala

 import org.clapper.argot.ArgotConverters._
 import com.weiglewilczek.slf4s.Logging
 import updown.util.Statistics
-import org.clapper.argot.{ArgotUsageException, ArgotParser}
 import updown.data.{SystemLabeledTweet, GoldLabeledTweet}
 import java.util.Arrays
+import org.clapper.argot.{MultiValueOption, ArgotUsageException, ArgotParser}
 
 abstract class SplitExperiment extends Experiment {
   // this exists purely to make the ArgotConverters appear used to IDEA
   convertByte _
-  val goldTrainSet = parser.option[String](List("G", "train"), "FILE", "gold labeled training data")
-  val goldTestSet = parser.option[String](List("g", "test"), "FILE", "gold labeled test data")
+  val goldTrainSet:MultiValueOption[String] = parser.multiOption[String](List("G", "train"), "FILE", "gold labeled training data")
+  val goldTestSet:MultiValueOption[String] = parser.multiOption[String](List("g", "test"), "FILE", "gold labeled test data")
   val targetsInputFileTest = parser.option[String](List("s", "targetsTest"), "targetsTestFile", "targets (TEST)")
 
   def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]): List[SystemLabeledTweet]
   def main(args: Array[String]) {
     try {
       parser.parse(args)
-      val trainFileName =
-        goldTrainSet.value match {
-          case Some(filename) => filename
-          case None => parser.usage("You must specify a gold labeled training file")
-        }
-      val testFileName =
-        goldTestSet.value match {
-          case Some(filename) => filename
-          case None => parser.usage("You must specify a gold labeled test file via")
-        }
-      val result =
-      {
-          logger.debug("starting run")
-          val result = doExperiment(TweetFeatureReader(testFileName), TweetFeatureReader(trainFileName))
-          logger.debug("ending run")
-          result
+      val trainSet: List[GoldLabeledTweet] = goldTrainSet.value.toList.flatMap((s)=>TweetFeatureReader(s))
+      val testSet: List[GoldLabeledTweet] = goldTrainSet.value.toList.flatMap((s)=>TweetFeatureReader(s))
+      if (trainSet.length == 0) {
+        parser.usage("no training instances specified")
       }
-      
-      report(trainFileName.toString+"->"+testFileName.toString ,result)
+      if (testSet.length == 0) {
+        parser.usage("no testing instances specified")
+      }
+      logger.debug("starting run")
+      val result = doExperiment(testSet, trainSet)
+      logger.debug("ending run")
+
+
+      report(goldTrainSet.value.toString + "->" + goldTestSet.value.toString, result)
       logger.debug("running cleanup code")
       System.exit(after())
     }

src/main/scala/updown/preproc/GenericPreprocessor.scala

       ("twokenizeSkipGtOneGrams" -> TokenizationPipes.twokenizeSkipGtOneGrams),
       ("filterAlpha") -> TokenizationPipes.filterOnRegex("\\p{Alpha}+"),
       ("filterAlphaQuote") -> TokenizationPipes.filterOnRegex("(\\p{Alpha}|')+"),
-      ("splitSpace" -> TokenizationPipes.splitOnDelimiter(" "))
+      ("splitSpace" -> TokenizationPipes.splitOnDelimiter(" ")),
+      ("removeStopwords") -> {
+        (s: List[String]) =>
+          throw new Error("Not implemented. This should have been replaced in the main method.")
+      }
     )
   val defaultPipeline = "twokenize|removeStopwords"
   val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
   val targetFile = parser.option[String](List("t", "target"), "target", "target file")
   val featureFile = parser.option[String](List("f", "feature"), "feature", "feature file")
 
+  val vocabSize = parser.option[Int]("vocabSize", "SIZE", "The number of words to allow in the vocabulary.")
 
   def getInstanceIterator(file: File): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)]
 
     writer.write("%s|%s\n".format(id, target))
   }
 
+  def getVocabulary(size: Int, pipeline: List[(List[String]) => List[String]], inputs: Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)]): Set[String] = {
+    val counts = scala.collection.mutable.Map[String, Int]().withDefaultValue(0)
+    for ((_, _, _, text) <- inputs) {
+      val outputText = runThroughPipeLine(text, pipeline).map((s) => s.replaceAll(",", "-COMMA-").replaceAll("\\|", "-PIPE-"))
+      for (token <- outputText) {
+        counts(token) = counts(token) + 1
+      }
+    }
+    counts.toList.sortBy(_._2).reverse.map {
+      case (s, c) => s
+    }.take(size).toSet
+  }
+
   def before() {}
 
   def main(args: Array[String]) {
       before()
       // SET UP IO
 
+
       val inputLines = getInputIterator(inputFiles.value)
       val targetWriter = new OutputStreamWriter(
         targetFile.value match {
       var numSkipped = 0
       var numClasses = scala.collection.mutable.Map[SentimentLabel.Type, Int]().withDefaultValue(0)
       var numLabels = 0
+      var numTokens = 0
       // RUN
+      val vocab = vocabSize.value match {
+        case Some(i: Int) => getVocabulary(i, pipeline, getInputIterator(inputFiles.value))
+        case _ => Set[String]()
+      }
       for ((id, reviewer, polarityChoice, text) <- inputLines) {
         val outputID = if (id == "") (idNumStart + numLines).toString else id
-        val outputText = runThroughPipeLine(text, pipeline).map((s) => s.replaceAll(",", "-COMMA-").replaceAll("\\|", "-PIPE-")).mkString(",")
+        val outputTextList = runThroughPipeLine(text, pipeline)
+          .map((s) => s.replaceAll(",", "-COMMA-").replaceAll("\\|", "-PIPE-"))
+          .filter(s=>vocab.size==0 || vocab.contains(s))
+        numTokens += outputTextList.length
+        val outputText = outputTextList.mkString(",")
         polarityChoice match {
           case Left(polarity) =>
             // no targets
       targetWriter.flush()
       logger.info("Stats:\n" +
         "Preprocessed " + numLines + " tweets. " +
+        "Used " + numTokens + " tokens. " +
         "Assigned %d labels.\n".format(numLabels) +
         (for ((label, count) <- numClasses if label != SentimentLabel.Abstained) yield
           "%20s: %10d instances (%2.2f%%)"

src/main/scala/updown/preproc/impl/PreprocMDSDReviews.scala

   }
 
   def getInstanceIterator(file:File): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
+    println(file.getAbsolutePath)
     (for ((line, index) <- scala.io.Source.fromFile(file, "UTF-8").getLines().zipWithIndex) yield {
       val (tokens, label) = getTokensFromLine(line)
       val purgedTokens = tokens.filter((s) => true)
       ("%s#%d".format(file.getName, index), "unk", Left(label), purgedTokens.mkString(" "))
     })
   }
-}
+}