1. Mike Speriosu
  2. updown

Commits

vvcephei  committed 347767d

implementing plda

  • Participants
  • Parent commits bbaae60
  • Branches default

Comments (0)

Files changed (21)

File .idea/libraries/slf4j_jdk14_1_6_4.xml

View file
  • Ignore whitespace
+<component name="libraryTable">
+  <library name="slf4j-jdk14-1.6.4">
+    <CLASSES>
+      <root url="jar://$PROJECT_DIR$/lib/slf4j-jdk14-1.6.4.jar!/" />
+    </CLASSES>
+    <JAVADOC />
+    <SOURCES>
+      <root url="jar://$PROJECT_DIR$/lib/slf4j-jdk14-1.6.4-sources.jar!/" />
+    </SOURCES>
+  </library>
+</component>

File .idea/libraries/tmt_0_4_0.xml

View file
  • Ignore whitespace
+<component name="libraryTable">
+  <library name="tmt-0.4.0">
+    <CLASSES>
+      <root url="jar://$PROJECT_DIR$/lib/tmt-0.4.0.jar!/" />
+    </CLASSES>
+    <JAVADOC />
+    <SOURCES>
+      <root url="file://$PROJECT_DIR$/../tmt-0.4.0-src/src" />
+    </SOURCES>
+  </library>
+</component>

File bin/parse_updoutput.py

View file
  • Ignore whitespace
 #!/usr/bin/python
+import time
 import re, sys
 
 DEBUG=False
 NaN = float('nan')
 statRE=re.compile(r"^STAT: (\S+) (\S+) (.*)")
-(fold, prep, exp, n, acc, fpos, fneg) = (None,)*7
 skip=True
 folds = set()
 tables = {}
+(fold, prep, exp, n, acc, fpos, fneg) = (None,)*7
+
+def reset():
+  (fold, prep, exp, n, acc, fpos, fneg) = (None,)*7
+
 
 def debug(s):
   if DEBUG:
     print('>'+str(s))
 
-for line in sys.stdin.readlines():
+lastline = ""
+instr = open(sys.stdin.fileno(),'r',encoding='utf8')
+for line in instr.readlines():
+  #print(lastline)
+  #print((fold,prep,exp,n,acc,fpos,fneg))
+  #print()
+  #time.sleep(1)
   line = line.rstrip()
+  lastline = line
   m=statRE.match(line)
   if (m):
+    #print("setting experiment")
+    (fold, prep, exp, n, acc, fpos, fneg) = (None,)*7
     (fold, prep, exp) = m.groups()
     folds.add(fold)
     if not prep in tables:
         tables[prep][exp][i][fold] = NaN
     skip=False
     continue
-  if re.match(r"^Per",line):
+  if re.match(r"^Per-",line):
+    #print("setting skip")
     skip=True
     continue
-  if (skip): continue
+  if (skip): 
+    #print("skipping")
+    continue
   m= re.match(r"^\s+N\s+(\d+)",line)
   if (m):
+    #print("setting n")
     (n,)=m.groups()
     tables[prep][exp]['n'][fold] = float(n)
     debug(tables[prep][exp]['n'])
     continue
   m= re.match(r"^\s+Accuracy\s+(\d+.\d+)",line)
   if (m):
+    #print("setting acc")
     (acc,)=m.groups()
     tables[prep][exp]['acc'][fold] = float(acc)
     debug(tables[prep][exp]['acc'])
     continue
   m= re.match(r"^\s+positive .* (\d+\.\d+)$",line)
   if (m):
+    #print("setting fpos")
     (fpos,)=m.groups()
     tables[prep][exp]['fpos'][fold] = float(fpos)
     debug(tables[prep][exp]['fpos'])
     continue
   m= re.match(r"^\s+negative .* (\d+\.\d+)$",line)
   if (m):
+    #print("setting fneg")
     (fneg,)=m.groups()
     tables[prep][exp]['fneg'][fold] = float(fneg)
     debug(tables[prep][exp]['fneg'])
     continue
-  #m= re.match(r"^Exception",line)
-  #if (m):
-    #tables[prep][exp]['n'] = NaN
-    #tables[prep][exp]['acc'] = NaN
-    #tables[prep][exp]['fpos'] = NaN
-    #tables[prep][exp]['fneg'] = NaN
-    #debug("exception")
-    #continue
+  #print("ignored")
+
+#sys.exit(1)
 
 prep_set = set(tables.keys())
 exp_set = set()

File bin/run-dp.sh

View file
  • Ignore whitespace
         for k in 25 50 75 100; do
           alpha=$( echo - | awk "{ print 50/$k }" )
           echo "STAT: $fold $pipe lda $k"
-          updown 3 run updown.app.experiment.topic.lda.SplitLDAMaxentExperiment --numTopics $k --alpha $alpha --beta 0.01 --iterations 1000 --name Dp_"$fold"_"$pipe"Lda$k -G $out/dp.$pipe.$fold.train.updown -g $out/dp.$pipe.$fold.test.updown
+          java -Xmx3g -jar ~/repos/updown/target/updown-0.1.0-one-jar.jar experiment split lda-maxent --numTopics $k --alpha $alpha --beta 0.01 --iterations 100 --name Dp_"$fold"_"$pipe"Lda$k -G $out/dp.$pipe.$fold.train.updown -g $out/dp.$pipe.$fold.test.updown
         done
-        echo "STAT: $fold $pipe maxent"
-        updown run updown.app.experiment.maxent.SplitMaxentExperiment --name Dp_"$fold"_"$pipe"Maxent -G $out/dp.$pipe.$fold.train.updown -g $out/dp.$pipe.$fold.test.updown
+        sigma=0.0
+        echo "STAT: $fold $pipe maxent-sig$sigma"
+        java -jar ~/repos/updown/target/updown-0.1.0-one-jar.jar experiment split maxent --iterations 100 --sigma $sigma -G $out/dp.$pipe.$fold.train.updown -g $out/dp.$pipe.$fold.test.updown
       done
     done
     ;;

File bin/updown

View file
  • Ignore whitespace
 
      $SCALA_COMMAND updown.app.JuntoClassifier $*
 
+elif [ $CMD = 'classpath' ]; then
+
+     echo $CP
+
 else  
 
     CLASS=

File build.sbt

View file
  • Ignore whitespace
 name := "Updown"
 
-version := "0.1.0"
+version := "0.1.2"
 
 organization := "OpenNLP"
 
 
 // append -deprecation to the options passed to the Scala compiler
 scalacOptions ++= Seq("-deprecation", "-Xlint")
+
+seq(com.github.retronym.SbtOneJar.oneJarSettings: _*)
+
+libraryDependencies += "commons-lang" % "commons-lang" % "2.6"
+
+mainClass in oneJar := Some("updown.Run")

File src/main/scala/updown/Run.scala

View file
  • Ignore whitespace
+package updown
+
+import util.Commandable
+
+object Run extends Commandable{
+  val usageString = "Usage:\n" +
+    "Run COMMAND args...\n" +
+    "\tWhere COMMAND is one of:\n" +
+    "\t- preprocess\n" +
+    "\t- experiment"
+
+  def main(args: Array[String]) {
+    if (args.length < 1) {
+      usage()
+    }
+    val command = args(0)
+    val rest = args.slice(1,args.length)
+    command match {
+      case "preprocess" => updown.preproc.Run(rest.toArray)
+      case "experiment" => updown.app.experiment.Run(rest.toArray)
+      case _ => unrecognized(command)
+    }
+  }
+}

File src/main/scala/updown/app/TrainMaxentModel.scala

View file
  • Ignore whitespace
 
   val DEFAULT_ITERATIONS = 10
   val DEFAULT_CUTOFF = 5
+  val DEFAULT_GAUSSIAN = 1.0
 
 
   def apply(fileName: String, iterations: Int, cutoff: Int): AbstractModel = {
-    GIS.PRINT_MESSAGES = false
     GIS.trainModel(MaxentEventStreamFactory(fileName), iterations, cutoff)
   }
 
     val dataStream = new PlainTextByLineDataStream(reader)
     val eventStream = new BasicEventStream(dataStream, ",")
 
-    GIS.PRINT_MESSAGES = false
     GIS.trainModel(eventStream, iterations, cutoff)
   }
 
   def trainWithStringIterator(iterator: Iterator[String], iterations: Int, cutoff: Int): AbstractModel = {
-    GIS.PRINT_MESSAGES = false
-
     GIS.trainModel(MaxentEventStreamFactory.getWithStringIterator(iterator), iterations, cutoff)
   }
 
-  //  def apply[String](iterator: Iterator[String]): AbstractModel = apply(iterator, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
+  def trainWithGoldLabeledTweetIterator(iterator: Iterator[GoldLabeledTweet],
+                                        iterations: Int,
+                                        cutoff: Int,
+                                        gaussianSigma: Double): AbstractModel = {
+    System.out.println("training with sigma: "+gaussianSigma.toString)
+    GIS.trainModel(MaxentEventStreamFactory.getWithGoldLabeledTweetIterator(iterator),
+      iterations,
+      cutoff,
+      gaussianSigma)
+  }
 
   def trainWithGoldLabeledTweetIterator(iterator: Iterator[GoldLabeledTweet], iterations: Int, cutoff: Int): AbstractModel = {
-    GIS.PRINT_MESSAGES = false
     GIS.trainModel(MaxentEventStreamFactory.getWithGoldLabeledTweetIterator(iterator), iterations, cutoff)
   }
 
-  def trainWithGoldLabeledTweetIterator(iterator: Iterator[GoldLabeledTweet]): AbstractModel = trainWithGoldLabeledTweetIterator(iterator, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
+  def trainWithGoldLabeledTweetIterator(iterator: Iterator[GoldLabeledTweet]): AbstractModel =
+    trainWithGoldLabeledTweetIterator(iterator, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
 
   def main(args: Array[String]) {
     val parser = new ArgotParser("updown run updown.app.TrainMaxentModel", preUsage = Some("Updown"))

File src/main/scala/updown/app/experiment/NFoldExperiment.scala

View file
  • Ignore whitespace
     }).iterator
   }
 
-  def main(args: Array[String]) {
+  def apply(args: Array[String]) {
     try {
       parser.parse(args)
       val nFolds: Int = n.value.getOrElse(10)

File src/main/scala/updown/app/experiment/Run.scala

View file
  • Ignore whitespace
+package updown.app.experiment
+
+import updown.util.Commandable
+
+object Run extends Commandable{
+  val usageString = "Usage:\n" +
+    "experiment TYPE EXPERIMENT args...\n" +
+    "\tWhere TYPE is one of:\n" +
+    "\t- static\n" +
+    "\t- split\n" +
+    "\t- nfold\n\n" +
+    "\tWhere EXPERIMENT is one of:\n" +
+    "\t- junto\n" +
+    "\t- lexical\n" +
+    "\t- maxent\n" +
+    "\t- nbayes\n" +
+    "\t- lda-maxent\n\n" +
+    "\t- plda-maxent\n\n" +
+    "\t Note that some combinations may not be implemented."
+
+  def apply(args: Array[String]) {
+    if (args.length < 2) {
+      usage()
+    }
+    val exptype = args(0)
+    val command = args(1)
+    val rest = args.slice(2,args.length)
+    command match {
+      case "junto" =>
+        exptype match {
+          case "split" => updown.app.experiment.labelprop.SplitJuntoExperiment(rest)
+          case "static" => updown.app.experiment.labelprop.StaticJuntoExperiment(rest)
+          case "nfold" => notImplemented(command+" "+exptype)
+          case _ => unrecognized(command)
+        }
+      case "lexical" =>
+        exptype match {
+          case "split" => notImplemented(command+" "+exptype)
+          case "static" => updown.app.experiment.lexical.LexicalRatioExperiment(rest)
+          case "nfold" => notImplemented(command+" "+exptype)
+          case _ => unrecognized(command)
+        }
+      case "maxent" =>
+        exptype match {
+          case "split" => updown.app.experiment.maxent.SplitMaxentExperiment(rest)
+          case "static" => updown.app.experiment.maxent.StaticMaxentExperiment(rest)
+          case "nfold" => updown.app.experiment.maxent.NFoldMaxentExperiment(rest)
+          case _ => unrecognized(command)
+        }
+      case "nbayes" =>
+        exptype match {
+          case "split" => notImplemented(command+" "+exptype)
+          case "static" => notImplemented(command+" "+exptype)
+          case "nfold" => updown.app.experiment.nbayes.NFoldNBayesExperiment(rest)
+          case _ => unrecognized(command)
+        }
+      case "lda-maxent" =>
+        exptype match {
+          case "split" => updown.app.experiment.topic.lda.SplitLDAMaxentExperiment(rest)
+          case "static" => notImplemented(command+" "+exptype)
+          case "nfold" => updown.app.experiment.topic.lda.NFoldDiscriminantLDAExperiment(rest)
+          case _ => unrecognized(command)
+        }
+      case "plda-maxent" =>
+        exptype match {
+          case "split" => updown.app.experiment.topic.plda.SplitPLADMaxentExperiment(rest)
+          case "static" => notImplemented(command+" "+exptype)
+          case "nfold" => notImplemented(command+" "+exptype)
+          case _ => unrecognized(command)
+        }
+      case _ =>
+        unrecognized(command)
+    }
+  }
+}

File src/main/scala/updown/app/experiment/SplitExperiment.scala

View file
  • Ignore whitespace
 
   def after(): Int
 
-  def main(args: Array[String]) {
+  def main(args: Array[String]) = apply(args)
+  
+  def apply(args: Array[String]) {
     try {
       parser.parse(args)
       val trainSet: List[GoldLabeledTweet] = goldTrainSet.value.toList.flatMap((s)=>TweetFeatureReader(s))
-      val testSet: List[GoldLabeledTweet] = goldTrainSet.value.toList.flatMap((s)=>TweetFeatureReader(s))
+      val testSet: List[GoldLabeledTweet] = goldTestSet.value.toList.flatMap((s)=>TweetFeatureReader(s))
       if (trainSet.length == 0) {
         parser.usage("no training instances specified")
       }

File src/main/scala/updown/app/experiment/StaticExperiment.scala

View file
  • Ignore whitespace
 
   def after(): Int
 
-  def main(args: Array[String]) {
+  def apply(args: Array[String]) {
     try {
       parser.parse(args)
 

File src/main/scala/updown/app/experiment/maxent/MaxentModel.scala

View file
  • Ignore whitespace
+package updown.app.experiment.maxent
+
+import org.clapper.argot.ArgotParser
+import org.clapper.argot.ArgotConverters.{convertDouble,convertInt}
+
+trait MaxentModel {
+  val parser: ArgotParser
+  val sigmaOption = parser.option[Double]("sigma","DOUBLE",
+    "the value for gaussian smoothing in training the maxent model")
+  val iterationsOption = parser.option[Int]("iterations","INT",
+    "the number of iterations in training the maxent model")
+}

File src/main/scala/updown/app/experiment/maxent/SplitMaxentExperiment.scala

View file
  • Ignore whitespace
 import updown.app.TrainMaxentModel
 import updown.app.experiment.{SplitExperiment, NFoldExperiment}
 
-object SplitMaxentExperiment extends SplitExperiment {
+object SplitMaxentExperiment extends SplitExperiment with MaxentModel {
   def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]) = {
     logger.info("performing Maxent experiment")
     logger.debug("training model")
-    val model = TrainMaxentModel.trainWithGoldLabeledTweetIterator(trainSet.iterator)
+    val sigma = sigmaOption.value match {
+      case Some(sigma: Double) => sigma
+      case _ => 0.0
+    }
+    val iterations = iterationsOption.value match {
+      case Some(iterations: Int) => iterations
+      case _ => TrainMaxentModel.DEFAULT_ITERATIONS
+    }
+    val model = TrainMaxentModel.trainWithGoldLabeledTweetIterator(
+      trainSet.iterator,
+      iterations,
+      TrainMaxentModel.DEFAULT_CUTOFF,
+      sigma)
 
-    logger.debug("testing model")
+    print("testing model")
+    var n=0
     val res = for (tweet <- testSet) yield {
+      n+=1
       tweet match {
         case GoldLabeledTweet(id, userid, features, goldLabel) =>
           SystemLabeledTweet(id, userid, features, goldLabel,
             SentimentLabel.figureItOut(model.getBestOutcome(model.eval(features.toArray))))
       }
     }
+    print(n)
     res
   }
-  def after():Int=0
+
+  def after(): Int = 0
 }

File src/main/scala/updown/app/experiment/topic/Constants.scala

View file
  • Ignore whitespace
+package updown.app.experiment.topic
+
+object Constants {
+  final val IGNORE_INSTANCE = "IGNORE_INSTANCE"
+
+}

File src/main/scala/updown/app/experiment/topic/lda/SplitLDAExperiment.scala

View file
  • Ignore whitespace
 import updown.util.{WordleUtils, LDATopicModel, TopicModel}
 import updown.app.experiment.SplitExperiment
 import java.util.Arrays
+import updown.data.io.TweetFeatureReader._
+import updown.data.io.TweetFeatureReader
 
 abstract class SplitLDAExperiment extends SplitExperiment {
   var iterations = 1000
   val alphaOption = parser.option[Double](List("alpha"), "INT", "the symmetric alpha hyperparameter for LDA")
   val betaOption = parser.option[Double](List("beta"), "DOUBLE", "the symmetric beta hyperparameter for LDA")
   val numTopicsOption = parser.option[Int](List("numTopics"), "INT", "the number of topics for LDA")
+  val extraTopicTrainingSetOption = parser.multiOption[String]("T","FILE",
+    "extra inputs to be used to train the topic model, not the classifier.")
 
   val outputOption = parser.option[String](List("o", "output"), "DIR", "the directory to dump topics into")
   val wordleOption = parser.flag[Boolean](List("w", "wordle"), "generate wordles for the topics (requires -o DIR) " +
 
 
     logger.debug("alphaSum: " + alphaSum)
-    val model: TopicModel = new LDATopicModel(trainSet, numTopics, iterations, alphaSum, beta)
+    val extraTopicTrainSet: List[GoldLabeledTweet] =
+      extraTopicTrainingSetOption.value.toList.flatMap((s)=>TweetFeatureReader(s)).map{
+        case GoldLabeledTweet(id,uid,feat,label) => GoldLabeledTweet(updown.app.experiment.topic.Constants.IGNORE_INSTANCE,uid,feat,label)
+      }
+
+    val model: TopicModel = new LDATopicModel(trainSet++extraTopicTrainSet, numTopics, iterations, alphaSum, beta)
     logger.debug("topic distribution:\n     :" + Arrays.toString(model.getTopicPriors))
     logger.debug({
       val labelToTopicDist = model.getLabelsToTopicDist

File src/main/scala/updown/app/experiment/topic/plda/SplitPLDAMaxentExperiment.scala

View file
  • Ignore whitespace
+package updown.app.experiment.topic.plda
+
+import updown.app.experiment.SplitExperiment
+import updown.app.experiment.topic.util.MaxentDiscriminant
+import scalanlp.io._
+import java.io.{OutputStreamWriter, FileOutputStream, File}
+import scalanlp.pipes._
+import scalanlp.stage._
+import scalanlp.stage.text._
+import scalanlp.text.tokenize._
+import edu.stanford.nlp.tmt.stage._
+import edu.stanford.nlp.tmt.model.DirichletParams._
+import edu.stanford.nlp.tmt.model.llda.LabeledLDADataset
+import edu.stanford.nlp.tmt.model.plda.{SharedKTopicsPerLabel, PLDAModelParams}
+import scala.Predef._
+import updown.data.SystemLabeledTweet._
+import updown.data.{SystemLabeledTweet, SentimentLabel, GoldLabeledTweet}
+import org.clapper.argot.ArgotParser
+import org.clapper.argot.ArgotConverters._
+
+trait CLOptions {
+  val parser: ArgotParser
+  val tmpDirOption = parser.option[File](List("tmp", "tmpDir"), "DIR", "The location to store temp files in") {
+    (s, opt) =>
+      val file = new File(s)
+      if (file.isDirectory) {
+        file
+      } else if (!file.exists) {
+        parser.usage("tmp dir does not exist")
+      } else {
+        parser.usage("Invaid tmp dir")
+      }
+  }
+  val numIterationOption = parser.option[Int](List("iterations"), "INT", "the number of iterations for the training the topicModel")
+  //  val alphaOption = parser.option[Double](List("alpha"), "INT", "the symmetric alpha hyperparameter for LDA")
+  //  val betaOption = parser.option[Double](List("beta"), "DOUBLE", "the symmetric beta hyperparameter for LDA")
+  val numTopicsPerLabelOption = parser.option[Int](List("numLabelTopics"), "INT", "the number of topics for PLDA")
+  val numTopicsInBackgroundOption = parser.option[Int](List("numBgroundTopics"), "INT", "the number of topics for PLDA")
+  val extraTopicTrainingSetOption = parser.multiOption[String]("T", "FILE",
+    "extra inputs to be used to train the topic model, not the classifier.")
+
+  def getNumBackgroundTopics = numTopicsInBackgroundOption.value match {
+    case Some(value: Int) => value
+    case _ => 1
+  }
+
+  def getNumTopicsPerLabel = numTopicsPerLabelOption.value match {
+    case Some(value: Int) => SharedKTopicsPerLabel(value);
+    // or could specify the number of topics per label based on the values
+    // in a two-column CSV file containing label name and number of topics
+    // val numTopicsPerLabel = CustomKTopicsPerLabel(
+    //  CSVFile("topics-per-label.csv").read[Iterator[(String,Int)]].toMap);
+    case _ => SharedKTopicsPerLabel(4);
+    // or could specify the number of topics per label based on the values
+    // in a two-column CSV file containing label name and number of topics
+    // val numTopicsPerLabel = CustomKTopicsPerLabel(
+    //  CSVFile("topics-per-label.csv").read[Iterator[(String,Int)]].toMap);
+
+  }
+
+  def getNumIterations = numIterationOption.value match {
+    case Some(value: Int) => value
+    case _ => 1000
+  }
+
+  def getTmpDir = tmpDirOption.value match {
+    case Some(file: File) => file
+    case _ =>
+      val file = new File("tmp")
+      if (!file.exists()) {
+        file.mkdir()
+      } else if (file.exists() && !file.isDirectory) {
+        System.err.println("Could not create tmp dir in working dir. Try specifying the --tmp option.")
+        System.exit(1)
+      }
+      file
+  }
+}
+
+object SplitPLADMaxentExperiment extends SplitExperiment with MaxentDiscriminant with CLOptions {
+  val rand = new java.util.Random()
+
+  def createCSVIntermediate(set: List[GoldLabeledTweet], name: String): File = {
+    val tmpFile = new File(getTmpDir.getAbsolutePath + java.io.File.separator + "splitPLDAMeExp_" + name + "_" + rand.nextLong())
+    val os = new FileOutputStream(tmpFile)
+    val out = new OutputStreamWriter(os, "utf8")
+    for (GoldLabeledTweet(id, userid, features, goldLabel) <- set) {
+      out.write("\"%s\",\"%s\",\"%s\",\"%s\"\n".format(id.toString, userid.toString, features.mkString(" "), goldLabel.toString))
+    }
+    out.flush()
+    out.close()
+    tmpFile
+  }
+
+  def getLLDADataset(infile: File): LabeledLDADataset[(String, scala.Iterable[String], scala.Iterable[String])] = {
+    val trainSource = CSVFile(infile) ~> IDColumn(1)
+
+    val tokenizer = {
+      SimpleEnglishTokenizer() ~> // tokenize on space and punctuation
+        MinimumLengthFilter(3) // take terms with >=3 characters
+    }
+
+    val text = {
+      trainSource ~> // read from the source file
+        Column(3) ~> // select column containing text
+        TokenizeWith(tokenizer) ~> // tokenize with tokenizer above
+        TermCounter() ~> // collect counts (needed below)
+        TermMinimumDocumentCountFilter(4) ~> // filter terms in <4 docs
+        TermDynamicStopListFilter(30) ~> // filter out 30 most common terms
+        DocumentMinimumLengthFilter(5) // take only docs with >=5 terms
+    }
+
+    // define fields from the dataset we are going to slice against
+    val labels = {
+      trainSource ~> // read from the source file
+        Column(4) ~> // take column two, the year
+        TokenizeWith(WhitespaceTokenizer()) ~> // turns label field into an array
+        TermCounter() ~> // collect label counts
+        TermMinimumDocumentCountFilter(10) // filter labels in < 10 docs
+    }
+
+    val dataset = LabeledLDADataset(text, labels);
+    dataset
+  }
+
+  def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]) = {
+    val trainFile = createCSVIntermediate(trainSet, "train")
+    val trainDataset = getLLDADataset(trainFile)
+
+    // define the model parameters
+    val modelParams = PLDAModelParams(trainDataset,
+      getNumBackgroundTopics, getNumTopicsPerLabel,
+      termSmoothing = 0.01, topicSmoothing = 0.01);
+
+    // Name of the output model folder to generate
+    val modelPath = new File(getTmpDir.getAbsolutePath+File.separator+"plda-updown-" + trainDataset.signature + "-" + modelParams.signature);
+
+    // Trains the model, writing to the given output path
+    val model = TrainCVB0PLDA(modelParams, trainDataset, output = modelPath, maxIterations = getNumIterations);
+    val trainMap = trainSet.map(tweet => (tweet.id, tweet)).toMap
+    val trainDistributions = InferCVB0PLDADocumentTopicDistributions(model, trainDataset).toList
+    val labelsToTopicDists = scala.collection.mutable.Map[SentimentLabel.Type, List[Array[Double]]]().withDefaultValue(Nil)
+    for ((outId, dist) <- trainDistributions) yield {
+      val GoldLabeledTweet(inId, inUID, _, inLabel) = trainMap(outId)
+      assert(outId.equals(inId))
+      labelsToTopicDists(inLabel) = dist :: labelsToTopicDists(inLabel)
+    }
+    val discriminantFn = getDiscriminantFn(labelsToTopicDists.toMap)
+
+    val testFile = createCSVIntermediate(testSet, "test")
+    val testDataset = getLLDADataset(testFile)
+    val testMap = testSet.map(tweet => (tweet.id, tweet)).toMap
+    val testDistributions = InferCVB0PLDADocumentTopicDistributions(model, testDataset).toList
+
+    (for ((outId, topicDist) <- testDistributions) yield {
+      val GoldLabeledTweet(inId, inUID, inFeatures, inLabel) = testMap(outId)
+      assert(outId.equals(inId))
+      val (outLabel: String, outcomes: String) = discriminantFn(topicDist.map(d => d.toFloat))
+      logger.trace("labeling id:%s gold:%2s with label:%2s from outcomes:%s".format(
+        inId,
+        inLabel.toString,
+        outLabel.toString,
+        outcomes))
+      SystemLabeledTweet(inId, inUID, inFeatures, inLabel, SentimentLabel.figureItOut(outLabel))
+    }).toList
+  }
+
+  def after() = 0
+}

File src/main/scala/updown/preproc/GenericPreprocessor.scala

View file
  • Ignore whitespace
 
   def before() {}
 
-  def main(args: Array[String]) {
+  def apply(args: Array[String]) {
     logger.debug(args.toList.toString)
     try {
       parser.parse(args)

File src/main/scala/updown/preproc/Run.scala

View file
  • Ignore whitespace
+package updown.preproc
+
+import updown.util.Commandable
+
+object Run extends Commandable {
+  val usageString = "Usage:\n" +
+    "preprocess DATASET args...\n" +
+    "\tWhere DATASET is one of:\n" +
+    "\t- dp\n" +
+    "\t- hcr\n" +
+    "\t- mdsd\n" +
+    "\t- polarity\n" +
+    "\t- shamma\n" +
+    "\t- stanford\n"
+
+  def apply(args: Array[String]) {
+    if (args.length < 1) {
+      usage()
+    }
+    val command = args(0)
+    val rest = args.slice(1,args.length)
+    command match {
+      case "dp" => updown.preproc.impl.PreprocDPArticles(rest)
+      case "hcr" => updown.preproc.impl.PreprocHCRTweets(rest)
+      case "mdsd" => updown.preproc.impl.PreprocMDSDReviews(rest)
+      case "polarity" => updown.preproc.impl.PreprocPangLeePolarityCorpus(rest)
+      case "shamma" => updown.preproc.impl.PreprocShammaTweets(rest)
+      case "stanford" => updown.preproc.impl.PreprocStanfordTweets(rest)
+      case _ =>
+        unrecognized(command)
+    }
+  }
+}

File src/main/scala/updown/util/Commandable.scala

View file
  • Ignore whitespace
+package updown.util
+
+trait Commandable {
+  val usageString:String
+
+  def usage() {
+    System.err.println(usageString)
+    sys.exit(1)
+  }
+
+  def unrecognized(command: String) {
+    System.err.println("unrecognized command: %s".format(command))
+    usage()
+  }
+
+  def notImplemented(command: String) {
+    System.err.println("%s is not implemented".format(command))
+    usage()
+  }
+}

File src/main/scala/updown/util/LDATopicModel.scala

View file
  • Ignore whitespace
   model.addInstances(instanceList)
   model.setNumThreads(numTopics max MAX_THREADS)
   model.setNumIterations(numIterations)
-//  ParallelTopicModel.logger.setLevel(Level.OFF)
+  //  ParallelTopicModel.logger.setLevel(Level.OFF)
   model.estimate()
 
   def getTopics: List[Topic] = {
     val priors: Array[Double] = getTopicPriors
-    val topicsToAlphaIds = scala.collection.mutable.Map[Int,List[(Int,Double)]]()
+    val topicsToAlphaIds = scala.collection.mutable.Map[Int, List[(Int, Double)]]()
 
     val wordsTopicsCounts = (for ((topicCounts, typeIndex) <- model.typeTopicCounts.zipWithIndex) yield {
       val word = alphabet.lookupObject(typeIndex).toString
       (for (topicCount <- topicCounts) yield {
         val topic = topicCount & model.topicMask
         val count = topicCount >> model.topicBits
-        (word,topic,count)
+        (word, topic, count)
       }).iterator
     }).iterator.flatten.toList
 
 
     val res = (for (i <- 0 until numTopics) yield {
-      val wordCounts = wordsTopicsCounts.filter((triple)=>(triple._2==i && triple._3!=0))
-      val sum = wordCounts.map((triple)=>triple._3).reduce(_ + _)
-      Topic(Map(("alpha"->priors(i))), wordCounts.map((triple)=>(triple._1->(triple._3.toDouble/sum))).toMap)
+      val wordCounts = wordsTopicsCounts.filter((triple) => (triple._2 == i && triple._3 != 0))
+      val sum = wordCounts.map((triple) => triple._3).reduce(_ + _)
+      Topic(Map(("alpha" -> priors(i))), wordCounts.map((triple) => (triple._1 -> (triple._3.toDouble / sum))).toMap)
     }).toList
 
     res
   }
 
   def getLabelsToTopicDists = {
-    val result = scala.collection.mutable.Map[SentimentLabel.Type,List[Array[Double]]]().withDefaultValue(Nil)
+    val result = scala.collection.mutable.Map[SentimentLabel.Type, List[Array[Double]]]().withDefaultValue(Nil)
     for (topicAssignment <- model.getData) {
-      val target = topicAssignment.instance.getTarget
-      val value = target.asInstanceOf[FeatureVector].getValues()(0)
-      val label = SentimentLabel.fromDouble(value)
-      result(label) = model.getTopicProbabilities(topicAssignment.topicSequence) :: result(label)
+      val name = topicAssignment.instance.getName()
+      if (name != updown.app.experiment.topic.Constants.IGNORE_INSTANCE) {
+        val target = topicAssignment.instance.getTarget
+        val value = target.asInstanceOf[FeatureVector].getValues()(0)
+        val label = SentimentLabel.fromDouble(value)
+        result(label) = model.getTopicProbabilities(topicAssignment.topicSequence) :: result(label)
+      }
     }
     result.toMap // immutize
   }
 
   def inferTopics(tweet: GoldLabeledTweet) = {
     val instance = tweet match {
-        case GoldLabeledTweet(id, userid, features, goldLabel) =>
-          val featureSequence = new FeatureSequence(alphabet, features.length)
-          for (feature <- features) {
-            featureSequence.add(feature)
-          }
-          new Instance(featureSequence, goldLabel, id, null)
-      }
+      case GoldLabeledTweet(id, userid, features, goldLabel) =>
+        val featureSequence = new FeatureSequence(alphabet, features.length)
+        for (feature <- features) {
+          featureSequence.add(feature)
+        }
+        new Instance(featureSequence, goldLabel, id, null)
+    }
     model.getInferencer.getSampledDistribution(instance, numIterations, 1, 1)
   }