Commits

vvcephei committed d213e84

working on topic experiments now

  • Participants
  • Parent commits a1113d7

Comments (0)

Files changed (10)

src/main/scala/updown/app/NFoldExperiment.scala

 
 import updown.data.io.TweetFeatureReader
 import updown.data.{SentimentLabel, GoldLabeledTweet}
+import org.clapper.argot.ArgotParser._
+import org.clapper.argot.{ArgotUsageException, ArgotParser}
+import org.clapper.argot.ArgotConverters._
+import com.weiglewilczek.slf4s.Logging
+import updown.util.Statistics
 
-abstract class NFoldExperiment {
+abstract class NFoldExperiment extends Logging {
+  // this exists purely to make the ArgotConverters appear used to IDEA
+  convertByte _
+
+  def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]):
+    (Double, List[(updown.data.SentimentLabel.Type, Double, Double, Double)])
+
   def generateTrials(inputFile: String, nFolds: Int): Iterator[(List[GoldLabeledTweet], List[GoldLabeledTweet])] = {
     val foldsToTweets = (for ((fold, list) <- TweetFeatureReader(inputFile).zipWithIndex.groupBy((pair) => {
       val (_, index) = pair;
     }).iterator
   }
 
-  def reportResults(resultTuple: (Double, scala.List[(SentimentLabel.Type, Double, Double, Double)])): String = {
-    val (accuracy, labelResultsList) = resultTuple
-    "Results:\n" +
-      "%12s%6.2f\n".format("Accuracy", accuracy) +
-      "%12s%11s%8s%9s\n".format("Label", "Precision", "Recall", "F-Score") +
-      (for ((label, precision, recall, fScore) <- labelResultsList) yield {
-        "%12s%11.2f%8.2f%9.2f".format(SentimentLabel.toEnglishName(label), precision, recall, fScore)
-      }).mkString("\n") + "\n"
+
+
+  def initializeAverageList(list: List[(updown.data.SentimentLabel.Type, Double, Double, Double)]): List[(updown.data.SentimentLabel.Type, Double, Double, Double)] = {
+    if (list.length == 0)
+      Nil
+    else {
+      val ((lLabel, _, _, _) :: ls) = list
+      (lLabel, 0.0, 0.0, 0.0) :: initializeAverageList(ls)
+    }
+  }
+
+  def addAll(list: List[(updown.data.SentimentLabel.Type, Double, Double, Double)], to: List[(updown.data.SentimentLabel.Type, Double, Double, Double)]): List[(updown.data.SentimentLabel.Type, Double, Double, Double)] = {
+    if (list.length == 0)
+      Nil
+    else {
+      val ((lLabel, lPrecision, lRecall, lFScore) :: ls) = list
+      val ((tLabel, tPrecision, tRecall, tFScore) :: ts) = to
+      assert(lLabel == tLabel)
+      (lLabel, lPrecision + tPrecision, lRecall + tRecall, lFScore + tFScore) :: addAll(ls, ts)
+    }
+  }
+
+  def divideBy(list: List[(updown.data.SentimentLabel.Type, Double, Double, Double)], by: Double): List[(updown.data.SentimentLabel.Type, Double, Double, Double)] = {
+    if (list.length == 0)
+      Nil
+    else {
+      val ((lLabel, lPrecision, lRecall, lFScore) :: ls) = list
+      (lLabel, lPrecision / by, lRecall / by, lFScore / by) :: divideBy(ls, by)
+    }
+  }
+
+
+  def averageResults(results: scala.List[(Double, scala.List[(SentimentLabel.Type, Double, Double, Double)])]): (Double, scala.List[(SentimentLabel.Type, Double, Double, Double)]) = {
+    var avgAccuracy = 0.0
+    var avgLabelResultsList = initializeAverageList(results(0)._2)
+    for ((accuracy, labelResults) <- results) {
+      avgAccuracy += accuracy
+      avgLabelResultsList = addAll(labelResults, avgLabelResultsList)
+    }
+    avgAccuracy /= results.length
+    avgLabelResultsList = divideBy(avgLabelResultsList, results.length)
+    (avgAccuracy, avgLabelResultsList)
+
+  }
+
+  def main(args: Array[String]) {
+    val parser = new ArgotParser(this.getClass.getName, preUsage = Some("Updown"))
+    val goldInputFile = parser.option[String](List("g", "gold"), "gold", "gold labeled input")
+    val n = parser.option[Int](List("n", "folds"), "FOLDS", "the number of folds for the experiment (default 10)")
+
+    try {
+      parser.parse(args)
+
+      val nFolds: Int = n.value.getOrElse(10)
+
+      if (goldInputFile.value == None) {
+        parser.usage("You must specify a gold labeled input file via -g.")
+      }
+
+      val inputFile = goldInputFile.value.get
+      val results =
+        (for ((testSet, trainSet) <- generateTrials(inputFile, nFolds)) yield {
+          doExperiment(testSet, trainSet)
+        }).toList
+
+      val averages = averageResults(results)
+      System.err.println("\n" + Statistics.reportResults(averages))
+    }
+    catch {
+      case e: ArgotUsageException => println(e.message); sys.exit(0)
+    }
   }
 }

src/main/scala/updown/app/NFoldMaxentExperiment.scala

 package updown.app
 
-import org.clapper.argot.{ArgotUsageException, ArgotParser}
-import org.clapper.argot.ArgotConverters._
 import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.util.Statistics
 
 object NFoldMaxentExperiment extends NFoldExperiment {
-  // this exists purely to make the ArgotConverters appear used to IDEA
-  convertByte _
+  def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]) = {
+    logger.info("performing Maxent experiment")
+    logger.debug("training model")
+    val model = TrainMaxentModel.trainWithGoldLabeledTweetIterator(trainSet.iterator)
 
-  def doExperiment(inputFile: String, nFolds: Int) = {
-    (for ((testSet, trainSet) <- generateTrials(inputFile, nFolds)) yield {
-      val model = TrainMaxentModel.trainWithGoldLabeledTweetIterator(trainSet.iterator)
-
-      PerTweetEvaluator.getEvalStats(for (tweet <- testSet) yield {
-        tweet match {
-          case GoldLabeledTweet(id, userid, features, goldLabel) =>
-            SystemLabeledTweet(id, userid, features, goldLabel,
-              SentimentLabel.figureItOut(model.getBestOutcome(model.eval(features.toArray))))
-        }
-      })
-    }).toList
-  }
-
-  def initializeAverageList(list: List[(updown.data.SentimentLabel.Type, Double, Double, Double)]): List[(updown.data.SentimentLabel.Type, Double, Double, Double)] = {
-    if (list.length == 0)
-      Nil
-    else {
-      val ((lLabel, _, _, _) :: ls) = list
-      (lLabel, 0.0, 0.0, 0.0) :: initializeAverageList(ls)
-    }
-  }
-
-  def addAll(list: List[(updown.data.SentimentLabel.Type, Double, Double, Double)], to: List[(updown.data.SentimentLabel.Type, Double, Double, Double)]): List[(updown.data.SentimentLabel.Type, Double, Double, Double)] = {
-    if (list.length == 0)
-      Nil
-    else {
-      val ((lLabel, lPrecision, lRecall, lFScore) :: ls) = list
-      val ((tLabel, tPrecision, tRecall, tFScore) :: ts) = to
-      assert(lLabel == tLabel)
-      (lLabel, lPrecision + tPrecision, lRecall + tRecall, lFScore + tFScore) :: addAll(ls, ts)
-    }
-  }
-
-  def divideBy(list: List[(updown.data.SentimentLabel.Type, Double, Double, Double)], by: Double): List[(updown.data.SentimentLabel.Type, Double, Double, Double)] = {
-    if (list.length == 0)
-      Nil
-    else {
-      val ((lLabel, lPrecision, lRecall, lFScore) :: ls) = list
-      (lLabel, lPrecision / by, lRecall / by, lFScore / by) :: divideBy(ls, by)
-    }
-  }
-
-
-  def averageResults(results: scala.List[(Double, scala.List[(SentimentLabel.Type, Double, Double, Double)])]): (Double, scala.List[(SentimentLabel.Type, Double, Double, Double)]) = {
-    var avgAccuracy = 0.0
-    var avgLabelResultsList = initializeAverageList(results(0)._2)
-    for ((accuracy, labelResults) <- results) {
-      avgAccuracy += accuracy
-      avgLabelResultsList = addAll(labelResults, avgLabelResultsList)
-    }
-    avgAccuracy /= results.length
-    avgLabelResultsList = divideBy(avgLabelResultsList, results.length)
-    //    println(results.mkString("\n"))
-    //    println("Averages:")
-    //    println("(Accuracy, List((Label, Precision, Recall, F-Score)")
-    //    println((avgAccuracy, avgLabelResultsList))
-    (avgAccuracy, avgLabelResultsList)
-
-  }
-
-  def main(args: Array[String]) {
-    val parser = new ArgotParser("updown run updown.app.PerTweetEvaluator", preUsage = Some("Updown"))
-    val goldInputFile = parser.option[String](List("g", "gold"), "gold", "gold labeled input")
-    val n = parser.option[Int](List("n", "folds"), "FOLDS", "the number of folds for the experiment (default 10)")
-
-    try {
-      parser.parse(args)
-    }
-    catch {
-      case e: ArgotUsageException => println(e.message); sys.exit(0)
-    }
-
-    val nFolds: Int = n.value.getOrElse(10)
-
-    if (goldInputFile.value == None) {
-      println("You must specify a gold labeled input file via -g.")
-      sys.exit(1)
-    }
-
-    val inputFile = goldInputFile.value.get
-    val results = doExperiment(inputFile, nFolds)
-    val averages = averageResults(results)
-    System.err.println("\n"+reportResults(averages))
+    logger.debug("testing model")
+    val res = Statistics.getEvalStats(for (tweet <- testSet) yield {
+      tweet match {
+        case GoldLabeledTweet(id, userid, features, goldLabel) =>
+          SystemLabeledTweet(id, userid, features, goldLabel,
+            SentimentLabel.figureItOut(model.getBestOutcome(model.eval(features.toArray))))
+      }
+    })
+    logger.info(Statistics.reportResults(res))
+    res
   }
 }

src/main/scala/updown/app/NFoldTopicExperiment.scala

+package updown.app
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.util.{Statistics, LDATopicModel, TopicModel}
+
+object NFoldTopicExperiment extends NFoldExperiment {
+
+
+  def label(model: TopicModel, tweet: GoldLabeledTweet, goodTopic: Int, badTopic: Int): SystemLabeledTweet = {
+    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
+    val topicDistribution = model.inferTopics(tweet)
+    val sortedDist = topicDistribution.zipWithIndex.sortBy((i) => 1.0 - i._1).map((i) => i._2)
+
+    SystemLabeledTweet(id, userid, features, goldLabel,
+      if (goodTopic == -1 || badTopic == -1) {
+        assert(goodTopic == badTopic)
+        SentimentLabel.Abstained
+      }
+      else if (sortedDist(0) == goodTopic) SentimentLabel.Positive
+      else if (sortedDist(0) == badTopic) SentimentLabel.Negative
+      else if (sortedDist(1) == goodTopic) SentimentLabel.Positive
+      else if (sortedDist(1) == badTopic) SentimentLabel.Negative
+      else if (sortedDist(2) == goodTopic) SentimentLabel.Positive
+      else SentimentLabel.Negative
+    )
+  }
+
+  def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]): (Double, scala.List[(SentimentLabel.Type, Double, Double, Double)]) = {
+    val labelToTopicDist = model.getTopicsPerTarget
+    val badDist = labelToTopicDist(SentimentLabel.Negative).zipWithIndex.sortBy((i) => 1.0 - i._1).map((i) => i._2)
+    val goodDist = labelToTopicDist(SentimentLabel.Positive).zipWithIndex.sortBy((i) => 1.0 - i._1).map((i) => i._2)
+    val goodTopic = goodDist(0)
+    val badTopic = if (goodTopic != badDist(0)) badDist(0) else badDist(1)
+
+    val res = Statistics.getEvalStats(for (tweet <- testSet) yield {
+      label(model, tweet, goodTopic, badTopic)
+    })
+    logger.debug(Statistics.getEvalStats(for (tweet <- testSet) yield {
+      label(model, tweet, goodTopic, badTopic)
+    }).toString)
+    logger.info(Statistics.reportResults(res))
+    res
+  }
+
+  def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]) = {
+    val model: TopicModel = new LDATopicModel(trainSet, 3, 1000, 100, 0.1)
+
+    logger.info("topic distribution:\n     :" + model.getTopicPriors)
+    logger.info({
+      val labelToTopicDist = model.getTopicsPerTarget
+      "topic distribution over labels:\n" + (for ((k, v) <- labelToTopicDist) yield "%5s:%s".format(k, v)).mkString("\n")
+    })
+    logger.info({
+      val topics = model.getTopics
+      "topic distributions\n" +
+        (for (i <- 0 until 3) yield "%5s: Topic(%s,%s)".format(i, topics(i).prior, topics(i).distribution.toList.sortBy((pair) => (1 - pair._2)))).mkString("\n")
+    })
+    evaluate(model, testSet)
+  }
+}

src/main/scala/updown/app/PerTweetEvaluator.scala

   // this exists purely to make the ArgotConverters appear used to IDEA
   convertByte _
 
-  val accurracy: (Double, Double) => Double =
-    (correct, total) => correct / total
-  val precision: (Double, Double) => Double =
-    (numCorrectlyLabeled, totalNumLabeled) => numCorrectlyLabeled / totalNumLabeled
-  val recall: (Double, Double) => Double =
-    (numCorrectlyLabeled, numberThatShouldHaveBeenLabeled) => numCorrectlyLabeled / numberThatShouldHaveBeenLabeled
-  val fScore: (Double, Double) => Double =
-    (precision, recall) => 2.0 * precision * recall / (precision + recall)
-
-  def getEvalStats(tweets: scala.List[SystemLabeledTweet]): (Double, List[(SentimentLabel.Type, Double, Double, Double)]) = {
-    val (correct, total, _, _) = tabulate(tweets)
-
-    (accurracy(correct, total.toDouble),
-      (for (label <- SentimentLabel.values) yield {
-        val goldList = tweets.filter((tweet) => tweet.goldLabel == label)
-        val systemList = tweets.filter((tweet) => tweet.systemLabel == label)
-        val labelPrecision = precision(
-          systemList.filter((tweet) => tweet.goldLabel == label).length,
-          systemList.length)
-        val labelRecall = recall(
-          goldList.filter((tweet) => tweet.systemLabel == label).length,
-          goldList.length
-        )
-        (label, labelPrecision, labelRecall, fScore(labelPrecision, labelRecall))
-      }).toList)
-  }
-
   def tabulate(tweets: scala.List[SystemLabeledTweet]): (Double, Int, Int, String) = {
     var correct = 0.0
     var total = 0

src/main/scala/updown/app/model/MaxentEventStreamFactory.scala

   val DEFAULT_DELIMITER = ","
 
   def apply(fileName: String): EventStream = {
-    getWithStringIterator(scala.io.Source.fromFile(fileName).getLines)
+    getWithStringIterator(scala.io.Source.fromFile(fileName).getLines())
   }
 
   def getWithStringIterator(iterator: Iterator[String]): EventStream = {
     new BasicEventStream(new DataStream {
       def nextToken(): AnyRef = {
-        val GoldLabeledTweet(tweetid, userid, features, label) = TweetFeatureReader.parseLine(iterator.next())
+        val GoldLabeledTweet(_, _, features, label) = TweetFeatureReader.parseLine(iterator.next())
         (features ::: (label :: Nil)).mkString(DEFAULT_DELIMITER)
       }
 
   def getWithGoldLabeledTweetIterator(iterator: Iterator[GoldLabeledTweet]): EventStream = {
     new BasicEventStream(new DataStream {
       def nextToken(): AnyRef = {
-        val GoldLabeledTweet(tweetid, userid, features, label) = iterator.next()
+        val GoldLabeledTweet(_, _, features, label) = iterator.next()
         (features ::: (label :: Nil)).mkString(DEFAULT_DELIMITER)
       }
 

src/main/scala/updown/preproc/GenericPreprocessor.scala

   def main(args: Array[String]) {
     logger.debug(args.toList.toString)
     // don't forget that this is linked to the pipeStages dict below
-    val availablePipes = Set("addBiGrams", "twokenize", "twokenizeSkipGtOneGrams", "removeStopwords", "splitSpace")
+    val availablePipes = Set("addBiGrams", "twokenize", "twokenizeSkipGtOneGrams", "removeStopwords", "splitSpace", "filterAlpha")
 
     // PARSE ARGS
     val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
           ("twokenize" -> TokenizationPipes.twokenize),
           ("twokenizeSkipGtOneGrams" -> TokenizationPipes.twokenizeSkipGtOneGrams),
           ("removeStopwords" -> TokenizationPipes.filterOnStopset(stopSet)),
+          ("filterAlpha") -> TokenizationPipes.filterOnRegex("\\p{Alpha}+"),
           ("splitSpace" -> TokenizationPipes.splitOnDelimiter(" "))
         )
       // had to predefine the available pipes so they could be printed in the usage string, before the stopset can be parsed.

src/main/scala/updown/util/LDATopicModel.scala

+package updown.util
+
+import cc.mallet.topics.ParallelTopicModel
+import cc.mallet.types._
+import scala.collection.JavaConversions._
+import updown.data.{SentimentLabel, GoldLabeledTweet}
+
+class LDATopicModel(tweets: List[GoldLabeledTweet], numTopics: Int, numIterations: Int, alphaSum: Double, beta: Double) extends TopicModel {
+  private final val MAX_THREADS = 20
+
+  private val (alphabet, instanceList) = getInstanceList(tweets)
+  private val model = new ParallelTopicModel(numTopics, alphaSum, beta)
+  model.addInstances(instanceList)
+  model.setNumThreads(numTopics max MAX_THREADS)
+  model.setNumIterations(numIterations)
+  model.estimate()
+
+  def getTopics: List[Topic] = {
+    val priors = getTopicPriors
+    val topicsToAlphaIds = scala.collection.mutable.Map[Int,List[(Int,Double)]]()
+
+    val wordsTopicsCounts = (for ((topicCounts, typeIndex) <- model.typeTopicCounts.zipWithIndex) yield {
+      val word = alphabet.lookupObject(typeIndex).toString
+      (for (topicCount <- topicCounts) yield {
+        val topic = topicCount & model.topicMask
+        val count = topicCount >> model.topicBits
+        (word,topic,count)
+      }).iterator
+    }).iterator.flatten.toList
+
+
+    val res = (for (i <- 0 until numTopics) yield {
+      val wordCounts = wordsTopicsCounts.filter((triple)=>(triple._2==i && triple._3!=0))
+      val sum = wordCounts.map((triple)=>triple._3).reduce(_ + _)
+      Topic(priors(i), wordCounts.map((triple)=>(triple._1->(triple._3.toDouble/sum))).toMap)
+    }).toList
+    res
+  }
+
+  def getTopicPriors: List[Double] = {
+    val result: Array[Double] = new Array[Double](numTopics)
+    var sum = 0.0
+    for (topicAssignment <- model.getData) {
+      val temp: Array[Double] = model.getTopicProbabilities(topicAssignment.topicSequence)
+      for (i <- 0 until result.length) {
+        result(i) += temp(i)
+        sum += temp(i)
+      }
+    }
+    result.toList.map((double: Double) => double / sum)
+  }
+
+  def getTopicsPerInstance = {
+    (for (topicAssignment <- model.getData) yield {
+      val source = topicAssignment.instance.getName.toString
+      val dist = model.getTopicProbabilities(topicAssignment.topicSequence)
+      (source, dist.toList)
+    }).toList
+  }
+
+  def getTopicsPerTarget = {
+    val result = scala.collection.mutable.Map[SentimentLabel.Type,List[Double]]()
+    for (topicAssignment <- model.getData) {
+      val target = topicAssignment.instance.getTarget.asInstanceOf[SentimentLabel.Type]
+      result(target) = result.getOrElse(target, (new Array[Double](numTopics)).toList).zip(model.getTopicProbabilities(topicAssignment.topicSequence).toList).map((pair) => pair._1+pair._2)
+    }
+    (for ((key, value) <- result) yield {
+      val sum = value.reduce( _ + _ )
+      (key->value.map(_ / sum))
+    }).toMap
+  }
+
+  def inferTopics(tweet: GoldLabeledTweet): List[Double] = {
+    val instance = tweet match {
+        case GoldLabeledTweet(id, userid, features, goldLabel) =>
+          val featureSequence = new FeatureSequence(alphabet, features.length)
+          for (feature <- features) {
+            featureSequence.add(feature)
+          }
+          new Instance(featureSequence, goldLabel, id, null)
+      }
+    model.getInferencer.getSampledDistribution(instance, numIterations, 1, 1).toList
+  }
+}
+

src/main/scala/updown/util/Statistics.scala

+package updown.util
+
+import updown.data.{SentimentLabel, SystemLabeledTweet}
+
+object Statistics {
+
+  val accurracy: (Double, Double) => Double =
+    (correct, total) => correct / total
+  val precision: (Double, Double) => Double =
+    (numCorrectlyLabeled, totalNumLabeled) => numCorrectlyLabeled / totalNumLabeled
+  val recall: (Double, Double) => Double =
+    (numCorrectlyLabeled, numberThatShouldHaveBeenLabeled) => numCorrectlyLabeled / numberThatShouldHaveBeenLabeled
+  val fScore: (Double, Double) => Double =
+    (precision, recall) => 2.0 * precision * recall / (precision + recall)
+
+  def tabulate(tweets: scala.List[SystemLabeledTweet]): (Double, Int) = {
+    var correct = 0.0
+    var total = 0
+    var numAbstained = tweets.count(_.systemLabel == null)
+
+    for (tweet <- tweets) {
+//      println(tweet.systemLabel + "|" + tweet.goldLabel)
+      /*
+       * val normedTweet = tweet.normalize("alpha")
+      *  val normedNormedTweet = normedTweet.normalize("int")
+      *  println(normedTweet.systemLabel + "|" + normedTweet.goldLabel + "\t" + normedNormedTweet.systemLabel + "|" + normedNormedTweet.goldLabel)
+      */
+      //      val normedTweet = tweet.normalize("alpha")
+      if (tweet.systemLabel == tweet.goldLabel) {
+        correct += 1
+      }
+
+      total += 1
+    }
+    correct += numAbstained.toFloat / 3
+
+    (correct, total)
+  }
+
+  def getEvalStats(tweets: scala.List[SystemLabeledTweet]): (Double, List[(SentimentLabel.Type, Double, Double, Double)]) = {
+    val (correct, total) = tabulate(tweets)
+
+    (accurracy(correct, total.toDouble),
+      (for (label <- SentimentLabel.values) yield {
+        val goldList = tweets.filter((tweet) => tweet.goldLabel == label)
+        val systemList = tweets.filter((tweet) => tweet.systemLabel == label)
+        val labelPrecision = precision(
+          systemList.filter((tweet) => tweet.goldLabel == label).length,
+          systemList.length)
+        val labelRecall = recall(
+          goldList.filter((tweet) => tweet.systemLabel == label).length,
+          goldList.length
+        )
+        (label, labelPrecision, labelRecall, fScore(labelPrecision, labelRecall))
+      }).toList)
+  }
+
+    def reportResults(resultTuple: (Double, scala.List[(SentimentLabel.Type, Double, Double, Double)])): String = {
+    val (accuracy, labelResultsList) = resultTuple
+    "Results:\n" +
+      "%12s%6.2f\n".format("Accuracy", accuracy) +
+      "%12s%11s%8s%9s\n".format("Label", "Precision", "Recall", "F-Score") +
+      (for ((label, precision, recall, fScore) <- labelResultsList) yield {
+        "%12s%11.2f%8.2f%9.2f".format(SentimentLabel.toEnglishName(label), precision, recall, fScore)
+      }).mkString("\n") + "\n"
+  }
+}

src/main/scala/updown/util/TokenizationPipes.scala

 package updown.util
 
+import util.matching.Regex
+
 object TokenizationPipes {
   val twokenize: (List[String]) => List[String] =
     (ss) => ss.map((s) => Twokenize(s)).flatten
     (stopSet) =>
       (ss) => ss.filter((s) => !stopSet.contains(s))
 
+  val filterOnRegex: (String)=> (List[String]) => List[String] =
+    (regex) =>
+      (ss) => ss.filter((s) => s.matches(regex))
+
   /*
     A really diligent implementation would put (n-1) "$"s at the beginning and end of the
     list, but I kind of doubt that's what we really want, so I'm not going to bother right now.

src/main/scala/updown/util/TopicModel.scala

+package updown.util
+
+import cc.mallet.types._
+import updown.data.{SentimentLabel, GoldLabeledTweet}
+
+case class Topic(prior:Double, distribution: Map[String,Double])
+
+abstract class TopicModel {
+  protected def getInstanceList(tweetList: List[GoldLabeledTweet]): (Alphabet, InstanceList) = {
+    val alphabet = new Alphabet()
+    val instances = (for (tweet <- tweetList) yield {
+      tweet match {
+        case GoldLabeledTweet(id, userid, features, goldLabel) =>
+          val featureSequence = new FeatureSequence(alphabet, features.length)
+          for (feature <- features) {
+            featureSequence.add(feature)
+          }
+          new Instance(featureSequence, goldLabel, id, null)
+      }
+    }).toList
+
+    val instanceList = new InstanceList(alphabet, null)
+    for (instance <- instances) {
+      instanceList.add(instance)
+    }
+    (alphabet, instanceList)
+  }
+
+  def getTopics: List[Topic]
+  def getTopicPriors: List[Double]
+  def getTopicsPerInstance: List[(String,List[Double])]
+  def getTopicsPerTarget: Map[SentimentLabel.Type,List[Double]]
+  def inferTopics(tweet: GoldLabeledTweet): List[Double]
+}