Commits

vvcephei committed e479892

refactored all preprocessors (except emoticon) to use GenericPreprocessor

Comments (0)

Files changed (27)

bin/compare-output.sh

 EOF
 }
 
+export CP_CONFIG=$UPDOWN_DIR/config/quiet
+
 OUT=~/.updown_output
 mkdir -p $OUT
 OUTPUT=$OUT/output
 #!/bin/bash
 
-JARS=`echo $UPDOWN_DIR/lib/*.jar $UPDOWN_DIR/lib_managed/*/*.jar $UPDOWN_DIR/output/*.jar $UPDOWN_DIR/target/*.jar | tr ' ' ':'`
+CP_CONFIG=${CP_CONFIG:-$UPDOWN_DIR/config/main}
+
+JARS=`echo $UPDOWN_DIR/lib/*.jar $UPDOWN_DIR/lib_managed/*/*.jar $UPDOWN_DIR/lib_managed/*/*/*.jar $UPDOWN_DIR/lib_managed/*/*/*/*.jar $UPDOWN_DIR/output/*.jar $UPDOWN_DIR/target/*.jar | tr ' ' ':'`
 SCALA_LIB="$HOME/.sbt/boot/scala-2.9.1/lib/scala-library.jar"
 
-CP="$UPDOWN_DIR/target/classes:$SCALA_LIB:$JARS:$CLASSPATH:$UPDOWN_DIR/config"
+CP="$UPDOWN_DIR/target/classes:$SCALA_LIB:$JARS:$CLASSPATH:$CP_CONFIG"
 
 FIRSTARG="$1"
 
 
 elif [ $CMD = 'preproc-stanford' ]; then
 
-     $SCALA_COMMAND updown.preproc.PreprocStanfordTweets $*
+     $SCALA_COMMAND updown.preproc.impl.PreprocStanfordTweets $*
 
 elif [ $CMD = 'preproc-shamma' ]; then
 
-     $SCALA_COMMAND updown.preproc.PreprocShammaTweets $*
+     $SCALA_COMMAND updown.preproc.impl.PreprocShammaTweets $*
 
 elif [ $CMD = 'preproc-hcr' ]; then
 
-     $SCALA_COMMAND updown.preproc.PreprocHCRTweets $*
+     $SCALA_COMMAND updown.preproc.impl.PreprocHCRTweets $*
 
 elif [ $CMD = 'per-tweet-eval' ]; then
 

config/log4j.properties

-log4j.rootLogger=DEBUG, stderr
-log4j.appender.stderr=org.apache.log4j.ConsoleAppender
-log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
-
-# Print the date in ISO 8601 format
-log4j.appender.stderr.target=System.err
-log4j.appender.stderr.layout.ConversionPattern=%-d{ISO8601} [%t] %-5p %c %x - %m%n
-

src/main/scala/updown/app/experiment/NFoldExperiment.scala

 
   def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]):
   (Double, List[(updown.data.SentimentLabel.Type, Double, Double, Double)])
-  def after():Int
+
+  def after(): Int
 
   def generateTrials(inputFile: String, nFolds: Int): Iterator[(List[GoldLabeledTweet], List[GoldLabeledTweet])] = {
-    val foldsToTweets = (for ((fold, list) <- TweetFeatureReader(inputFile).zipWithIndex.groupBy((pair) => {
-      val (_, index) = pair;
-      index % nFolds
-    })) yield {
-      (fold, list.map((pair) => {
-        val (tweet, _) = pair;
-        tweet
-      }))
-    }).toList
+    val polToTweetLists = TweetFeatureReader(inputFile).groupBy((tweet) => tweet.goldLabel)
+
+    val minListLength = (for ((pol, tweetList) <- polToTweetLists) yield tweetList.length).min
+    logger.info("takining %d items from each polarity class. This was the minimum number in any class".format(minListLength))
+    val allTweetsFolded =
+      (for (index <- 0 until minListLength) yield {
+          (for ((pol, tweetList) <- polToTweetLists) yield {
+            (pol, index, (index % nFolds, tweetList(index)))
+          }).toList.map{case(pol,index,item)=>item}
+          // this is really strange. If I just emit the item, it only emits every nth one.
+          // Somehow, emitting a tuple and then unmapping it fixes the problem.
+          // I'm guessing this is because the input is a map, and it is trying to make the output a map as well.
+      }).toList.flatten
+
+    val foldsToTweets = allTweetsFolded.groupBy{case(fold, tweet) => fold}
+      .map{case(fold,list)=>(fold,list.map{case(fold,tweet)=>tweet})}
 
     (for ((heldOutFold, heldOutData) <- foldsToTweets) yield {
-      (heldOutData,
-        foldsToTweets.filter((pair) => {
-          val (listFold, _) = pair;
-          listFold != heldOutFold
-        }).map((pair) => {
-          val (_, tweets) = pair;
-          tweets
-        }).flatten)
+      (heldOutData, foldsToTweets.filter{case(setNo,list)=>setNo != heldOutFold}.map{case(setNo,list)=>list}.flatten.toList)
     }).iterator
   }
 
       val results =
         (for ((testSet, trainSet) <- generateTrials(inputFile, nFolds)) yield {
           experimentalRun += 1
-          logger.debug("starting run "+experimentalRun)
+          logger.debug("starting run " + experimentalRun)
           val result = doExperiment(testSet, trainSet)
-          logger.debug("ending run "+experimentalRun)
+          logger.debug("ending run " + experimentalRun)
           result
         }).toList
 

src/main/scala/updown/app/experiment/topic/NFoldMajorityTopicExperiment.scala

     val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
     val topicDistribution = model.inferTopics(tweet)
     val sortedDist = topicDistribution.zipWithIndex.sortBy((i) => 1.0 - i._1).map((i) => i._2)
+    val chosenTopic = topicDistribution.indexOf(topicDistribution.max)
 
-    // for now, we'll always guess positive or negative, never neutral
     SystemLabeledTweet(id, userid, features, goldLabel,
-      if (goodTopic == badTopic) SentimentLabel.Abstained
-      else if (sortedDist(0) == goodTopic) SentimentLabel.Positive
-      else if (sortedDist(0) == badTopic) SentimentLabel.Negative
-      else if (sortedDist(1) == goodTopic) SentimentLabel.Positive
-      else if (sortedDist(1) == badTopic) SentimentLabel.Negative
-      else if (sortedDist(2) == goodTopic) SentimentLabel.Positive
-      else SentimentLabel.Negative
+      if (chosenTopic == goodTopic) SentimentLabel.Positive
+      else if (chosenTopic == badTopic) SentimentLabel.Negative
+      else SentimentLabel.Neutral
     )
   }
 
   def evaluate(model: TopicModel, testSet: scala.List[GoldLabeledTweet]): (Double, scala.List[(SentimentLabel.Type, Double, Double, Double)]) = {
     val labelToTopicDist = model.getTopicsPerTarget
 
+    //This approach will only work if there is a very clear sentiment-topic correlation.
+    val badTopic = labelToTopicDist(SentimentLabel.Negative).indexOf(labelToTopicDist(SentimentLabel.Negative).max)
+    val goodTopic = labelToTopicDist(SentimentLabel.Positive).indexOf(labelToTopicDist(SentimentLabel.Positive).max)
+    val neutralTopic = if (labelToTopicDist.contains(SentimentLabel.Neutral)) labelToTopicDist(SentimentLabel.Neutral).indexOf(labelToTopicDist(SentimentLabel.Neutral).max) else -1
+    logger.info("goodTopic:%d badTopic:%d neutralTopic:%d".format(goodTopic, badTopic, neutralTopic))
 
-    val badDist = labelToTopicDist(SentimentLabel.Negative).zipWithIndex.sortBy((i) => 1.0 - i._1).map((i) => i._2)
-    logger.debug("badDist: "+badDist.toString)
-    val goodDist = labelToTopicDist(SentimentLabel.Positive).zipWithIndex.sortBy((i) => 1.0 - i._1).map((i) => i._2)
-    logger.debug("goodDist: "+goodDist.toString)
-
-    val (goodTopic, badTopic, neutralTopic): (Int, Int, Int) =
-    if (labelToTopicDist.contains(SentimentLabel.Neutral)) {
-      val neutralDist = labelToTopicDist(SentimentLabel.Neutral).zipWithIndex.sortBy((i) => 1.0 - i._1).map((i) => i._2)
-      logger.debug("neutralDist: "+neutralDist.toString)
-      val neutralTopic = neutralDist(0)
-        if (goodDist(0) != neutralTopic) {
-          if (goodDist(0) != badDist(0)) {
-            (goodDist(0), neutralTopic, badDist(0))
-          } else {
-            //then we have a pathological case
-            logger.warn("pathological topic distribution: %s".format(labelToTopicDist.toString))
-            (-1, -1, -1)
-          }
-        } else {
-          val goodTopic = goodDist(1)
-          val badTopic =
-          if (badDist(0) != neutralTopic){
-             badDist(0)
-          } else {
-            badDist(1)
-          }
-          if (goodTopic == badTopic){
-            // then we have a pathological case
-            logger.warn("pathological topic distribution: %s".format(labelToTopicDist.toString))
-            (-1, -1, -1)
-          } else {
-            (goodTopic, neutralTopic, badTopic)
-          }
-        }
-      } else {
-        // there were no neutral training instances
-        if (goodDist(0) == badDist(0)) {
-          val neutralTopic = goodDist(0)
-          if (goodDist(1) == badDist(1)) {
-            // then we have a pathological case, and the topics are not sentimental
-            logger.warn("pathological topic distribution: %s".format(labelToTopicDist.toString))
-            (-1, -1, -1)
-          } else {
-            // then the neutral topic was dominant in both cases, and the second topic held the sentiment
-            (goodDist(1), neutralTopic, badDist(1))
-          }
-        } else {
-          // then the sentimental topic was dominant, and we just have to find the neutral topic
-          val goodTopic = goodDist(0)
-          val badTopic = badDist(0)
-          if (goodDist(1) != badTopic) {
-            (goodTopic, goodDist(1), badTopic)
-          } else {
-            (goodTopic, goodDist(2), badTopic)
-          }
-        }
-      }
-    assert ((goodTopic == -1 && badTopic == -1 && neutralTopic == -1) ||
-      (goodTopic != badTopic && badTopic != neutralTopic && goodTopic != neutralTopic))
-    logger.info("goodTopic:%d badTopic:%d neutralTopic:%d".format(goodTopic, badTopic, neutralTopic))
+    if (goodTopic == badTopic){
+      logger.error("Patholological distribution. No clear topics for bad/good labels. Exiting...")
+      System.exit(1)
+    } else if (neutralTopic != -1 && (badTopic == neutralTopic | goodTopic == neutralTopic)) {
+      logger.warn("No clear distribution for the neutral label. ")
+    }
 
     val res = Statistics.getEvalStats(for (tweet <- testSet) yield {
       label(model, tweet, goodTopic, badTopic)

src/main/scala/updown/app/experiment/topic/NFoldTopicExperiment.scala

       numTopics = numTopicsOption.value.get
     }
 
+    // Thanks to a bug in Mallet, we have to cap alphaSum
+    val alphaSum = 300 min (alpha * numTopics)
 
-    val model: TopicModel = new LDATopicModel(trainSet, numTopics, iterations, alpha * numTopics, beta)
+
+    logger.debug("alphaSum: " + alphaSum)
+    val model: TopicModel = new LDATopicModel(trainSet, numTopics, iterations, alphaSum, beta)
     logger.debug("topic distribution:\n     :" + model.getTopicPriors)
     logger.debug({
       val labelToTopicDist = model.getTopicsPerTarget

src/main/scala/updown/data/SentimentLabel.scala

   val Negative = Value("-1")
   val Negative2 = Value("-2")
   val Abstained = Value("A")
+  val Unknown = Value("U")
   // this is the end of the enum definition. the rest of this object just demonstrates other
   //  stuff you can do.
 
   private val _NEG_NAME2 = "superNegative"
   private val _NEU_NAME = "neutral"
   private val _ABS_NAME = "abstained"
+  private val _UNK_NAME = "unknown"
 
   def unitSentiment(label:SentimentLabel.Type) = {
     label match {
       case Negative2 => _NEG_NAME2
       case Neutral  => _NEU_NAME
       case Abstained  => _ABS_NAME
+      case Unknown  => _UNK_NAME
     }
   }
 
       case `_NEG_NAME2` => Negative2
       case `_NEU_NAME` => Neutral
       case `_ABS_NAME` => Abstained
+      case `_UNK_NAME` => Unknown
     }
   }
 
   def figureItOut(name:String) = {
+    try {
     name.toLowerCase match {
       case `_POS_NAME` |"pos"|"p"|"+"|"1" => Positive
       case `_POS_NAME2` |"pos2"|"2" => Positive2
       case `_NEG_NAME2`|"neg2"|"-2" => Negative2
       case `_NEU_NAME`|"neu"|"neut"|"0" => Neutral
       case `_ABS_NAME` => Abstained
+      case `_UNK_NAME` => Unknown
+    }
+    } catch {
+      case e:scala.MatchError =>
+        System.err.println("couldn't figure out: \"%s\"".format(name))
+        throw e
     }
   }
 }

src/main/scala/updown/preproc/GenericPreprocessor.scala

 import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
 import ArgotConverters._
 import updown.data.SentimentLabel
-import updown.util.{TokenizationPipes, Twokenize}
-import java.util.Collections
+import updown.util.TokenizationPipes
 import com.weiglewilczek.slf4s.Logging
+import collection.immutable.List._
+import java.io.{File, FileOutputStream, OutputStreamWriter}
 
 abstract class GenericPreprocessor extends Logging {
   // this is here to make ArgotConverters appear used to IDEA.
   var pipeStages: Map[String, (List[String]) => List[String]] =
     Map[String, (List[String]) => List[String]](
       ("lowerCase" -> TokenizationPipes.toLowercase),
-      ("addBiGrams" -> TokenizationPipes.addNGrams(2)),
+      ("addBigrams" -> TokenizationPipes.addNGrams(2)),
+      ("basicTokenize" -> TokenizationPipes.basicTokenize),
       ("twokenize" -> TokenizationPipes.twokenize),
       ("twokenizeSkipGtOneGrams" -> TokenizationPipes.twokenizeSkipGtOneGrams),
       ("filterAlpha") -> TokenizationPipes.filterOnRegex("\\p{Alpha}+"),
       ("filterAlphaQuote") -> TokenizationPipes.filterOnRegex("(\\p{Alpha}|')+"),
       ("splitSpace" -> TokenizationPipes.splitOnDelimiter(" "))
     )
+  val defaultPipeline = "twokenize|removeStopwords"
   val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
   val inputFile = parser.option[String](List("i", "input"), "input", "path to stanford data file")
   val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "path to stoplist file")
     ("specify the desired pipe stages seperated by |: \"addBiGrams|twokenize\". " +
       "Available options are in %s.").format(pipeStages.keySet))
 
-  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)]
+  val targetFile = parser.option[String](List("t", "target"), "target", "target file")
+  val featureFile = parser.option[String](List("f", "feature"), "feature", "feature file")
 
-  def getInputIterator(inputOption: Option[String]): Iterator[(String, String, SentimentLabel.Type, String)] = {
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)]
+
+  def getInputIterator(inputOption: Option[String]): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
     logger.debug("entering getInputIterator")
     inputOption match {
       case Some(fileNameList) =>
         (for ((name, polarity) <- fileNameList.split("\\s*,\\s*").map((pair) => {
           val plist = pair.split("\\s*->\\s*")
-          (plist(0) -> plist(1))
+          if (plist.length > 1) {
+            (plist(0) -> plist(1))
+          } else {
+            logger.debug("the polarity for %s is not included on the command line, expecting it in the text.".format(plist(0)))
+            (plist(0) -> SentimentLabel.toEnglishName(SentimentLabel.Unknown))
+          }
         }
         ).toMap) yield {
           getInstanceIterator(name, polarity)
         (for (line <- scala.io.Source.stdin.getLines()) yield {
           line.split("|") match {
             case Array(id, reviewer, polarityString, text) =>
-              (id, reviewer, SentimentLabel.figureItOut(polarityString), text)
+              (id, reviewer, Left(SentimentLabel.figureItOut(polarityString)), text)
             case _ =>
               logger.error("Input must be of the form id|reviewer|polarity|text.")
-              ("", "", SentimentLabel.Neutral, "")
+              ("", "", Left(SentimentLabel.Abstained), "")
           }
         })
     }
   }
 
-  def main(args: Array[String]) {
-    logger.debug(args.toList.toString)
-    // don't forget that this is linked to the pipeStages dict below
-
-    // PARSE ARGS
-
-    try {
-      parser.parse(args)
-
-      // SET UP IO
-      var lineCount =
-        startId.value match {
-          case Some(id) => id
-          case None => 0
-        }
-
-      logger.debug("Inputfile: %s".format(inputFile.value))
-      val inputLines: Iterator[(String, String, SentimentLabel.Type, String)] =
-        getInputIterator(inputFile.value)
-
-
-      val stopSet: Set[String] =
-        stopListFile.value match {
-          case Some(fileName) =>
-            scala.io.Source.fromFile(fileName).getLines.toSet
-          case None => Set("a", "the", ".")
-        }
-      val tokpipe: (String, List[String]=>List[String]) = ("removeStopwords", TokenizationPipes.filterOnStopset(stopSet))
-      pipeStages = pipeStages + tokpipe
-
-
-      logger.debug("Pipeline option: %s".format(textPipeline.value))
-      val pipeline: List[(List[String]) => List[String]] =
-        if (textPipeline.value.isDefined) {
-          val arg: String = textPipeline.value.get
-          (for (pipeStage <- arg.split("\\|")) yield {
-            if (pipeStages.keySet.contains(pipeStage)) {
-              pipeStages(pipeStage)
-            } else {
-              parser.usage("invalid pipeStage: %s".format(pipeStage))
-            }
-          }).toList
-        } else {
-          List(pipeStages("twokenize"), pipeStages("removeStopwords"))
-        }
-      logger.debug("Pipeline: %s".format(pipeline))
-
-
-      // RUN
-      for ((id, reviewer, polarity, text) <- inputLines) {
-        println(
-          "%s|%s|%s|%s".format(
-            if (id == "") lineCount else id,
-            reviewer,
-            runThroughPipeLine(text.replaceAll(",", ""), pipeline).mkString(","),
-            polarity))
-        lineCount += 1
-      }
-      logger.debug("Done!")
-    }
-    catch {
-      case e: ArgotUsageException =>
-        println(e.message)
-        System.exit(1)
-    }
-  }
-
   def runThroughPipeLine(text: String, pipeLine: List[(List[String]) => List[String]]): List[String] = {
     var res = List(text)
     for (pipeStage <- pipeLine) {
     }
     res
   }
+
+  def writeInstance(id: String, reviewer: String, text: String, polarity: String, writer: OutputStreamWriter) {
+    writer.write("%s|%s|%s|%s\n".format(id, reviewer, text, polarity))
+  }
+
+  def writeTarget(id: String, target: String, writer: OutputStreamWriter) {
+    writer.write("%s|%s\n".format(id, target))
+  }
+
+  def main(args: Array[String]) {
+    logger.debug(args.toList.toString)
+    try {
+      parser.parse(args)
+
+      // SET UP IO
+
+      logger.debug("Inputfile: %s".format(inputFile.value))
+      val inputLines = getInputIterator(inputFile.value)
+      val targetWriter = new OutputStreamWriter(
+        targetFile.value match {
+          case Some(fileName) => new FileOutputStream(new File(fileName))
+          case None => System.out
+        }, "UTF-8")
+
+      // Note: if you want to squelch output entirely, you can initialize the writer with
+      // new java.io.OutputStream() { public void write ( int b ) { } }
+
+      val featureWriter = new OutputStreamWriter(
+        featureFile.value match {
+          case Some(fileName) => new FileOutputStream(new File(fileName))
+          case None => System.out
+        }, "UTF-8")
+
+      val stopSet: Set[String] =
+        stopListFile.value match {
+          case Some(fileName) =>
+            scala.io.Source.fromFile(fileName).getLines.toSet
+          case None => Set("a", "the", ".")
+        }
+      val tokpipe: (String, List[String] => List[String]) = ("removeStopwords", TokenizationPipes.filterOnStopset(stopSet))
+      pipeStages = pipeStages + tokpipe
+
+
+      logger.debug("Pipeline option: %s".format(textPipeline.value))
+      val pipeline: List[(List[String]) => List[String]] = {
+        val arg: String =
+          if (textPipeline.value.isDefined) {
+            textPipeline.value.get
+          } else {
+            defaultPipeline
+          }
+        (for (pipeStage <- arg.split("\\|")) yield {
+          if (pipeStages.keySet.contains(pipeStage)) {
+            pipeStages(pipeStage)
+          } else {
+            parser.usage("invalid pipeStage: %s".format(pipeStage))
+          }
+        }).toList
+      }
+      logger.debug("Pipeline: %s".format(pipeline))
+
+      // STATS
+      val idNumStart =
+        startId.value match {
+          case Some(id) => id
+          case None => 0
+        }
+      var numLines = 0
+      var numSkipped = 0
+      var numClasses = scala.collection.mutable.Map[SentimentLabel.Type, Int]().withDefaultValue(0)
+      var numLabels = 0
+      // RUN
+      for ((id, reviewer, polarityChoice, text) <- inputLines) {
+        val outputID = if (id == "") (idNumStart + numLines).toString else id
+        val outputText = runThroughPipeLine(text, pipeline).map((s) => s.replaceAll(",", "-COMMA-").replaceAll("\\|", "-PIPE-")).mkString(",")
+        polarityChoice match {
+          case Left(polarity) =>
+            // no targets
+            if (polarity != SentimentLabel.Abstained) {
+              writeInstance(outputID, reviewer, outputText, polarity.toString, featureWriter)
+              numLines += 1
+              numClasses(polarity) += 1
+              numLabels += 1
+            } else {
+              numClasses(SentimentLabel.Abstained) += 1
+            }
+          case Right(polarityMap) =>
+            // map of target -> polarity
+            val labelList = polarityMap.map {
+              case (target, label) => label
+            }.toList
+            val targetList = polarityMap.map {
+              case (target, label) => target
+            }.toList
+            if (labelList.filter((label) => label != SentimentLabel.Abstained).length > 0) {
+              writeInstance(outputID, reviewer, outputText, labelList.mkString(","), featureWriter)
+              writeTarget(outputID, targetList.mkString(","), targetWriter)
+              numLines += 1
+              numLabels += polarityMap.size
+              for ((_, label) <- polarityMap) {
+                numClasses(label) += 1
+              }
+            } else {
+              numClasses(SentimentLabel.Abstained) += 1
+            }
+        }
+      }
+      featureWriter.flush()
+      targetWriter.flush()
+      logger.info("Stats:\n"+
+        "Preprocessed " + numLines + " tweets. " +
+        "Assigned %d labels.\n".format(numLabels) +
+        (for ((label, count) <- numClasses if label != SentimentLabel.Abstained) yield
+          "%20s: %10d instances (%2.2f%%)"
+            .format(
+            SentimentLabel.toEnglishName(label),
+            count,
+            count.toFloat / numLabels * 100)).mkString("\n") +
+        "\n\n%20s: %10d instances"
+                .format(
+                "skipped",
+                numClasses(SentimentLabel.Abstained))
+      )
+      // These may close stdout, so make sure they are last!
+      featureWriter.close()
+      targetWriter.close()
+    }
+    catch {
+      case e: ArgotUsageException =>
+        println(e.message)
+        System.exit(1)
+    }
+  }
+
 }

src/main/scala/updown/preproc/PreprocFlatFilesCat.scala

-package updown.preproc
-
-import updown.data.SentimentLabel
-import org.clapper.argot.ArgotConverters._
-/**
- * This preprocessor is suitable for any file that contains one instance per line with no labels or ids. This variation will concatenate all lines in each file and create just one instance per file.
- */
-object PreprocFlatFilesCat extends GenericPreprocessor {
-  val minDocSizeOption = parser.option[Int](List("minDocSize"), "INT", "concatenate inputs until the docsize (in characters) reaches INT")
-
-  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)] = {
-    try {
-      val minDocSize =
-      (if (minDocSizeOption.value.isDefined)
-        minDocSizeOption.value.get
-      else
-        20000)
-      var totalLength = 0
-      var fileLines = ""
-      var result = List[(String, String, SentimentLabel.Type, String)]()
-      val source = scala.io.Source.fromFile(fileName, "UTF-8").getLines
-      for ((line, index) <- source.zipWithIndex) {
-        fileLines += line.replace("|", "")
-        if (fileLines.length > minDocSize) {
-          result = (fileName + index, "reviewer", SentimentLabel.figureItOut(polarity), fileLines) :: result
-          logger.info("processed %d inputs.".format(index))
-          totalLength += fileLines.length
-          fileLines = ""
-        }
-      }
-      if (fileLines!=""){
-        result = (fileName + "Remainder", "reviewer", SentimentLabel.figureItOut(polarity), fileLines) :: result
-        totalLength += fileLines.length
-      }
-      logger.info("average length: %f".format(totalLength.toFloat / result.length))
-      result.iterator
-    } catch {
-      case e: MatchError =>
-        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
-          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
-        Iterator[(String, String, SentimentLabel.Type, String)]()
-    }
-  }
-}

src/main/scala/updown/preproc/PreprocHCRTweets.scala

-package updown.preproc
-
-import model.{TweetParse}
-import updown.util._
-import java.io._
-import au.com.bytecode.opencsv.CSVReader
-import scala.collection.immutable._
-
-import org.clapper.argot._
-import updown.data.SentimentLabel
-
-case class SuccessfulHCRParse(tweetid: String, username: String,
-                              sentTargList: List[(SentimentLabel.Type, String)],
-                              features: Iterable[String]) extends TweetParse
-
-case class FailedHCRParse(reason: String) extends TweetParse
-
-object PreprocHCRTweets {
-
-  import ArgotConverters._
-
-  val parser = new ArgotParser("updown run updown.preproc.PreprocHCRTweets", preUsage = Some("Updown"))
-
-  val inputFile = parser.option[String](List("i", "input"), "input", "csv input")
-  val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "stoplist words")
-  val targetFile = parser.option[String](List("t", "target"), "target", "target file")
-  val featureFile = parser.option[String](List("f", "feature"), "feature", "feature file")
-  val ignoreNeutral = parser.flag[Boolean](List("ignoreNeutral"), "set this flag if you want to ignore neutral annotations")
-
-  val HCR_POS = "positive"
-  val HCR_NEG = "negative"
-  val HCR_NEU = "neutral"
-
-  val PARSE_FAIL_NO_SENT = "NO_SENT"
-  val PARSE_FAIL_INVAL_SENT = "INVAL_SENT"
-  val PARSE_FAIL_NO_TWEET_ID = "NO_TWEET_ID"
-  val PARSE_FAIL_NO_USERNAME = "NO_USERNAME"
-  val PARSE_FAIL_NO_TWEET = "NO_TWEET"
-  val PARSE_FAIL_NO_TARGET = "NO_TARGET"
-
-  def processOneLine(numFields: Int, fields: Array[String], stoplist: Set[String]): TweetParse = {
-    // tweet id,user id,username,content,sentiment,target,annotator id,comment,dispute
-    val INDEX_TWEET_ID = 0
-    //val INDEX_USER_ID = 1 // unused, leaving here for documentation
-    val INDEX_USER_NAME = 2
-    val INDEX_CONTENT = 3
-
-    val ITERATE_START = 4
-    val ITERATE_END = numFields - 3
-
-    //    val INDEX_ANNOTATOR_ID = numFields - 3 // unused, leaving here for documentation
-    //    val INDEX_COMMENT = numFields - 2 // unused, leaving here for documentation
-    //    val INDEX_DISPUTE = numFields - 1 // unused, leaving here for documentation
-
-    if (fields.length < 5) {
-      return FailedHCRParse(PARSE_FAIL_NO_SENT)
-    }
-
-    val tweetid = if (fields(INDEX_TWEET_ID).trim.matches("\\d+")) fields(INDEX_TWEET_ID).trim else "" // why are we doing this? why not just take whatever is there as the id?
-    val username = if (fields.length > INDEX_USER_NAME) fields(INDEX_USER_NAME).trim else ""
-    val tweet = if (fields.length > INDEX_CONTENT) fields(INDEX_CONTENT).trim else ""
-    var sentimentList = List[SentimentLabel.Type]()
-    var targetList = List[String]()
-    for (i <- ITERATE_START until ITERATE_END by 2) {
-      val sentiment = if (fields.length > i) fields(i).trim else ""
-      if (!(sentiment == HCR_POS || sentiment == HCR_NEG || sentiment == HCR_NEU))
-        return FailedHCRParse(PARSE_FAIL_INVAL_SENT)
-
-      sentimentList = (sentiment match {
-        case `HCR_POS` => SentimentLabel.Positive
-        case `HCR_NEU` => SentimentLabel.Neutral
-        case `HCR_NEG` => SentimentLabel.Negative
-      }) :: sentimentList
-
-      targetList = (if (fields.length > i + 1) fields(i + 1).trim else "") :: targetList
-      if (targetList(0) == "")
-        return FailedHCRParse(PARSE_FAIL_NO_TARGET)
-    }
-    val sentTargList = sentimentList zip targetList
-
-    if (tweetid == "")
-      return FailedHCRParse(PARSE_FAIL_NO_TWEET_ID)
-    if (username == "")
-      return FailedHCRParse(PARSE_FAIL_NO_USERNAME)
-    if (tweet == "")
-      return FailedHCRParse(PARSE_FAIL_NO_TWEET)
-
-    val tokens = BasicTokenizer(tweet)
-    val features = tokens.filterNot(stoplist(_)) ::: StringUtil.generateBigrams(tokens)
-
-    SuccessfulHCRParse(tweetid, username, sentTargList, features)
-  }
-
-  def writeOutput(featureWriter: OutputStreamWriter, tweetid: String,
-                  username: String, features: Iterable[String],
-                  sentTargList: List[(SentimentLabel.Type, String)], targetWriter: OutputStreamWriter) {
-    var label = ""
-    var target = ""
-    for ((sentiment, targetString) <- sentTargList) {
-      label += (if (label != "") "," else "") + sentiment
-      target += (if (target != "") "," else "") + targetString
-    }
-    featureWriter.write("%s|%s|%s|%s\n".format(tweetid, username, features.mkString(",").replace("|", ""), label))
-    targetWriter.write("%s|%s\n".format(tweetid, target))
-  }
-
-  def main(args: Array[String]) {
-    try {
-      parser.parse(args)
-    }
-    catch {
-      case e: ArgotUsageException => println(e.message);
-      sys.exit(0)
-    }
-
-    // dumb, I know, but a boolean flag turns out to be an Option, which is even dumber
-    val ignoreNeut = if (ignoreNeutral.value == None) false else true
-
-    if (inputFile.value == None) {
-      println("You must specify a input data file via --in or --input ")
-      sys.exit(0)
-    }
-    if (stopListFile.value == None) {
-      println("You must specify a stoplist file via -s ")
-      sys.exit(0)
-    }
-
-
-    val reader = new CSVReader(new InputStreamReader(new FileInputStream(new File(inputFile.value.get)), "UTF-8"))
-    val stoplist = scala.io.Source.fromFile(stopListFile.value.get, "utf-8").getLines().toSet
-    val targetWriter = new OutputStreamWriter(
-      (if (targetFile.value != None)
-        new FileOutputStream(new File(targetFile.value.get))
-      else
-        System.err), "UTF-8")
-
-    val featureWriter = new OutputStreamWriter(
-      (if (featureFile.value != None)
-        new FileOutputStream(new File(featureFile.value.get))
-      else
-        System.out), "UTF-8")
-
-
-    var numTweets = 0
-    var numCounted = 0
-    var numPos = 0 //takes on a new meaning with multiple target labels
-    var numNeg = 0 //same deal here
-    var numNeu = 0
-    var noTweetID = 0
-    var noUserName = 0
-    var noTweet = 0
-    var noSentiment = 0
-    var invalSentiment = 0
-    var noTarget = 0
-
-
-    var fields = reader.readNext
-    // Assumes there is a header!!!
-    val numFields = fields.length
-    fields = reader.readNext
-    while (fields != null) {
-      numTweets += 1
-      processOneLine(numFields, fields, stoplist) match {
-        case SuccessfulHCRParse(tweetid, username, sentTargList, features) =>
-          for ((sentiment, target) <- sentTargList) {
-            numCounted += 1
-            sentiment match {
-              case SentimentLabel.Positive => numPos += 1
-              case SentimentLabel.Negative => numNeg += 1
-              case SentimentLabel.Neutral =>
-                if (!ignoreNeut)
-                  numNeu += 1
-                else
-                  numCounted -= 1
-            }
-          }
-          writeOutput(featureWriter, tweetid, username, features, sentTargList, targetWriter)
-        case FailedHCRParse(PARSE_FAIL_NO_SENT) =>
-          noSentiment += 1
-        case FailedHCRParse(PARSE_FAIL_INVAL_SENT) =>
-          invalSentiment += 1
-        case FailedHCRParse(PARSE_FAIL_NO_TWEET) =>
-          noTweet += 1
-        case FailedHCRParse(PARSE_FAIL_NO_TWEET_ID) =>
-          noTweetID += 1
-        case FailedHCRParse(PARSE_FAIL_NO_USERNAME) =>
-          noUserName += 1
-        case FailedHCRParse(PARSE_FAIL_NO_TARGET) =>
-          noTarget += 1
-      }
-      fields = reader.readNext
-    }
-
-    targetWriter.flush()
-    featureWriter.flush()
-
-    val log = System.err
-
-    log.println("Preprocessed " + numCounted +
-      " tweets. Fraction positive: " + (numPos.toFloat / numCounted) +
-      "\tFraction Negative: " + (numNeg.toFloat / numCounted)
-      + "\tFraction Neutral: " + (numNeu.toFloat / numCounted))
-    log.println("Num pos tweets: " + numPos + ".\t Num neg tweets: " + numNeg + ".\t Num neutral tweets: " + numNeu)
-    log.println((numTweets - numCounted) + " were numNotCounted" +
-      "\nand num of noSentiment: " + noSentiment +
-      "\nand num of invalSentiment: " + invalSentiment +
-      "\nand num of noTarget " + noTarget)
-    log.println("noTweet: " + noTweet + " noUserName: " + noUserName + " noTweetID: " + noTweetID)
-
-    reader.close()
-    targetWriter.close()
-    featureWriter.close()
-
-  }
-}

src/main/scala/updown/preproc/PreprocPangLeePolarityCorpus.scala

-package updown.preproc
-
-import updown.data.SentimentLabel
-import java.io.File
-
-/**
- * This preprocessor is suitable for any directory that contains files which should each be mapped to one instance
- * whose polarity is signified by the label given to the directory in the inputOption
- */
-object PreprocPangLeePolarityCorpus extends GenericPreprocessor {
-
-  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)] = {
-    try {
-      val dir = new File(fileName)
-      assert(dir.isDirectory)
-      (for (file: File <- dir.listFiles()) yield
-        (file.getName,
-          "reviewer",
-          SentimentLabel.figureItOut(polarity),
-          scala.io.Source.fromFile(file, "ISO-8859-1").getLines().mkString(" ").replace("|", "")
-          )
-        ).iterator
-    } catch {
-      case e: MatchError =>
-        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
-          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
-        Iterator[(String, String, SentimentLabel.Type, String)]()
-    }
-  }
-}

src/main/scala/updown/preproc/PreprocPangLeeSentenceCorpus.scala

-package updown.preproc
-
-import updown.data.SentimentLabel
-
-/**
- * This preprocessor is suitable for any file that contains one instance per line with no labels or ids
- */
-object PreprocPangLeeSentenceCorpus extends GenericPreprocessor {
-
-  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)] = {
-    try {
-      for (line <- scala.io.Source.fromFile(fileName, "ISO-8859-1").getLines) yield {
-        ("", "reviewer", SentimentLabel.figureItOut(polarity), line.replace("|", ""))
-      }
-    } catch {
-      case e: MatchError =>
-        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
-          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
-        Iterator[(String, String, SentimentLabel.Type, String)]()
-    }
-  }
-}

src/main/scala/updown/preproc/PreprocShammaTweets.scala

-package updown.preproc
-
-import model.TweetParse
-import updown.util._
-
-import org.clapper.argot._
-import updown.data.SentimentLabel
-
-case class SuccessfulShammaParse(tweetid: String, username: String, label: SentimentLabel.Type, iaa: Double, features: Iterable[String]) extends TweetParse
-
-object PreprocShammaTweets {
-  import ArgotConverters._
-
-  val parser = new ArgotParser("updown preproc-shamma", preUsage=Some("Updown"))
-  
-  val inputFile = parser.option[String](List("i","input"),"input", "path to shamma's Obama-McCain debate data file")
-  val stopListFile =  parser.option[String](List("s","stoplist"),"stoplist", "path to stoplist file")
-  
-  val lineRE = """^(\d+)\t[^\t]+\t([^\t]+)\t([^\t]+)\t[^\t]*\t(.*)$""".r
-  val ratingRE = """\d""".r
-
-  val SHAMMA_POS = "2"
-  val SHAMMA_NEG = "1"
-  val SHAMMA_MIXED = "3"
-  val SHAMMA_OTHER = "4"
-
-  def processOneLine(line: String, stoplist: Set[String]): Any = {
-    val roughTokens = line.split("\t")
-
-    if (!line.startsWith("#") && roughTokens.length >= 8 && line.length > 0 && Character.isDigit(line(0))) {
-      val lineRE(tweetid, tweet, username, ratingsRaw) = line
-      val ratings = ratingRE.findAllIn(ratingsRaw).toList
-
-      // we only consider tweets that were evaluated by 3 or more annotators
-      if (ratings.length >= 3) {
-        val numPos = ratings.count(_ == SHAMMA_POS)
-        val numNeg = ratings.count(_ == SHAMMA_NEG)
-        val posFraction = numPos.toFloat / ratings.length
-        val negFraction = numNeg.toFloat / ratings.length
-        val majorityFraction = math.max(posFraction, negFraction)
-        //only consider non-tie classifications
-        if (majorityFraction > .5) {
-          val label = if (posFraction > negFraction) SentimentLabel.Positive else SentimentLabel.Negative
-          val tokens = BasicTokenizer(tweet)
-          val features = tokens.filterNot(stoplist(_)) ::: StringUtil.generateBigrams(tokens)
-
-          SuccessfulShammaParse(tweetid, username, label, majorityFraction, features)
-        }
-      }
-    }
-  }
-
-  def main(args: Array[String]) {
-    try{parser.parse(args)}
-    catch { case e: ArgotUsageException => println(e.message); sys.exit(0) }
-    
-    if(inputFile.value == None){
-      println("You must specify a input data file via -i")
-      sys.exit(0)
-    }
-    if(stopListFile.value == None){
-      println("You must specify a stoplist file via -s ")
-      sys.exit(0)
-    }
-    
-    val lines = scala.io.Source.fromFile(inputFile.value.get).getLines
-    val stoplist = scala.io.Source.fromFile(stopListFile.value.get).getLines.toSet
-
-    var numTweets = 0
-    var numPosTweets = 0
-    var averageIAA = 0.0
-    for (line <- lines) {
-
-      processOneLine(line, stoplist) match {
-        case SuccessfulShammaParse(tweetid, username, label, iaa, features) =>
-          numTweets += 1
-          averageIAA += iaa
-          if (label == SentimentLabel.Positive) numPosTweets += 1
-          printf("%s|%s|%s|%s\n", tweetid, username, features.mkString(",").replace("|", ""), label.toString)
-        case _ => ()
-      }
-    }
-
-    System.err.println("Preprocessed " + numTweets + " tweets. Fraction positive: " + (numPosTweets.toFloat / numTweets))
-    System.err.println("Average inter-annotator agreement: " + averageIAA / numTweets)
-  }
-}

src/main/scala/updown/preproc/PreprocStanfordTweets.scala

-package updown.preproc
-
-import model.TweetParse
-import updown.util._
-
-import org.clapper.argot._
-import updown.data.SentimentLabel
-
-case class SuccessfulStanfordParse(tweetid: String, username: String, label: SentimentLabel.Type, features: Iterable[String]) extends TweetParse
-
-object PreprocStanfordTweets {
-  //IDEA will try to remove this import, but it is not unused. Make sure it stays here.
-  // See http://devnet.jetbrains.com/message/5301770;jsessionid=5C12AD4FD62857DAD611E8EEED52DF6A
-  import ArgotConverters._
-  
-  val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
-
-  val inputFile = parser.option[String](List("i", "input"), "input", "path to stanford data file")
-  val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "path to stoplist file")
-
-  val lineRE = """^(\d+);;(\d+);;[^;]+;;[^;]+;;([^;]+);;(.*)$""".r
-
-  // TODO: verify the meanings of these values
-  val STAN_POS = "4"
-  val STAN_NEU = "2"
-  val STAN_NEG = "0"
-
-  def processOneLine(line: String, stoplist: Set[String]): Any = {
-    val lineRE(sentimentRaw, tweetid, username, tweet) = line
-    if (sentimentRaw == STAN_POS || sentimentRaw == STAN_NEG) {
-      val tokens = BasicTokenizer(tweet)
-      val features = tokens.filterNot(stoplist(_)) ::: StringUtil.generateBigrams(tokens)
-      val label = if (sentimentRaw == STAN_POS) SentimentLabel.Positive else SentimentLabel.Negative
-
-      SuccessfulStanfordParse(tweetid, username, label, features)
-    }
-  }
-
-  def main(args: Array[String]) {
-    try {
-      parser.parse(args)
-    }
-    catch {
-      case e: ArgotUsageException => println(e.message); sys.exit(0)
-    }
-
-    if (inputFile.value == None) {
-      println("You must specify a input data file via -i ")
-      sys.exit(0)
-    }
-    if (stopListFile.value == None) {
-      println("You must specify a stoplist file via -s ")
-      sys.exit(0)
-    }
-
-    val lines = scala.io.Source.fromFile(inputFile.value.get).getLines
-    val stoplist = scala.io.Source.fromFile(stopListFile.value.get).getLines.toSet
-
-    var numTweets = 0
-    var numPos = 0
-    for (line <- lines) {
-      processOneLine(line, stoplist) match {
-        case SuccessfulStanfordParse(tweetid, username, label, features) =>
-          numTweets += 1
-          if (label == SentimentLabel.Positive)
-            numPos += 1
-          printf("%s|%s|%s|%s\n", tweetid, username, features.mkString(",").replace("|", ""), label.toString)
-        case _ => ()
-      }
-    }
-
-    System.err.println("Preprocessed " + numTweets + " tweets. Fraction positive: " + (numPos.toFloat / numTweets))
-  }
-}

src/main/scala/updown/preproc/PreprocTSVFiles.scala

-package updown.preproc
-
-import updown.data.SentimentLabel
-
-/**
- * This preprocessor is suitable for any file that contains one instance per line with no labels or ids
- */
-object PreprocTSVFiles extends GenericPreprocessor {
-
-  val lineRegex = "(\\S*)\\s*(\\S*)\\s*(.*)".r
-
-  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)] = {
-    try {
-      for (line <- scala.io.Source.fromFile(fileName, "UTF-8").getLines) yield {
-        val lineRegex(id, label, text) = line
-        logger.debug("id:%s label:%s opol:%s pol:%s".format(id, label, polarity, SentimentLabel.figureItOut(polarity)))
-        (id, "reviewer", SentimentLabel.figureItOut(polarity), text.replace("|", ""))
-      }
-    } catch {
-      case e: MatchError =>
-        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
-          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
-        Iterator[(String, String, SentimentLabel.Type, String)]()
-    }
-  }
-}

src/main/scala/updown/preproc/PreprocTSVFilesCat.scala

-package updown.preproc
-
-import updown.data.SentimentLabel
-import io.BufferedSource
-import org.clapper.argot.ArgotConverters
-import ArgotConverters._
-
-/**
- * This preprocessor is suitable for any file that contains one instance per line with no labels or ids. This variation will concatenate all lines in each file and create just one instance per file.
- */
-object PreprocTSVFilesCat extends GenericPreprocessor {
-  val minDocSizeOption = parser.option[Int](List("minDocSize"), "INT", "concatenate inputs until the docsize (in characters) reaches INT")
-
-  val lineRegex = "(\\S*)\\s*(\\S*)\\s*(.*)".r
-
-  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)] = {
-    try {
-      val minDocSize =
-      (if (minDocSizeOption.value.isDefined)
-        minDocSizeOption.value.get
-      else
-        20000)
-      var totalLength = 0
-      var fileLines = ""
-      var result = List[(String, String, SentimentLabel.Type, String)]()
-      val source = scala.io.Source.fromFile(fileName, "UTF-8").getLines
-      for ((line, index) <- source.zipWithIndex) {
-
-        val lineRegex(id, label, text) = line
-        //        logger.debug("id:%s label:%s opol:%s pol:%s".format(id, label, polarity, SentimentLabel.figureItOut(polarity)))
-        fileLines += text.replace("|", "")
-        if (fileLines.length > minDocSize) {
-          result = (fileName + index, "reviewer", SentimentLabel.figureItOut(polarity), fileLines) :: result
-          logger.info("processed %d inputs.".format(index))
-          totalLength += fileLines.length
-          fileLines = ""
-        }
-      }
-      if (fileLines!=""){
-        result = (fileName + "Remainder", "reviewer", SentimentLabel.figureItOut(polarity), fileLines) :: result
-        totalLength += fileLines.length
-      }
-      logger.info("average length: %f".format(totalLength.toFloat / result.length))
-      result.iterator
-    } catch {
-      case e: MatchError =>
-        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
-          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
-        Iterator[(String, String, SentimentLabel.Type, String)]()
-    }
-  }
-}

src/main/scala/updown/preproc/StripIds.scala

-package updown.preproc
-
-object StripIds {
-
-  val lineRE = """^[^|]+\|[^|]+\|(.*)$""".r
-
-  def main(args: Array[String]) {
-    for(line <- scala.io.Source.fromFile(args(0),"utf-8").getLines()) {
-      val lineRE(stripped) = line
-      println(stripped)
-    }
-  }
-}

src/main/scala/updown/preproc/impl/PreprocHCRTweets.scala

+package updown.preproc.impl
+
+import updown.data.SentimentLabel
+import updown.preproc.GenericPreprocessor
+import au.com.bytecode.opencsv.CSVReader
+import java.io.{File, FileInputStream, InputStreamReader}
+
+
+object PreprocHCRTweets extends GenericPreprocessor {
+  //IDEA will try to remove this import, but it is not unused. Make sure it stays here.
+  // See http://devnet.jetbrains.com/message/5301770;jsessionid=5C12AD4FD62857DAD611E8EEED52DF6A
+
+  val lineRE = """^([^,]*),([^,]*),([^,]*),([^,]*),(.*)$""".r
+
+  val HCR_POS = "positive"
+  val HCR_NEG = "negative"
+  val HCR_NEU = "neutral"
+
+  override val defaultPipeline = "basicTokenize|addBigrams|removeStopwords"
+
+  def getTargetToLabelMap(labelInfo: List[String]): Map[String, SentimentLabel.Type] = {
+    labelInfo match {
+      case sentiment :: target :: _ :: _ :: _ :: rest =>
+        val label =
+        sentiment match {
+          case `HCR_POS` => SentimentLabel.Positive
+          case `HCR_NEG` => SentimentLabel.Negative
+          case `HCR_NEU` => SentimentLabel.Neutral
+          case _ => SentimentLabel.Abstained
+        }
+        getTargetToLabelMap(rest) + ((target, label))
+      case Nil => Nil.toMap
+    }
+  }
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
+    // Thanks for not giving us an iterator
+    val reader = new CSVReader(new InputStreamReader(new FileInputStream(new File(inputFile.value.get)), "UTF-8"))
+    var fields = reader.readNext() // burn the header column
+    fields = reader.readNext()
+    var lines = List[List[String]]()
+    while (fields != null) {
+      lines = fields.toList.map((s)=>s.trim) :: lines
+      fields = reader.readNext()
+    }
+    lines = lines.reverse
+
+    // now that we have one, we can use it
+    (for (fields: List[String] <- lines) yield {
+      val (tid :: (uid :: (uname :: (tweet :: labelInfo)))) = fields
+      val newLabelInfo =
+        (labelInfo.length % 5) match {
+          case 0 => labelInfo
+          case 1 => labelInfo ::: List("", "", "", "")
+          case 2 => labelInfo ::: List("", "", "")
+          case 3 => labelInfo ::: List("", "")
+          case 4 => labelInfo ::: List("")
+        }
+      assert(newLabelInfo.length % 5 == 0)
+      val targetToLabelMap: Map[String, SentimentLabel.Type] = getTargetToLabelMap(newLabelInfo)
+      (tid, uname, Right(targetToLabelMap), tweet)
+    }).iterator
+  }
+}

src/main/scala/updown/preproc/impl/PreprocPangLeePolarityCorpus.scala

+package updown.preproc.impl
+
+import updown.data.SentimentLabel
+import java.io.File
+import updown.preproc.GenericPreprocessor
+
+/**
+ * This preprocessor is suitable for any directory that contains files which should each be mapped to one instance
+ * whose polarity is signified by the label given to the directory in the inputOption
+ */
+object PreprocPangLeePolarityCorpus extends GenericPreprocessor {
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
+    try {
+      val dir = new File(fileName)
+      assert(dir.isDirectory)
+      (for (file: File <- dir.listFiles()) yield
+        (file.getName,
+          "reviewer",
+          Left(SentimentLabel.figureItOut(polarity)),
+          scala.io.Source.fromFile(file, "ISO-8859-1").getLines().mkString(" ").replace("|", "")
+          )
+        ).iterator
+    } catch {
+      case e: MatchError =>
+        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
+          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
+        Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)]()
+    }
+  }
+}

src/main/scala/updown/preproc/impl/PreprocPangLeeSentenceCorpus.scala

+package updown.preproc.impl
+
+import updown.data.SentimentLabel
+import updown.preproc.GenericPreprocessor
+
+/**
+ * This preprocessor is suitable for any file that contains one instance per line with no labels or ids
+ */
+object PreprocPangLeeSentenceCorpus extends GenericPreprocessor {
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
+    try {
+      for (line <- scala.io.Source.fromFile(fileName, "ISO-8859-1").getLines) yield {
+        ("", "reviewer", Left(SentimentLabel.figureItOut(polarity)), line.replace("|", ""))
+      }
+    } catch {
+      case e: MatchError =>
+        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
+          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
+        Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)]()
+    }
+  }
+}

src/main/scala/updown/preproc/impl/PreprocShammaTweets.scala

+package updown.preproc.impl
+
+import updown.data.SentimentLabel
+import updown.preproc.GenericPreprocessor
+object PreprocShammaTweets extends GenericPreprocessor {
+  //IDEA will try to remove this import, but it is not unused. Make sure it stays here.
+  // See http://devnet.jetbrains.com/message/5301770;jsessionid=5C12AD4FD62857DAD611E8EEED52DF6A
+
+  val lineRE = """^(\d+)\t[^\t]+\t([^\t]+)\t([^\t]+)\t[^\t]*\t(.*)$""".r
+  val ratingRE = """\d""".r
+
+  val SHAMMA_POS = "2"
+  val SHAMMA_NEG = "1"
+  val SHAMMA_MIXED = "3"
+  val SHAMMA_OTHER = "4"
+
+  override val defaultPipeline = "basicTokenize|addBigrams|removeStopwords"
+
+
+  def getSingleRating(ratings: List[String]): SentimentLabel.Type = {
+    // we only consider tweets that were evaluated by 3 or more annotators
+    if (ratings.length >= 3) {
+      val fracPos = ratings.count(_ == SHAMMA_POS).toFloat / ratings.length
+      val fracNeg = ratings.count(_ == SHAMMA_NEG).toFloat / ratings.length
+
+      //only consider non-tie classifications
+      if (fracPos > .5 && fracPos > fracNeg) {
+        SentimentLabel.Positive
+      } else if (fracNeg > .5 && fracNeg > fracPos) {
+        SentimentLabel.Negative
+      } else {
+        SentimentLabel.Abstained
+      }
+    } else {
+      SentimentLabel.Abstained
+    }
+  }
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
+    for (line <- scala.io.Source.fromFile(fileName, "UTF-8").getLines) yield {
+      val roughTokens = line.split("\t")
+
+      if (!line.startsWith("#") && roughTokens.length >= 8 && line.length > 0 && Character.isDigit(line(0))) {
+        val lineRE(id, tweet, username, ratingsRaw) = line
+
+        val label = getSingleRating(ratingRE.findAllIn(ratingsRaw).toList)
+        logger.debug("id:%s label:%s".format(id, SentimentLabel.toEnglishName(label)))
+        (id, username, Left(label), tweet)
+      } else {
+        ("","",Left(SentimentLabel.Abstained),"")
+      }
+    }
+  }
+}

src/main/scala/updown/preproc/impl/PreprocStanfordTweets.scala

+package updown.preproc.impl
+
+import updown.util._
+
+import org.clapper.argot._
+import updown.data.SentimentLabel
+import updown.preproc.model.TweetParse
+import updown.preproc.GenericPreprocessor
+
+object PreprocStanfordTweets extends GenericPreprocessor {
+  //IDEA will try to remove this import, but it is not unused. Make sure it stays here.
+  // See http://devnet.jetbrains.com/message/5301770;jsessionid=5C12AD4FD62857DAD611E8EEED52DF6A
+
+  import ArgotConverters._
+
+  val lineRE = """^(\d+);;(\d+);;[^;]+;;[^;]+;;([^;]+);;(.*)$""".r
+
+  val STAN_POS = "4"
+  val STAN_NEU = "2"
+  val STAN_NEG = "0"
+
+  override val defaultPipeline = "basicTokenize|addBigrams|removeStopwords"
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
+    for (line <- scala.io.Source.fromFile(fileName, "UTF-8").getLines) yield {
+      val lineRE(sentimentRaw, id, username, tweet) = line
+      val label = sentimentRaw match {
+        case STAN_POS => SentimentLabel.Positive
+        case STAN_NEU => SentimentLabel.Neutral
+        case STAN_NEG => SentimentLabel.Negative
+      }
+      logger.debug("id:%s label:%s".format(id, label))
+      (id, username, Left(label), tweet)
+    }
+  }
+}

src/main/scala/updown/preproc/impl/PreprocTSVFiles.scala

+package updown.preproc.impl
+
+import updown.data.SentimentLabel
+import updown.preproc.GenericPreprocessor
+
+/**
+ * This preprocessor is suitable for any file that contains one instance per line with no labels or ids
+ */
+object PreprocTSVFiles extends GenericPreprocessor {
+
+  val lineRegex = "(\\S*)\\s*(\\S*)\\s*(.*)".r
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
+    try {
+      for (line <- scala.io.Source.fromFile(fileName, "UTF-8").getLines) yield {
+        val lineRegex(id, label, text) = line
+        logger.debug("id:%s label:%s opol:%s pol:%s".format(id, label, polarity, SentimentLabel.figureItOut(polarity)))
+        (id, "reviewer", Left(SentimentLabel.figureItOut(polarity)), text.replace("|", ""))
+      }
+    } catch {
+      case e: MatchError =>
+        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
+          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
+        Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)]()
+    }
+  }
+}

src/main/scala/updown/preproc/impl/PreprocTSVFilesCat.scala

+package updown.preproc.impl
+
+import updown.data.SentimentLabel
+import io.BufferedSource
+import org.clapper.argot.ArgotConverters
+import ArgotConverters._
+import updown.preproc.GenericPreprocessor
+
+/**
+ * This preprocessor is suitable for any file that contains one instance per line with no labels or ids. This variation will concatenate all lines in each file and create just one instance per file.
+ */
+object PreprocTSVFilesCat extends GenericPreprocessor {
+  val minDocSizeOption = parser.option[Int](List("minDocSize"), "INT", "concatenate inputs until the docsize (in characters) reaches INT")
+
+  val lineRegex = "(\\S*)\\s*(\\S*)\\s*(.*)".r
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
+    try {
+      val minDocSize =
+      (if (minDocSizeOption.value.isDefined)
+        minDocSizeOption.value.get
+      else
+        20000)
+      var totalLength = 0
+      var fileLines = ""
+      var result = List[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)]()
+      val source = scala.io.Source.fromFile(fileName, "UTF-8").getLines
+      for ((line, index) <- source.zipWithIndex) {
+
+        val lineRegex(id, label, text) = line
+        //        logger.debug("id:%s label:%s opol:%s pol:%s".format(id, label, polarity, SentimentLabel.figureItOut(polarity)))
+        fileLines += text.replace("|", "")
+        if (fileLines.length > minDocSize) {
+          result = (fileName + index, "reviewer", Left(SentimentLabel.figureItOut(polarity)), fileLines) :: result
+          logger.info("processed %d inputs.".format(index))
+          totalLength += fileLines.length
+          fileLines = ""
+        }
+      }
+      if (fileLines!=""){
+        result = (fileName + "Remainder", "reviewer", Left(SentimentLabel.figureItOut(polarity)), fileLines) :: result
+        totalLength += fileLines.length
+      }
+      logger.info("average length: %f".format(totalLength.toFloat / result.length))
+      result.iterator
+    } catch {
+      case e: MatchError =>
+        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
+          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
+        Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)]()
+    }
+  }
+}

src/main/scala/updown/util/BasicTokenizer.scala

   def apply(s: String): List[String] = tokenize(s)
 
   def tokenize(s: String): List[String] = {
-    s.split(" ").map(StringUtil.preprocessKeepHash(_)).filter(_.length > 0).toList
+    s.split("[\\s+]").map(StringUtil.preprocessKeepHash(_)).filter(_.length > 0).toList
   }
 }

src/main/scala/updown/util/TokenizationPipes.scala

     else
       Twokenize(s)).flatten
 
+  val basicTokenize: (List[String]) => List[String] =
+    (ss) => ss.map((s)=>BasicTokenizer(s)).flatten
+
   val toLowercase: (List[String]) => List[String] =
     (ss) => ss.map((s) => s.toLowerCase)
 

src/main/scala/updown/util/WordleUtils.scala

 
 
 import com.weiglewilczek.slf4s.Logging
-import java.io.{BufferedWriter, File}
+import java.io.{IOException, BufferedWriter, File}
 
 object WordleUtils extends Logging {
 
       if (sourceFile.isFile) {
         val sourcePath = sourceFile.getAbsolutePath
         val destPath = sourcePath + ".png"
-        if (htmlOutputWriter.isDefined){
+        if (htmlOutputWriter.isDefined) {
           htmlOutputWriter.get.write("<div class=wordle><span class=name>%s</span><img src=\"%s\"/></div>".format(sourceFile.getName, destPath))
         }
 
         logger.debug(sourceFile.getAbsolutePath() + " exists.");
         val command = makeWordlesCommand(jarPath, configPath, sourcePath, destPath);
         logger.debug("Spawning: " + command);
-        children = Runtime.getRuntime().exec(command) :: children
+        try {
+          children = Runtime.getRuntime().exec(command) :: children
+        } catch {
+          case s: IOException => logger.error("couldn't launch wordle program. Probably, out of memory.")
+        }
       } else {
         logger.error("%s is not a file".format(topicFile))
       }