Commits

vvcephei  committed dd4e01d

implemented a new preprocessor that guarantees a minium length for train/test instances

  • Participants
  • Parent commits 19a6497

Comments (0)

Files changed (4)

 JARS=`echo $UPDOWN_DIR/lib/*.jar $UPDOWN_DIR/lib_managed/*/*.jar $UPDOWN_DIR/output/*.jar $UPDOWN_DIR/target/*.jar | tr ' ' ':'`
 SCALA_LIB="$HOME/.sbt/boot/scala-2.9.1/lib/scala-library.jar"
 
-CP="$UPDOWN_DIR/target/classes:$SCALA_LIB:$JARS:$CLASSPATH"
+CP="$UPDOWN_DIR/target/classes:$SCALA_LIB:$JARS:$CLASSPATH:$UPDOWN_DIR/config"
 
 FIRSTARG="$1"
 

File src/main/scala/updown/preproc/GenericPreprocessor.scala

 abstract class GenericPreprocessor extends Logging {
   // this is here to make ArgotConverters appear used to IDEA.
   convertString _
+  var pipeStages: Map[String, (List[String]) => List[String]] =
+    Map[String, (List[String]) => List[String]](
+      ("lowerCase" -> TokenizationPipes.toLowercase),
+      ("addBiGrams" -> TokenizationPipes.addNGrams(2)),
+      ("twokenize" -> TokenizationPipes.twokenize),
+      ("twokenizeSkipGtOneGrams" -> TokenizationPipes.twokenizeSkipGtOneGrams),
+      ("filterAlpha") -> TokenizationPipes.filterOnRegex("\\p{Alpha}+"),
+      ("filterAlphaQuote") -> TokenizationPipes.filterOnRegex("(\\p{Alpha}|')+"),
+      ("splitSpace" -> TokenizationPipes.splitOnDelimiter(" "))
+    )
+  val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
+  val inputFile = parser.option[String](List("i", "input"), "input", "path to stanford data file")
+  val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "path to stoplist file")
+  val startId = parser.option[Int](List("start-id"), "ID", "id at which to start numbering lines")
+  val textPipeline = parser.option[String](List("textPipeline"), "PIPELINE",
+    ("specify the desired pipe stages seperated by |: \"addBiGrams|twokenize\". " +
+      "Available options are in %s.").format(pipeStages.keySet))
 
   def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)]
 
   def main(args: Array[String]) {
     logger.debug(args.toList.toString)
     // don't forget that this is linked to the pipeStages dict below
-    val availablePipes = Set("lowerCase", "addBiGrams", "twokenize", "twokenizeSkipGtOneGrams", "removeStopwords", "splitSpace", "filterAlpha", "filterAlphaQuote")
 
     // PARSE ARGS
-    val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
-    val inputFile = parser.option[String](List("i", "input"), "input", "path to stanford data file")
-    val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "path to stoplist file")
-    val startId = parser.option[Int](List("start-id"), "ID", "id at which to start numbering lines")
-    val textPipeline = parser.option[String](List("textPipeline"), "PIPELINE",
-      ("specify the desired pipe stages seperated by |: \"addBiGrams|twokenize\". " +
-        "Available options are in %s.").format(availablePipes))
+
     try {
       parser.parse(args)
 
             scala.io.Source.fromFile(fileName).getLines.toSet
           case None => Set("a", "the", ".")
         }
+      val tokpipe: (String, List[String]=>List[String]) = ("removeStopwords", TokenizationPipes.filterOnStopset(stopSet))
+      pipeStages = pipeStages + tokpipe
 
 
-      val pipeStages: Map[String, (List[String]) => List[String]] =
-        Map[String, (List[String]) => List[String]](
-          ("lowerCase" -> TokenizationPipes.toLowercase),
-          ("addBiGrams" -> TokenizationPipes.addNGrams(2)),
-          ("twokenize" -> TokenizationPipes.twokenize),
-          ("twokenizeSkipGtOneGrams" -> TokenizationPipes.twokenizeSkipGtOneGrams),
-          ("removeStopwords" -> TokenizationPipes.filterOnStopset(stopSet)),
-          ("filterAlpha") -> TokenizationPipes.filterOnRegex("\\p{Alpha}+"),
-          ("filterAlphaQuote") -> TokenizationPipes.filterOnRegex("(\\p{Alpha}|')+"),
-          ("splitSpace" -> TokenizationPipes.splitOnDelimiter(" "))
-        )
-      // had to predefine the available pipes so they could be printed in the usage string, before the stopset can be parsed.
-      assert(pipeStages.keySet == availablePipes)
-
       logger.debug("Pipeline option: %s".format(textPipeline.value))
       val pipeline: List[(List[String]) => List[String]] =
         if (textPipeline.value.isDefined) {

File src/main/scala/updown/preproc/PreprocFlatFilesCat.scala

+package updown.preproc
+
+import updown.data.SentimentLabel
+import org.clapper.argot.ArgotConverters._
+/**
+ * This preprocessor is suitable for any file that contains one instance per line with no labels or ids. This variation will concatenate all lines in each file and create just one instance per file.
+ */
+object PreprocFlatFilesCat extends GenericPreprocessor {
+  val minDocSizeOption = parser.option[Int](List("minDocSize"), "INT", "concatenate inputs until the docsize (in characters) reaches INT")
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)] = {
+    try {
+      val minDocSize =
+      (if (minDocSizeOption.value.isDefined)
+        minDocSizeOption.value.get
+      else
+        20000)
+      var totalLength = 0
+      var fileLines = ""
+      var result = List[(String, String, SentimentLabel.Type, String)]()
+      val source = scala.io.Source.fromFile(fileName, "UTF-8").getLines
+      for ((line, index) <- source.zipWithIndex) {
+        fileLines += line.replace("|", "")
+        if (fileLines.length > minDocSize) {
+          result = (fileName + index, "reviewer", SentimentLabel.figureItOut(polarity), fileLines) :: result
+          logger.info("processed %d inputs.".format(index))
+          totalLength += fileLines.length
+          fileLines = ""
+        }
+      }
+      if (fileLines!=""){
+        result = (fileName + "Remainder", "reviewer", SentimentLabel.figureItOut(polarity), fileLines) :: result
+        totalLength += fileLines.length
+      }
+      logger.info("average length: %f".format(totalLength.toFloat / result.length))
+      result.iterator
+    } catch {
+      case e: MatchError =>
+        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
+          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
+        Iterator[(String, String, SentimentLabel.Type, String)]()
+    }
+  }
+}

File src/main/scala/updown/preproc/PreprocTSVFilesCat.scala

+package updown.preproc
+
+import updown.data.SentimentLabel
+import io.BufferedSource
+import org.clapper.argot.ArgotConverters
+import ArgotConverters._
+
+/**
+ * This preprocessor is suitable for any file that contains one instance per line with no labels or ids. This variation will concatenate all lines in each file and create just one instance per file.
+ */
+object PreprocTSVFilesCat extends GenericPreprocessor {
+  val minDocSizeOption = parser.option[Int](List("minDocSize"), "INT", "concatenate inputs until the docsize (in characters) reaches INT")
+
+  val lineRegex = "(\\S*)\\s*(\\S*)\\s*(.*)".r
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)] = {
+    try {
+      val minDocSize =
+      (if (minDocSizeOption.value.isDefined)
+        minDocSizeOption.value.get
+      else
+        20000)
+      var totalLength = 0
+      var fileLines = ""
+      var result = List[(String, String, SentimentLabel.Type, String)]()
+      val source = scala.io.Source.fromFile(fileName, "UTF-8").getLines
+      for ((line, index) <- source.zipWithIndex) {
+
+        val lineRegex(id, label, text) = line
+        //        logger.debug("id:%s label:%s opol:%s pol:%s".format(id, label, polarity, SentimentLabel.figureItOut(polarity)))
+        fileLines += text.replace("|", "")
+        if (fileLines.length > minDocSize) {
+          result = (fileName + index, "reviewer", SentimentLabel.figureItOut(polarity), fileLines) :: result
+          logger.info("processed %d inputs.".format(index))
+          totalLength += fileLines.length
+          fileLines = ""
+        }
+      }
+      if (fileLines!=""){
+        result = (fileName + "Remainder", "reviewer", SentimentLabel.figureItOut(polarity), fileLines) :: result
+        totalLength += fileLines.length
+      }
+      logger.info("average length: %f".format(totalLength.toFloat / result.length))
+      result.iterator
+    } catch {
+      case e: MatchError =>
+        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
+          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
+        Iterator[(String, String, SentimentLabel.Type, String)]()
+    }
+  }
+}