vvcephei avatar vvcephei committed dd4e01d

implemented a new preprocessor that guarantees a minium length for train/test instances

Comments (0)

Files changed (4)

 JARS=`echo $UPDOWN_DIR/lib/*.jar $UPDOWN_DIR/lib_managed/*/*.jar $UPDOWN_DIR/output/*.jar $UPDOWN_DIR/target/*.jar | tr ' ' ':'`
 SCALA_LIB="$HOME/.sbt/boot/scala-2.9.1/lib/scala-library.jar"
 
-CP="$UPDOWN_DIR/target/classes:$SCALA_LIB:$JARS:$CLASSPATH"
+CP="$UPDOWN_DIR/target/classes:$SCALA_LIB:$JARS:$CLASSPATH:$UPDOWN_DIR/config"
 
 FIRSTARG="$1"
 

src/main/scala/updown/preproc/GenericPreprocessor.scala

 abstract class GenericPreprocessor extends Logging {
   // this is here to make ArgotConverters appear used to IDEA.
   convertString _
+  var pipeStages: Map[String, (List[String]) => List[String]] =
+    Map[String, (List[String]) => List[String]](
+      ("lowerCase" -> TokenizationPipes.toLowercase),
+      ("addBiGrams" -> TokenizationPipes.addNGrams(2)),
+      ("twokenize" -> TokenizationPipes.twokenize),
+      ("twokenizeSkipGtOneGrams" -> TokenizationPipes.twokenizeSkipGtOneGrams),
+      ("filterAlpha") -> TokenizationPipes.filterOnRegex("\\p{Alpha}+"),
+      ("filterAlphaQuote") -> TokenizationPipes.filterOnRegex("(\\p{Alpha}|')+"),
+      ("splitSpace" -> TokenizationPipes.splitOnDelimiter(" "))
+    )
+  val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
+  val inputFile = parser.option[String](List("i", "input"), "input", "path to stanford data file")
+  val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "path to stoplist file")
+  val startId = parser.option[Int](List("start-id"), "ID", "id at which to start numbering lines")
+  val textPipeline = parser.option[String](List("textPipeline"), "PIPELINE",
+    ("specify the desired pipe stages seperated by |: \"addBiGrams|twokenize\". " +
+      "Available options are in %s.").format(pipeStages.keySet))
 
   def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)]
 
   def main(args: Array[String]) {
     logger.debug(args.toList.toString)
     // don't forget that this is linked to the pipeStages dict below
-    val availablePipes = Set("lowerCase", "addBiGrams", "twokenize", "twokenizeSkipGtOneGrams", "removeStopwords", "splitSpace", "filterAlpha", "filterAlphaQuote")
 
     // PARSE ARGS
-    val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
-    val inputFile = parser.option[String](List("i", "input"), "input", "path to stanford data file")
-    val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "path to stoplist file")
-    val startId = parser.option[Int](List("start-id"), "ID", "id at which to start numbering lines")
-    val textPipeline = parser.option[String](List("textPipeline"), "PIPELINE",
-      ("specify the desired pipe stages seperated by |: \"addBiGrams|twokenize\". " +
-        "Available options are in %s.").format(availablePipes))
+
     try {
       parser.parse(args)
 
             scala.io.Source.fromFile(fileName).getLines.toSet
           case None => Set("a", "the", ".")
         }
+      val tokpipe: (String, List[String]=>List[String]) = ("removeStopwords", TokenizationPipes.filterOnStopset(stopSet))
+      pipeStages = pipeStages + tokpipe
 
 
-      val pipeStages: Map[String, (List[String]) => List[String]] =
-        Map[String, (List[String]) => List[String]](
-          ("lowerCase" -> TokenizationPipes.toLowercase),
-          ("addBiGrams" -> TokenizationPipes.addNGrams(2)),
-          ("twokenize" -> TokenizationPipes.twokenize),
-          ("twokenizeSkipGtOneGrams" -> TokenizationPipes.twokenizeSkipGtOneGrams),
-          ("removeStopwords" -> TokenizationPipes.filterOnStopset(stopSet)),
-          ("filterAlpha") -> TokenizationPipes.filterOnRegex("\\p{Alpha}+"),
-          ("filterAlphaQuote") -> TokenizationPipes.filterOnRegex("(\\p{Alpha}|')+"),
-          ("splitSpace" -> TokenizationPipes.splitOnDelimiter(" "))
-        )
-      // had to predefine the available pipes so they could be printed in the usage string, before the stopset can be parsed.
-      assert(pipeStages.keySet == availablePipes)
-
       logger.debug("Pipeline option: %s".format(textPipeline.value))
       val pipeline: List[(List[String]) => List[String]] =
         if (textPipeline.value.isDefined) {

src/main/scala/updown/preproc/PreprocFlatFilesCat.scala

+package updown.preproc
+
+import updown.data.SentimentLabel
+import org.clapper.argot.ArgotConverters._
+/**
+ * This preprocessor is suitable for any file that contains one instance per line with no labels or ids. This variation will concatenate all lines in each file and create just one instance per file.
+ */
+object PreprocFlatFilesCat extends GenericPreprocessor {
+  val minDocSizeOption = parser.option[Int](List("minDocSize"), "INT", "concatenate inputs until the docsize (in characters) reaches INT")
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)] = {
+    try {
+      val minDocSize =
+      (if (minDocSizeOption.value.isDefined)
+        minDocSizeOption.value.get
+      else
+        20000)
+      var totalLength = 0
+      var fileLines = ""
+      var result = List[(String, String, SentimentLabel.Type, String)]()
+      val source = scala.io.Source.fromFile(fileName, "UTF-8").getLines
+      for ((line, index) <- source.zipWithIndex) {
+        fileLines += line.replace("|", "")
+        if (fileLines.length > minDocSize) {
+          result = (fileName + index, "reviewer", SentimentLabel.figureItOut(polarity), fileLines) :: result
+          logger.info("processed %d inputs.".format(index))
+          totalLength += fileLines.length
+          fileLines = ""
+        }
+      }
+      if (fileLines!=""){
+        result = (fileName + "Remainder", "reviewer", SentimentLabel.figureItOut(polarity), fileLines) :: result
+        totalLength += fileLines.length
+      }
+      logger.info("average length: %f".format(totalLength.toFloat / result.length))
+      result.iterator
+    } catch {
+      case e: MatchError =>
+        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
+          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
+        Iterator[(String, String, SentimentLabel.Type, String)]()
+    }
+  }
+}

src/main/scala/updown/preproc/PreprocTSVFilesCat.scala

+package updown.preproc
+
+import updown.data.SentimentLabel
+import io.BufferedSource
+import org.clapper.argot.ArgotConverters
+import ArgotConverters._
+
+/**
+ * This preprocessor is suitable for any file that contains one instance per line with no labels or ids. This variation will concatenate all lines in each file and create just one instance per file.
+ */
+object PreprocTSVFilesCat extends GenericPreprocessor {
+  val minDocSizeOption = parser.option[Int](List("minDocSize"), "INT", "concatenate inputs until the docsize (in characters) reaches INT")
+
+  val lineRegex = "(\\S*)\\s*(\\S*)\\s*(.*)".r
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)] = {
+    try {
+      val minDocSize =
+      (if (minDocSizeOption.value.isDefined)
+        minDocSizeOption.value.get
+      else
+        20000)
+      var totalLength = 0
+      var fileLines = ""
+      var result = List[(String, String, SentimentLabel.Type, String)]()
+      val source = scala.io.Source.fromFile(fileName, "UTF-8").getLines
+      for ((line, index) <- source.zipWithIndex) {
+
+        val lineRegex(id, label, text) = line
+        //        logger.debug("id:%s label:%s opol:%s pol:%s".format(id, label, polarity, SentimentLabel.figureItOut(polarity)))
+        fileLines += text.replace("|", "")
+        if (fileLines.length > minDocSize) {
+          result = (fileName + index, "reviewer", SentimentLabel.figureItOut(polarity), fileLines) :: result
+          logger.info("processed %d inputs.".format(index))
+          totalLength += fileLines.length
+          fileLines = ""
+        }
+      }
+      if (fileLines!=""){
+        result = (fileName + "Remainder", "reviewer", SentimentLabel.figureItOut(polarity), fileLines) :: result
+        totalLength += fileLines.length
+      }
+      logger.info("average length: %f".format(totalLength.toFloat / result.length))
+      result.iterator
+    } catch {
+      case e: MatchError =>
+        logger.error("Couldn't figure out what sentiment '%s' is supposed to be." +
+          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
+        Iterator[(String, String, SentimentLabel.Type, String)]()
+    }
+  }
+}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.