Commits

vvcephei committed fe0e609

adding preprocessor for reviews and also sandboxing a new paradigm for tokenization

Comments (0)

Files changed (3)

lib/mallet.jar

Binary file added.

src/main/scala/updown/preproc/PreprocPangLeeSentenceCorpus.scala

+package updown.preproc
+
+import org.clapper.argot.{ArgotUsageException, ArgotConverters, ArgotParser}
+import ArgotConverters._
+import updown.util.{Twokenize, TokenizationPipes}
+
+object PreprocPangLeeSentenceCorpus {
+  // this is here to make ArgotConverters appear used to IDEA.
+  convertString _
+
+  def main(args: Array[String]) {
+    // PARSE ARGS
+    val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
+    val inputFile = parser.option[String](List("i", "input"), "input", "path to stanford data file")
+    val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "path to stoplist file")
+    try {
+      parser.parse(args)
+    }
+    catch {
+      case e: ArgotUsageException =>
+        println(e.message)
+        sys.exit(0)
+    }
+
+    // SET UP IO
+    val inputLines =
+      inputFile.value match {
+        case Some(fileName) => scala.io.Source.fromFile(fileName, "ISO-8859-1").getLines()
+        case None => scala.io.Source.stdin.getLines()
+      }
+
+    val stopSet: Set[String] =
+      stopListFile.value match {
+        case Some(fileName) => scala.io.Source.fromFile(fileName).getLines.toSet
+        case None => Set("a")
+      }
+
+    // RUN
+    val tokenizationPipeline: List[(List[String]) => List[String]] = List(
+      TokenizationPipes.toLowercase,
+      TokenizationPipes.splitOnDelimiter("\\s"),
+//      TokenizationPipes.addNGrams(2),
+      TokenizationPipes.filterOnStopset(stopSet)
+    )
+
+    for (line <- inputLines) {
+      var resultLine = List(line)
+      for (f <- tokenizationPipeline) {
+        resultLine = f(resultLine)
+      }
+      println(line)
+      println("-> " + resultLine.mkString("\t"))
+      println("=> " + Twokenize(line).mkString("\t"))
+    }
+  }
+}

src/main/scala/updown/util/TokenizationPipes.scala

+package updown.util
+
+object TokenizationPipes {
+  val toLowercase: (List[String]) => List[String] =
+    (ss) => ss.map((s) => s.toLowerCase)
+
+  val splitOnDelimiter: (String) => (List[String]) => List[String] =
+    (d) =>
+      (ss) => ss.map((s) => s.split(d).toList).flatten
+
+  val filterOnStopset: (Set[String]) => (List[String]) => List[String] =
+    (stopSet) =>
+      (ss) => ss.filter((s) => !stopSet.contains(s))
+
+  /*
+    A really diligent implementation would put (n-1) "$"s at the beginning and end of the
+    list, but I kind of doubt that's what we really want, so I'm not going to bother right now.
+  */
+  val nGrams: (Int) => (List[String]) => List[String] =
+    (n) =>
+      (ss) => ("$" :: ss ::: ("$" :: Nil)).sliding(n).map(innerList => innerList.mkString(" ")).toList
+
+  val addNGrams: (Int) => (List[String]) => List[String] =
+    (n) =>
+      (ss) => ss ::: nGrams(n)(ss)
+}