Commits

vvcephei  committed e7ca7b2

working on review preproc

  • Participants
  • Parent commits fe0e609

Comments (0)

Files changed (1)

File src/main/scala/updown/preproc/PreprocPangLeeSentenceCorpus.scala

 
 import org.clapper.argot.{ArgotUsageException, ArgotConverters, ArgotParser}
 import ArgotConverters._
-import updown.util.{Twokenize, TokenizationPipes}
+import updown.util.Twokenize
 
 object PreprocPangLeeSentenceCorpus {
   // this is here to make ArgotConverters appear used to IDEA.
     }
 
     // SET UP IO
-    val inputLines =
+    val inputLines: Iterator[String] =
       inputFile.value match {
-        case Some(fileName) => scala.io.Source.fromFile(fileName, "ISO-8859-1").getLines()
+        case Some(fileNameList) =>
+          val tempArr: Array[String] = fileNameList.split("\\s*,\\s*")
+          val tempArrList: Array[(String, String)] = tempArr.map((pair) => {
+            val plist = pair.split("\\s*->\\s*")
+            (plist(0) -> plist(1))
+          }
+          )
+
+          val fileNameMap: Map[String, String] = tempArrList.toMap
+
+          Iterator.flatten((
+            for ((name, polarity) <- fileNameMap) yield {
+              for (line <- scala.io.Source.fromFile(name, "ISO-8859-1").getLines) yield {
+                "%s|%s".format(polarity, line.replace("|", ""))
+              }
+            }).iterator)
+
         case None => scala.io.Source.stdin.getLines()
       }
 
     val stopSet: Set[String] =
       stopListFile.value match {
-        case Some(fileName) => scala.io.Source.fromFile(fileName).getLines.toSet
-        case None => Set("a")
+        case Some(fileName) =>
+          scala.io.Source.fromFile(fileName).getLines.toSet
+        case None => Set("a", "the", ".")
       }
 
     // RUN
-    val tokenizationPipeline: List[(List[String]) => List[String]] = List(
-      TokenizationPipes.toLowercase,
-      TokenizationPipes.splitOnDelimiter("\\s"),
-//      TokenizationPipes.addNGrams(2),
-      TokenizationPipes.filterOnStopset(stopSet)
-    )
+    var x = 0
+    for (line <- inputLines) {
+      // mostly replacing the delimiter in the input string as documentation that no matter what you are using
+      // to tokenize the string, you must strip out the character that you intend to delimit tokens with later.
 
-    for (line <- inputLines) {
-      var resultLine = List(line)
-      for (f <- tokenizationPipeline) {
-        resultLine = f(resultLine)
+      line.split('|') match {
+        case Array(polarity, text) =>
+        println(
+        "%s|%s|%s|%s".format(
+              x,
+              "",
+              Twokenize(text.replaceAll(",", "")).toList.filter((s) => !stopSet.contains(s)).mkString(","),
+              polarity)
+      )
       }
-      println(line)
-      println("-> " + resultLine.mkString("\t"))
-      println("=> " + Twokenize(line).mkString("\t"))
+      /*println(
+        "%s|%s|%s|%s".format(
+              x,
+              "",
+              Twokenize(text.replaceAll(",", "")).toList.filter((s) => !stopSet.contains(s)).mkString(","),
+              polarity)
+      )*/
+      /*println(
+        line match {
+          case inputRegex(polarity, text) =>
+            "%s|%s|%s|%s".format(
+              x,
+              "",
+              Twokenize(text.replaceAll(",", "")).toList.filter((s) => !stopSet.contains(s)).mkString(","),
+              polarity)
+          case _ =>
+            ""
+        }
+      )*/
     }
   }
 }