1. Christopher Johnson
  2. polify

Commits

vvcephei  committed e23754a

implementing 10-fold experiments and a new preprocessor

  • Participants
  • Parent commits 8715807
  • Branches default

Comments (0)

Files changed (10)

File src/main/scala/updown/app/JuntoClassifier.scala

View file
  • Ignore whitespace
     PerTweetEvaluator(systemLabeledTweets)
     PerUserEvaluator(systemLabeledTweets)
     if (targetsInputFile.value != None) {
-      val targets = new scala.collection.mutable.HashMap[String, String]
-
-      scala.io.Source.fromFile(targetsInputFile.value.get, "utf-8").getLines.foreach(p => targets.put(p.split("\t")(0).trim, p.split("\t")(1).trim))
+      val targets =
+        (for (line <- scala.io.Source.fromFile(targetsInputFile.value.get, "UTF-8").getLines) yield {
+          val arr = line.split("\\|")
+          (arr(0)->arr(1))
+        }).toMap
       PerTargetEvaluator(systemLabeledTweets, targets)
     }
 
     PerTweetEvaluator.apply(systemLabeledTestTweets)
     PerUserEvaluator.evaluate(systemLabeledTestTweets)
     if (targetsInputFile.value != None) {
-      val targets = new scala.collection.mutable.HashMap[String, String]
-
-      scala.io.Source.fromFile(targetsInputFile.value.get, "utf-8").getLines.foreach(p => targets.put(p.split("\t")(0).trim, p.split("\t")(1).trim))
+//      val targets = new scala.collection.mutable.HashMap[String, String]
+//
+//      scala.io.Source.fromFile(targetsInputFile.value.get, "utf-8").getLines
+//        .foreach(p => targets.put(p.split("\t")(0).trim, p.split("\t")(1).trim))
+      val targets: Map[String, String] =
+        (for (line <- scala.io.Source.fromFile(targetsInputFile.value.get, "UTF-8").getLines) yield {
+          val arr = line.split("\\|")
+          (arr(0)->arr(1))
+        }).toMap
       PerTargetEvaluator(systemLabeledTestTweets, targets)
     }
   }

File src/main/scala/updown/app/LexicalRatioClassifier.scala

View file
  • Ignore whitespace
     PerUserEvaluator(tweets)
 
     if (targetsInputFile.value != None) {
-      val targets = new scala.collection.mutable.HashMap[String, String]
-      scala.io.Source.fromFile(targetsInputFile.value.get, "utf-8").getLines.foreach(p => targets.put(p.split("|")(0).trim, p.split("|")(1).trim))
+//      val targets = new scala.collection.mutable.HashMap[String, String]
+//      scala.io.Source.fromFile(targetsInputFile.value.get, "utf-8").getLines.foreach(p => targets.put(p.split("|")(0).trim, p.split("|")(1).trim))
+      val targets: Map[String, String] =
+        (for (line <- scala.io.Source.fromFile(targetsInputFile.value.get, "UTF-8").getLines) yield {
+          val arr = line.split("\\|")
+          (arr(0)->arr(1))
+        }).toMap
       PerTargetEvaluator(tweets, targets)
     }
   }

File src/main/scala/updown/app/NFoldMaxentExperiment.scala

View file
  • Ignore whitespace
     val inputFile = goldInputFile.value.get
     val results = doExperiment(inputFile, nFolds)
     val averages = averageResults(results)
-    println("\n"+reportResults(averages))
+    System.err.println("\n"+reportResults(averages))
   }
 }

File src/main/scala/updown/app/PerTargetEvaluator.scala

View file
  • Ignore whitespace
   val NEG = "NEG"
   val NEU = "NEU"
 
-  def computeEvaluation(tweets: scala.List[SystemLabeledTweet], targets: HashMap[String, String]):
+  def computeEvaluation(tweets: scala.List[SystemLabeledTweet], targets: Map[String, String]):
   (List[(String, Double)], Int, HashMap[String, List[SystemLabeledTweet]]) = {
     val targetsToTweets = new scala.collection.mutable.HashMap[String, List[SystemLabeledTweet]] {
       override def default(s: String) = List()
     (targetsToAccuracies, numAbstained, targetsToTweets)
   }
 
-  def apply(tweets: List[SystemLabeledTweet], targets: scala.collection.mutable.HashMap[String, String]) = {
+  def apply(tweets: List[SystemLabeledTweet], targets: Map[String, String]) = {
     val (targetsToAccuracies, numAbstained, targetsToTweets) = computeEvaluation(tweets, targets)
 
-    println("\n***** PER TARGET EVAL *****")
+
+    System.err.println("\n***** PER TARGET EVAL *****")
     if (numAbstained > 0)
-      println(numAbstained + " tweets were abstained on; assuming half (" + (numAbstained.toFloat / 2) + ") were correct.")
+      System.err.println(numAbstained + " tweets were abstained on; assuming half (" + (numAbstained.toFloat / 2) + ") were correct.")
     for ((target, accuracy) <- targetsToAccuracies) {
-      println(target + ": " + accuracy + " (" + targetsToTweets(target).length + ")")
+      System.err.println(target + ": " + accuracy + " (" + targetsToTweets(target).length + ")")
     }
   }
 
       }
     })
 
-    val targets = new scala.collection.mutable.HashMap[String, String]
-
-    scala.io.Source.fromFile(targetsInputFile.value.get).getLines.foreach(p => targets.put(p.split("\t")(0).trim, p.split("\t")(1).trim))
+//    val targets = new scala.collection.mutable.HashMap[String, String]
+//
+//    scala.io.Source.fromFile(targetsInputFile.value.get).getLines.foreach(p => targets.put(p.split("\t")(0).trim, p.split("\t")(1).trim))
 
     //targets.foreach(p => println(p._1+" "+p._2))
 
+    val targets: Map[String, String] =
+        (for (line <- scala.io.Source.fromFile(targetsInputFile.value.get, "UTF-8").getLines) yield {
+          val arr = line.split("|")
+          (arr(0)->arr(1))
+        }).toMap
     apply(tweets, targets)
   }
 }

File src/main/scala/updown/app/PerTweetEvaluator.scala

View file
  • Ignore whitespace
     var numAbstained = tweets.count(_.systemLabel == null)
 
     for (tweet <- tweets) {
-      println(tweet.systemLabel + "|" + tweet.goldLabel)
+//      println(tweet.systemLabel + "|" + tweet.goldLabel)
       /*
        * val normedTweet = tweet.normalize("alpha")
       *  val normedNormedTweet = normedTweet.normalize("int")
 
     val (correct, total, abstained, message) = tabulate(tweets)
 
-    println("\n***** PER TWEET EVAL *****")
-    println("Accuracy: %.2f (%.2f/%d)".format(correct / total, correct, total))
-    println(message)
+    System.err.println("\n***** PER TWEET EVAL *****\n" +
+      "Accuracy: %.2f (%.2f/%d)".format(correct / total, correct, total)+"\n" +
+      message)
   }
 
 

File src/main/scala/updown/app/PerUserEvaluator.scala

View file
  • Ignore whitespace
   def evaluate(tweets: List[SystemLabeledTweet]) = {
     val (total, abstained, error, message) = computeEvaluation(tweets)
 
-    println("\n***** PER USER EVAL *****")
+    System.err.println("\n***** PER USER EVAL *****")
 
     if (abstained > 0) {
-      println(abstained + " tweets were abstained on; assuming one-third (" + (abstained / 3) + ") were positive.")
+      System.err.println(abstained + " tweets were abstained on; assuming one-third (" + (abstained / 3) + ") were positive.")
     }
-    println("Number of users evaluated: %d %s".format(total, message))
-    if (total > 0) println("Mean squared error: %f".format(error))
-    println(message)
+    System.err.println("Number of users evaluated: %d %s".format(total, message))
+    if (total > 0) System.err.println("Mean squared error: %f".format(error))
+    System.err.println(message)
   }
 
   def main(args: Array[String]) {

File src/main/scala/updown/preproc/GenericPreprocessor.scala

View file
  • Ignore whitespace
+package updown.preproc
+
+import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
+import ArgotConverters._
+import updown.data.SentimentLabel
+import updown.util.{TokenizationPipes, Twokenize}
+import java.util.Collections
+
+abstract class GenericPreprocessor {
+  // this is here to make ArgotConverters appear used to IDEA.
+  convertString _
+
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)]
+
+  def getInputIterator(inputOption: Option[String]): Iterator[(String, String, SentimentLabel.Type, String)] = {
+    inputOption match {
+      case Some(fileNameList) =>
+        (for ((name, polarity) <- fileNameList.split("\\s*,\\s*").map((pair) => {
+          val plist = pair.split("\\s*->\\s*")
+          (plist(0) -> plist(1))
+        }
+        ).toMap) yield {
+          getInstanceIterator(name, polarity)
+        }).iterator.flatten
+
+      case None =>
+        (for (line <- scala.io.Source.stdin.getLines()) yield {
+          line.split("|") match {
+            case Array(id, reviewer, polarityString, text) =>
+              (id, reviewer, SentimentLabel.figureItOut(polarityString), text)
+            case _ =>
+              System.err.println("Input must be of the form id|reviewer|polarity|text.")
+              ("", "", SentimentLabel.Neutral, "")
+          }
+        })
+    }
+  }
+
+  def main(args: Array[String]) {
+    // don't forget that this is linked to the pipeStages dict below
+    val availablePipes = Set("addBiGrams", "twokenize", "twokenizeSkipGtOneGrams", "removeStopwords", "splitSpace")
+
+    // PARSE ARGS
+    val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
+    val inputFile = parser.option[String](List("i", "input"), "input", "path to stanford data file")
+    val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "path to stoplist file")
+    val startId = parser.option[Int](List("start-id"), "ID", "id at which to start numbering lines")
+    val textPipeline = parser.option[String](List("textPipeline"), "PIPELINE",
+      ("specify the desired pipe stages seperated by |: \"addBiGrams|twokenize\". " +
+        "Available options are in %s.").format(availablePipes))
+    val debugOption = parser.flag[Boolean](List("d","debug"), "show debugging output")
+    try {
+      parser.parse(args)
+      val debug = debugOption.value.isDefined
+
+      // SET UP IO
+      var lineCount =
+        startId.value match {
+          case Some(id) => id
+          case None => 0
+        }
+
+      if (debug) System.err.println("Inputfile: %s".format(inputFile.value))
+      val inputLines: Iterator[(String, String, SentimentLabel.Type, String)] =
+        getInputIterator(inputFile.value)
+
+
+      val stopSet: Set[String] =
+        stopListFile.value match {
+          case Some(fileName) =>
+            scala.io.Source.fromFile(fileName).getLines.toSet
+          case None => Set("a", "the", ".")
+        }
+
+
+      val pipeStages: Map[String, (List[String]) => List[String]] =
+        Map[String, (List[String]) => List[String]](
+          ("addBiGrams" -> TokenizationPipes.addNGrams(2)),
+          ("twokenize" -> TokenizationPipes.twokenize),
+          ("twokenizeSkipGtOneGrams" -> TokenizationPipes.twokenizeSkipGtOneGrams),
+          ("removeStopwords" -> TokenizationPipes.filterOnStopset(stopSet)),
+          ("splitSpace" -> TokenizationPipes.splitOnDelimiter(" "))
+        )
+      // had to predefine the available pipes so they could be printed in the usage string, before the stopset can be parsed.
+      assert(pipeStages.keySet == availablePipes)
+
+      if (debug) System.err.println("Pipeline option: %s".format(textPipeline.value))
+      val pipeline: List[(List[String]) => List[String]] =
+        if (textPipeline.value.isDefined) {
+          val arg: String = textPipeline.value.get
+          (for (pipeStage <- arg.split("\\|")) yield {
+            if (pipeStages.keySet.contains(pipeStage)) {
+              pipeStages(pipeStage)
+            } else {
+              parser.usage("invalid pipeStage: %s".format(pipeStage))
+            }
+          }).toList
+        } else {
+          List(pipeStages("twokenize"), pipeStages("removeStopwords"))
+        }
+      if (debug) System.err.println("Pipeline: %s".format(pipeline))
+
+
+      // RUN
+      for ((id, reviewer, polarity, text) <- inputLines) {
+        println(
+          "%s|%s|%s|%s".format(
+            if (id == "") lineCount else id,
+            reviewer,
+            runThroughPipeLine(text.replaceAll(",", ""), pipeline).mkString(","),
+            polarity))
+        lineCount += 1
+      }
+      if (debug) System.err.println("Done!")
+    }
+    catch {
+      case e: ArgotUsageException =>
+        println(e.message)
+        System.exit(1)
+    }
+  }
+
+  def runThroughPipeLine(text: String, pipeLine: List[(List[String]) => List[String]]): List[String] = {
+    var res = List(text)
+    for (pipeStage <- pipeLine) {
+      res = pipeStage(res)
+    }
+    res
+  }
+}

File src/main/scala/updown/preproc/PreprocPangLeePolarityCorpus.scala

View file
  • Ignore whitespace
 package updown.preproc
 
-import org.clapper.argot.{ArgotUsageException, ArgotConverters, ArgotParser}
-import ArgotConverters._
-import updown.util.Twokenize
 import updown.data.SentimentLabel
 import java.io.File
-import io.Source
 
 /**
  * This preprocessor is suitable for any directory that contains files which should each be mapped to one instance
  * whose polarity is signified by the label given to the directory in the inputOption
  */
-object PreprocPangLeePolarityCorpus {
-  // this is here to make ArgotConverters appear used to IDEA.
-  convertString _
+object PreprocPangLeePolarityCorpus extends GenericPreprocessor {
 
-  def getInputIterator(inputOption: Option[String]): Iterator[String] = {
-    inputOption match {
-      case Some(fileNameList) =>
-        (for ((name, polarity) <- fileNameList.split("\\s*,\\s*").map((pair) => {
-          val plist = pair.split("\\s*->\\s*")
-          (plist(0) -> plist(1))
-        }
-        ).toMap) yield {
-          try {
-            val canonicalPol = SentimentLabel.figureItOut(polarity)
-            val dir = new File(name)
-            assert(dir.isDirectory)
-            (for (file <- dir.listFiles()) yield {
-              val text: Source = scala.io.Source.fromFile(file, "ISO-8859-1")
-              val string = text.getLines().mkString(" ")
-              "%s|%s|%s".format(file.getName, canonicalPol, string.replace("|", ""))
-            }).iterator
-          } catch {
-            case e: MatchError =>
-              System.err.println("Couldn't figure out what sentiment '%s' is supposed to be." +
-                " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, name))
-              Iterator[String]()
-            case e =>
-              System.err.println("Caught some error. Skipping " + name)
-              e.printStackTrace()
-              Iterator[String]()
-          }
-        }).iterator.flatten
-
-      case None => scala.io.Source.stdin.getLines()
-    }
-  }
-
-  def main(args: Array[String]) {
-    // PARSE ARGS
-    val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
-    val inputFile = parser.option[String](List("i", "input"), "input", "path to stanford data file")
-    val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "path to stoplist file")
-    val startId = parser.option[Int](List("start-id"), "ID", "id at which to start numbering lines")
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)] = {
     try {
-      parser.parse(args)
-    }
-    catch {
-      case e: ArgotUsageException =>
-        println(e.message)
-        sys.exit(0)
-    }
-
-    // SET UP IO
-    var instanceID =
-      startId.value match {
-        case Some(id) => id
-        case None => 0
-      }
-
-    val inputLines: Iterator[String] =
-      getInputIterator(inputFile.value)
-
-    val stopSet: Set[String] =
-      stopListFile.value match {
-        case Some(fileName) =>
-          scala.io.Source.fromFile(fileName).getLines.toSet
-        case None => Set("a", "the", ".")
-      }
-
-    // RUN
-    for (line <- inputLines) {
-      line.split('|') match {
-        case Array(filename, polarity, text) =>
-          println(
-            "%s|%s|%s|%s".format(
-              filename,
-              "reviewer",
-              Twokenize(text.replaceAll(",", "")).toList.filter((s) => !stopSet.contains(s)).mkString(","),
-              polarity)
+      val dir = new File(fileName)
+      assert(dir.isDirectory)
+      (for (file: File <- dir.listFiles()) yield
+        (file.getName,
+          "reviewer",
+          SentimentLabel.figureItOut(polarity),
+          scala.io.Source.fromFile(file, "ISO-8859-1").getLines().mkString(" ").replace("|", "")
           )
-      }
-      instanceID += 1
+        ).iterator
+    } catch {
+      case e: MatchError =>
+        System.err.println("Couldn't figure out what sentiment '%s' is supposed to be." +
+          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
+        Iterator[(String, String, SentimentLabel.Type, String)]()
     }
   }
 }

File src/main/scala/updown/preproc/PreprocPangLeeSentenceCorpus.scala

View file
  • Ignore whitespace
 package updown.preproc
 
-import org.clapper.argot.{ArgotUsageException, ArgotConverters, ArgotParser}
-import ArgotConverters._
-import updown.util.Twokenize
 import updown.data.SentimentLabel
 
 /**
  * This preprocessor is suitable for any file that contains one instance per line with no labels or ids
  */
-object PreprocPangLeeSentenceCorpus {
-  // this is here to make ArgotConverters appear used to IDEA.
-  convertString _
+object PreprocPangLeeSentenceCorpus extends GenericPreprocessor {
 
-  def getInputIterator(inputOption: Option[String]): Iterator[String] = {
-    inputOption match {
-      case Some(fileNameList) =>
-        (for ((name, polarity) <- fileNameList.split("\\s*,\\s*").map((pair) => {
-          val plist = pair.split("\\s*->\\s*")
-          (plist(0) -> plist(1))
-        }
-        ).toMap) yield {
-          try {
-            val canonicalPol = SentimentLabel.figureItOut(polarity)
-            for (line <- scala.io.Source.fromFile(name, "ISO-8859-1").getLines) yield {
-              "%s|%s".format(canonicalPol, line.replace("|", ""))
-            }
-          } catch {
-            case e: MatchError =>
-              System.err.println("Couldn't figure out what sentiment '%s' is supposed to be." +
-                " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, name))
-              Iterator[String]()
-            case e =>
-              System.err.println("Caught some error. Skipping " + name)
-              e.printStackTrace()
-              Iterator[String]()
-          }
-        }).iterator.flatten
-
-      case None => scala.io.Source.stdin.getLines()
-    }
-  }
-
-  def main(args: Array[String]) {
-    // PARSE ARGS
-    val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
-    val inputFile = parser.option[String](List("i", "input"), "input", "path to stanford data file")
-    val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "path to stoplist file")
-    val startId = parser.option[Int](List("start-id"), "ID", "id at which to start numbering lines")
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, SentimentLabel.Type, String)] = {
     try {
-      parser.parse(args)
-    }
-    catch {
-      case e: ArgotUsageException =>
-        println(e.message)
-        sys.exit(0)
-    }
-
-    // SET UP IO
-    var instanceID =
-      startId.value match {
-        case Some(id) => id
-        case None => 0
+      for (line <- scala.io.Source.fromFile(fileName, "ISO-8859-1").getLines) yield {
+        ("", "reviewer", SentimentLabel.figureItOut(polarity), line.replace("|", ""))
       }
-
-    val inputLines: Iterator[String] =
-      getInputIterator(inputFile.value)
-
-    val stopSet: Set[String] =
-      stopListFile.value match {
-        case Some(fileName) =>
-          scala.io.Source.fromFile(fileName).getLines.toSet
-        case None => Set("a", "the", ".")
-      }
-
-    // RUN
-    for (line <- inputLines) {
-      line.split('|') match {
-        case Array(polarity, text) =>
-          println(
-            "%s|%s|%s|%s".format(
-              instanceID,
-              "reviewer",
-              Twokenize(text.replaceAll(",", "")).toList.filter((s) => !stopSet.contains(s)).mkString(","),
-              polarity)
-          )
-      }
-      instanceID += 1
+    } catch {
+      case e: MatchError =>
+        System.err.println("Couldn't figure out what sentiment '%s' is supposed to be." +
+          " Try using 'pos', 'neg', or 'neu'. Skipping %s...".format(polarity, fileName))
+        Iterator[(String, String, SentimentLabel.Type, String)]()
     }
   }
 }

File src/main/scala/updown/util/TokenizationPipes.scala

View file
  • Ignore whitespace
 package updown.util
 
 object TokenizationPipes {
+  val twokenize: (List[String]) => List[String] =
+    (ss) => ss.map((s) => Twokenize(s)).flatten
+
+  val twokenizeSkipGtOneGrams: (List[String]) => List[String] =
+    (ss) => ss.map((s) => if (s.contains(" "))
+      List(s)
+    else
+      Twokenize(s)).flatten
+
   val toLowercase: (List[String]) => List[String] =
     (ss) => ss.map((s) => s.toLowerCase)