Commits

vvcephei  committed ad5cd4e

Moved maxent depencency from build.sbt to a jar in lib/.
The reason for this is that maxent was dumping status
information to stdout and polluting my output. I changed
the relevant System.out.print statements to System.err.print
and rebuilt it.

Also made a bunch of unrelated changes for my thesis.

  • Participants
  • Parent commits af2e8e1

Comments (0)

Files changed (12)

 *.iml
 out/
 .idea/
+.idea
 retrieveManaged := true
 
 libraryDependencies ++= Seq(
-  "org.apache.opennlp" % "opennlp-maxent" % "3.0.1-incubating",
+//  "org.apache.opennlp" % "opennlp-maxent" % "3.0.1-incubating",
   "org.clapper" %% "argot" % "0.3.5",
   "com.weiglewilczek.slf4s" %% "slf4s" % "1.0.7",
   "org.scalatest" %% "scalatest" % "1.6.1" % "test"

File lib/opennlp-maxent-3.0.3-incubating-SNAPSHOT.jar

+/home/john/repos/opennlp/opennlp-maxent/target/opennlp-maxent-3.0.3-incubating-SNAPSHOT.jar

File src/main/scala/updown/app/TrainMaxentModel.scala

  *
  * or, using the -s flag, train from a simple file that has the format:
  * feature1,feature2,feature3,...,featureN,label
- * 
+ *
  * @author Mike Speriosu
  */
 object TrainMaxentModel {
   val DEFAULT_CUTOFF = 5
 
 
-  def apply(fileName: String, iterations: Int, cutoff: Int): AbstractModel =
+  def apply(fileName: String, iterations: Int, cutoff: Int): AbstractModel = {
+    GIS.PRINT_MESSAGES = false
     GIS.trainModel(MaxentEventStreamFactory(fileName), iterations, cutoff)
+  }
 
   def apply(fileName: String): AbstractModel = apply(fileName, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
 
     val dataStream = new PlainTextByLineDataStream(reader)
     val eventStream = new BasicEventStream(dataStream, ",")
 
+    GIS.PRINT_MESSAGES = false
     GIS.trainModel(eventStream, iterations, cutoff)
   }
 
-  def trainWithStringIterator(iterator: Iterator[String], iterations: Int, cutoff: Int): AbstractModel =
+  def trainWithStringIterator(iterator: Iterator[String], iterations: Int, cutoff: Int): AbstractModel = {
+    GIS.PRINT_MESSAGES = false
+
     GIS.trainModel(MaxentEventStreamFactory.getWithStringIterator(iterator), iterations, cutoff)
+  }
 
   //  def apply[String](iterator: Iterator[String]): AbstractModel = apply(iterator, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
 
-  def trainWithGoldLabeledTweetIterator(iterator: Iterator[GoldLabeledTweet], iterations: Int, cutoff: Int): AbstractModel =
+  def trainWithGoldLabeledTweetIterator(iterator: Iterator[GoldLabeledTweet], iterations: Int, cutoff: Int): AbstractModel = {
+    GIS.PRINT_MESSAGES = false
     GIS.trainModel(MaxentEventStreamFactory.getWithGoldLabeledTweetIterator(iterator), iterations, cutoff)
+  }
 
   def trainWithGoldLabeledTweetIterator(iterator: Iterator[GoldLabeledTweet]): AbstractModel = trainWithGoldLabeledTweetIterator(iterator, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
 
     }
 
 
-    val model: AbstractModel = if(simple.value == None)
-      apply(inputFile.value.get,iterations.value.getOrElse(DEFAULT_ITERATIONS), cutoff.value.getOrElse(DEFAULT_CUTOFF))
+    val model: AbstractModel = if (simple.value == None)
+      apply(inputFile.value.get, iterations.value.getOrElse(DEFAULT_ITERATIONS), cutoff.value.getOrElse(DEFAULT_CUTOFF))
     else
-      trainSimple(inputFile.value.get,iterations.value.getOrElse(DEFAULT_ITERATIONS), cutoff.value.getOrElse(DEFAULT_CUTOFF))
+      trainSimple(inputFile.value.get, iterations.value.getOrElse(DEFAULT_ITERATIONS), cutoff.value.getOrElse(DEFAULT_CUTOFF))
 
     val modelWriter = new BinaryGISModelWriter(model, new File(outputFile.value.get))
     modelWriter.persist()

File src/main/scala/updown/app/experiment/Experiment.scala

 abstract class Experiment extends Logging {
   val parser = new ArgotParser(this.getClass.getName, preUsage = Some("Updown"))
   val targetsInputFile = parser.option[String](List("t", "targets"), "targets", "targets")
+  val reportFormatO = parser.option[String]("format","tex|txt","default is txt") {
+    (s,opt) =>
+      s.toLowerCase() match {
+        case "tex" => "tex"
+        case "txt" => "txt"
+        case _     => parser.usage("Format must be 'tex' or 'txt'.")
+      }
+  }
+  val reportNameO = parser.option[String]("name","STRING","The name to use in reports. If unspecified, the name will " +
+    "be composed of the input files.")
 
-  def report(labeledTweets: List[SystemLabeledTweet]) {
-    logger.info("Overall:\n" + Statistics.getEvalStats("", labeledTweets).toString)
+
+  def reportTex(experimentName: String, labeledTweets: List[SystemLabeledTweet]) {
+    val outputName = reportNameO.value match {
+      case Some(s:String) => s
+      case _ => experimentName
+    }
+    val ExperimentalResult(_,eN,accuracy,classes) = Statistics.getEvalStats("",labeledTweets)
+    lazy val varName:(String)=>(String)=>(String)=>String = (prefix)=>(suffix)=>(value)=>"\\newcommand{\\%s%s}{%s}".format(prefix,suffix,value)
+    lazy val nName = varName("n")(outputName)
+    lazy val accName = varName("acc")(outputName)
+    lazy val fposName = varName("fpos")(outputName)
+    lazy val fnegName = varName("fneg")(outputName)
+    println("% "+outputName)
+    println(nName("%d".format(eN)))
+    println(accName("%.2f".format(accuracy)))
+    val classesMap = classes.groupBy(res=>res.label)
+    println(fposName("%.2f".format(classesMap(SentimentLabel.Positive)(0).f)))
+    println(fnegName("%.2f".format(classesMap(SentimentLabel.Negative)(0).f)))
+  }
+
+  def reportTxt(experimentName: String, labeledTweets: List[SystemLabeledTweet]) {
+    println("\n-----------------------------------------------------")
+    println(experimentName+":")
+    println("Overall:\n" + Statistics.getEvalStats("", labeledTweets).toString)
 
     val (msePerUser, nUsers) = Statistics.getMSEPerUser(labeledTweets)
-    logger.info("Per-user Summary:\nN users:%d\n%s\n%s".format(nUsers, "%15s %5s %7s".format("Label", "MSE", "√(MSE)"),
+    println("Per-user Summary:\nN users:%d\n%s\n%s".format(nUsers, "%15s %5s %7s".format("Label", "MSE", "√(MSE)"),
       msePerUser.map {
         case (label, mse) => "%15s %.3f   %.3f".format(SentimentLabel.toEnglishName(label), mse, math.sqrt(mse))
       }.mkString("\n")))
         }
         val (statsPerTarget, nTargets) = Statistics.getEvalStatsPerTarget("", targetedTweets)
         if (statsPerTarget.length > 0) {
-          logger.info("Per-target:\nN targets: %d\n%s".format(nTargets, statsPerTarget.mkString("\n")))
+          println("\nPer-target:\nN targets: %d\n%s".format(nTargets, statsPerTarget.mkString("\n")))
         } else
-          logger.info("Per-target: No targets were over the threshold")
+          println("\nPer-target: No targets were over the threshold")
       case None =>
-        logger.info("Per-target: No target file provided")
+        println("\nPer-target: No target file provided")
+    }
+  }
+  def report(experimentName: String, labeledTweets: List[SystemLabeledTweet]) {
+    reportFormatO.value match {
+      case Some("tex") => reportTex(experimentName,labeledTweets)
+      case _ => reportTxt(experimentName,labeledTweets)
     }
   }
 }

File src/main/scala/updown/app/experiment/NFoldExperiment.scala

           logger.debug("starting run " + experimentalRun)
           val result = doExperiment(trainSet, testSet)
           logger.debug("ending run " + experimentalRun)
-          logger.info("Intermediate:")
-          report(result)
+//          logger.info("Intermediate:")
+//          report(inputFile.toString, result)
           result
         }).toList
 
       val result = results.flatten
       logger.info("Final Result:")
-      report(result)
+      report(inputFile.toString,result)
 //      println("\n" + Statistics.averageResults("%d-fold Average".format(nFolds), results).toString)
       logger.debug("running cleanup code")
     }

File src/main/scala/updown/app/experiment/SplitExperiment.scala

           result
       }
       
-      report(result)
+      report(trainFileName.toString+"->"+testFileName.toString ,result)
       logger.debug("running cleanup code")
       System.exit(after())
     }

File src/main/scala/updown/app/experiment/StaticExperiment.scala

       val labeledTweets = doExperiment(TweetFeatureReader(dataFileName))
       logger.debug("ending run")
 
-      report(labeledTweets)
+      report(dataFileName,labeledTweets)
 
       logger.debug("running cleanup code")
       System.exit(after())

File src/main/scala/updown/app/experiment/labelprop/SplitJuntoExperiment.scala

       (followerGraphFile.value,followerGraphFileTest.value) match {
         case (Some(filename: String),Some(filenameTest: String)) =>
           createTransductiveGraph(trainTweets, filename, testTweets, filenameTest, edgeSeedSet, getNgramWeight)
+        case _ => throw new MatchError()
       }
 
     logger.debug("running label prop")

File src/main/scala/updown/app/experiment/maxent/NFoldMaxentExperiment.scala

             SentimentLabel.figureItOut(model.getBestOutcome(model.eval(features.toArray))))
       }
     }).toList
-    logger.info(res.toString)
     res
   }
   def after():Int=0

File src/main/scala/updown/preproc/GenericPreprocessor.scala

     )
   val defaultPipeline = "twokenize|removeStopwords"
   val parser = new ArgotParser("updown run updown.preproc.PreprocStanfordTweets", preUsage = Some("Updown"))
-  val inputFile = parser.option[String](List("i", "input"), "input", "path to stanford data file")
+  val inputFile = parser.option[String](List("i", "input"), "input", "path to data file")
   val stopListFile = parser.option[String](List("s", "stoplist"), "stoplist", "path to stoplist file")
   val startId = parser.option[Int](List("start-id"), "ID", "id at which to start numbering lines")
   val textPipeline = parser.option[String](List("textPipeline"), "PIPELINE",
     writer.write("%s|%s\n".format(id, target))
   }
 
+  def before() {}
+
   def main(args: Array[String]) {
     logger.debug(args.toList.toString)
     try {
       parser.parse(args)
-
+      before()
       // SET UP IO
 
       logger.debug("Inputfile: %s".format(inputFile.value))

File src/main/scala/updown/preproc/impl/PreprocMDSDReviews.scala

 object PreprocMDSDReviews extends GenericPreprocessor {
   override val defaultPipeline = "basicTokenize"
 
+  override def before() {
+    pipeStages = pipeStages + (("filterBigrams", (ss: List[String]) => {
+      ss.filterNot(_.contains("_"))
+    }))
+  }
+
   val getTokensFromLine: (String) => (List[String], SentimentLabel.Type) = line => {
     lazy val getTokensFromLineHelper: (List[String], List[String], SentimentLabel.Type) => (List[String], SentimentLabel.Type) =
       (inputs, tokens, label) => {