Commits

Mike Speriosu committed 04bf0cf

Added LexRatio classifier, PerUserEvaluator.

Comments (0)

Files changed (5)

src/main/scala/updown/app/JuntoClassifier.scala

   val modelInputFile = parser.option[String](List("m", "model"), "model", "model input")
   val mpqaInputFile = parser.option[String](List("p", "mpqa"), "mpqa", "MPQA sentiment lexicon input file")
   val followerGraphFile = parser.option[String](List("f", "follower-graph"), "follower-graph", "twitter follower graph input file")
-  val refCorpusProbsFile = parser.option[String](List("r", "reference-corpus-probabilities"), "reference-corpus-probabilities", "reference corpus probabilities input file")
+  val refCorpusProbsFile = parser.option[String](List("r", "reference-corpus-probabilities"), "ref-corp-probs", "reference corpus probabilities input file")
 
   val mu1 = parser.option[Double](List("u", "mu1"), "mu1", "seed injection probability")
   val iterations = parser.option[Int](List("n", "iterations"), "iterations", "number of iterations")
 
     val graph = createGraph(tweets, followerGraphFile.value.get, modelInputFile.value.get, mpqaInputFile.value.get)
 
-    //graph.WriteToFileWithAlphabet("input-graph")
-    graph.SaveEstimatedScores("input-graph")
+    //graph.SaveEstimatedScores("input-graph")
 
     JuntoRunner(graph, mu1.value.getOrElse(DEFAULT_MU1), .01, .01, iterations.value.getOrElse(DEFAULT_ITERATIONS), false)
 
-    graph.SaveEstimatedScores("output-graph")
+    //graph.SaveEstimatedScores("output-graph")
 
     val tweetIdsToPredictedLabels = new scala.collection.mutable.HashMap[String, String]
 
     }
 
     PerTweetEvaluator.evaluate(tweets)
+    PerUserEvaluator.evaluate(tweets)
   }
   
   def createGraph(tweets: List[Tweet], followerGraphFile: String, modelInputFile: String, mpqaInputFile: String) = {
     val edges = getTweetNgramEdges(tweets) ::: getFollowerEdges(followerGraphFile) ::: getUserTweetEdges(tweets)
-    val seeds = getMaxentSeeds(tweets, modelInputFile) ::: getMPQASeeds(MPQALexicon(mpqaInputFile))/* ::: getEmoticonSeeds*/
+    val seeds = getMaxentSeeds(tweets, modelInputFile) ::: getMPQASeeds(MPQALexicon(mpqaInputFile)) ::: getEmoticonSeeds
     GraphBuilder(edges, seeds)
   }
 

src/main/scala/updown/app/LexicalRatioClassifier.scala

+package updown.app
+
+import org.clapper.argot._
+
+import updown.lex._
+import updown.data._
+import updown.data.io._
+
+object LexicalRatioClassifier {
+
+  val POS = "POS"
+  val NEG = "NEG"
+
+  import ArgotConverters._
+  val parser = new ArgotParser("updown run updown.app.JuntoClassifier", preUsage=Some("Updown"))
+  
+  val goldInputFile = parser.option[String](List("g", "gold"), "gold", "gold labeled input")
+  val mpqaInputFile = parser.option[String](List("p", "mpqa"), "mpqa", "MPQA sentiment lexicon input file")
+
+  def main(args: Array[String]) = {
+    try { parser.parse(args) }
+    catch { case e: ArgotUsageException => println(e.message); sys.exit(0)}
+
+    if(goldInputFile.value == None) {
+      println("You must specify a gold labeled input file via -g.")
+      sys.exit(0)
+    }
+    if(mpqaInputFile.value == None) {
+      println("You must specify an MPQA sentiment lexicon file via -p.")
+      sys.exit(0)
+    }
+
+    val tweets = TweetFeatureReader(goldInputFile.value.get)
+
+    val lexicon = MPQALexicon(mpqaInputFile.value.get)
+
+    var numAbstained = 0
+    for(tweet <- tweets) {
+
+      var numPosWords = 0
+      var numNegWords = 0
+      for(feature <- tweet.features) {
+        if(lexicon.contains(feature)) {
+          val entry = lexicon(feature)
+          if(entry.isPositive) numPosWords += 1
+          if(entry.isNegative) numNegWords += 1
+        }
+      }
+
+      if(numPosWords == numNegWords) numAbstained += 1
+      else if(numPosWords > numNegWords) {
+        tweet.systemLabel = POS
+      }
+      else {//if(numNegWords > numPosWords)
+        tweet.systemLabel = NEG
+      }
+    }
+
+    PerTweetEvaluator(tweets)
+    PerUserEvaluator(tweets)
+  }
+}

src/main/scala/updown/app/PerTweetEvaluator.scala

  */
 object PerTweetEvaluator {
 
+  val POS = "POS"
+  val NEG = "NEG"
+
   import ArgotConverters._
   val parser = new ArgotParser("updown run updown.app.PerTweetEvaluator", preUsage=Some("Updown"))
 
   val modelInputFile = parser.option[String](List("m", "model"), "model", "model input")
   val goldInputFile = parser.option[String](List("g", "gold"), "gold", "gold labeled input")
 
+  def apply(tweets: List[Tweet]) = evaluate(tweets)
+
   def evaluate(tweets: List[Tweet]) = {
-    var correct = 0
+    var correct = 0.0
     var total = 0
+    var numAbstained = tweets.count(_.systemLabel == null)
 
     for(tweet <- tweets) {
       if(tweet.systemLabel == tweet.goldLabel) {
       total += 1
     }
 
+    println("\n***** PER TWEET EVAL *****")
+
+    if(numAbstained > 0) {
+      correct += numAbstained.toFloat / 2
+      println(numAbstained + " tweets were abstained on; assuming half (" + (numAbstained.toFloat/2) + ") were correct.")
+    }
     println("Accuracy: "+(correct.toFloat/total)+" ("+correct+"/"+total+")")
   }
 
 
     val goldLines = scala.io.Source.fromFile(goldInputFile.value.get).getLines.toList
 
-    var correct = 0
-    var total = 0
-
-    for(tweet <- TweetFeatureReader(goldInputFile.value.get)) {
+    val tweets = TweetFeatureReader(goldInputFile.value.get)
+    
+    for(tweet <- tweets) {
       val result = model.eval(tweet.features.toArray)
       
       val posProb = result(0)
       val negProb = result(1)
 
-      val systemLabel = if(posProb >= negProb) "POS" else "NEG"
+      tweet.systemLabel = if(posProb >= negProb) POS else NEG
 
-      if(systemLabel == tweet.goldLabel) correct += 1
-      
-      total += 1
     }
 
-    println("Accuracy: "+(correct.toFloat/total)+" ("+correct+"/"+total+")")
+    evaluate(tweets)
   }
 }

src/main/scala/updown/app/PerUserEvaluator.scala

+package updown.app
+
+import updown.data._
+import updown.data.io._
+
+import java.io._
+
+import opennlp.maxent._
+import opennlp.maxent.io._
+import opennlp.model._
+
+import org.clapper.argot._
+
+/**
+ *
+ * @author Mike Speriosu
+ */
+object PerUserEvaluator {
+
+  import ArgotConverters._
+  val parser = new ArgotParser("updown run updown.app.PerTweetEvaluator", preUsage=Some("Updown"))
+
+  val modelInputFile = parser.option[String](List("m", "model"), "model", "model input")
+  val goldInputFile = parser.option[String](List("g", "gold"), "gold", "gold labeled input")
+
+  val POS = "POS"
+  val NEG = "NEG"
+
+  val DEFAULT_MIN_TPU = 1
+
+  def apply(tweets: List[Tweet]) = evaluate(tweets)
+
+  def evaluate(tweets: List[Tweet]) = {
+    var totalError = 0.0
+    var totalNumAbstained = 0
+    val usersToTweets = new scala.collection.mutable.HashMap[String, List[Tweet]] { override def default(s: String) = List() }
+
+    val minTPU = DEFAULT_MIN_TPU
+
+    for(tweet <- tweets)
+      usersToTweets.put(tweet.userid, usersToTweets(tweet.userid) ::: (tweet :: Nil))
+
+    for(userid <- usersToTweets.keys) {
+      val curTweets = usersToTweets(userid)
+
+      var numAbstained = 0
+      if(curTweets.length >= minTPU) {
+        var numGoldPos = 0
+        var numSysPos = 0.0
+        for(tweet <- curTweets) {
+          if(tweet.goldLabel == POS) numGoldPos += 1
+          if(tweet.systemLabel == POS) numSysPos += 1
+          else if(tweet.systemLabel == null) numAbstained += 1
+        }
+
+        numSysPos += numAbstained.toFloat / 2
+        totalError += math.pow((numGoldPos - numSysPos) / curTweets.length, 2)
+        totalNumAbstained += numAbstained
+      }
+    }
+
+    totalError /= usersToTweets.size
+
+    println("\n***** PER USER EVAL *****")
+
+    if(totalNumAbstained > 0)
+      println(totalNumAbstained + " tweets were abstained on; assuming half (" + (totalNumAbstained.toFloat/2) + ") were positive.")
+
+    println("Number of users evaluated: " + usersToTweets.size + " (min of " + minTPU + " tweets per user)")
+    println("Mean squared error: " + totalError)
+  }
+
+  def main(args: Array[String]) {
+    try { parser.parse(args) }
+    catch { case e: ArgotUsageException => println(e.message); sys.exit(0) }
+
+    if(modelInputFile.value == None) {
+      println("You must specify a model input file via -m.")
+      sys.exit(0)
+    }
+    if(goldInputFile.value == None) {
+      println("You must specify a gold labeled input file via -g.")
+      sys.exit(0)
+    }
+
+    val dataInputStream = new DataInputStream(new FileInputStream(modelInputFile.value.get));
+    val reader = new BinaryGISModelReader(dataInputStream)
+
+    val model = reader.getModel
+
+    val goldLines = scala.io.Source.fromFile(goldInputFile.value.get).getLines.toList
+
+    val tweets = TweetFeatureReader(goldInputFile.value.get)
+
+    for(tweet <- tweets) {
+      val result = model.eval(tweet.features.toArray)
+      
+      val posProb = result(0)
+      val negProb = result(1)
+
+      tweet.systemLabel = if(posProb >= negProb) POS else NEG
+    }
+
+    evaluate(tweets)
+  }
+}

src/main/scala/updown/lex/MPQALexicon.scala

 
 class MPQALexicon(entries: scala.collection.mutable.HashMap[String, MPQAEntry]) {
   def apply(s:String) = entries(s)
+  def contains(s:String) = entries.contains(s)
   val keySet = entries.keySet
 }