Commits

vvcephei committed 3fa10a1

done messing with experiments for now

  • Participants
  • Parent commits c3bf2ba

Comments (0)

Files changed (8)

File src/main/scala/updown/app/JuntoClassifier.scala

       }
     }
 
+    System.err.println("predicted nPos:%d nNeg:%d nNeu:%d".format(
+    tweetIdsToPredictedLabels.count(i=>i._2==SentimentLabel.Positive),
+    tweetIdsToPredictedLabels.count(i=>i._2==SentimentLabel.Negative),
+    tweetIdsToPredictedLabels.count(i=>i._2==SentimentLabel.Neutral)
+    ))
     val systemLabeledTweets =
       for (GoldLabeledTweet(id, userid, features, goldLabel) <- goldLabeledTweets) yield {
         SystemLabeledTweet(id, userid, features, goldLabel,
   val refCorpusProbsFile = parser.option[String](List("r", "reference-corpus-probabilities"), "ref-corp-probs", "reference corpus probabilities input file")
 
   val edgeSeedSetOption = parser.option[String](List("e", "edge-seed-set-selection"), "edge-seed-set-selection", "edge/seed set selection")
-  val targetsInputFile = parser.option[String](List("t", "targets"), "targets", "targets")
+  val targetsInputFile = parser.option[String](List("t", "targets"), "targets", "targets (TRAIN)")
+  val targetsInputFileTest = parser.option[String](List("u", "targets-test"), "targets", "targets (TEST)")
 
   val mu1 = parser.option[Double](List("u", "mu1"), "mu1", "seed injection probability")
   val iterations = parser.option[Int](List("n", "iterations"), "iterations", "number of iterations")

File src/main/scala/updown/app/experiment/Experiment.scala

     logger.info("Overall:\n" + Statistics.getEvalStats("", labeledTweets).toString)
 
     val (msePerUser, nUsers) = Statistics.getMSEPerUser(labeledTweets)
-    logger.info("Per-user Summary:\nN users:%d\n%s\n%s".format(nUsers, "%15s %5s".format("Label","MSE"),msePerUser.map{case LabelResult(_,label,_,_,_,mse)=>"%15s %.3f".format(SentimentLabel.toEnglishName(label),mse)}.mkString("\n")))
+    logger.info("Per-user Summary:\nN users:%d\n%s\n%s".format(nUsers, "%15s %5s %7s".format("Label", "MSE", "√(MSE)"),
+      msePerUser.map {
+        case (label, mse) => "%15s %.3f   %.3f".format(SentimentLabel.toEnglishName(label), mse, math.sqrt(mse))
+      }.mkString("\n")))
 
     targetsInputFile.value match {
       case Some(filename) =>
             TargetedSystemLabeledTweet(id, uid, features, gLabel, sLabel, targets(id))
         }
         val (statsPerTarget, nTargets) = Statistics.getEvalStatsPerTarget("", targetedTweets)
-        if (statsPerTarget.length > 0){
+        if (statsPerTarget.length > 0) {
           logger.info("Per-target:\nN targets: %d\n%s".format(nTargets, statsPerTarget.mkString("\n")))
-        }else
+        } else
           logger.info("Per-target: No targets were over the threshold")
       case None =>
         logger.info("Per-target: No target file provided")

File src/main/scala/updown/app/experiment/ExperimentalResult.scala

 import updown.data.SentimentLabel
 
 case class ExperimentalResult(name: String, n: Int, accuracy: Double, classes: List[LabelResult]) {
-  def header: String = "\n%15s%5s%11s%8s%9s%9s\n".format("Label", "N", "Precision", "Recall", "F-Score", "MSE")
+  def header: String = "\n%15s%5s%11s%8s%9s\n".format("Label", "N", "Precision", "Recall", "F-Score")
 
   override def toString(): String =
     "%s Results:\n".format(name) +
     val otherClassesMap = (other.classes.groupBy((labelResult) => labelResult.label).map((tup) => {
       val (k, (v: LabelResult) :: vs) = tup
       (k, v)
-    }).toMap).withDefaultValue(LabelResult(0, SentimentLabel.Abstained, 0.0, 0.0, 0.0, 0.0))
+    }).toMap).withDefaultValue(LabelResult(0, SentimentLabel.Abstained, 0.0, 0.0, 0.0))
     ExperimentalResult(name, n + other.n, accuracy + other.accuracy,
       (for ((label, classResult) <- classesMap.toList) yield classResult + otherClassesMap(label)).toList
     )
     val otherClassesMap = (other.classes.groupBy((labelResult) => labelResult.label).map((tup) => {
       val (k, (v: LabelResult) :: vs) = tup
       (k, v)
-    }).toMap).withDefaultValue(LabelResult(0, SentimentLabel.Abstained, 0.0, 0.0, 0.0, 0.0))
+    }).toMap).withDefaultValue(LabelResult(0, SentimentLabel.Abstained, 0.0, 0.0, 0.0))
     ExperimentalResult(name, n * other.n, accuracy * other.accuracy,
       (for ((label, classResult) <- classesMap.toList) yield classResult * otherClassesMap(label)).toList
     )
 }
 
 
-case class LabelResult(n: Int, label: SentimentLabel.Type, precision: Double, recall: Double, f: Double, mse: Double) {
-  override def toString(): String = "%15s%5d%11.2f%8.2f%9.2f%9.2f".format(SentimentLabel.toEnglishName(label), n, precision, recall, f, mse)
+case class LabelResult(n: Int, label: SentimentLabel.Type, precision: Double, recall: Double, f: Double) {
+  override def toString(): String = "%15s%5d%11.2f%8.2f%9.2f".format(SentimentLabel.toEnglishName(label), n, precision, recall, f)
 
   def +(other: LabelResult): LabelResult = {
     assert(label == other.label)
-    LabelResult(n + other.n, label, precision + other.precision, recall + other.recall, f + other.f, mse + other.mse)
+    LabelResult(n + other.n, label, precision + other.precision, recall + other.recall, f + other.f)
   }
 
   def *(other: LabelResult): LabelResult = {
     assert(label == other.label)
-    LabelResult(n * other.n, label, precision * other.precision, recall * other.recall, f * other.f, mse * other.mse)
+    LabelResult(n * other.n, label, precision * other.precision, recall * other.recall, f * other.f)
   }
 
-  def /(scalar: Double): LabelResult = LabelResult((n.toFloat / scalar).toInt, label, precision / scalar, recall / scalar, f / scalar, mse / scalar)
+  def /(scalar: Double): LabelResult = LabelResult((n.toFloat / scalar).toInt, label, precision / scalar, recall / scalar, f / scalar)
 
-  def *(scalar: Double): LabelResult = LabelResult((n.toFloat * scalar).toInt, label, precision * scalar, recall * scalar, f * scalar, mse / scalar)
+  def *(scalar: Double): LabelResult = LabelResult((n.toFloat * scalar).toInt, label, precision * scalar, recall * scalar, f * scalar)
 }

File src/main/scala/updown/app/experiment/SplitExperiment.scala

 import updown.util.Statistics
 import org.clapper.argot.{ArgotUsageException, ArgotParser}
 import updown.data.{SystemLabeledTweet, GoldLabeledTweet}
+import java.util.Arrays
 
 abstract class SplitExperiment extends Experiment {
   // this exists purely to make the ArgotConverters appear used to IDEA
   convertByte _
-  val goldTrainSet = parser.option[String](List("e", "test"), "FILE", "gold labeled training data")
-  val goldTestSet = parser.option[String](List("g", "train"), "FILE", "gold labeled test data")
+  val goldTrainSet = parser.option[String](List("G", "train"), "FILE", "gold labeled training data")
+  val goldTestSet = parser.option[String](List("g", "test"), "FILE", "gold labeled test data")
+  val targetsInputFileTest = parser.option[String](List("s", "targetsTest"), "targetsTestFile", "targets (TEST)")
 
   def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]): List[SystemLabeledTweet]
 
   def main(args: Array[String]) {
     try {
       parser.parse(args)
-
       val trainFileName =
         goldTrainSet.value match {
           case Some(filename) => filename
-          case None => parser.usage("You must specify a gold labeled training file via -i.")
+          case None => parser.usage("You must specify a gold labeled training file")
         }
       val testFileName =
         goldTestSet.value match {
           case Some(filename) => filename
-          case None => parser.usage("You must specify a gold labeled test file via -i.")
+          case None => parser.usage("You must specify a gold labeled test file via")
         }
       val result =
       {

File src/main/scala/updown/app/experiment/labelprop/JuntoExperiment.scala

+package updown.app.experiment.labelprop
+
+import opennlp.maxent.io.BinaryGISModelReader
+import org.clapper.argot.ArgotParser._
+import updown.lex.MPQALexicon
+import opennlp.model.AbstractModel
+import upenn.junto.config.{Edge, Label, GraphBuilder}
+import updown.data.{ProbabilityLexicon, SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import java.util.zip.GZIPInputStream
+import java.io.{ObjectInputStream, FileInputStream, DataInputStream}
+import upenn.junto.app.JuntoRunner
+import scala.collection.JavaConversions._
+import updown.app.experiment.{Experiment, StaticExperiment}
+import upenn.junto.graph.Graph
+import com.weiglewilczek.slf4s.Logging
+
+trait JuntoExperiment extends Logging {
+  val DEFAULT_MU1 = .005
+  val DEFAULT_ITERATIONS = 100
+  val DEFAULT_EDGE_SEED_SET = "nfmoe"
+  val nodeRE = """^(.+_)(.+)$""".r
+  val posEmoticons = """:) :D =D =) :] =] :-) :-D :-] ;) ;D ;] ;-) ;-D ;-]""".split(" ")
+  val negEmoticons = """:( =( :[ =[ :-( :-[ :’( :’[ D:""".split(" ")
+
+  val TWEET_ = "tweet_"
+  val USER_ = "user_"
+  val NGRAM_ = "ngram_"
+  val POS = "POS"
+  val NEG = "NEG"
+  val NEU = "NEU"
+
+  // for weighting MPQA seeds
+  val BIG = 0.9
+  val BIG_COMP = .1
+  val SMALL = 0.8
+  val SMALL_COMP = .2
+
+
+  val getNgramWeightFn: (Any, List[GoldLabeledTweet]) => ((String) => Double) =
+    (refCorpusFileOption, trainSet) => {
+      refCorpusFileOption match {
+        case Some(filename: String) =>
+          val refCorpusNgramProbs = loadRefCorpusNgramProbs(filename)
+          val thisCorpusNgramProbs = computeNgramProbs(trainSet)
+          (ngram) => {
+            val numerator = thisCorpusNgramProbs(ngram)
+            val denominator = refCorpusNgramProbs.getNgramProb(ngram)
+
+            if (denominator == 0.0) 0.0 //ngram not found in reference corpus; assume NOT relevant to this corpus
+            else if (numerator > denominator) math.log(numerator / denominator)
+            else 0.0
+          }
+
+        case None => (str) => 1.0
+      }
+    }
+
+  def getTweetNgramEdges(tweets: List[GoldLabeledTweet], getNgramWeight: (String) => Double): List[Edge] = {
+    (for (tweet <- tweets) yield {
+      for (ngram <- tweet.features) yield {
+        val weight = getNgramWeight(ngram)
+        if (weight > 0.0) Some(new Edge(TWEET_ + tweet.id, NGRAM_ + ngram, weight)) else None
+      }
+    }).flatten.flatten
+  }
+
+  def getFollowerEdges(followerGraphFile: String): List[Edge] = {
+    (for (line <- scala.io.Source.fromFile(followerGraphFile, "utf-8").getLines) yield {
+      val tokens = line.split("\t")
+      if (tokens.length < 2 || tokens(0).length == 0 || tokens(1).length == 0) None else Some(new Edge(USER_ + tokens(0), USER_ + tokens(1), 1.0))
+    }).flatten.toList
+  }
+
+  def getUserTweetEdges(tweets: List[GoldLabeledTweet]): List[Edge] = (for (tweet <- tweets) yield new Edge(USER_ + tweet.userid, TWEET_ + tweet.id, 1.0))
+
+  def getMaxentSeeds(tweets: List[GoldLabeledTweet], model: AbstractModel): List[Label] = {
+    val labels = model.getDataStructures()(2).asInstanceOf[Array[String]]
+    val posIndex = labels.indexOf("1")
+    val negIndex = labels.indexOf("-1")
+    val neuIndex = labels.indexOf("0")
+
+    var nNeut = 0
+    val res =
+    (for (tweet <- tweets) yield {
+      val result = model.eval(tweet.features.toArray)
+      val posProb = if (posIndex >= 0) result(posIndex) else 0.0
+      val negProb = if (negIndex >= 0) result(negIndex) else 0.0
+      val neuProb = if (neuIndex >= 0) result(neuIndex) else 0.0
+      if (neuIndex > 0)
+        nNeut += 1
+
+      new Label(TWEET_ + tweet.id, POS, posProb) :: new Label(TWEET_ + tweet.id, NEG, negProb) :: new Label(TWEET_ + tweet.id, NEU, neuProb) :: Nil
+    }).flatten
+    println("neutrals:%d".format(nNeut))
+    res
+  }
+
+  def getMPQASeeds(lexicon: MPQALexicon): List[Label] = {
+    (for (word <- lexicon.keySet.toList) yield {
+      val entry = lexicon(word)
+      val posWeight =
+        if (entry.isStrong && entry.isPositive) BIG
+        else if (entry.isWeak && entry.isPositive) SMALL
+        else if (entry.isStrong && entry.isNegative) BIG_COMP
+        else /*if(entry.isWeak && entry.isNegative)*/ SMALL_COMP
+
+      val negWeight =
+        if (entry.isStrong && entry.isPositive) BIG_COMP
+        else if (entry.isWeak && entry.isPositive) SMALL_COMP
+        else if (entry.isStrong && entry.isNegative) BIG
+        else /*if(entry.isWeak && entry.isNegative)*/ SMALL
+
+      val neuWeight = 0.5 //Matt has little to no inkling of what is appropriate here.
+
+
+      new Label(NGRAM_ + word, POS, posWeight) :: new Label(NGRAM_ + word, NEG, negWeight) :: new Label(NGRAM_ + word, NEU, neuWeight) :: Nil
+    }).flatten
+  }
+
+  def getEmoticonSeeds(): List[Label] = {
+    (for (emo <- posEmoticons) yield {
+      new Label(NGRAM_ + emo, POS, BIG) ::
+        new Label(NGRAM_ + emo, NEG, BIG_COMP) :: Nil
+    }).toList.flatten :::
+      (for (emo <- negEmoticons) yield {
+        new Label(NGRAM_ + emo, NEG, BIG) ::
+          new Label(NGRAM_ + emo, POS, BIG_COMP) :: Nil
+      }).toList.flatten/* :::
+      (for (emo <- negEmoticons) yield {
+        new Label(NGRAM_ + emo, NEG, BIG) ::
+          new Label(NGRAM_ + emo, POS, BIG_COMP) :: Nil
+      }).toList.flatten*/
+  }
+
+  def createGraph(tweets: List[GoldLabeledTweet], followerGraphFile: String, model: AbstractModel, lexicon: MPQALexicon, edgeSeedSet: String, getNgramWeight: (String) => Double) = {
+    val edges = (if (edgeSeedSet.contains("n")) getTweetNgramEdges(tweets, getNgramWeight) else Nil) :::
+      (if (edgeSeedSet.contains("f")) (getFollowerEdges(followerGraphFile) ::: getUserTweetEdges(tweets)) else Nil)
+    val seeds = (if (edgeSeedSet.contains("m")) getMaxentSeeds(tweets, model) else Nil) :::
+      (if (edgeSeedSet.contains("o")) getMPQASeeds(lexicon) else Nil) :::
+      (if (edgeSeedSet.contains("e")) getEmoticonSeeds else Nil)
+    GraphBuilder(edges, seeds)
+  }
+
+  def loadRefCorpusNgramProbs(filename: String): ProbabilityLexicon /*scala.collection.mutable.HashMap[String, Double]*/ = {
+    val refProbs = new ObjectInputStream(new GZIPInputStream(new FileInputStream(filename))).readObject()
+
+    refProbs match {
+      case refProbLex: ProbabilityLexicon => refProbLex
+      case _ => throw new ClassCastException
+    }
+  }
+
+  def getWordCount(tweets: List[GoldLabeledTweet]): Int = {
+    (for (tweet <- tweets) yield {
+      (for (feature <- tweet.features) yield {
+        1
+      }).sum
+    }).sum
+  }
+
+  def computeNgramProbs(tweets: List[GoldLabeledTweet]): scala.collection.mutable.HashMap[String, Double] = {
+    val probs = new scala.collection.mutable.HashMap[String, Double] {
+      override def default(s: String) = 0.0
+    }
+    for (tweet <- tweets) {
+      for (feature <- tweet.features) {
+        probs.put(feature, probs(feature) + 1.0)
+      }
+    }
+
+    probs.foreach(p => probs.put(p._1, p._2 / getWordCount(tweets)))
+
+    probs
+  }
+
+  def evaluateGraphResults(tweets: scala.List[GoldLabeledTweet], graph: Graph, lexicon: MPQALexicon, getNgramWeight: (String) => Double, topNOutputFileOption: Option[String]): List[SystemLabeledTweet] = {
+    val tweetIdsToPredictedLabels = new scala.collection.mutable.HashMap[String, SentimentLabel.Type]
+
+
+
+    logger.debug("testing model")
+    val ngramsToPositivity = new scala.collection.mutable.HashMap[String, Double]
+    val ngramsToNegativity = new scala.collection.mutable.HashMap[String, Double]
+    val ngramsToNeutrality = new scala.collection.mutable.HashMap[String, Double]
+
+    val thisCorpusNgramProbs = computeNgramProbs(tweets)
+
+    for ((id, vertex) <- graph._vertices) {
+      val nodeRE(nodeType, nodeName) = id
+      if (nodeType == TWEET_) {
+        val predictions = vertex.GetEstimatedLabelScores
+        val posProb = predictions.get(POS)
+        val negProb = predictions.get(NEG)
+        val neuProb = predictions.get(NEU)
+        val maxProb = math.max(posProb, math.max(negProb, neuProb))
+
+        tweetIdsToPredictedLabels(nodeName) =
+          if (neuProb == maxProb)
+            SentimentLabel.Neutral
+          else if (posProb == maxProb)
+            SentimentLabel.Positive
+          else
+            SentimentLabel.Negative
+      }
+      else if (topNOutputFileOption != None && nodeType == NGRAM_ && !lexicon.contains(nodeName)
+        && getNgramWeight(nodeName) >= 1.0 && thisCorpusNgramProbs(nodeName) * getWordCount(tweets) >= 5.0) {
+        val predictions = vertex.GetEstimatedLabelScores
+        val posProb = predictions.get(POS)
+        val negProb = predictions.get(NEG)
+        val neuProb = predictions.get(NEU)
+
+        ngramsToPositivity.put(nodeName, posProb)
+        ngramsToNegativity.put(nodeName, negProb)
+        ngramsToNeutrality.put(nodeName, neuProb)
+
+      }
+    }
+    logger.debug("predicted nPos:%d nNeg:%d nNeu:%d".format(
+      tweetIdsToPredictedLabels.count(i => i._2 == SentimentLabel.Positive),
+      tweetIdsToPredictedLabels.count(i => i._2 == SentimentLabel.Negative),
+      tweetIdsToPredictedLabels.count(i => i._2 == SentimentLabel.Neutral)
+    ))
+    val res = for (tweet <- tweets) yield {
+
+      tweet match {
+        case GoldLabeledTweet(id, userid, features, goldLabel) =>
+          SystemLabeledTweet(id, userid, features, goldLabel,
+            if (tweetIdsToPredictedLabels.contains(id)) {
+              tweetIdsToPredictedLabels(id)
+            } else {
+              SentimentLabel.Abstained
+            })
+      }
+    }
+    res
+  }
+
+
+
+  def after(): Int = 0
+}

File src/main/scala/updown/app/experiment/labelprop/SplitJuntoExperiment.scala

+package updown.app.experiment.labelprop
+
+import opennlp.maxent.io.BinaryGISModelReader
+import updown.lex.MPQALexicon
+import java.io.{FileInputStream, DataInputStream}
+import upenn.junto.app.JuntoRunner
+import updown.app.experiment.{SplitExperiment, StaticExperiment}
+
+import org.clapper.argot.ArgotConverters._
+import upenn.junto.config.GraphBuilder._
+import upenn.junto.config.{Label, GraphBuilder}
+import updown.data.{SentimentLabel, SystemLabeledTweet, GoldLabeledTweet}
+
+object SplitJuntoExperiment extends SplitExperiment with JuntoExperiment {
+
+  import org.clapper.argot.ArgotConverters._
+
+  val modelInputFile = parser.option[String](List("m", "model"), "model", "model input")
+  val mpqaInputFile = parser.option[String](List("p", "mpqa"), "mpqa", "MPQA sentiment lexicon input file")
+  val followerGraphFile = parser.option[String](List("f", "follower-graph"), "follower-graph", "twitter follower graph input file")
+  val followerGraphFileTest = parser.option[String](List("h", "follower-graph-test"), "follower-graph-test", "twitter follower graph input (TEST)")
+  val refCorpusProbsFile = parser.option[String](List("r", "reference-corpus-probabilities"), "ref-corp-probs", "reference corpus probabilities input file")
+
+  val edgeSeedSetOption = parser.option[String](List("e", "edge-seed-set-selection"), "edge-seed-set-selection", "edge/seed set selection")
+  val topNOutputFile = parser.option[String](List("z", "top-n-file"), "top-n-file", "top-n-file")
+
+  val mu1 = parser.option[Double](List("u", "mu1"), "mu1", "seed injection probability")
+  val iterations = parser.option[Int](List("n", "iterations"), "iterations", "number of iterations")
+
+  def createTransductiveGraph(trainTweets: List[GoldLabeledTweet], followerGraphFile: String, testTweets: List[GoldLabeledTweet], followerGraphFileTest: String, edgeSeedSet: String, getNgramWeight: (String) => Double) = {
+    val totalTweets = trainTweets ::: testTweets
+    val edges = (if (edgeSeedSet.contains("n")) getTweetNgramEdges(totalTweets, getNgramWeight) else Nil) :::
+      (if (edgeSeedSet.contains("f")) (getFollowerEdges(followerGraphFile) ::: getUserTweetEdges(totalTweets) :::
+        getFollowerEdges(followerGraphFileTest))
+      else Nil)
+    val seeds = getGoldSeeds(trainTweets)
+    GraphBuilder(edges, seeds)
+  }
+  def getGoldSeeds(tweets: List[GoldLabeledTweet]): List[Label] = {
+    for (tweet <- tweets) yield {
+      tweet match {
+        case GoldLabeledTweet(id, _, _, SentimentLabel.Positive) => new Label(TWEET_ + id, POS, 1.0)
+        case GoldLabeledTweet(id, _, _, SentimentLabel.Negative) => new Label(TWEET_ + id, POS, 1.0)
+        case GoldLabeledTweet(id, _, _, SentimentLabel.Neutral) => new Label(TWEET_ + id, POS, 1.0)
+      }
+    }
+  }
+
+  def doExperiment(trainTweets: List[GoldLabeledTweet], testTweets: List[GoldLabeledTweet]) = {
+    logger.info("performing Junto experiment")
+    logger.debug("loading model")
+    val model =
+      modelInputFile.value match {
+        case Some(filename) =>
+          new BinaryGISModelReader(new DataInputStream(new FileInputStream(modelInputFile.value.get))).getModel
+        case None =>
+          parser.usage("You must specify a model input file")
+      }
+
+    val lexicon =
+      mpqaInputFile.value match {
+        case Some(filename: String) =>
+          MPQALexicon(filename)
+        case None =>
+          parser.usage("You must specify a lexicon file.")
+      }
+
+
+
+    val edgeSeedSet = edgeSeedSetOption.value.getOrElse(DEFAULT_EDGE_SEED_SET)
+
+    val getNgramWeight = getNgramWeightFn(refCorpusProbsFile.value, testTweets)
+
+    val graph =
+      (followerGraphFile.value,followerGraphFileTest.value) match {
+        case (Some(filename: String),Some(filenameTest: String)) =>
+          createTransductiveGraph(trainTweets, filename, testTweets, filenameTest, edgeSeedSet, getNgramWeight)
+      }
+
+    logger.debug("running label prop")
+    JuntoRunner(graph, mu1.value.getOrElse(DEFAULT_MU1), .01, .01, iterations.value.getOrElse(DEFAULT_ITERATIONS), false)
+
+
+    val res: List[SystemLabeledTweet] = evaluateGraphResults(testTweets, graph, lexicon, getNgramWeight, topNOutputFile.value)
+    res
+  }
+}

File src/main/scala/updown/app/experiment/labelprop/StaticJuntoExperiment.scala

 import upenn.junto.app.JuntoRunner
 import scala.collection.JavaConversions._
 
-object StaticJuntoExperiment extends StaticExperiment {
-  val DEFAULT_MU1 = .005
-  val DEFAULT_ITERATIONS = 100
-  val DEFAULT_EDGE_SEED_SET = "nfmoe"
-  val nodeRE = """^(.+_)(.+)$""".r
-  val posEmoticons = """:) :D =D =) :] =] :-) :-D :-] ;) ;D ;] ;-) ;-D ;-]""".split(" ")
-  val negEmoticons = """:( =( :[ =[ :-( :-[ :’( :’[ D:""".split(" ")
-
-  val TWEET_ = "tweet_"
-  val USER_ = "user_"
-  val NGRAM_ = "ngram_"
-  val POS = "POS"
-  val NEG = "NEG"
-  val NEU = "NEU"
-
-  // for weighting MPQA seeds
-  val BIG = 0.9
-  val BIG_COMP = .1
-  val SMALL = 0.8
-  val SMALL_COMP = .2
-
+object StaticJuntoExperiment extends StaticExperiment with JuntoExperiment{
   val modelInputFile = parser.option[String](List("m", "model"), "model", "model input")
   val mpqaInputFile = parser.option[String](List("p", "mpqa"), "mpqa", "MPQA sentiment lexicon input file")
   val followerGraphFile = parser.option[String](List("f", "follower-graph"), "follower-graph", "twitter follower graph input file")
   val mu1 = parser.option[Double](List("u", "mu1"), "mu1", "seed injection probability")
   val iterations = parser.option[Int](List("n", "iterations"), "iterations", "number of iterations")
 
-  val getNgramWeightFn: (Any, List[GoldLabeledTweet]) => ((String) => Double) =
-    (refCorpusFileOption, trainSet) => {
-      refCorpusFileOption match {
-        case Some(filename: String) =>
-          val refCorpusNgramProbs = loadRefCorpusNgramProbs(filename)
-          val thisCorpusNgramProbs = computeNgramProbs(trainSet)
-          (ngram) => {
-            val numerator = thisCorpusNgramProbs(ngram)
-            val denominator = refCorpusNgramProbs.getNgramProb(ngram)
-
-            if (denominator == 0.0) 0.0 //ngram not found in reference corpus; assume NOT relevant to this corpus
-            else if (numerator > denominator) math.log(numerator / denominator)
-            else 0.0
-          }
-
-        case None => (str) => 1.0
-      }
-    }
-
-  def getTweetNgramEdges(tweets: List[GoldLabeledTweet], getNgramWeight: (String) => Double): List[Edge] = {
-    (for (tweet <- tweets) yield {
-      for (ngram <- tweet.features) yield {
-        val weight = getNgramWeight(ngram)
-        if (weight > 0.0) Some(new Edge(TWEET_ + tweet.id, NGRAM_ + ngram, weight)) else None
-      }
-    }).flatten.flatten
-  }
-
-  def getFollowerEdges(followerGraphFile: String): List[Edge] = {
-    (for (line <- scala.io.Source.fromFile(followerGraphFile, "utf-8").getLines) yield {
-      val tokens = line.split("\t")
-      if (tokens.length < 2 || tokens(0).length == 0 || tokens(1).length == 0) None else Some(new Edge(USER_ + tokens(0), USER_ + tokens(1), 1.0))
-    }).flatten.toList
-  }
-
-  def getUserTweetEdges(tweets: List[GoldLabeledTweet]): List[Edge] = (for (tweet <- tweets) yield new Edge(USER_ + tweet.userid, TWEET_ + tweet.id, 1.0))
-
-  def getMaxentSeeds(tweets: List[GoldLabeledTweet], model: AbstractModel): List[Label] = {
-    val labels = model.getDataStructures()(2).asInstanceOf[Array[String]]
-    val posIndex = labels.indexOf("1")
-    val negIndex = labels.indexOf("-1")
-    val neuIndex = labels.indexOf("0")
-
-    (for (tweet <- tweets) yield {
-      val result = model.eval(tweet.features.toArray)
-      val posProb = if (posIndex >= 0) result(posIndex) else 0.0
-      val negProb = if (negIndex >= 0) result(negIndex) else 0.0
-      val neuProb = if (neuIndex >= 0) result(neuIndex) else 0.5
-
-      new Label(TWEET_ + tweet.id, POS, posProb) :: new Label(TWEET_ + tweet.id, NEG, negProb) :: new Label(TWEET_ + tweet.id, NEU, neuProb) :: Nil
-    }).flatten
-  }
-
-  def getMPQASeeds(lexicon: MPQALexicon): List[Label] = {
-    (for (word <- lexicon.keySet.toList) yield {
-      val entry = lexicon(word)
-      val posWeight =
-        if (entry.isStrong && entry.isPositive) BIG
-        else if (entry.isWeak && entry.isPositive) SMALL
-        else if (entry.isStrong && entry.isNegative) BIG_COMP
-        else /*if(entry.isWeak && entry.isNegative)*/ SMALL_COMP
-
-      val negWeight =
-        if (entry.isStrong && entry.isPositive) BIG_COMP
-        else if (entry.isWeak && entry.isPositive) SMALL_COMP
-        else if (entry.isStrong && entry.isNegative) BIG
-        else /*if(entry.isWeak && entry.isNegative)*/ SMALL
-
-      val neuWeight = 0.5 //Matt has little to no inkling of what is appropriate here.
-
-
-      new Label(NGRAM_ + word, POS, posWeight) :: new Label(NGRAM_ + word, NEG, negWeight) :: new Label(NGRAM_ + word, NEU, neuWeight) :: Nil
-    }).flatten
-  }
-
-  def getEmoticonSeeds(): List[Label] = {
-    (for (emo <- posEmoticons) yield {
-      new Label(NGRAM_ + emo, POS, BIG) ::
-        new Label(NGRAM_ + emo, NEG, BIG_COMP) :: Nil
-    }).toList.flatten :::
-      (for (emo <- negEmoticons) yield {
-        new Label(NGRAM_ + emo, NEG, BIG) ::
-          new Label(NGRAM_ + emo, POS, BIG_COMP) :: Nil
-      }).toList.flatten/* :::
-      (for (emo <- negEmoticons) yield {
-        new Label(NGRAM_ + emo, NEG, BIG) ::
-          new Label(NGRAM_ + emo, POS, BIG_COMP) :: Nil
-      }).toList.flatten*/
-  }
-
-  def createGraph(tweets: List[GoldLabeledTweet], followerGraphFile: String, model: AbstractModel, lexicon: MPQALexicon, edgeSeedSet: String, getNgramWeight: (String) => Double) = {
-    val edges = (if (edgeSeedSet.contains("n")) getTweetNgramEdges(tweets, getNgramWeight) else Nil) :::
-      (if (edgeSeedSet.contains("f")) (getFollowerEdges(followerGraphFile) ::: getUserTweetEdges(tweets)) else Nil)
-    val seeds = (if (edgeSeedSet.contains("m")) getMaxentSeeds(tweets, model) else Nil) :::
-      (if (edgeSeedSet.contains("o")) getMPQASeeds(lexicon) else Nil) :::
-      (if (edgeSeedSet.contains("e")) getEmoticonSeeds else Nil)
-    GraphBuilder(edges, seeds)
-  }
-
-  def loadRefCorpusNgramProbs(filename: String): ProbabilityLexicon /*scala.collection.mutable.HashMap[String, Double]*/ = {
-    val refProbs = new ObjectInputStream(new GZIPInputStream(new FileInputStream(filename))).readObject()
-
-    refProbs match {
-      case refProbLex: ProbabilityLexicon => refProbLex
-      case _ => throw new ClassCastException
-    }
-  }
-
-  def getWordCount(tweets: List[GoldLabeledTweet]): Int = {
-    (for (tweet <- tweets) yield {
-      (for (feature <- tweet.features) yield {
-        1
-      }).sum
-    }).sum
-  }
-
-  def computeNgramProbs(tweets: List[GoldLabeledTweet]): scala.collection.mutable.HashMap[String, Double] = {
-    val probs = new scala.collection.mutable.HashMap[String, Double] {
-      override def default(s: String) = 0.0
-    }
-    for (tweet <- tweets) {
-      for (feature <- tweet.features) {
-        probs.put(feature, probs(feature) + 1.0)
-      }
-    }
-
-    probs.foreach(p => probs.put(p._1, p._2 / getWordCount(tweets)))
-
-    probs
-  }
 
   def doExperiment(tweets: List[GoldLabeledTweet]) = {
     logger.info("performing Junto experiment")
 
     val lexicon =
       mpqaInputFile.value match {
-        case Some(filename) =>
+        case Some(filename: String) =>
           MPQALexicon(filename)
         case None =>
           parser.usage("You must specify a lexicon file.")
 
     val graph =
       followerGraphFile.value match {
-        case Some(filename) =>
+        case Some(filename: String) =>
           createGraph(tweets, filename, model, lexicon, edgeSeedSet, getNgramWeight)
       }
 
     JuntoRunner(graph, mu1.value.getOrElse(DEFAULT_MU1), .01, .01, iterations.value.getOrElse(DEFAULT_ITERATIONS), false)
 
 
-    val tweetIdsToPredictedLabels = new scala.collection.mutable.HashMap[String, SentimentLabel.Type]
-
-
-
-    logger.debug("testing model")
-    val ngramsToPositivity = new scala.collection.mutable.HashMap[String, Double]
-    val ngramsToNegativity = new scala.collection.mutable.HashMap[String, Double]
-    val ngramsToNeutrality = new scala.collection.mutable.HashMap[String, Double]
-
-    val thisCorpusNgramProbs = computeNgramProbs(tweets)
-
-    for ((id, vertex) <- graph._vertices) {
-      val nodeRE(nodeType, nodeName) = id
-      if (nodeType == TWEET_) {
-        val predictions = vertex.GetEstimatedLabelScores
-        val posProb = predictions.get(POS)
-        val negProb = predictions.get(NEG)
-        val neuProb = predictions.get(NEU)
-        val maxProb = math.max(posProb, math.max(negProb, neuProb))
-
-        tweetIdsToPredictedLabels(nodeName) =
-          if (neuProb == maxProb)
-            SentimentLabel.Neutral
-          else if (posProb == maxProb)
-            SentimentLabel.Positive
-          else
-            SentimentLabel.Negative
-      }
-      else if (topNOutputFile.value != None && nodeType == NGRAM_ && !lexicon.contains(nodeName)
-        && getNgramWeight(nodeName) >= 1.0 && thisCorpusNgramProbs(nodeName) * getWordCount(tweets) >= 5.0) {
-        val predictions = vertex.GetEstimatedLabelScores
-        val posProb = predictions.get(POS)
-        val negProb = predictions.get(NEG)
-        val neuProb = predictions.get(NEU)
-
-        ngramsToPositivity.put(nodeName, posProb)
-        ngramsToNegativity.put(nodeName, negProb)
-        ngramsToNeutrality.put(nodeName, neuProb)
-
-      }
-    }
-    logger.info("predicted nPos:%d nNeg:%d nNeu:%d".format(
-    tweetIdsToPredictedLabels.count(i=>i._2==SentimentLabel.Positive),
-    tweetIdsToPredictedLabels.count(i=>i._2==SentimentLabel.Negative),
-    tweetIdsToPredictedLabels.count(i=>i._2==SentimentLabel.Neutral)
-    ))
-    val res = for (tweet <- tweets) yield {
-
-      tweet match {
-        case GoldLabeledTweet(id, userid, features, goldLabel) =>
-          SystemLabeledTweet(id, userid, features, goldLabel,
-            if (tweetIdsToPredictedLabels.contains(id)) {
-              tweetIdsToPredictedLabels(id)
-            } else {
-              SentimentLabel.Abstained
-            })
-      }
-    }
+    val res: List[SystemLabeledTweet] = evaluateGraphResults(tweets, graph, lexicon, getNgramWeight, topNOutputFile.value)
     res
   }
-
-  def after(): Int = 0
 }

File src/main/scala/updown/util/Statistics.scala

   def averageResults(newName: String, results: scala.List[ExperimentalResult]): ExperimentalResult = {
     var avgAccuracy = 0.0
     var avgN = 0.0
-    var avgLabelResults = scala.collection.mutable.Map[SentimentLabel.Type, LabelResult]().withDefault((label) => LabelResult(0, label, 0.0, 0.0, 0.0, 0.0))
+    var avgLabelResults = scala.collection.mutable.Map[SentimentLabel.Type, LabelResult]().withDefault((label) => LabelResult(0, label, 0.0, 0.0, 0.0))
     // first, sum
     for (ExperimentalResult(name, n, accuracy, classes) <- results) {
       avgAccuracy += accuracy
       avgN += n
-      for (LabelResult(n, label, precision, recall, f, mse) <- classes) {
-        val LabelResult(oN, oLabel, oPrecision, oRecall, oF, oMse) = avgLabelResults(label)
-        avgLabelResults(label) = LabelResult(n + oN, label, precision + oPrecision, recall + oRecall, f + oF, mse + oMse)
+      for (LabelResult(n, label, precision, recall, f) <- classes) {
+        val LabelResult(oN, oLabel, oPrecision, oRecall, oF) = avgLabelResults(label)
+        avgLabelResults(label) = LabelResult(n + oN, label, precision + oPrecision, recall + oRecall, f + oF)
       }
     }
     // then, scale
     val N = results.length
     ExperimentalResult(newName, (avgN / N).toInt, avgAccuracy / N,
-      (for ((_, LabelResult(n, label, precision, recall, f, mse)) <- avgLabelResults.toList.sortBy {
+      (for ((_, LabelResult(n, label, precision, recall, f)) <- avgLabelResults.toList.sortBy {
         case (k, v) => SentimentLabel.ordinality(k)
       }) yield {
-        LabelResult(n / N, label, precision / N, recall / N, f / N, mse / N)
+        LabelResult(n / N, label, precision / N, recall / N, f / N)
       }).toList)
   }
 
           goldList.length
         )
 
-        LabelResult(goldList.length, label, labelPrecision, labelRecall, fScore(labelPrecision, labelRecall), math.pow((goldList.length-systemList.length)/tweets.length, 2))
+        LabelResult(goldList.length, label, labelPrecision, labelRecall, fScore(labelPrecision, labelRecall))
       }).toList)
   }
 
-  def getMSEPerUser(tweets: scala.List[SystemLabeledTweet]): (List[LabelResult], Int) = {
+  def getMSEPerUser(tweets: scala.List[SystemLabeledTweet]): (List[(SentimentLabel.Type,Double)], Int) = {
     var totalError = 0.0;
     var totalErrorPos = 0.0
     var totalErrorNeg = 0.0
     totalErrorNeg /= usersToTweetsFiltered.size
     totalErrorNeu /= usersToTweetsFiltered.size
 
-    (List(LabelResult(-1, SentimentLabel.Positive, -1, -1, -1, totalErrorPos),
-      LabelResult(-1, SentimentLabel.Negative, -1, -1, -1, totalErrorNeg),
-      LabelResult(-1, SentimentLabel.Neutral, -1, -1, -1, totalErrorNeu)), usersToTweetsFiltered.size)
+    (List((SentimentLabel.Positive, totalErrorPos),
+      (SentimentLabel.Negative, totalErrorNeg),
+      (SentimentLabel.Neutral, totalErrorNeu)), usersToTweetsFiltered.size)
   }
 
   def getEvalStatsPerTarget(resultName: String, tweets: scala.List[TargetedSystemLabeledTweet]): (List[ExperimentalResult], Int) = {