Commits

Mike Speriosu  committed 0613bc8

Updated JuntoClassifier and Shamma preproc.

  • Participants
  • Parent commits be0e5a9

Comments (0)

Files changed (2)

File src/main/scala/updown/app/JuntoClassifier.scala

 
     if(topNOutputFile.value != None) {
       val tnout = new BufferedWriter(new FileWriter(topNOutputFile.value.get))
-      val topNPos = ngramsToPositivity.toList/*.filterNot(p => lexicon.contains(p._1))*/.sortWith((x, y) => x._2 >= y._2).slice(0, TOP_N)
-      val topNNeg = ngramsToPositivity.toList.sortWith((x, y) => x._2 <= y._2).slice(0, TOP_N)//ngramsToNegativity.toList/*.filterNot(p => lexicon.contains(p._1))*/.sortWith((x, y) => x._2 >= y._2).slice(0, TOP_N)
+      //val topNPos = ngramsToPositivity.toList/*.filterNot(p => lexicon.contains(p._1))*/.sortWith((x, y) => x._2 >= y._2).slice(0, TOP_N)
+      //val topNNeg = ngramsToPositivity.toList.sortWith((x, y) => x._2 <= y._2).slice(0, TOP_N)//ngramsToNegativity.toList/*.filterNot(p => lexicon.contains(p._1))*/.sortWith((x, y) => x._2 >= y._2).slice(0, TOP_N)
 
-      topNPos.foreach(p => tnout.write(p._1+" "+p._2+"\n"))
-      tnout.write("\n\n\n")
-      topNNeg.foreach(p => tnout.write(p._1+" "+p._2+"\n"))
+      val ngramsToRatios = ngramsToPositivity.toList.map(p => (p._1, p._2 / ngramsToNegativity(p._1)))
+
+      //topNPos.foreach(p => tnout.write(p._1+" "+p._2+"\n"))
+      //tnout.write("\n\n\n")
+      //topNNeg.foreach(p => tnout.write(p._1+" "+p._2+"\n"))
+      val mostPos = ngramsToRatios.sortWith((x, y) => x._2 >= y._2).slice(0, TOP_N)
+      mostPos.foreach(p => tnout.write(p._1+"\t"+p._2+"\n"))
+      mostPos.foreach(p => tnout.write(p._1+", "))
+      tnout.write("\n\n\n\n")
+      val mostNeg = ngramsToRatios.sortWith((x, y) => x._2 <= y._2).slice(0, TOP_N)
+      mostNeg.foreach(p => tnout.write(p._1+"\t"+p._2+"\n"))
+      mostNeg.foreach(p => tnout.write(p._1+", "))
+      tnout.write("\n")
 
       tnout.close
     }
         val weight = getNgramWeight(ngram)
         //println(TWEET_ + tweet.id + "   " + NGRAM_ + ngram + "   " + weight)
         if(weight > 0.0) {
+          //if(ngram == "mccain") println("mccain: " + weight)
           Some(new Edge(TWEET_ + tweet.id, NGRAM_ + ngram, weight))
         }
         else
       thisCorpusNgramProbs = computeNgramProbs(totalTweets)
     }
 
-    val graph = createTransductiveGraph(trainTweets, /*followerGraphFile.value.get, */ testTweets, /*followerGraphFileTest.value.get, */ edgeSeedSet)
+    val graph = createTransductiveGraph(trainTweets, followerGraphFile.value.get, testTweets, followerGraphFileTest.value.get, edgeSeedSet)
 
     JuntoRunner(graph, mu1.value.getOrElse(DEFAULT_MU1), .01, .01, iterations.value.getOrElse(DEFAULT_ITERATIONS), false)
 
     }
   }
 
-  def createTransductiveGraph(trainTweets: List[Tweet], /*followerGraphFile: String, */ testTweets: List[Tweet], /* followerGraphFileTest: String, */ edgeSeedSet: String) = {
+  def createTransductiveGraph(trainTweets: List[Tweet], followerGraphFile: String, testTweets: List[Tweet],  followerGraphFileTest: String, edgeSeedSet: String) = {
     val totalTweets = trainTweets ::: testTweets
-    val edges = (if(edgeSeedSet.contains("n")) getTweetNgramEdges(totalTweets) else Nil) /*:::
+    val edges = (if(edgeSeedSet.contains("n")) getTweetNgramEdges(totalTweets) else Nil) :::
                 (if(edgeSeedSet.contains("f")) (getFollowerEdges(followerGraphFile) ::: getUserTweetEdges(totalTweets) :::
-                                                getFollowerEdges(followerGraphFileTest)) else Nil)*/
+                                                getFollowerEdges(followerGraphFileTest)) else Nil)
     val seeds = getGoldSeeds(trainTweets)
     /*val seeds = (if(edgeSeedSet.contains("m")) getMaxentSeeds(tweets, modelInputFile) else Nil) :::
                 (if(edgeSeedSet.contains("o")) getMPQASeeds(MPQALexicon(mpqaInputFile)) else Nil) :::

File src/main/scala/updown/preproc/PreprocShammaTweets.scala

 
     var numTweets = 0
     var numPosTweets = 0
+    var averageIAA = 0.0
     for(line <- lines) {
 
       val roughTokens = line.split("\t")
           //println(ratings.length)
           //println("posFraction: " + posFraction)
           //println("negFraction: " + negFraction)
-          if(math.max(posFraction, negFraction) > .5) {
+          val majorityFraction = math.max(posFraction, negFraction)
+          if(majorityFraction > .5) {
             val label = if(posFraction > negFraction) "1" else "-1"
             if(label == "1") numPosTweets += 1
             numTweets += 1
+            averageIAA += majorityFraction
             
             val tokens = BasicTokenizer(tweet)//TwokenizeWrapper(tweet)
             val features = tokens.filterNot(stoplist(_)) ::: StringUtil.generateBigrams(tokens)
     }
 
     System.err.println("Preprocessed " + numTweets + " tweets. Fraction positive: " + (numPosTweets.toFloat/numTweets))
+    System.err.println("Average inter-annotator agreement: " + averageIAA/numTweets)
   }
 }