Commits

Mike Speriosu committed d42a03e

Finished making reference corpus n-gram probabilities more memory efficient.

Comments (0)

Files changed (4)

src/main/resources/eng/model/ngramProbs.ser.gz

Binary file modified.

src/main/scala/updown/app/JuntoClassifier.scala

     val edgeSeedSet = edgeSeedSetOption.value.getOrElse(DEFAULT_EDGE_SEED_SET)
 
     val tweets = TweetFeatureReader(goldInputFile.value.get)
+    //tweets.foreach(println)
 
     if (refCorpusProbsFile.value != None) {
       refCorpusNgramProbs = loadRefCorpusNgramProbs(refCorpusProbsFile.value.get)
     else {
       val numerator = thisCorpusNgramProbs(ngram)
       val denominator = refCorpusNgramProbs.getNgramProb(ngram)
+      //println(ngram+" "+denominator)
 
       if (denominator == 0.0) //ngram not found in reference corpus; assume NOT relevant to this corpus
         return 0.0

src/main/scala/updown/data/NgramProbabilityCalculator.scala

 
 object NgramProbabilityCalculator {
 
-  val COUNT_THRESHOLD = 5.0
+  val COUNT_THRESHOLD = 10
   //val underThresholdProbs = new scala.collection.mutable.HashMap[String, Double] { override def default(s: String) = 0.0 }
   //val ngramProbabilities = new scala.collection.mutable.HashMap[String, Double] { override def default(s: String) = 0.0 }
   val probLex = new ProbabilityLexicon
 
     //ngramProbabilities/*Pruned*/.foreach(p => ngramProbabilities/*Pruned*/.put(p._1, p._2 / wordCount))
 
-    println("Final word count was " + probLex.size)
+    println("Words:"+probLex.totalUnigramCount+", vocab:"+probLex.vocabSize)
+    println("Removing words under threshold of "+COUNT_THRESHOLD)
+    probLex.removeUnderThreshold(COUNT_THRESHOLD)
+    println("Words:"+probLex.totalUnigramCount+", vocab:"+probLex.vocabSize)
 
     /*println(ngramProbabilities("lol"))
     println(ngramProbabilities("the"))
   }
 
   def processFile(file: File): Unit = {
-    println(file)
+    println(file+" tweets: "+tweetCount+" words:"+probLex.totalUnigramCount+" vocab:"+probLex.vocabSize)
     
     val fileInputStream = new FileInputStream(file)
     fileInputStream.read(); // otherwise null pointer

src/main/scala/updown/data/ProbabilityLexicon.scala

 package updown.data
 
-class ProbabilityLexicon {
+class ProbabilityLexicon extends Serializable {
 
   var vocabSize = 0
   val wordsToInts = new scala.collection.mutable.HashMap[String, Int] { override def default(s: String) = -1 }
   def observeUnigram(word: String): Unit = {
     if(!wordsToInts.contains(word)) {
       wordsToInts.put(word, vocabSize)
+      //intsToWords.put(vocabSize, word)
       vocabSize += 1
     }
 
 
     if(!wordsToInts.contains(word1)) {
       wordsToInts.put(word1, vocabSize)
+      //intsToWords.put(vocabSize, word1)
       vocabSize += 1
     }
     if(!wordsToInts.contains(word2)) {
       wordsToInts.put(word2, vocabSize)
+      //intsToWords.put(vocabSize, word2)
       vocabSize += 1
     }
 
     bigramCounts((wordsToInts(word1), wordsToInts(word2))).toDouble / totalUnigramCount
   }
 
-  def size = totalUnigramCount
+  def removeUnderThreshold(threshold: Int): Unit = {
+
+    val intsToWords = new scala.collection.mutable.HashMap[Int, String] { override def default(n: Int) = null }
+    wordsToInts.foreach(p => intsToWords.put(p._2, p._1))
+
+    bigramCounts.foreach(p => if(p._2 < threshold) {
+      bigramCounts.remove(p._1)
+    })
+
+    unigramCounts.foreach(p => if(p._2 < threshold) {
+      totalUnigramCount -= p._2
+      vocabSize -= 1
+      unigramCounts.remove(p._1)
+      //wordsToInts.remove(wordsToInts.find(w => w._2 == p._1).get._1) // super slow but memory efficient way to do this
+
+      wordsToInts.remove(intsToWords(p._1))
+      intsToWords.remove(p._1)
+    })
+  }
+
 }