Commits

Anonymous committed 1f85359 Merge

merge

Comments (0)

Files changed (5)

 #!/bin/bash
 
 JARS=`echo $UPDOWN_DIR/lib/*.jar $UPDOWN_DIR/lib_managed/*/*.jar $UPDOWN_DIR/output/*.jar $UPDOWN_DIR/target/*.jar | tr ' ' ':'`
-SCALA_LIB="$UPDOWN_DIR/project/boot/scala-2.9.0/lib/scala-library.jar"
+SCALA_LIB="$UPDOWN_DIR/project/boot/scala-2.9.1/lib/scala-library.jar"
 
 CP=$UPDOWN_DIR/target/classes:$SCALA_LIB:$JARS:$CLASSPATH
 

project/build.properties

 project.name=updown
 sbt.version=0.7.7
 project.version=0.1
-build.scala.versions=2.9.0
+build.scala.versions=2.9.1

src/main/scala/updown/app/JuntoClassifier.scala

 
   val nodeRE = """^(.+_)(.+)$""".r
 
-  var refCorpusNgramProbs: scala.collection.mutable.HashMap[String, Double] = null
+  var refCorpusNgramProbs: ProbabilityLexicon/*scala.collection.mutable.HashMap[String, Double]*/ = null
   var thisCorpusNgramProbs: scala.collection.mutable.HashMap[String, Double] = null
 
   var wordCount = 0
       }).toList.flatten
   }
 
-  def loadRefCorpusNgramProbs(filename: String): scala.collection.mutable.HashMap[String, Double] = {
+  def loadRefCorpusNgramProbs(filename: String): ProbabilityLexicon/*scala.collection.mutable.HashMap[String, Double]*/ = {
     val gis = new GZIPInputStream(new FileInputStream(filename))
     val ois = new ObjectInputStream(gis)
     val refProbs = ois.readObject()
 
     refProbs match {
-      case refProbsHM: scala.collection.mutable.HashMap[String, Double] => refProbsHM
+      //case refProbsHM: scala.collection.mutable.HashMap[String, Double] => refProbsHM
+      case refProbLex: ProbabilityLexicon => refProbLex
       case _ => throw new ClassCastException
     }
   }
       return 1.0
     else {
       val numerator = thisCorpusNgramProbs(ngram)
-      val denominator = refCorpusNgramProbs(ngram)
+      val denominator = refCorpusNgramProbs.getNgramProb(ngram)
 
       if (denominator == 0.0) //ngram not found in reference corpus; assume NOT relevant to this corpus
         return 0.0

src/main/scala/updown/data/NgramProbabilityCalculator.scala

 object NgramProbabilityCalculator {
 
   val COUNT_THRESHOLD = 5.0
-  val ngramProbabilities = new scala.collection.mutable.HashMap[String, Double] { override def default(s: String) = 0.0 }
+  //val underThresholdProbs = new scala.collection.mutable.HashMap[String, Double] { override def default(s: String) = 0.0 }
+  //val ngramProbabilities = new scala.collection.mutable.HashMap[String, Double] { override def default(s: String) = 0.0 }
+  val probLex = new ProbabilityLexicon
   var tweetCount = 0
   var tweetsToProcess = 1000
-  var wordCount = 0
+  //var wordCount = 0
 
   def main(args: Array[String]) = {
     val inFile = new File(args(0))
       processFile(inFile)
     }
 
-    ngramProbabilities.foreach(p => if(p._2 < COUNT_THRESHOLD) ngramProbabilities.remove(p._1))
+    //ngramProbabilities.foreach(p => if(p._2 < COUNT_THRESHOLD) ngramProbabilities.remove(p._1))
     
     //val ngramProbabilitiesPruned = ngramProbabilities.filter(_._2 >= COUNT_THRESHOLD)
 
-    ngramProbabilities/*Pruned*/.foreach(p => ngramProbabilities/*Pruned*/.put(p._1, p._2 / wordCount))
+    //ngramProbabilities/*Pruned*/.foreach(p => ngramProbabilities/*Pruned*/.put(p._1, p._2 / wordCount))
 
-    println("Final word count was " + wordCount)
+    println("Final word count was " + probLex.size)
 
     /*println(ngramProbabilities("lol"))
     println(ngramProbabilities("the"))
     print("Serializing to " + args(1) + " ...");
     val gos = new GZIPOutputStream(new FileOutputStream(args(1)))
     val oos = new ObjectOutputStream(gos)
-    oos.writeObject(ngramProbabilities/*Pruned*/)
+    oos.writeObject(probLex/*ngramProbabilities/*Pruned*/*/)
     oos.close()
     println("done.")
   }
         val tokens = tweet.split(" ")
         if(tokens.length >= 1) {
           val unigrams = tokens.map(StringUtil.preprocessKeepHash(_)).toList
-          wordCount += unigrams.length
+          //wordCount += unigrams.length
           val bigramsFromUnigrams =
             if(unigrams.length >= 2)
               unigrams.sliding(2).map(bi => bi(0)+" "+bi(1)).toList
 
           for(ngram <- unigrams ::: bigrams) {
             //println("adding: " + ngram)
-            ngramProbabilities.put(ngram, ngramProbabilities(ngram) + 1.0)
+            probLex.observeNgram(ngram)
           }
         }
 

src/main/scala/updown/data/ProbabilityLexicon.scala

+package updown.data
+
+class ProbabilityLexicon {
+
+  var vocabSize = 0
+  val wordsToInts = new scala.collection.mutable.HashMap[String, Int] { override def default(s: String) = -1 }
+
+  val unigramCounts = new scala.collection.mutable.HashMap[Int, Int] { override def default(n: Int) = 0 }
+  var totalUnigramCount = 0
+
+  val bigramCounts = new scala.collection.mutable.HashMap[(Int, Int), Int] { override def default(t: (Int, Int)) = 0 }
+
+  def observeNgram(ngram: String): Unit = {
+    if(ngram.contains(" ")) {
+      val tokens = ngram.split(" ")
+      observeBigram(tokens(0), tokens(1))
+    }
+    else {
+      observeUnigram(ngram)
+    }
+  }
+
+  def observeUnigram(word: String): Unit = {
+    if(!wordsToInts.contains(word)) {
+      wordsToInts.put(word, vocabSize)
+      vocabSize += 1
+    }
+
+    val wordID = wordsToInts(word)
+    unigramCounts.put(wordID, unigramCounts(wordID) + 1)    
+
+    totalUnigramCount += 1
+  }
+
+  def observeBigram(word1: String, word2: String): Unit = {
+
+    if(!wordsToInts.contains(word1)) {
+      wordsToInts.put(word1, vocabSize)
+      vocabSize += 1
+    }
+    if(!wordsToInts.contains(word2)) {
+      wordsToInts.put(word2, vocabSize)
+      vocabSize += 1
+    }
+
+    val idPair = (wordsToInts(word1), wordsToInts(word2))
+
+    bigramCounts.put(idPair, bigramCounts(idPair) + 1)
+  }
+
+  def getNgramProb(ngram: String): Double = {
+    if(ngram.contains(" ")) {
+      val tokens = ngram.split(" ")
+      getBigramProb(tokens(0), tokens(1))
+    }
+    else
+      getUnigramProb(ngram)
+  }
+
+  def getUnigramProb(word: String): Double = {
+    unigramCounts(wordsToInts(word)).toDouble / totalUnigramCount
+  }
+
+  def getBigramProb(word1: String, word2: String): Double = {
+    bigramCounts((wordsToInts(word1), wordsToInts(word2))).toDouble / totalUnigramCount
+  }
+
+  def size = totalUnigramCount
+}