Commits

Anonymous committed b841bd7

added some more tests

  • Participants
  • Parent commits c7e5721

Comments (0)

Files changed (7)

 
 elif [ $CMD = 'scct' ]; then
 
-    $JAVA_COMMAND -jar $UPDOWN_DIR/project/build/sbt-launch-0.7.7.jar test-coverage "$@"
+    $JAVA_COMMAND -jar $UPDOWN_DIR/project/build/sbt-launch-0.7.7.jar clean test-coverage "$@"
     firefox $UPDOWN_DIR/target/coverage-report/index.html
 
 elif [ $CMD = 'preproc-stanford' ]; then

src/main/scala/updown/preproc/PreprocEmoticonTweets.scala

 import java.io._
 
 import org.clapper.argot._
+import updown.data.SentimentLabel
 
 object PreprocEmoticonTweets {
   import ArgotConverters._
   
   val topNUnigrams = new scala.collection.mutable.HashMap[String, Int] { override def default(s: String) = 0 }
   val TOP_N = 1000
-  
-  // TODO: refactor this into a seperate enum
-  val NUM_POS = "1"
-  val NUM_NEU = "0"
-  val NUM_NEG = "-1"
 
   val parser = new ArgotParser("updown run updown.preproc.PreprocEmoticonTweets", preUsage=Some("Updown"))
   val inputPositiveFile = parser.option[String](List("p","positive"),"positive", "text file with positive emoticons")
     val engDict = scala.io.Source.fromFile(dictFile.value.get,"utf-8").getLines.toSet
     val countTopN = if(countArg.value == None) false else true
     
-    preprocFile(inputPositiveFile.value.get, NUM_POS, out, stoplist, engDict, countTopN) //happy
-    preprocFile(inputNegativeFile.value.get, NUM_NEG, out, stoplist, engDict, countTopN) //sad
+    preprocFile(inputPositiveFile.value.get, SentimentLabel.Positive, out, stoplist, engDict, countTopN) //happy
+    preprocFile(inputNegativeFile.value.get, SentimentLabel.Negative, out, stoplist, engDict, countTopN) //sad
     
     /*
      * 
      *I have no idea what a neutral emoticon is
      */
-    //preprocFile(args(2), NUM_NEU, out, stoplist, engDict, countTopN)
+    //preprocFile(args(2), SentimentLabel.Neutral, out, stoplist, engDict, countTopN)
     out.close
 
     if(countTopN) {
     }
   }
 
-  def preprocFile(inFilename: String, label: String, out: OutputStreamWriter, stoplist: Set[String], engDict: Set[String], countTopN: Boolean) = {
+  def preprocFile(inFilename: String, label: SentimentLabel.Type, out: OutputStreamWriter, stoplist: Set[String], engDict: Set[String], countTopN: Boolean) = {
     for(line <- scala.io.Source.fromFile(inFilename,"utf-8").getLines) {
       val tokens = BasicTokenizer(line)//TwokenizeWrapper(line)
       if(isEnglish(tokens, engDict)) {

src/main/scala/updown/preproc/StripIds.scala

 
   val lineRE = """^[^|]+\|[^|]+\|(.*)$""".r
 
-  def main(args: Array[String]) = {
-    for(line <- scala.io.Source.fromFile(args(0),"utf-8").getLines) {
+  def main(args: Array[String]) {
+    for(line <- scala.io.Source.fromFile(args(0),"utf-8").getLines()) {
       val lineRE(stripped) = line
       println(stripped)
     }

src/main/scala/updown/preproc/UsernamifyEdges.scala

 package updown.preproc
 
 object UsernamifyEdges {
-  def main(args: Array[String]) = {
-    
+  def main(args: Array[String]) {
     val userIdsToUsernames = new scala.collection.mutable.HashMap[Int, String]
 
-    for(line <- scala.io.Source.fromFile(args(1),"utf-8").getLines) {
+    for(line <- scala.io.Source.fromFile(args(1),"utf-8").getLines()) {
       val tokens = line.split("\t")
       if(tokens.length >= 2 && tokens(0).length > 0 && tokens(1).length > 0) {
         userIdsToUsernames.put(tokens(0).toInt, tokens(1))
       }
     }
 
-    for(line <- scala.io.Source.fromFile(args(0),"utf-8").getLines) {
+    for(line <- scala.io.Source.fromFile(args(0),"utf-8").getLines()) {
       val tokens = line.split("\t")
       if(tokens.length >= 2 && tokens(0).length > 0 && tokens(1).length > 0) {
         val userId1 = tokens(0).toInt

src/main/scala/updown/util/Twokenize.scala

  tokenized Array[String] of the input text Twokenize.tokenize("foobar
  baz.") => ['foobar', 'baz', '.']
 
- The main method reads from stdin like it's python counterpart and
+ The main method reads from stdin like its python counterpart and
  calls the above method on each line
 
   > scalac twokenize.scala

src/test/scala/StringUtilTest.scala

+import org.scalatest.FlatSpec
+import updown.util.StringUtil
+
+class StringUtilTest extends FlatSpec {
+  "stripPunc" should "turn /.,@$#asdf';.@#$% into asdf" in {
+    assert(StringUtil.stripPunc("/.,@$#asdf';.@#$%") === "asdf")
+  }
+  "preprocess" should "turn /.,@$#ASdf';.@#$% into asdf" in {
+    assert(StringUtil.preprocess("/.,@$#ASdf';.@#$%") === "asdf")
+  }
+  it should "turn /.,@$#;.@#$% into /.,@$#';.@#$%" in {
+    assert(StringUtil.preprocess("/.,@$#';.@#$%") === "/.,@$#';.@#$%")
+  }
+  "stripPuncKeepHash" should "turn /.,@$#asdf';.@#$% into #asdf" in {
+    assert(StringUtil.stripPuncKeepHash("/.,@$#asdf';.@#$%") === "#asdf")
+  }
+  "preprocessKeepHash" should "turn /.,@$#ASdf';.@#$% into #asdf" in {
+    assert(StringUtil.preprocessKeepHash("/.,@$#ASdf';.@#$%") === "#asdf")
+  }
+  it should "turn /.,@$#;.@#$% into /.,@$#';.@#$%" in {
+    assert(StringUtil.preprocessKeepHash("/.,@$#';.@#$%") === "/.,@$#';.@#$%")
+  }
+  "generateBigrams" should "return bigrams for a list of unigrams" in {
+    assert(StringUtil.generateBigrams(List("a", "b", "c")) === List("$ a", "a b", "b c", "c $"))
+  }
+  it should "return Nil for an empty list" in {
+    assert(StringUtil.generateBigrams(List()) === Nil)
+  }
+  it should "return bigrams for an list length 1" in {
+    assert(StringUtil.generateBigrams(List("a")) === List("$ a", "a $"))
+  }
+  it should "return bigrams for an list length 2" in {
+    assert(StringUtil.generateBigrams(List("a", "b")) === List("$ a", "a b", "b $"))
+  }
+}

src/test/scala/TwokenizeTest.scala

+import org.scalatest.FlatSpec
+import updown.util.Twokenize
+
+class TwokenizeTest extends FlatSpec {
+  "Twokenize" should "tokenize a simple tweet" in {
+    assert(Twokenize("@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right.")
+      ===
+      List("@stellargirl", "I", "loooooooovvvvvveee", "my", "Kindle2", ".", "Not", "that",
+        "the", "DX", "is", "cool", ",", "but", "the", "2", "is", "fantastic", "in", "its", "own", "right", "."))
+  }
+}