vvcephei avatar vvcephei committed dc2a0ba Merge

merge

Comments (0)

Files changed (2)

src/main/scala/updown/app/NeutralTweetFinder.scala

+package updown.app
+
+import updown.util._
+import updown.lex._
+
+import java.io._
+import org.apache.tools.bzip2._
+import java.util.zip._
+import org.json._
+
+import scala.collection.JavaConversions._
+
+object NeutralTweetFinder {
+
+  //val url = """^.*(http://)|(.co)|(.org)|(.uk)|(.gov)|(.be).*$""".r
+
+  var numWritten = 0
+  val STOPWORD_THRESHOLD = 2
+  val MAX_TO_WRITE = 1000000
+
+  def main(args: Array[String]): Unit = {
+    val inFile = new File(args(0))
+    val out = new BufferedWriter(new FileWriter(args(1)))
+    val engDict = scala.io.Source.fromFile(args(2), "utf-8").getLines.toSet
+    val stoplist = scala.io.Source.fromFile(args(3), "utf-8").getLines.toSet
+    val lexicon = MPQALexicon(args(4))
+    
+    if(inFile.isDirectory) {
+      for(file <- inFile.listFiles) {
+        if(!file.isDirectory) {
+          processFile(file, out, engDict, stoplist, lexicon)
+          out.flush
+
+          if(numWritten >= MAX_TO_WRITE) {
+            out.close
+            return
+          }
+        }
+      }
+    }
+
+    out.close
+  }
+
+  def processFile(file: File, out: BufferedWriter, engDict: Set[String], stoplist: Set[String], lexicon: MPQALexicon): Unit = {
+    println(file)
+    
+    val fis = new FileInputStream(file)
+    fis.read; fis.read
+    val cbzis = new CBZip2InputStream(fis)
+    val in = new BufferedReader(new InputStreamReader(cbzis))
+
+    var curLine = in.readLine
+    while(curLine != null) {
+      try {
+        val tokener = new JSONTokener(curLine)
+        val jso = new JSONObject(tokener)
+
+        val tweet = jso.getString("text")
+
+        processTweet(tweet, out, engDict, stoplist, lexicon)
+
+      } catch {
+        case e: Exception => e.printStackTrace
+      }
+
+      curLine = in.readLine
+    }
+
+    in.close
+  }
+
+  def processTweet(tweet: String, out: BufferedWriter, engDict: Set[String], stoplist: Set[String], lexicon: MPQALexicon): Unit = {
+    val tokens = BasicTokenizer(tweet)
+
+    var stopwordCount = 0
+    for(token <- tokens) {
+      if(!engDict(token)) {
+        return
+      }
+
+      if(lexicon.contains(token)) {
+        val entry = lexicon(token)
+        if(entry.isPositive || entry.isNegative)
+          return
+      }
+      
+
+      if(stoplist(token))
+        stopwordCount += 1
+    }
+
+    if(stopwordCount >= STOPWORD_THRESHOLD) {
+      out.write(tweet.replaceAll("\n", " ")+"\n")
+      numWritten += 1
+    }
+  }
+}

src/main/scala/updown/lex/MPQALexicon.scala

   val NEG = "NEG"
   val NEU = "NEU"
 
-  //  val mpqaLineRE = """^.*word1=(\w+).*mpqapolarity=(neutral)|(weakneg)|(strongneg)|(weakpos)|(strongpos).*$""".r
+  //val mpqaLineRE = """^.*word1=(\w+).*mpqapolarity=(neutral)|(weakneg)|(strongneg)|(weakpos)|(strongpos).*$""".r
   /* broke up one regex into two. Note the below two aint equiv to above one.*/
   val wordRE = """word1=(\w+)""".r
-  val polarityRE = """mpqapolarity=(neutral)|(neg)|(pos)$""".r
-  val subjectivityRE = """mpqapolarity=(strong)|(weak)$""".r
+  val polarityRE = """mpqapolarity=\w*(neutral|neg|pos)""".r
+  val subjectivityRE = """mpqapolarity=\w*(strong|weak)""".r
 
   def parseLine(line: String, entries: HashMap[String, MPQAEntry]): Any = {
     //whoa! interesting line of code. but causes runtime match error. shouldn't we be passing character seq and not strings?..
     val polarityOption = polarityRE.findFirstIn(line)
     val subjectivityOption = subjectivityRE.findFirstIn(line)
 
-    if (wordOption != None && polarityOption != None && subjectivityOption != None) {
+    if (wordOption != None && polarityOption != None/* && subjectivityOption != None*/) {
       val word = wordOption.get.substring(6)
-      val polarity = polarityOption.get.toUpperCase
-      val subjectivity = subjectivityOption.get.split("=").last
+      val rawPolarity = polarityOption.get.toUpperCase.split("=").last
+      val polarity = if(rawPolarity.endsWith(POS)) POS
+                     else if(rawPolarity.endsWith(NEG)) NEG
+                     else NEU
+      val subjectivity = if(subjectivityOption != None) subjectivityOption.get.split("=").last else NEU
 
       entries.put(word, new MPQAEntry(word, polarity, subjectivity))
     }
     }
   }
 }
+
+object MPQALexiconTest {
+  def main(args: Array[String]) = {
+    val lexicon = MPQALexicon(args(0))
+    println("Number of entries: "+lexicon.keySet.size)
+    println(lexicon("great"))
+    println(lexicon("glee"))
+    println(lexicon("awful"))
+  }
+}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.