Commits

vvcephei committed 1c5038c Merge

merge

Comments (0)

Files changed (5)

Add a comment to this file

models/maxent-eng-3way.mxm.gz

Binary file added.

Add a comment to this file

models/maxent-eng.mxm.gz

Binary file modified.

src/main/scala/updown/app/TrainMaxentModel.scala

 
 /**
  * Train a maxent model from labeled tweet input where each line has the format:
- * TWEET_ID|USER_ID|feature1,feature2,feature3,...,featureN,label
+ * TWEET_ID|USER_ID|feature1,feature2,feature3,...,featureN|label
  *
+ * or, using the -s flag, train from a simple file that has the format:
+ * feature1,feature2,feature3,...,featureN,label
+ * 
  * @author Mike Speriosu
  */
 object TrainMaxentModel {
 
   def apply(fileName: String): AbstractModel = apply(fileName, DEFAULT_ITERATIONS, DEFAULT_CUTOFF)
 
+  def trainSimple(fileName: String, iterations: Int, cutoff: Int): AbstractModel = {
+    val reader = new BufferedReader(new FileReader(fileName))
+    val dataStream = new PlainTextByLineDataStream(reader)
+    val eventStream = new BasicEventStream(dataStream, ",")
+
+    GIS.trainModel(eventStream, iterations, cutoff)
+  }
+
   def trainWithStringIterator(iterator: Iterator[String], iterations: Int, cutoff: Int): AbstractModel =
     GIS.trainModel(MaxentEventStreamFactory.getWithStringIterator(iterator), iterations, cutoff)
 
     val outputFile = parser.option[String](List("m", "output"), "output", "model output") //Matt votes to change abbrev from "m" to "o"...
     val iterations = parser.option[Int](List("n", "iterations"), "iterations", "number of iterations (default = " + DEFAULT_ITERATIONS + ")")
     val cutoff = parser.option[Int](List("c", "cutoff"), "cutoff", "number of times a feature must be seen to be used (default = " + DEFAULT_CUTOFF + ")")
+    val simple = parser.option[String](List("s", "simple"), "simple", "read tweets in simple format, without userid and tweetid")
+
     try {
       parser.parse(args)
     }
     }
 
 
-    val model: AbstractModel = apply(
-      inputFile.value.get,
-      iterations.value.getOrElse(DEFAULT_ITERATIONS),
-      cutoff.value.getOrElse(DEFAULT_CUTOFF))
+    val model: AbstractModel = if(simple.value == None)
+      apply(inputFile.value.get,iterations.value.getOrElse(DEFAULT_ITERATIONS), cutoff.value.getOrElse(DEFAULT_CUTOFF))
+    else
+      trainSimple(inputFile.value.get,iterations.value.getOrElse(DEFAULT_ITERATIONS), cutoff.value.getOrElse(DEFAULT_CUTOFF))
 
     val modelWriter = new BinaryGISModelWriter(model, new File(outputFile.value.get))
     modelWriter.persist()

src/main/scala/updown/data/io/TweetFeatureReader.scala

     GoldLabeledTweet(tweetid, userid, features, SentimentLabel.figureItOut(label))
   }
 }
+
+/*object RawTweetFeatureReader {
+  val featureRowRE = """^([^|]*)\|([^|]*)\|([^|]*)\|(.*)$""".r
+
+  def apply(inputFile: String): List[GoldLabeledTweet] = {
+
+    val lines = scala.io.Source.fromFile(inputFile, "utf-8").getLines.toList
+
+    for (line <- lines) yield {
+      parseLine(line: String): GoldLabeledTweet = {
+        
+      }
+    }
+  }
+*/

src/main/scala/updown/preproc/PreprocEmoticonTweets.scala

   val parser = new ArgotParser("updown run updown.preproc.PreprocEmoticonTweets", preUsage = Some("Updown"))
   val inputPositiveFile = parser.option[String](List("p", "positive"), "positive", "text file with positive emoticons")
   val inputNegativeFile = parser.option[String](List("n", "negative"), "negative", "text file with negative emoticons")
+  val inputNeutralFile = parser.option[String](List("u", "neutral"), "neutral", "text file with neutral tweets")
   val outputFile = parser.option[String](List("o", "output"), "ouput", "feature file output")
   val stopListFile = parser.option[String](List("l", "stoplist"), "stoplist", "stoplist words")
   val dictFile = parser.option[String](List("d", "dictionary"), "dictionary", "a dictionary-this is actually just a list of words in the language")
 
     preprocFile(inputPositiveFile.value.get, SentimentLabel.Positive, out, stoplist, engDict, countTopN) //happy
     preprocFile(inputNegativeFile.value.get, SentimentLabel.Negative, out, stoplist, engDict, countTopN) //sad
+    if(inputNeutralFile.value != None)
+      preprocFile(inputNeutralFile.value.get, SentimentLabel.Neutral, out, stoplist, engDict, countTopN) // neutral
 
-    /*
-    *
-    *I have no idea what a neutral emoticon is
-    */
-    //preprocFile(args(2), SentimentLabel.Neutral, out, stoplist, engDict, countTopN)
 
     if (countTopN) {
       val topNOut = new OutputStreamWriter(new FileOutputStream(countArg.value.get), "utf-8")
         for (feature <- features) out.write(feature + ",")
         out.write(label + "\n")
       }
-      if (isArabic(tokens, engDict)) {
+      // Why are we passing engDict to isArabic? Commenting this out for now...
+      /*if (isArabic(tokens, engDict)) {
         val bigrams = StringUtil.generateBigrams(tokens)
 
         val unigrams = tokens.filterNot(stoplist(_))
 
         for (feature <- features) out.write(feature + ",")
         out.write(label + "\n")
-      }
+      }*/
     }
   }
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.