Commits

vvcephei committed 8ec0e67

still working on thesis experiments

  • Participants
  • Parent commits ad5cd4e

Comments (0)

Files changed (6)

src/main/scala/updown/app/experiment/nbayes/NFoldNBayesExperiment.scala

+package updown.app.experiment.nbayes
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.app.experiment.NFoldExperiment
+import updown.util.NaiveBayesModel
+
+object NFoldNBayesExperiment extends NFoldExperiment {
+  def doExperiment(trainSet: List[GoldLabeledTweet], testSet: List[GoldLabeledTweet]) = {
+    logger.info("performing Naive Bayes experiment")
+    logger.debug("training model")
+    val model = new NaiveBayesModel(trainSet)
+
+    logger.debug("testing model")
+    val res = testSet.map(goldTweet=>model.classify(goldTweet))
+    res
+  }
+  def after():Int=0
+}

src/main/scala/updown/app/experiment/nbayes/SplitMaxentExperiment.scala

+package updown.app.experiment.nbayes
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.app.TrainMaxentModel
+import updown.app.experiment.SplitExperiment
+
+object SplitMaxentExperiment extends SplitExperiment {
+  def doExperiment(testSet: List[GoldLabeledTweet], trainSet: List[GoldLabeledTweet]) = {
+    logger.info("performing Maxent experiment")
+    logger.debug("training model")
+    val model = TrainMaxentModel.trainWithGoldLabeledTweetIterator(trainSet.iterator)
+
+    logger.debug("testing model")
+    val res = for (tweet <- testSet) yield {
+      tweet match {
+        case GoldLabeledTweet(id, userid, features, goldLabel) =>
+          SystemLabeledTweet(id, userid, features, goldLabel,
+            SentimentLabel.figureItOut(model.getBestOutcome(model.eval(features.toArray))))
+      }
+    }
+    res
+  }
+  def after():Int=0
+}

src/main/scala/updown/app/experiment/nbayes/StaticMaxentExperiment.scala

+package updown.app.experiment.nbayes
+
+import updown.data.{SystemLabeledTweet, GoldLabeledTweet, SentimentLabel}
+import updown.app.experiment.StaticExperiment
+import java.io.{FileInputStream, DataInputStream}
+import opennlp.maxent.io.BinaryGISModelReader
+
+object StaticMaxentExperiment extends StaticExperiment {
+  import org.clapper.argot.ArgotConverters._
+  val modelInputFile = parser.option[String](List("m", "model"), "model", "model input")
+
+  def doExperiment(testSet: List[GoldLabeledTweet]) = {
+    logger.info("performing Maxent experiment")
+    logger.debug("loading model")
+    val model =
+      modelInputFile.value match {
+        case Some(filename) =>
+          new BinaryGISModelReader(new DataInputStream(new FileInputStream(modelInputFile.value.get))).getModel
+        case None =>
+          parser.usage("You must specify a model input file")
+      }
+
+    logger.debug("testing model")
+    val res = for (tweet <- testSet) yield {
+      tweet match {
+        case GoldLabeledTweet(id, userid, features, goldLabel) =>
+          SystemLabeledTweet(id, userid, features, goldLabel,
+            SentimentLabel.figureItOut(model.getBestOutcome(model.eval(features.toArray))))
+      }
+    }
+    res
+  }
+
+  def after(): Int = 0
+}

src/main/scala/updown/util/NaiveBayesModel.scala

+package updown.util
+
+import cc.mallet.classify.NaiveBayesTrainer
+import updown.data.SystemLabeledTweet._
+import updown.data.{SystemLabeledTweet, SentimentLabel, GoldLabeledTweet}
+import cc.mallet.types._
+
+class NaiveBayesModel(tweets: List[GoldLabeledTweet]) extends MalletModel {
+  override protected def getInstanceList(tweetList: List[GoldLabeledTweet]): (Alphabet, InstanceList) = {
+    val alphabet = new Alphabet()
+    val labelAlphabet = new LabelAlphabet()
+    val instances = (for (tweet <- tweetList) yield {
+      tweet match {
+        case GoldLabeledTweet(id, userid, features, goldLabel) =>
+          val featureSequence = new FeatureSequence(alphabet, features.length)
+          for (feature <- features) {
+            featureSequence.add(feature)
+          }
+          val featureVector = new FeatureVector(featureSequence)
+          val label = labelAlphabet.lookupLabel(goldLabel)
+//          val label = new FeatureVector(
+//            labelAlphabet,
+//            Array[Object]("label"), Array[Double](SentimentLabel.toDouble(goldLabel)))
+          new Instance(featureVector, label, id, null)
+      }
+    }).toList
+
+    val instanceList = new InstanceList(alphabet, labelAlphabet)
+    for (instance <- instances) {
+      instanceList.add(instance)
+    }
+    (alphabet, instanceList)
+  }
+  private val (alphabet, instanceList) = getInstanceList(tweets)
+  private val trainer = new NaiveBayesTrainer()
+  private val classifier = trainer.train(instanceList)
+
+  def classify(tweet: GoldLabeledTweet) = {
+    val GoldLabeledTweet(id, userid, features, goldLabel) = tweet
+    val instance = {
+      val featureSequence = new FeatureSequence(alphabet, features.length)
+      for (feature <- features) {
+        featureSequence.add(feature)
+      }
+      val featureVector = new FeatureVector(featureSequence)
+      new Instance(featureVector, goldLabel, id, null)
+    }
+    val result = classifier.classify(instance)
+    SystemLabeledTweet(id, userid, features, goldLabel,
+      SentimentLabel.figureItOut(result.getLabeling.getBestLabel.getEntry.toString))
+  }
+
+}
+

src/main/scala/updown/util/TopicModel.scala

 
 case class Topic(prior: Map[String, Double], distribution: Map[String, Double])
 
-abstract class TopicModel {
-  protected def getInstanceList(tweetList: List[GoldLabeledTweet]): (Alphabet, InstanceList) = {
-    val alphabet = new Alphabet()
-    val labelAlphabet = new Alphabet()
-    val instances = (for (tweet <- tweetList) yield {
-      tweet match {
-        case GoldLabeledTweet(id, userid, features, goldLabel) =>
-          val featureSequence = new FeatureSequence(alphabet, features.length)
-          for (feature <- features) {
-            featureSequence.add(feature)
-          }
-          val label = new FeatureVector(
-            labelAlphabet,
-            Array[Object]("label"), Array[Double](SentimentLabel.toDouble(goldLabel)))
-          new Instance(featureSequence, label, id, null)
-      }
-    }).toList
-
-    val instanceList = new InstanceList(alphabet, null)
-    for (instance <- instances) {
-      instanceList.add(instance)
-    }
-    (alphabet, instanceList)
-  }
-
-  protected def getInstanceList(tweetList: List[GoldLabeledTweet], alphabet: Alphabet) = {
-    val instances = (for (tweet <- tweetList) yield {
-      tweet match {
-        case GoldLabeledTweet(id, userid, features, goldLabel) =>
-          val featureSequence = new FeatureSequence(alphabet, features.length)
-          for (feature <- features) {
-            featureSequence.add(feature)
-          }
-          new Instance(featureSequence, goldLabel, id, null)
-      }
-    }).toList
-
-    val instanceList = new InstanceList(alphabet, null)
-    for (instance <- instances) {
-      instanceList.add(instance)
-    }
-    instanceList
-  }
-
-    val getLabelNameArray = Array[Object](
-    SentimentLabel.toEnglishName(SentimentLabel.Positive),
-    SentimentLabel.toEnglishName(SentimentLabel.Neutral),
-    SentimentLabel.toEnglishName(SentimentLabel.Negative)
-    )
-  val getLabelFeatureArray: SentimentLabel.Type => Array[Double] =
-    (label: SentimentLabel.Type) => {
-      label match {
-        case SentimentLabel.Positive => Array[Double](1.0, 0.0, 0.0)
-        case SentimentLabel.Neutral => Array[Double](0.0, 1.0, 0.0)
-        case SentimentLabel.Negative => Array[Double](0.0, 0.0, 1.0)
-      }
-    }
-
-  protected def getInstanceListWithLabelVectors(tweetList: List[GoldLabeledTweet]): (Alphabet, InstanceList) = {
-    val alphabet = new Alphabet()
-    val labelAlphabet = new Alphabet()
-    val instances = (for (tweet <- tweetList) yield {
-      tweet match {
-        case GoldLabeledTweet(id, userid, features, goldLabel) =>
-          val featureSequence = new FeatureSequence(alphabet, features.length)
-          for (feature <- features) {
-            featureSequence.add(feature)
-          }
-          val label = new FeatureVector(
-            labelAlphabet,
-            getLabelNameArray,
-            getLabelFeatureArray(goldLabel)
-          )
-          new Instance(featureSequence, label, id, null)
-      }
-    }).toList
-
-    val instanceList = new InstanceList(alphabet, null)
-    for (instance <- instances) {
-      instanceList.add(instance)
-    }
-    (alphabet, instanceList)
-  }
+abstract class TopicModel extends MalletModel {
 
   def getTopics: List[Topic]
 

src/test/scala/updown/test/NBayesModelTest.scala

+package updown.test
+
+import org.scalatest.FlatSpec
+import updown.util.{NaiveBayesModel, BasicTokenizer}
+import updown.data.{SystemLabeledTweet, SentimentLabel, GoldLabeledTweet}
+
+class NBayesModelTest extends FlatSpec {
+  val t0 = GoldLabeledTweet("0","",List("foo","bar","baz"),SentimentLabel.Positive)
+  val t1 = GoldLabeledTweet("1","",List("foo","bar","baz"),SentimentLabel.Positive)
+  val t2 = GoldLabeledTweet("2","",List("bar","baz","foo"),SentimentLabel.Positive)
+  val t3 = GoldLabeledTweet("3","",List("bar","baz","foo"),SentimentLabel.Positive)
+  val t4 = GoldLabeledTweet("4","",List("foo","bar","foo","baz"),SentimentLabel.Positive)
+
+  val t5 = GoldLabeledTweet("5","",List("cat","dog","bird"),SentimentLabel.Negative)
+  val t6 = GoldLabeledTweet("6","",List("foo","dog","bird"),SentimentLabel.Negative)
+  val t7 = GoldLabeledTweet("7","",List("dog","bird","cat"),SentimentLabel.Negative)
+  val t8 = GoldLabeledTweet("8","",List("dog","bird","cat"),SentimentLabel.Negative)
+  val t9 = GoldLabeledTweet("9","",List("cat","dog","cat","bird"),SentimentLabel.Negative)
+
+  val autoclassify:(GoldLabeledTweet) => SystemLabeledTweet =
+  g=>SystemLabeledTweet(g.id,g.userid,g.features,g.goldLabel,g.goldLabel)
+
+  "classify" should "work" in {
+    val   train = List(t0,t1,t2,t3,t5,t6,t7,t8)
+    val model = new NaiveBayesModel(train)
+    assert(model.classify(t4) === autoclassify(t4))
+    assert(model.classify(t5) === autoclassify(t5))
+  }
+}