1. Mike Speriosu
  2. updown


vvcephei  committed af2e8e1

added importer for MDSD (Blitzer)

  • Participants
  • Parent commits fc3ffd5
  • Branches default

Comments (0)

Files changed (3)

File src/main/scala/updown/preproc/GenericPreprocessor.scala

View file
  • Ignore whitespace
 package updown.preproc
 import org.clapper.argot.{ArgotUsageException, ArgotParser, ArgotConverters}
-import ArgotConverters._
 import updown.data.SentimentLabel
 import updown.util.TokenizationPipes
 import com.weiglewilczek.slf4s.Logging
 import java.io.{File, FileOutputStream, OutputStreamWriter}
 abstract class GenericPreprocessor extends Logging {
-  // this is here to make ArgotConverters appear used to IDEA.
-  convertString _
+  import ArgotConverters._
   var pipeStages: Map[String, (List[String]) => List[String]] =
     Map[String, (List[String]) => List[String]](
       ("lowerCase" -> TokenizationPipes.toLowercase),

File src/main/scala/updown/preproc/impl/PreprocMDSDReviews.scala

View file
  • Ignore whitespace
+package updown.preproc.impl
+import updown.data.SentimentLabel
+import updown.preproc.GenericPreprocessor
+import au.com.bytecode.opencsv.CSVReader
+import java.io.{File, FileInputStream, InputStreamReader}
+object PreprocMDSDReviews extends GenericPreprocessor {
+  override val defaultPipeline = "basicTokenize"
+  val getTokensFromLine: (String) => (List[String], SentimentLabel.Type) = line => {
+    lazy val getTokensFromLineHelper: (List[String], List[String], SentimentLabel.Type) => (List[String], SentimentLabel.Type) =
+      (inputs, tokens, label) => {
+        inputs match {
+          case Nil => (tokens, label)
+          case s :: ss =>
+            val (left :: right :: Nil) = s.split(":").toList
+            left match {
+              case "#label#" => getTokensFromLineHelper(ss, tokens, SentimentLabel.figureItOut(right))
+              case s: String => getTokensFromLineHelper(ss, tokens ::: (for (_ <- 0 until Integer.valueOf(right)) yield left).toList, label)
+            }
+        }
+      }
+    getTokensFromLineHelper(line.split("\\s+").toList, Nil, SentimentLabel.Unknown)
+  }
+  def getInstanceIterator(fileName: String, polarity: String): Iterator[(String, String, Either[SentimentLabel.Type, Map[String, SentimentLabel.Type]], String)] = {
+    (for ((line, index) <- scala.io.Source.fromFile(fileName, "UTF-8").getLines().zipWithIndex) yield {
+      val (tokens, label) = getTokensFromLine(line)
+      val purgedTokens = tokens.filter((s) => true)
+      ("%s#%d".format(fileName, index), "unk", Left(label), purgedTokens.mkString(" "))
+    })
+  }

File src/test/scala/updown/test/PreprocMDSDTest.scala

View file
  • Ignore whitespace
+package updown.test
+import org.scalatest.FlatSpec
+import updown.data.SentimentLabel
+import updown.preproc.impl.{PreprocMDSDReviews, PreprocHCRTweets}
+class PreprocMDSDTest extends FlatSpec {
+  "getTokensFromLine" should "work" in {
+    val line = "your:4 in_behind:1 people_have:1 stuff_in:1 so:1 might_matter:1 durable_so:1 affected:1 shielded_stuff:1 have_pointed:2 is_better:1 leave_it:1 yes_if:1 most_have:1 digital:1 affected_by:1 because:1 plated_shielded:1 so_yes:1 no:1 months_don't:1 you:2 signal_and:1 analog:2 then_leave:1 waste:1 a_digital:1 analog_cables:1 plan:1 stuff:3 if_like:1 shielded:1 stuff_absolutely:1 swinging_around:1 this_stuff:1 signal:1 matter:1 most_people:1 are_highly:1 monster:2 better:1 a_difference:1 your_money:2 your_cables:1 stuff_monster:1 people:2 matter_if:1 the_gas-injected:1 gas-injected_gold:1 digital_signal:1 don't:1 em_interference.some:1 like_most:2 for_months:1 then:1 that_monster:1 interference.some:1 spent:1 the_stuff:1 gold:1 out_it's:1 behind_your:1 all:1 difference_because:1 monster_cables:2 you_plan:1 plated:1 you_plug:1 pointed_out:2 pointed:2 hype_for:1 if:2 outside:1 out_that:1 equipment:1 waste_your:1 more_durable:1 signals:1 absolutely:1 and_months:1 durable:1 highly_affected:1 people_you:1 cables:4 money_is:1 plug:1 because_analog:1 plug_the:1 your_equipment:1 don't_waste:1 there's_no:1 makes:1 leave:1 analog_signals:1 months_and:1 behind:1 cables_hype:1 makes_a:1 plan_on:1 elsewhere:1 all_the:1 more:1 outside_em:1 months:2 on_your:1 gold_plated:1 for_analog:1 are_more:1 equipment_then:1 most:2 by_outside:1 em:1 might:1 difference:1 around_on:1 better_spent:1 like:2 out:2 money:2 if_you:1 gas-injected:1 around:1 it_might:1 hype:1 no_need:1 highly:1 for_all:1 yes:1 spent_elsewhere:1 need:1 swinging:1 cables_it:1 need_for:1 on_swinging:1 absolutely_makes:1 cables_are:2 signals_this:1 interference.some_people:1 #label#:negative"
+    val result = (List("your", "your", "your", "your", "in_behind", "people_have", "stuff_in", "so", "might_matter", "durable_so", "affected", "shielded_stuff", "have_pointed", "have_pointed", "is_better", "leave_it", "yes_if", "most_have", "digital", "affected_by", "because", "plated_shielded", "so_yes", "no", "months_don't", "you", "you", "signal_and", "analog", "analog", "then_leave", "waste", "a_digital", "analog_cables", "plan", "stuff", "stuff", "stuff", "if_like", "shielded", "stuff_absolutely", "swinging_around", "this_stuff", "signal", "matter", "most_people", "are_highly", "monster", "monster", "better", "a_difference", "your_money", "your_money", "your_cables", "stuff_monster", "people", "people", "matter_if", "the_gas-injected", "gas-injected_gold", "digital_signal", "don't", "em_interference.some", "like_most", "like_most", "for_months", "then", "that_monster", "interference.some", "spent", "the_stuff", "gold", "out_it's", "behind_your", "all", "difference_because", "monster_cables", "monster_cables", "you_plan", "plated", "you_plug", "pointed_out", "pointed_out", "pointed", "pointed", "hype_for", "if", "if", "outside", "out_that", "equipment", "waste_your", "more_durable", "signals", "absolutely", "and_months", "durable", "highly_affected", "people_you", "cables", "cables", "cables", "cables", "money_is", "plug", "because_analog", "plug_the", "your_equipment", "don't_waste", "there's_no", "makes", "leave", "analog_signals", "months_and", "behind", "cables_hype", "makes_a", "plan_on", "elsewhere", "all_the", "more", "outside_em", "months", "months", "on_your", "gold_plated", "for_analog", "are_more", "equipment_then", "most", "most", "by_outside", "em", "might", "difference", "around_on", "better_spent", "like", "like", "out", "out", "money", "money", "if_you", "gas-injected", "around", "it_might", "hype", "no_need", "highly", "for_all", "yes", "spent_elsewhere", "need", "swinging", "cables_it", "need_for", "on_swinging", "absolutely_makes", "cables_are", "cables_are", "signals_this", "interference.some_people"),SentimentLabel.Negative)
+    val line2 = "i've_also:1 pain_and:1 my_im716s:1 no:1 say_about:1 a_few:1 module:4 equal:1 to_drown:1 earphones_the:1 clip:2 i've_ever:1 also_caught:1 saying:1 notch_in:1 annoying_i've:1 quality:2 quite:1 using:1 bar_none:1 none:1 was_able:1 perfect:1 site_<num>:1 away_from:1 will_recommend:1 cord:1 something_otherwise:1 same:1 mow:2 that_stops:1 out_the:1 route:1 <num>:2 logitech_ear:1 causing_the:1 site:1 me_to:1 it_back:1 it_clipped:1 otherwise:1 outperform:1 i_am:1 able_to:1 set_i:1 mainly_to:1 or_my:1 quality_w/o:1 from_saying:1 the_way:1 volume_module:2 need_to:1 steal.2)_quality:1 i_was:1 me_about:1 having:1 for_about:1 im716s:1 quite_annoying:1 them.1)_price:1 ipod:1 noise_this:1 out:1 mower.4)_conclusion:1 said:1 around:1 anyone_asking:1 set:1 lists:1 saying_these:1 performance_bar:1 jack:1 im716s_for:1 them_or:1 ear:2 removeable_and:1 muff_set:1 asking_me:1 quality_top:1 earphones:1 i'd:1 use:1 having_to:1 annoying:1 60gb_video:1 <year>:1 i've_had:1 lawn:1 a_pair:1 performance:1 a_pain:1 back_if:1 a_steal.2):1 was:1 pair:1 regretting:1 ever:1 pain:1 <num>_months:1 huge:1 regretting_my:1 module_it:1 even_for:1 volume_to:1 are_perfect:1 also:1 say:1 to_anyone:1 bend_it:1 recommend:1 of_equal:1 and_i:1 clipped:1 clip_to:1 to_mow:2 able:1 things_to:1 <num>_is:1 when:1 <year>_on:1 the_clip:2 weak.3):1 purchase_and:1 price_i'd:1 mow_in:1 only_thing:1 best:2 caught_the:1 moving_around:1 i've:3 video:1 need:1 caught:1 about_them.1):1 stops:1 asking:1 that_said:1 the_cord:1 when_moving:1 headphones:1 now_i:1 the_same:1 clipped_to:1 back:1 clip_on:1 logitech:1 muff:1 60gb:1 jack_up:1 altec's:1 steal.2):1 can_get:1 drown:1 for_<year>:1 w/o:1 go_that:1 top_notch:1 about_<num>:1 allow_me:1 you_need:1 is_weak.3):1 the_only:1 things_outperform:1 best_i've:1 around_you:1 lists_for:1 them:1 months:1 the_volume:3 way:2 perfect_is:1 get_quite:1 moving:1 pair_of:1 price:2 my:3 noise:1 weak.3)_performance:1 way_regretting:1 me:3 you:1 lawn_without:1 create:1 me_from:1 on_things:1 things_causing:1 volume:3 top:1 same_price:1 bend:2 stops_me:1 create_inner:1 altec's_site:1 anyone:1 my_purchase:1 using_my:1 outperform_the:1 removeable:1 otherwise_it:1 said_i:1 and_get:1 few_things:1 get:2 to_say:1 that_route:1 route_with:1 way_when:1 ever_used:1 drown_out:1 i_have:1 conclusion_the:1 the_lawn:1 to_bend:2 best_headphones:1 video_ipod:1 is_removeable:1 inner:1 none_best:1 price_lists:1 if:1 the_huge:1 without:1 module_can:1 ear_muff:1 inner_ear:1 no_way:1 something:1 without_having:1 mow_the:1 bend_away:1 away:1 even:1 months_now:1 go:1 these_things:2 module_is:1 headphones_i've:1 i:4 huge_logitech:1 after_using:1 them.1):1 my_60gb:1 purchase:1 conclusion:1 mower.4):1 equal_quality:1 now:1 recommend_these:1 i'd_go:1 thing_that:1 about:3 bar:1 cord_on:1 get_in:1 if_there:1 about_them:1 few:1 use_mainly:1 i_use:1 things_allow:1 the_module:2 can_create:1 used:1 w/o_the:1 only:1 on_altec's:1 in_no:1 notch:1 allow:1 to_jack:1 things:4 the_mower.4):1 thing:1 to_something:1 the_earphones:1 causing:1 after:1 module_even:1 ear_noise:1 mainly:1 #label#:positive"
+    val result2 = (List("i've_also", "pain_and", "my_im716s", "no", "say_about", "a_few", "module", "module", "module", "module", "equal", "to_drown", "earphones_the", "clip", "clip", "i've_ever", "also_caught", "saying", "notch_in", "annoying_i've", "quality", "quality", "quite", "using", "bar_none", "none", "was_able", "perfect", "site_<num>", "away_from", "will_recommend", "cord", "something_otherwise", "same", "mow", "mow", "that_stops", "out_the", "route", "<num>", "<num>", "logitech_ear", "causing_the", "site", "me_to", "it_back", "it_clipped", "otherwise", "outperform", "i_am", "able_to", "set_i", "mainly_to", "or_my", "quality_w/o", "from_saying", "the_way", "volume_module", "volume_module", "need_to", "steal.2)_quality", "i_was", "me_about", "having", "for_about", "im716s", "quite_annoying", "them.1)_price", "ipod", "noise_this", "out", "mower.4)_conclusion", "said", "around", "anyone_asking", "set", "lists", "saying_these", "performance_bar", "jack", "im716s_for", "them_or", "ear", "ear", "removeable_and", "muff_set", "asking_me", "quality_top", "earphones", "i'd", "use", "having_to", "annoying", "60gb_video", "<year>", "i've_had", "lawn", "a_pair", "performance", "a_pain", "back_if", "a_steal.2)", "was", "pair", "regretting", "ever", "pain", "<num>_months", "huge", "regretting_my", "module_it", "even_for", "volume_to", "are_perfect", "also", "say", "to_anyone", "bend_it", "recommend", "of_equal", "and_i", "clipped", "clip_to", "to_mow", "to_mow", "able", "things_to", "<num>_is", "when", "<year>_on", "the_clip", "the_clip", "weak.3)", "purchase_and", "price_i'd", "mow_in", "only_thing", "best", "best", "caught_the", "moving_around", "i've", "i've", "i've", "video", "need", "caught", "about_them.1)", "stops", "asking", "that_said", "the_cord", "when_moving", "headphones", "now_i", "the_same", "clipped_to", "back", "clip_on", "logitech", "muff", "60gb", "jack_up", "altec's", "steal.2)", "can_get", "drown", "for_<year>", "w/o", "go_that", "top_notch", "about_<num>", "allow_me", "you_need", "is_weak.3)", "the_only", "things_outperform", "best_i've", "around_you", "lists_for", "them", "months", "the_volume", "the_volume", "the_volume", "way", "way", "perfect_is", "get_quite", "moving", "pair_of", "price", "price", "my", "my", "my", "noise", "weak.3)_performance", "way_regretting", "me", "me", "me", "you", "lawn_without", "create", "me_from", "on_things", "things_causing", "volume", "volume", "volume", "top", "same_price", "bend", "bend", "stops_me", "create_inner", "altec's_site", "anyone", "my_purchase", "using_my", "outperform_the", "removeable", "otherwise_it", "said_i", "and_get", "few_things", "get", "get", "to_say", "that_route", "route_with", "way_when", "ever_used", "drown_out", "i_have", "conclusion_the", "the_lawn", "to_bend", "to_bend", "best_headphones", "video_ipod", "is_removeable", "inner", "none_best", "price_lists", "if", "the_huge", "without", "module_can", "ear_muff", "inner_ear", "no_way", "something", "without_having", "mow_the", "bend_away", "away", "even", "months_now", "go", "these_things", "these_things", "module_is", "headphones_i've", "i", "i", "i", "i", "huge_logitech", "after_using", "them.1)", "my_60gb", "purchase", "conclusion", "mower.4)", "equal_quality", "now", "recommend_these", "i'd_go", "thing_that", "about", "about", "about", "bar", "cord_on", "get_in", "if_there", "about_them", "few", "use_mainly", "i_use", "things_allow", "the_module", "the_module", "can_create", "used", "w/o_the", "only", "on_altec's", "in_no", "notch", "allow", "to_jack", "things", "things", "things", "things", "the_mower.4)", "thing", "to_something", "the_earphones", "causing", "after", "module_even", "ear_noise", "mainly"),SentimentLabel.Positive)
+    assert(PreprocMDSDReviews.getTokensFromLine(line)===result)
+    assert(PreprocMDSDReviews.getTokensFromLine(line2)===result2)
+  }
+  // the new preprocessor expects a file. I'll have to think about what to do here.
+  /*"processOneLine" should "produce expected output" in {
+    assert(
+      pst.processOneLine(9, HCR_INPUT_FIELDS, Set("for", "you", "mr"))
+        ===
+        SuccessfulHCRParse(
+          HCR_TWEET_ID,
+          HCR_USERNAME,
+          List((HCR_SENTIMENT_GOLD,
+          HCR_TARGET)),
+          HCR_FEATURES))
+  }*/
+  //TODO test failure modes