Commits

abhimanu  committed 7019c83

Error Analysis code for the paper

  • Participants
  • Parent commits 420b771

Comments (0)

Files changed (6)

File .cache

Binary file modified.

File src/main/scala/opennlp/textgrounder/app/RunApps.scala

     
     if(args.length==0){
     println("default args")
-    val args = new Array[String](18)
+    val args = new Array[String](22)
     args(0) = "--input-corpus"
 //    args(1) = "/home/abhimanu/textgrounder_temporal/data/corpora/temporal/docthresh-5"
-    args(1) = "/home/abhimanu/datasets/textgrounder/temporal/wiki-bio/" //wiki-bio gutts wiki-years
+    args(1) = "/home/abhimanu/datasets/textgrounder/temporal/gutts/" //wiki-bio gutts wiki-years
     args(2) = "--width-of-multi-cell"
-    args(3) = "10"
+    args(3) = "40"
     args(4) = "--eval-set"
     args(5) = "dev"
     args(6) = "--word-dist"
     //NOTE: Naive Bayes has been currently coded for only Dirichlet/JM Smoothing
     //By default Bayes is chronon-docs
     args(12) = "--smoothing-par"
-    args(13) = "0.01"					//for JM=0.99  and cikm = 0.01
+    args(13) = "0.99"					//for JM=0.99  and cikm = 0.01
     args(14) = "--bayes-prior"
     args(15) = "chronon-docs"  					//"uniform" "chronon-docs" ; default=chronon-docs
     args(16) = "--smoothing-type"  
     args(17) = "JM"  					//"Dirichlet" "JM" ; default = "JM" 
+    args(18) = "--frequency-limit"
+    args(19) = "3"
+    args(20) = "--error-limit"
+    args(21) = "107"
     
       //NOTE: always remember to set "temporal-dirichlet" before doing bayes
     TemporalDocumentApp.main(args)

File src/main/scala/opennlp/textgrounder/geolocate/Evaluation.scala

       warning("Strange, no instances found at all; perhaps --eval-format is incorrect?")
       return
     }
+    all_results_flag = all_results
     errprint("Number of instances = %s", total_instances)
     output_correct_results()
     output_incorrect_results()

File src/main/scala/opennlp/textgrounder/geolocate/Geolocate.scala

       choices = Seq("uniform", "chronon-docs"),
       help = """Chooses Bayes Prior. Options are "uniform" and "chronon-docs".
 Default '%default'.""")
-      
+
+  var analysis_word_document_frequency_limit =
+    ap.option[Int]("frequency-limit", "freq-limit", metavar = "NUM",
+      default = 1,
+      help = """ Document Frequency limit for a word.""")
+  var analysis_error_limit =
+    ap.option[Double]("error-limit", "error-limit", metavar = "NUM",
+      default = 100.0,
+      help = """ Error Limit for a document. """)
 //  var partial-cell-count =
 //    ap.option[Int]("partial-cell-count", "partial-cell", metavar = "NUM",
 //      default = 0,

File src/main/scala/opennlp/textgrounder/temporal/TemporalDirichletSmoothedWordDist.scala

   def innerToString = ", %.2f unseen mass" format unseen_mass
 
   
+  def getWordFromInt(wordInt: Int) : String = {
+    unmemoize_string(wordInt)
+  }
   
   protected def imp_add_word_distribution_partial(xworddist: WordDist, partial: Double){
     val worddist = xworddist.asInstanceOf[UnigramWordDist]

File src/main/scala/opennlp/textgrounder/temporal/TemporalEvaluation.scala

 import opennlp.textgrounder.geolocate.DistDocumentFileProcessor
 import opennlp.textgrounder.geolocate.UnigramWordDist
 import opennlp.textgrounder.geolocate.WordDist.memoizer._
+import scala.runtime.RichDouble
 
 
 abstract class TemporalTestFileEvaluator[TEvalDoc, TEvalRes](
   driver_stats, prefix, max_rank_for_credit) {
   val degree_dists = mutable.Buffer[Double]()
   val oracle_degree_dists = mutable.Buffer[Double]()
-  val error_histograms = mutable.HashMap[TemporalCoord,(Int, Double)]()
-
+  val error_histograms = mutable.HashMap[Double,(Int, Double,String)]()
+  val word_ErrorHistogram = mutable.HashMap[String,(Int, Double, Double)]()
+  val worst_docs = mutable.HashMap[TemporalDocument,(Double,Int,TemporalCell,TemporalCell)]()
+  var limit_flag = 1
   def record_result(rank: Int, pred_true_dist: Double,
       pred_degree_dist: Double) {
     super.record_result(rank, pred_true_dist)
   }
   
   def record_histogram(true_center: TemporalCoord, pred_degdist: Double){
-    var value=error_histograms.get(true_center)
+    val key = (true_center.left+true_center.right)*1.0/2
+    var value=error_histograms.get(key)
     value match{
-      case Some(x) =>{error_histograms.put(true_center,(x._1+1,x._2+pred_degdist))}
-      case None => error_histograms.put(true_center,(1,pred_degdist))
+      case Some(x) =>{error_histograms.put(key,(x._1+1,x._2+pred_degdist,true_center.toString()))}
+      case None => error_histograms.put(key,(1,pred_degdist,true_center.toString()))
     }
   }
 
+  def record_word_errors(word: String, pred_degdist:Double, limit: Int){
+    limit_flag = limit
+    var value=word_ErrorHistogram.get(word)
+    value match{
+      case Some(x) =>{word_ErrorHistogram.put(word,(x._1+1,x._2+pred_degdist,(x._2+pred_degdist)*1.0/(x._1+1)))}
+      case None => word_ErrorHistogram.put(word,(1,pred_degdist,pred_degdist))
+    }
+  }
+  
+  def record_worst_predictions(document: TemporalDocument, pred_degdist: Double, true_rank: Int, 
+      true_cell: TemporalCell, pred_cell: TemporalCell){
+    worst_docs.put(document,(pred_degdist,true_rank,true_cell,pred_cell))
+  }
+  
   def record_oracle_result(oracle_true_dist: Double,
       oracle_degree_dist: Double) {
     super.record_oracle_result(oracle_true_dist)
     errprint("  Median error distance = %.2f Years",
       median(degree_dists))
     if(all_results_flag){
-      println("\n\n\n\n\t\t=====================================\n")
-    	for(coord <- error_histograms.keySet){
-    		var value = error_histograms.get(coord)
-    		println(coord.toString()+"\t",value.get._2/value.get._1,value.get._1)
+//      val temp_map = mutable.ArrayBuffer[(String, Double,Int)]()
+      println("\n\n\n\n\t\t===================Analysis Stats==================\n")
+      println("\t\t===================Error Histogram==================\n")
+    	for(i <- error_histograms.keySet.toList.sorted){
+    	  val value = error_histograms.get(i)
+    	  println(value.get._3+"\t",value.get._2/value.get._1,value.get._1)
     	}
-      
+      println("\t\t===================Word Errors==================\n")	
+      for(value <- word_ErrorHistogram.toList sortBy {-_._2._3}){
+        if(value._2._1>limit_flag)
+    		println(value._1+"\t",value._2._2/value._2._1, value._2._3,value._2._1)
+    	}
+      println("\t\t===================Worst Docs==================\n")
+      for(value <- worst_docs.toList sortBy {-_._2._1}){
+        if(value._2._1>100)
+    		println(value._1,value._2._1, value._2._2," True Cell: "+value._2._3," Pred Cell"+value._2._4)
+    	}
     }
 //    errprint("  Median oracle true error distance = %s",
 //      km_and_miles(median(oracle_true_dists)))
 
   override def record_one_result(stats: TBasicEvalStats,
       res: TDocEvalRes) {
+    val parameters = cell_grid.table.driver.params
     stats.record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
     stats.record_histogram(res.true_center, res.pred_degdist)
+    if(res.pred_degdist>parameters.analysis_error_limit)
+    	stats.record_worst_predictions(res.document,res.pred_degdist, res.true_rank, 
+        res.true_cell.asInstanceOf[TemporalCell], res.pred_cell)
+    val dist = res.document.dist.asInstanceOf[TemporalDirichletSmoothedWordDist]
+    for (word <- dist.counts.keys){
+      stats.record_word_errors(dist.getWordFromInt(word),res.pred_degdist, parameters.analysis_word_document_frequency_limit)
+    }
   }
 
   override def record_one_oracle_result(stats: TBasicEvalStats,