Commits

Ben Wing committed 9d520de

Rewriting to add --coord-strategy for mean shift, parameters for mean shift and lots of hacking to allow for evaluation that produces a point rather than necessarily a cell

  • Participants
  • Parent commits 92173ab

Comments (0)

Files changed (7)

src/main/scala/opennlp/textgrounder/geolocate/Geolocate.scala

 NOTE: Multiple --strategy options can be given, and each strategy will
 be tried, one after the other.""")
 
+  var coord_strategy =
+    ap.option[String]("coord-strategy", "cs",
+      default = "top-ranked",
+      choices = Seq("top-ranked", "mean-shift"),
+      help = """Strategy/strategies to use to choose the best coordinate for
+a document.
+
+'top-ranked' means to choose the single best-ranked cell according to the
+scoring strategy specified using '--strategy', and use its central point.
+
+'mean-shift' means to take the K best cells (according to '--k-best'),
+and then compute a single point using the mean-shift algorithm.  This
+algorithm works by steadily shifting each point towards the others by
+computing an average of the points surrounding a given point, weighted
+by a function that drops off rapidly as the distance from the point
+increases (specifically, the weighting is the same as for a Gaussian density,
+with a parameter H, specified using '--mean-shift-window', that corresponds to
+the standard deviation in the Gaussian distribution function).  The idea is
+that the points will eventually converge on the largest cluster within the
+original points.  The algorithm repeatedly moves the points closer to each
+other until either the total standard deviation of the points (i.e.
+approximately the average distance of the points from their mean) is less than
+the value specified by '--mean-shift-max-stddev', or the number of iterations
+exceeds '--mean-shift-max-iterations'.
+
+Default '%default'.""")
+
+  var k_best =
+    ap.option[Int]("k-best", "kb",
+      default = 10,
+      help = """Value of K for use in the mean-shift algorithm
+(see '--coord-strategy').  For this value of K, we choose the K best cells
+and then apply the mean-shift algorithm to the central points of those cells.
+
+Default '%default'.""")
+
+  var mean_shift_window =
+    ap.option[Double]("mean-shift-window", "msw",
+      default = 1.0,
+      help = """Window to use in the mean-shift algorithm
+(see '--coord-strategy').
+
+Default '%default'.""")
+
+  var mean_shift_max_stddev =
+    ap.option[Double]("mean-shift-max-stddev", "msms",
+      default = 1e-10,
+      help = """Maximum allowed standard deviation (i.e. approximately the
+average distance of the points from their mean) among the points selected by
+the mean-shift algorithm (see '--coord-strategy').
+
+Default '%default'.""")
+
+  var mean_shift_max_iterations =
+    ap.option[Int]("mean-shift-max-iterations", "msmi",
+      default = 100,
+      help = """Maximum number of iterations in the mean-shift algorithm
+(see '--coord-strategy').
+
+Default '%default'.""")
+
   var baseline_strategy =
     ap.multiOption[String]("baseline-strategy", "bs",
       default = Seq("internal-link"),
   override type TParam <: GeolocateDocumentParameters
   type TRunRes =
     Seq[(String, GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
-         TestFileEvaluator[_,_])]
+         TestDocumentEvaluator[_,_])]
 
   var strategies: Seq[(String, GridLocateDocumentStrategy[SphereCell, SphereCellGrid])] = _
 
    *
    * The current return type is as follows:
    *
-   * Seq[(java.lang.String, GridLocateDocumentStrategy[SphereCell, SphereCellGrid], scala.collection.mutable.Map[evalobj.Document,opennlp.textgrounder.geolocate.EvaluationResult])] where val evalobj: opennlp.textgrounder.geolocate.TestFileEvaluator
+   * Seq[(java.lang.String, GridLocateDocumentStrategy[SphereCell, SphereCellGrid], scala.collection.mutable.Map[evalobj.Document,opennlp.textgrounder.geolocate.EvaluationResult])] where val evalobj: opennlp.textgrounder.geolocate.TestDocumentEvaluator
    *
    * This means you get a sequence of tuples of
    * (strategyname, strategy, results)
    * strategyname = name of strategy as given on command line
    * strategy = strategy object
    * results = map listing results for each document (an abstract type
-   * defined in TestFileEvaluator; the result type EvaluationResult
+   * defined in TestDocumentEvaluator; the result type EvaluationResult
    * is practically an abstract type, too -- the most useful dynamic
    * type in practice is DocumentEvaluationResult)
    */
 
+  def create_document_evaluator(
+      strategy: GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
+      stratname: String) = {
+    // Generate reader object
+    if (params.eval_format == "pcl-travel")
+      new PCLTravelGeolocateDocumentEvaluator(strategy, stratname, this)
+    else if (params.coord_strategy =="top-ranked")
+      new RankedCorpusGeolocateDocumentEvaluator(strategy, stratname, this)
+    else
+      new MeanShiftCorpusGeolocateDocumentEvaluator(strategy, stratname, this,
+        params.k_best, params.mean_shift_window,
+        params.mean_shift_max_stddev,
+        params.mean_shift_max_iterations)
+  }
+
   def run_after_setup() = {
-    process_strategies(strategies)((stratname, strategy) => {
-      // Generate reader object
-      if (params.eval_format == "pcl-travel")
-        new PCLTravelGeolocateDocumentEvaluator(strategy, stratname, this)
-      else
-        new CorpusGeolocateDocumentEvaluator(strategy, stratname, this)
-    })
+    process_strategies(strategies)((stratname, strategy) =>
+      create_document_evaluator(strategy, stratname))
   }
 }
 

src/main/scala/opennlp/textgrounder/geolocate/Hadoop.scala

 import org.apache.hadoop.fs.Path
 
 import opennlp.textgrounder.util.argparser._
+import opennlp.textgrounder.util.distances._
 import opennlp.textgrounder.util.experiment.ExperimentMeteredTask
 import opennlp.textgrounder.util.hadoop._
 import opennlp.textgrounder.util.ioutil.FileHandler
 import opennlp.textgrounder.util.mathutil.{mean, median}
 import opennlp.textgrounder.util.printutil.{errprint, warning}
 
-import opennlp.textgrounder.gridlocate.{TextGrounderInfo,DistDocumentFileProcessor}
+import opennlp.textgrounder.gridlocate.{CorpusDocumentEvaluator,TextGrounderInfo,DistDocumentFileProcessor}
 
 /* Basic idea for hooking up Geolocate with Hadoop.  Hadoop works in terms
    of key-value pairs, as follows:
   def create_param_object(ap: ArgParser) = new TParam(ap)
   def create_driver() = new TDriver
 
-  var evaluators: Iterable[CorpusGeolocateDocumentEvaluator] = null
+  var evaluators: Iterable[CorpusDocumentEvaluator[SphereCoord,SphereDocument,_,_,_]] = null
   val task = new ExperimentMeteredTask(driver, "document", "evaluating")
 
   class HadoopDocumentFileProcessor(
   var processor: HadoopDocumentFileProcessor = _
   override def init(context: TContext) {
     super.init(context)
-    evaluators =
-      for ((stratname, strategy) <- driver.strategies)
-        yield new CorpusGeolocateDocumentEvaluator(strategy, stratname,
-          driver)
-    if (driver.params.input_corpus.length != 1) {
+    if (driver.params.eval_format != "internal")
       driver.params.parser.error(
-        "FIXME: For Hadoop, currently need exactly one corpus")
-    } else {
-      processor = new HadoopDocumentFileProcessor(context)
-      processor.read_schema_from_corpus(driver.get_file_handler,
-          driver.params.input_corpus(0))
-      context.progress
+        "For Hadoop, '--eval-format' must be 'internal'")
+    else {
+      evaluators =
+        for ((stratname, strategy) <- driver.strategies)
+          yield driver.create_document_evaluator(strategy, stratname).
+            asInstanceOf[CorpusDocumentEvaluator[
+              SphereCoord,SphereDocument,_,_,_]]
+      if (driver.params.input_corpus.length != 1) {
+        driver.params.parser.error(
+          "FIXME: For Hadoop, currently need exactly one corpus")
+      } else {
+        processor = new HadoopDocumentFileProcessor(context)
+        processor.read_schema_from_corpus(driver.get_file_handler,
+            driver.params.input_corpus(0))
+        context.progress
+      }
     }
   }
 

src/main/scala/opennlp/textgrounder/geolocate/SphereEvaluation.scala

 
 //////// Statistics for geolocating documents
 
-class SphereDocumentEvalStats(
-  driver_stats: ExperimentDriverStats,
-  prefix: String,
-  max_rank_for_credit: Int = 10
-) extends DocumentEvalStats(
-  driver_stats, prefix, max_rank_for_credit) {
+trait SphereDocumentEvalStats extends DocumentEvalStats {
   // "True dist" means actual distance in km's or whatever.
   // "Degree dist" is the distance in degrees.
   val degree_dists = mutable.Buffer[Double]()
   val oracle_degree_dists = mutable.Buffer[Double]()
 
-  def record_result(rank: Int, pred_true_dist: Double,
-      pred_degree_dist: Double) {
-    super.record_result(rank, pred_true_dist)
+  def record_predicted_degree_distance(pred_degree_dist: Double) {
     degree_dists += pred_degree_dist
   }
 
-  def record_oracle_result(oracle_true_dist: Double,
-      oracle_degree_dist: Double) {
-    super.record_oracle_result(oracle_true_dist)
+  def record_oracle_degree_distance(oracle_degree_dist: Double) {
     oracle_degree_dists += oracle_degree_dist
   }
 
 }
 
 /**
+ * SphereDocument version of `CoordDocumentEvalStats`.
+ */
+class CoordSphereDocumentEvalStats(
+  driver_stats: ExperimentDriverStats,
+  prefix: String
+) extends CoordDocumentEvalStats(driver_stats, prefix)
+  with SphereDocumentEvalStats {
+}
+
+/**
+ * SphereDocument version of `RankedDocumentEvalStats`.
+ */
+class RankedSphereDocumentEvalStats(
+  driver_stats: ExperimentDriverStats,
+  prefix: String,
+  max_rank_for_credit: Int = 10
+) extends RankedDocumentEvalStats(driver_stats, prefix, max_rank_for_credit)
+  with SphereDocumentEvalStats {
+}
+
+/**
  * Class for statistics for geolocating documents, with separate
  * sets of statistics for different intervals of error distances and
  * number of documents in true cell.
 class SphereGroupedDocumentEvalStats(
   driver_stats: ExperimentDriverStats,
   cell_grid: SphereCellGrid,
-  results_by_range: Boolean
+  results_by_range: Boolean,
+  is_ranked: Boolean
 ) extends GroupedDocumentEvalStats[
-  SphereCoord, SphereDocument, SphereCell](
-  driver_stats, cell_grid, results_by_range) {
-  type TBasicEvalStats = SphereDocumentEvalStats
-  type TDocEvalRes = SphereDocumentEvaluationResult
-  override def create_stats(prefix: String) =
-    new SphereDocumentEvalStats(driver_stats, prefix)
+  SphereCoord, SphereDocument, SphereCell, SphereCellGrid,
+  SphereDocumentEvaluationResult
+](driver_stats, cell_grid, results_by_range) {
+  override def create_stats(prefix: String) = {
+    if (is_ranked)
+      new RankedSphereDocumentEvalStats(driver_stats, prefix)
+    else
+      new CoordSphereDocumentEvalStats(driver_stats, prefix)
+  }
 
   val docs_by_degree_dist_to_true_center =
     docmap("degree_dist_to_true_center")
     new DoubleTableByRange(dist_fractions_for_error_dist,
       create_stats_for_range("degree_dist_to_pred_center", _))
 
-  override def record_one_result(stats: TBasicEvalStats,
-      res: TDocEvalRes) {
-    stats.record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
+  override def record_one_result(stats: DocumentEvalStats,
+      res: SphereDocumentEvaluationResult) {
+    super.record_one_result(stats, res)
+    stats.asInstanceOf[SphereDocumentEvalStats].
+      record_predicted_degree_distance(res.pred_degdist)
   }
 
-  override def record_one_oracle_result(stats: TBasicEvalStats,
-      res: TDocEvalRes) {
-    stats.record_oracle_result(res.true_truedist, res.true_degdist)
+  override def record_one_oracle_result(stats: DocumentEvalStats,
+      res: SphereDocumentEvaluationResult) {
+    super.record_one_oracle_result(stats, res)
+    stats.asInstanceOf[SphereDocumentEvalStats].
+      record_oracle_degree_distance(res.true_degdist)
   }
 
-  override def record_result_by_range(res: TDocEvalRes) {
+  override def record_result_by_range(res: SphereDocumentEvaluationResult) {
     super.record_result_by_range(res)
 
     /* FIXME: This code specific to MultiRegularCellGrid is kind of ugly.
         fracinc * floor(frac_true_degdist / fracinc)
       val rounded_frac_true_degdist =
         fracinc * floor(frac_true_degdist / fracinc)
-      docs_by_true_dist_to_true_center(rounded_frac_true_truedist).
-        record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
-      docs_by_degree_dist_to_true_center(rounded_frac_true_degdist).
-        record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
+      res.record_result(docs_by_true_dist_to_true_center(
+        rounded_frac_true_truedist))
+      res.record_result(docs_by_degree_dist_to_true_center(
+        rounded_frac_true_degdist))
 
       /* For distance to center of predicted cell, which may be large, since
          predicted cell may be nowhere near the true cell.  Again we convert
          cell size */
       val frac_pred_truedist = res.pred_truedist / multigrid.km_per_cell
       val frac_pred_degdist = res.pred_degdist / multigrid.degrees_per_cell
-      docs_by_true_dist_to_pred_center.get_collector(frac_pred_truedist).
-        record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
-      docs_by_degree_dist_to_pred_center.get_collector(frac_pred_degdist).
-        record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
+      res.record_result(docs_by_true_dist_to_pred_center.get_collector(
+        frac_pred_truedist))
+      res.record_result(docs_by_degree_dist_to_pred_center.get_collector(
+        frac_pred_degdist))
      } else if (cell_grid.isInstanceOf[KdTreeCellGrid]) {
        // for kd trees, we do something similar to above, but round to the nearest km...
        val kdgrid = cell_grid.asInstanceOf[KdTreeCellGrid]
-       docs_by_true_dist_to_true_center(round(res.true_truedist)).
-         record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
-       docs_by_degree_dist_to_true_center(round(res.true_degdist)).
-         record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
+       res.record_result(docs_by_true_dist_to_true_center(
+         round(res.true_truedist)))
+       res.record_result(docs_by_degree_dist_to_true_center(
+         round(res.true_degdist)))
     }
   }
 
 //                             Main evaluation code                        //
 /////////////////////////////////////////////////////////////////////////////
 
-abstract class GeolocateDocumentEvaluator[TEvalDoc, TEvalRes](
-  strategy: GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
-  stratname: String,
-  driver: GeolocateDocumentTypeDriver
-) extends DocumentEvaluator[SphereCoord, SphereDocument, SphereCell,
-  SphereCellGrid, TEvalDoc, TEvalRes](strategy, stratname, driver) {
-  type TGroupedEvalStats = SphereGroupedDocumentEvalStats
-  def create_grouped_eval_stats(driver: GridLocateDriver,
-    cell_grid: SphereCellGrid, results_by_range: Boolean) =
-    new SphereGroupedDocumentEvalStats(driver,
-      cell_grid.asInstanceOf[SphereCellGrid],
-      results_by_range)
+trait SphereDocumentEvaluationResult extends DocumentEvaluationResult[
+  SphereCoord, SphereDocument, SphereCell, SphereCellGrid
+] {
+  val xdocument: SphereDocument
+  /* The following must be declared as 'lazy' because 'xdocument' above isn't
+     initialized at creation time (which is impossible because traits can't
+     have construction parameters). */
+  /**
+   * Distance in degrees between document's coordinate and central
+   * point of true cell
+   */
+  lazy val true_degdist = xdocument.degree_distance_to_coord(true_center)
+  /**
+   * Distance in degrees between document's coordinate and predicted
+   * coordinate
+   */
+  lazy val pred_degdist = xdocument.degree_distance_to_coord(pred_coord)
 }
 
-class SphereDocumentEvaluationResult(
+/**
+ * Result of evaluating a SphereDocument using an algorithm that does
+ * cell-by-cell comparison and computes a ranking of all the cells.
+ * The predicted coordinate is the central point of the top-ranked cell,
+ * and the cell grid is derived from the cell.
+ *
+ * @param document document whose coordinate is predicted
+ * @param pred_cell top-ranked predicted cell in which the document should
+ *        belong
+ * @param true_rank rank of the document's true cell among all of the
+ *        predicted cell
+ */
+class SphereDocumentEvaluationResultCell(
   document: SphereDocument,
   pred_cell: SphereCell,
   true_rank: Int
-) extends DocumentEvaluationResult[SphereCoord, SphereDocument, SphereCell](
+) extends DocumentEvaluationResultCell[
+  SphereCoord, SphereDocument, SphereCell, SphereCellGrid
+  ](
   document, pred_cell, true_rank
-) {
-  val true_degdist = document.degree_distance_to_coord(true_center)
-  val pred_degdist = document.degree_distance_to_coord(pred_center)
+) with SphereDocumentEvaluationResult {
+  val xdocument = document
+}
+
+/**
+ * Result of evaluating a SphereDocument using an algorithm that
+ * predicts a coordinate that is not necessarily the central point of
+ * any cell (e.g. using a mean-shift algorithm).
+ *
+ * @param document document whose coordinate is predicted
+ * @param cell_grid cell grid against which error comparison should be done
+ * @param pred_coord predicted coordinate of the document
+ */
+class SphereDocumentEvaluationResultCoord(
+  document: SphereDocument,
+  cell_grid: SphereCellGrid,
+  pred_coord: SphereCoord
+) extends DocumentEvaluationResult[
+  SphereCoord, SphereDocument, SphereCell, SphereCellGrid
+  ](
+  document, cell_grid, pred_coord
+) with SphereDocumentEvaluationResult {
+  val xdocument = document
 }
 
 /**
  * Class to do document geolocating on documents from the document data, in
  * the dev or test set.
  */
-class CorpusGeolocateDocumentEvaluator(
+class RankedCorpusGeolocateDocumentEvaluator(
   strategy: GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
   stratname: String,
   driver: GeolocateDocumentTypeDriver
-) extends CorpusDocumentEvaluator[
+) extends RankedCorpusDocumentEvaluator[
   SphereCoord, SphereDocument, SphereCell, SphereCellGrid,
-  SphereDocumentEvaluationResult
+  SphereDocumentEvaluationResultCell
 ](strategy, stratname, driver) {
-  // FIXME, the following 8 lines are copied from GeolocateDocumentEvaluator
-  type TGroupedEvalStats = SphereGroupedDocumentEvalStats
   def create_grouped_eval_stats(driver: GridLocateDriver,
     cell_grid: SphereCellGrid, results_by_range: Boolean) =
     new SphereGroupedDocumentEvalStats(driver,
       cell_grid.asInstanceOf[SphereCellGrid],
-      results_by_range)
-  def create_evaluation_result(document: SphereDocument, pred_cell: SphereCell,
-      true_rank: Int) =
-    new SphereDocumentEvaluationResult(document, pred_cell, true_rank)
+      results_by_range, is_ranked = true)
+  def create_cell_evaluation_result(document: SphereDocument,
+      pred_cell: SphereCell, true_rank: Int) =
+    new SphereDocumentEvaluationResultCell(document, pred_cell, true_rank)
 
   val num_nearest_neighbors = driver.params.num_nearest_neighbors
 
   def print_individual_result(doctag: String, document: SphereDocument,
-      result: SphereDocumentEvaluationResult,
+      result: SphereDocumentEvaluationResultCell,
       pred_cells: Array[(SphereCell, Double)]) {
     errprint("%s:Document %s:", doctag, document)
     // errprint("%s:Document distribution: %s", doctag, document.dist)
     errprint("%s:  Distance %.2f km to true cell center at %s",
       doctag, result.true_truedist, result.true_center)
     errprint("%s:  Distance %.2f km to predicted cell center at %s",
-      doctag, result.pred_truedist, result.pred_center)
+      doctag, result.pred_truedist, result.pred_coord)
 
     val avg_dist_of_neighbors = mean(closest_half_with_dists.map(_._2))
     errprint("%s:  Average distance from true cell center to %s closest cells' centers from %s best matches: %.2f km",
   }
 }
 
+/**
+ * Class to do document geolocating on documents from the document data, in
+ * the dev or test set.
+ */
+class MeanShiftCorpusGeolocateDocumentEvaluator(
+  strategy: GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
+  stratname: String,
+  driver: GeolocateDocumentTypeDriver,
+  k_best: Int,
+  mean_shift_window: Double,
+  mean_shift_max_stddev: Double,
+  mean_shift_max_iterations: Int
+) extends MeanShiftCorpusDocumentEvaluator[
+  SphereCoord, SphereDocument, SphereCell, SphereCellGrid,
+  SphereDocumentEvaluationResult
+](strategy, stratname, driver, k_best, mean_shift_window,
+  mean_shift_max_stddev, mean_shift_max_iterations) {
+  def create_grouped_eval_stats(driver: GridLocateDriver,
+    cell_grid: SphereCellGrid, results_by_range: Boolean) =
+    new SphereGroupedDocumentEvalStats(driver,
+      cell_grid.asInstanceOf[SphereCellGrid],
+      results_by_range, is_ranked = false)
+  def create_coord_evaluation_result(document: SphereDocument,
+      cell_grid: SphereCellGrid, pred_coord: SphereCoord) =
+    new SphereDocumentEvaluationResultCoord(document, cell_grid, pred_coord)
+
+  def print_individual_result(doctag: String, document: SphereDocument,
+      result: SphereDocumentEvaluationResult) {
+    errprint("%s:Document %s:", doctag, document)
+    // errprint("%s:Document distribution: %s", doctag, document.dist)
+    errprint("%s:  %d types, %f tokens",
+      doctag, document.dist.num_word_types, document.dist.num_word_tokens)
+    errprint("%s:  true cell: %s", doctag, result.true_cell)
+
+    errprint("%s:  Distance %.2f km to true cell center at %s",
+      doctag, result.true_truedist, result.true_center)
+    errprint("%s:  Distance %.2f km to predicted cell center at %s",
+      doctag, result.pred_truedist, result.pred_coord)
+  }
+}
+
 case class TitledDocument(title: String, text: String)
 class TitledDocumentResult { }
 
   strategy: GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
   stratname: String,
   driver: GeolocateDocumentTypeDriver
-) extends GeolocateDocumentEvaluator[
+) extends TestDocumentEvaluator[
   TitledDocument, TitledDocumentResult
-](strategy, stratname, driver) with DocumentIteratingEvaluator[
+](stratname, driver) with DocumentIteratingEvaluator[
   TitledDocument, TitledDocumentResult
 ] {
   def iter_documents(filehand: FileHandler, filename: String) = {
 
     new TitledDocumentResult()
   }
+
+  def output_results(isfinal: Boolean = false) {
+  }
 }
 

src/main/scala/opennlp/textgrounder/geolocate/toponym/Toponym.scala

 import opennlp.textgrounder.util.osutil._
 import opennlp.textgrounder.util.printutil.{errout, errprint, warning}
 
-import opennlp.textgrounder.gridlocate.{CombinedWordDist,EvalStats,TestFileEvaluator,DocumentIteratingEvaluator}
+import opennlp.textgrounder.gridlocate.{CombinedWordDist,EvalStats,TestDocumentEvaluator,DocumentIteratingEvaluator}
 import opennlp.textgrounder.gridlocate.GridLocateDriver.Debug._
 import opennlp.textgrounder.geolocate._
 
   strategy: GeolocateToponymStrategy,
   stratname: String,
   driver: GeolocateToponymDriver
-) extends TestFileEvaluator[
+) extends TestDocumentEvaluator[
   GeogWordDocument, ToponymEvaluationResult
 ](stratname, driver) with DocumentIteratingEvaluator[
   GeogWordDocument, ToponymEvaluationResult
     GeolocateDriver with StandaloneExperimentDriverStats {
   type TParam = GeolocateToponymParameters
   type TRunRes =
-    Seq[(String, GeolocateToponymStrategy, TestFileEvaluator[_,_])]
+    Seq[(String, GeolocateToponymStrategy, TestDocumentEvaluator[_,_])]
 
   override def handle_parameters() {
     super.handle_parameters()

src/main/scala/opennlp/textgrounder/gridlocate/Evaluation.scala

 
 //////// Statistics for locating documents
 
-case class DocumentEvaluationResult[
+/**
+ * General class for the result of evaluating a document.  Specifies a
+ * document, cell grid, and the predicted coordinate for the document.
+ * The reason that a cell grid needs to be given is that we need to
+ * retrieve the cell that the document belongs to in order to get the
+ * "central point" (center or centroid of the cell), and in general we
+ * may be operating with multiple cell grids (e.g. in the combination of
+ * uniform and k-D tree grids). (FIXME: I don't know if this is actually
+ * true.)
+ *
+ * FIXME: Perhaps we should redo the results in terms of pseudo-documents
+ * instead of cells.
+ *
+ * @tparam TCoord type of a coordinate
+ * @tparam TDoc type of a document
+ * @tparam TCell type of a cell
+ * @tparam TGrid type of a cell grid
+ *
+ * @param document document whose coordinate is predicted
+ * @param cell_grid cell grid against which error comparison should be done
+ * @param pred_coord predicted coordinate of the document
+ */
+class DocumentEvaluationResult[
   TCoord,
   TDoc <: DistDocument[TCoord],
-  TCell <: GeoCell[TCoord, TDoc]
+  TCell <: GeoCell[TCoord, TDoc],
+  TGrid <: CellGrid[TCoord, TDoc, TCell]
+](
+  val document: TDoc,
+  val cell_grid: TGrid,
+  val pred_coord: TCoord
+) {
+  /**
+   * True cell in the cell grid in which the document belongs
+   */
+  val true_cell = cell_grid.find_best_cell_for_coord(document.coord, true)
+  /**
+   * Number of documents in the true cell
+   */
+  val num_docs_in_true_cell = true_cell.combined_dist.num_docs_for_word_dist
+  /**
+   * Central point of the true cell
+   */
+  val true_center = true_cell.get_center_coord()
+  /**
+   * "True distance" (rather than e.g. degree distance) between document's
+   * coordinate and central point of true cell
+   */
+  val true_truedist = document.distance_to_coord(true_center)
+  /**
+   * "True distance" (rather than e.g. degree distance) between document's
+   * coordinate and predicted coordinate
+   */
+  val pred_truedist = document.distance_to_coord(pred_coord)
+
+  def record_result(stats: DocumentEvalStats) {
+    stats.record_predicted_distance(pred_truedist)
+  }
+}
+
+/**
+ * Subclass of `DocumentEvaluationResult` where the predicted coordinate
+ * is specifically the central point of one of the grid cells.
+ *
+ * @param document document whose coordinate is predicted
+ * @param pred_cell top-ranked predicted cell in which the document should
+ *        belong
+ * @param true_rank rank of the document's true cell among all of the
+ *        predicted cell
+ */
+class DocumentEvaluationResultCell[
+  TCoord,
+  TDoc <: DistDocument[TCoord],
+  TCell <: GeoCell[TCoord, TDoc],
+  TGrid <: CellGrid[TCoord, TDoc, TCell]
 ](
   document: TDoc,
-  pred_cell: TCell,
-  true_rank: Int
+  val pred_cell: TCell,
+  val true_rank: Int
+) extends DocumentEvaluationResult[TCoord, TDoc, TCell, TGrid](
+  document, pred_cell.cell_grid.asInstanceOf[TGrid],
+  pred_cell.get_center_coord()
 ) {
-  val true_cell =
-    pred_cell.cell_grid.find_best_cell_for_coord(document.coord, true)
-  val num_docs_in_true_cell = true_cell.combined_dist.num_docs_for_word_dist
-  val true_center = true_cell.get_center_coord()
-  val true_truedist = document.distance_to_coord(true_center)
-  val pred_center = pred_cell.get_center_coord()
-  val pred_truedist = document.distance_to_coord(pred_center)
+  override def record_result(stats: DocumentEvalStats) {
+    super.record_result(stats)
+    stats.asInstanceOf[RankedDocumentEvalStats].record_true_rank(true_rank)
+  }
 }
 
-abstract class DocumentEvalStats(
-  driver_stats: ExperimentDriverStats,
-  prefix: String,
-  max_rank_for_credit: Int = 10
-) extends EvalStatsWithRank(driver_stats, prefix, max_rank_for_credit) {
+/**
+ * A basic class for accumulating statistics from multiple evaluation
+ * results.
+ */
+trait DocumentEvalStats extends EvalStats {
   // "True dist" means actual distance in km's or whatever.
   val true_dists = mutable.Buffer[Double]()
   val oracle_true_dists = mutable.Buffer[Double]()
 
-  def record_result(rank: Int, pred_true_dist: Double) {
-    super.record_result(rank)
+  def record_predicted_distance(pred_true_dist: Double) {
     true_dists += pred_true_dist
   }
 
-  def record_oracle_result(oracle_true_dist: Double) {
+  def record_oracle_distance(oracle_true_dist: Double) {
     oracle_true_dists += oracle_true_dist
   }
 
 }
 
 /**
- * Class for statistics for locating documents, with separate
- * sets of statistics for different intervals of error distances and
- * number of documents in true cell.
+ * A class for accumulating statistics from multiple evaluation results,
+ * where the results directly specify a coordinate (rather than e.g. a cell).
  */
+abstract class CoordDocumentEvalStats(
+  driver_stats: ExperimentDriverStats,
+  prefix: String
+) extends EvalStats(driver_stats, prefix, Map[String, String]())
+  with DocumentEvalStats {
+}
 
-abstract class GroupedDocumentEvalStats[TCoord,
+/**
+ * A class for accumulating statistics from multiple evaluation results,
+ * including statistics on the rank of the true cell.
+ */
+abstract class RankedDocumentEvalStats(
+  driver_stats: ExperimentDriverStats,
+  prefix: String,
+  max_rank_for_credit: Int = 10
+) extends EvalStatsWithRank(driver_stats, prefix, max_rank_for_credit)
+  with DocumentEvalStats {
+  def record_true_rank(rank: Int) {
+    record_result(rank)
+  }
+}
+
+/**
+ * Class for accumulating statistics from multiple document evaluation results,
+ * with separate sets of statistics for different intervals of error distances
+ * and number of documents in true cell. ("Grouped" in the sense that we may be
+ * computing not only results for the documents as a whole but also for various
+ * subgroups.)
+ */
+abstract class GroupedDocumentEvalStats[
+  TCoord,
   TDoc <: DistDocument[TCoord],
-  TCell <: GeoCell[TCoord, TDoc]](
+  TCell <: GeoCell[TCoord, TDoc],
+  TGrid <: CellGrid[TCoord, TDoc, TCell],
+  -TDocEvalRes <: DocumentEvaluationResult[TCoord, TDoc, TCell, TGrid]
+](
   driver_stats: ExperimentDriverStats,
   cell_grid: CellGrid[TCoord,TDoc,TCell],
   results_by_range: Boolean
 ) {
-  type TBasicEvalStats <: DocumentEvalStats
-  type TDocEvalRes <:
-    DocumentEvaluationResult[TCoord, TDoc, TCell]
-
-  def create_stats(prefix: String): TBasicEvalStats
+  def create_stats(prefix: String): DocumentEvalStats
   def create_stats_for_range[T](prefix: String, range: T) =
     create_stats(prefix + ".byrange." + range)
 
   // and longitudinally.
   val dist_fraction_increment = 0.25
   def docmap(prefix: String) =
-    new SettingDefaultHashMap[Double, TBasicEvalStats](
+    new SettingDefaultHashMap[Double, DocumentEvalStats](
       create_stats_for_range(prefix, _))
   val docs_by_true_dist_to_true_center =
     docmap("true_dist_to_true_center")
     new DoubleTableByRange(dist_fractions_for_error_dist,
       create_stats_for_range("true_dist_to_pred_center", _))
 
-  def record_one_result(stats: TBasicEvalStats, res: TDocEvalRes) {
-    stats.record_result(res.true_rank, res.pred_truedist)
+  def record_one_result(stats: DocumentEvalStats, res: TDocEvalRes) {
+    res.record_result(stats)
   }
 
-  def record_one_oracle_result(stats: TBasicEvalStats, res: TDocEvalRes) {
-    stats.record_oracle_result(res.true_truedist)
+  def record_one_oracle_result(stats: DocumentEvalStats, res: TDocEvalRes) {
+    stats.record_oracle_distance(res.true_truedist)
   }
 
   def record_result(res: TDocEvalRes) {
 /////////////////////////////////////////////////////////////////////////////
 
 /**
- * Basic abstract class for reading documents from a test file and evaluating
- * on them.  Doesn't use any driver class. (FIXME, perhaps we should
- * integrate this into TestFileEvaluator.)
+ * Basic abstract class for evaluating a test document.  Doesn't use any
+ * driver class.
+ *
+ * TestDocumentEvaluator is currently the only subclass. (FIXME: Perhaps we
+ * should integrate the two.) The reason they are separated is because
+ * TestDocumentEvaluator makes use of a GridLocateDriver class, which
+ * encapsulates (among other things) various command-line parameters,
+ * in particular command-line parameters that allow a subset of the
+ * total set of documents to be evaluated.
  *
  * @tparam TEvalDoc Type of document to evaluate.
  * @tparam TEvalRes Type of result of evaluating a document.
+ *
+ * @param stratname Name of the strategy used to perform evaluation.
+ *   This is output in various status messages.
  */
-abstract class BasicTestFileEvaluator[TEvalDoc, TEvalRes](
+abstract class BasicTestDocumentEvaluator[TEvalDoc, TEvalRes](
   val stratname: String
 ) {
   var documents_processed = 0
   def would_stop_processing(new_processed: Int) = false
 
   /**
-   * Return true if document was actually processed and evaluated; false
-   * if skipped.
+   * Evaluate a document.  Return an object describing the results of the
+   * evaluation.
+   *
+   * @param document Document to evaluate.
+   * @param doctag A short string identifying the document (e.g. '#25'),
+   *   to be printed out at the beginning of diagnostic lines describing
+   *   the document and its evaluation results.
    */
   def evaluate_document(doc: TEvalDoc, doctag: String):
     TEvalRes
 }
 
 /**
- * Abstract class for reading documents from a test file and evaluating
- * on them.
+ * Abstract class for evaluating a test document.
  *
  * @tparam TEvalDoc Type of document to evaluate.
  * @tparam TEvalRes Type of result of evaluating a document.
  *
- * Evaluates on all of the given files, outputting periodic results and
- * results after all files are done.  If the evaluator uses documents as
- * documents (so that it doesn't need any external test files), the value
- * of 'files' should be a sequence of one item, which is null. (If an
- * empty sequence is passed in, no evaluation will happen.)
-
- * Also returns an object containing the results.
+ * @param stratname Name of the strategy used for performing evaluation.
+ * @param driver Driver class that encapsulates command-line parameters and
+ *   such.
+ *
+ * This is a subclass of `BasicTestDocumentEvaluator` which uses the command-line
+ * parameters to determine which documents should be skipped.
  */
-abstract class TestFileEvaluator[TEvalDoc, TEvalRes](
+abstract class TestDocumentEvaluator[TEvalDoc, TEvalRes](
   stratname: String,
   val driver: GridLocateDriver
-) extends BasicTestFileEvaluator[TEvalDoc, TEvalRes](stratname) {
+) extends BasicTestDocumentEvaluator[TEvalDoc, TEvalRes](stratname) {
   override val task = new MeteredTask("document", "evaluating",
     maxtime = driver.params.max_time_per_stage)
   var skip_initial = driver.params.skip_initial_test_docs
   }
 }
 
-abstract class DocumentEvaluator[
+/**
+ * Abstract class for evaluating a test document by comparing it against each
+ * of the cells in a cell grid, where each cell has an associated
+ * pseudo-document created by amalgamating all of the training documents
+ * in the cell.
+ *
+ * Abstract class for for evaluating a test document where a collection of
+ * documents has been divided into "training" and "test" sets, and the
+ * training set used to construct a cell grid in which the training
+ * documents in a particular cell are amalgamated to form a pseudo-document
+ * and evaluation of a test document proceeds by comparing it against each
+ * pseudo-document in turn.
+ *
+ * This is the highest-level evaluation class that includes the concept of a
+ * coordinate that is associated with training and test documents, so that
+ * computation of error distances possible.
+ *
+ * @tparam TCoord Type of the coordinate assigned to a document
+ * @tparam XTDoc Type of the training and test documents
+ * @tparam TCell Type of a cell in a cell grid
+ * @tparam XTGrid Type of a cell grid
+ * @tparam TEvalRes Type of result of evaluating a document.
+ *
+ * @param strategy Object encapsulating the strategy used for performing
+ *   evaluation.
+ * @param stratname Name of the strategy used for performing evaluation.
+ * @param driver Driver class that encapsulates command-line parameters and
+ *   such.
+ *
+ * Note that we are forced to use the strange names `XTDoc` and `XTGrid`
+ * because of an apparent Scala bug that prevents use of the more obvious
+ * names `TDoc` and `TGrid` due to a naming clash.  Possibly there is a
+ * solution to this problem but if so I can't figure it out.
+ */
+abstract class CorpusDocumentEvaluator[
   TCoord,
-  TDoc <: DistDocument[TCoord],
-  TCell <: GeoCell[TCoord, TDoc],
-  TGrid <: CellGrid[TCoord, TDoc, TCell],
-  TEvalDoc,
-  TEvalRes
+  XTDoc <: DistDocument[TCoord],
+  TCell <: GeoCell[TCoord, XTDoc],
+  // SCALABUG: No way to access something called 'TGrid' at this scope in the
+  // line below where it says 'type TGrid = XTGrid'
+  XTGrid <: CellGrid[TCoord, XTDoc, TCell],
+  TEvalRes <: DocumentEvaluationResult[TCoord, XTDoc, TCell, XTGrid]
 ](
-  val strategy: GridLocateDocumentStrategy[TCell, TGrid],
+  strategy: GridLocateDocumentStrategy[TCell, XTGrid],
   stratname: String,
-  driver: GridLocateDriver // GridLocateDocumentTypeDriver
-) extends TestFileEvaluator[TEvalDoc, TEvalRes](stratname, driver) {
-  type TGroupedEvalStats <: GroupedDocumentEvalStats[TCoord,TDoc,TCell]
+  driver: GridLocateDriver { type TGrid = XTGrid; type TDoc = XTDoc } // GridLocateDocumentTypeDriver
+) extends TestDocumentEvaluator[XTDoc, TEvalRes](stratname, driver) {
   def create_grouped_eval_stats(driver: GridLocateDriver, // GridLocateDocumentTypeDriver
-    cell_grid: TGrid, results_by_range: Boolean):
-    TGroupedEvalStats
+    cell_grid: XTGrid, results_by_range: Boolean):
+    GroupedDocumentEvalStats[TCoord, XTDoc, TCell, XTGrid, TEvalRes]
+
   val evalstats = create_grouped_eval_stats(driver,
     strategy.cell_grid, results_by_range = driver.params.results_by_range)
 
   def output_results(isfinal: Boolean = false) {
     evalstats.output_results(all_results = isfinal)
-  }
-}
-
-/**
- * Class to do document grid-location on documents from the document data, in
- * the dev or test set.
- */
-abstract class CorpusDocumentEvaluator[
-  TCoord,
-  XTDoc <: DistDocument[TCoord],
-  TCell <: GeoCell[TCoord, XTDoc],
-  // SCALABUG: No way access something called 'TGrid' at this scope in the
-  // line below where it says 'type TGrid = XTGrid'
-  XTGrid <: CellGrid[TCoord, XTDoc, TCell],
-  TEvalRes <: DocumentEvaluationResult[_,_,_]
-](
-  strategy: GridLocateDocumentStrategy[TCell, XTGrid],
-  stratname: String,
-  driver: GridLocateDriver { type TGrid = XTGrid; type TDoc = XTDoc } // GridLocateDocumentTypeDriver
-) extends DocumentEvaluator[
-  TCoord, XTDoc, TCell, XTGrid, XTDoc, TEvalRes
-](strategy, stratname, driver) {
-  override type TGroupedEvalStats <:
-    GroupedDocumentEvalStats[TCoord,XTDoc,TCell] { type TDocEvalRes = TEvalRes }
+ }
 
   /**
    * A file processor that reads corpora containing document metadata and
     } else false
   }
 
-  def create_evaluation_result(document: XTDoc, pred_cell: TCell,
-    true_rank: Int): TEvalRes
+  /**
+   * Compare the document to the pseudo-documents associated with each cell,
+   * using the strategy for this evaluator.  Return a tuple
+   * (pred_cells, true_rank), where:
+   *
+   *  pred_cells = List of predicted cells, from best to worst; each list
+   *     entry is actually a tuple of (cell, score) where lower scores
+   *     are better
+   *  true_rank = Rank of true cell among predicted cells
+   *
+   * @param document Document to evaluate.
+   * @param true_cell Cell in the cell grid which contains the document.
+   */
+  def return_ranked_cells(document: XTDoc, true_cell: TCell) = {
+    if (driver.params.oracle_results)
+      (Array((true_cell, 0.0)), 1)
+    else {
+      def get_computed_results() = {
+        val cells = strategy.return_ranked_cells(document.dist).toArray
+        var rank = 1
+        var broken = false
+        breakable {
+          for ((cell, value) <- cells) {
+            if (cell eq true_cell) {
+              broken = true
+              break
+            }
+            rank += 1
+          }
+        }
+        if (!broken)
+          rank = 1000000000
+        (cells, rank)
+      }
 
-  def print_individual_result(doctag: String, document: XTDoc,
-    result: TEvalRes, pred_cells: Array[(TCell, Double)])
+      get_computed_results()
+    }
+  }
 
+  /**
+   * Actual implementation of code to evaluate a document.  Optionally
+   * Return an object describing the results of the evaluation, and
+   * optionally print out information on these results.
+   *
+   * @param document Document to evaluate.
+   * @param doctag A short string identifying the document (e.g. '#25'),
+   *   to be printed out at the beginning of diagnostic lines describing
+   *   the document and its evaluation results.
+   * @param true_cell Cell in the cell grid which contains the document.
+   * @param want_indiv_results Whether we should print out individual
+   *   evaluation results for the document.
+   */
+  def imp_evaluate_document(document: XTDoc, doctag: String,
+      true_cell: TCell, want_indiv_results: Boolean): TEvalRes
+
+  /**
+   * Evaluate a document, record statistics about it, etc.  Calls
+   * `imp_evaluate_document` to do the document evaluation and optionally
+   * print out information on the results, and records the results in
+   * `evalstat`.
+   *
+   * Return an object describing the results of the evaluation.
+   *
+   * @param document Document to evaluate.
+   * @param doctag A short string identifying the document (e.g. '#25'),
+   *   to be printed out at the beginning of diagnostic lines describing
+   *   the document and its evaluation results.
+   */
   def evaluate_document(document: XTDoc, doctag: String): TEvalRes = {
-    if (would_skip_document(document, doctag)) {
-      evalstats.increment_counter("documents.skipped")
-      // SCALABUG: Doesn't automatically recognize TEvalRes as a reference
-      // type despite being a subclass of DocumentEvaluationResult
-      return null.asInstanceOf[TEvalRes]
-    }
+    assert(!would_skip_document(document, doctag))
     assert(document.dist.finished)
     val true_cell =
       strategy.cell_grid.find_best_cell_for_coord(document.coord, true)
       errprint("Evaluating document %s with %s word-dist documents in true cell",
         document, naitr)
     }
+    val want_indiv_results =
+      !driver.params.oracle_results && !driver.params.no_individual_results
+    val result = imp_evaluate_document(document, doctag, true_cell,
+      want_indiv_results)
+    evalstats.record_result(result)
+    if (result.num_docs_in_true_cell == 0) {
+      evalstats.increment_counter("documents.no_training_documents_in_cell")
+    }
+    result
+  }
+}
 
-    //val num_nearest_neighbors = 10
+/**
+ * An implementation of `CorpusDocumentEvaluator` that compares the test
+ * document against each pseudo-document in the cell grid, ranks them by
+ * score and computes the document's location by the central point of the
+ * top-ranked cell.
+ *
+ * @tparam TCoord Type of the coordinate assigned to a document
+ * @tparam XTDoc Type of the training and test documents
+ * @tparam TCell Type of a cell in a cell grid
+ * @tparam XTGrid Type of a cell grid
+ * @tparam TEvalRes Type of result of evaluating a document.
+ *
+ * @param strategy Object encapsulating the strategy used for performing
+ *   evaluation.
+ * @param stratname Name of the strategy used for performing evaluation.
+ * @param driver Driver class that encapsulates command-line parameters and
+ *   such.
+ */
+abstract class RankedCorpusDocumentEvaluator[
+  TCoord,
+  XTDoc <: DistDocument[TCoord],
+  TCell <: GeoCell[TCoord, XTDoc],
+  XTGrid <: CellGrid[TCoord, XTDoc, TCell],
+  TEvalRes <: DocumentEvaluationResult[TCoord, XTDoc, TCell, XTGrid]
+](
+  strategy: GridLocateDocumentStrategy[TCell, XTGrid],
+  stratname: String,
+  driver: GridLocateDriver { type TGrid = XTGrid; type TDoc = XTDoc } // GridLocateDocumentTypeDriver
+) extends CorpusDocumentEvaluator[
+  TCoord, XTDoc, TCell, XTGrid, TEvalRes
+](strategy, stratname, driver) {
+  /**
+   * Create an evaluation-result object describing the top-ranked
+   * predicted cell and the rank of the document's true cell among
+   * all predicted cells.
+   */
+  def create_cell_evaluation_result(document: XTDoc, pred_cell: TCell,
+    true_rank: Int): TEvalRes
 
-    /* That is:
+  /**
+   * Print out the evaluation result, possibly along with some of the
+   * top-ranked cells.
+   */
+  def print_individual_result(doctag: String, document: XTDoc,
+    result: TEvalRes, pred_cells: Array[(TCell, Double)])
 
-       pred_cells = List of predicted cells, from best to worst; each list
-          entry is actually a tuple of (cell, score) where lower scores
-          are better
-       true_rank = Rank of true cell among predicted cells
-     */
-    val (pred_cells, true_rank) =
-      if (driver.params.oracle_results)
-        (Array((true_cell, 0.0)), 1)
-      else {
-        def get_computed_results() = {
-          val cells = strategy.return_ranked_cells(document.dist).toArray
-          var rank = 1
-          var broken = false
-          breakable {
-            for ((cell, value) <- cells) {
-              if (cell eq true_cell) {
-                broken = true
-                break
-              }
-              rank += 1
-            }
-          }
-          if (!broken)
-            rank = 1000000000
-          (cells, rank)
-        }
-
-        get_computed_results()
-      }
-
-    val result = create_evaluation_result(document, pred_cells(0)._1, true_rank)
+  def imp_evaluate_document(document: XTDoc, doctag: String,
+      true_cell: TCell, want_indiv_results: Boolean): TEvalRes = {
+    val (pred_cells, true_rank) = return_ranked_cells(document, true_cell)
+    val result =
+      create_cell_evaluation_result(document, pred_cells(0)._1, true_rank)
 
     if (debug("all-scores")) {
       for (((cell, value), index) <- pred_cells.zipWithIndex) {
           cell.describe_indices(), value)
       }
     }
-    val want_indiv_results =
-      !driver.params.oracle_results && !driver.params.no_individual_results
-    evalstats.record_result(result)
-    if (result.num_docs_in_true_cell == 0) {
-      evalstats.increment_counter("documents.no_training_documents_in_cell")
-    }
     if (want_indiv_results) {
       //val cells_for_average = pred_cells.zip(pred_cells.map(_._1.center))
       //for((cell, score) <- pred_cells) {
   }
 }
 
+/**
+ * An implementation of `CorpusDocumentEvaluator` that compares the test
+ * document against each pseudo-document in the cell grid, selects the
+ * top N ranked pseudo-documents for some N, and uses the mean-shift
+ * algorithm to determine a single point that is hopefully in the middle
+ * of the strongest cluster of points among the central points of the
+ * pseudo-documents.
+ *
+ * @tparam TCoord Type of the coordinate assigned to a document
+ * @tparam XTDoc Type of the training and test documents
+ * @tparam TCell Type of a cell in a cell grid
+ * @tparam XTGrid Type of a cell grid
+ * @tparam TEvalRes Type of result of evaluating a document.
+ *
+ * @param strategy Object encapsulating the strategy used for performing
+ *   evaluation.
+ * @param stratname Name of the strategy used for performing evaluation.
+ * @param driver Driver class that encapsulates command-line parameters and
+ *   such.
+ */
+abstract class MeanShiftCorpusDocumentEvaluator[
+  TCoord,
+  XTDoc <: DistDocument[TCoord],
+  TCell <: GeoCell[TCoord, XTDoc],
+  XTGrid <: CellGrid[TCoord, XTDoc, TCell],
+  TEvalRes <: DocumentEvaluationResult[TCoord, XTDoc, TCell, XTGrid]
+](
+  strategy: GridLocateDocumentStrategy[TCell, XTGrid],
+  stratname: String,
+  driver: GridLocateDriver { type TGrid = XTGrid; type TDoc = XTDoc }, // GridLocateDocumentTypeDriver
+  k_best: Int,
+  mean_shift_window: Double,
+  mean_shift_max_stddev: Double,
+  mean_shift_max_iterations: Int
+) extends CorpusDocumentEvaluator[
+  TCoord, XTDoc, TCell, XTGrid, TEvalRes
+](strategy, stratname, driver) {
+  /**
+   * Create an evaluation-result object describing the predicted coordinate.
+   */
+  def create_coord_evaluation_result(document: XTDoc, cell_grid: XTGrid,
+    pred_coord: TCoord): TEvalRes
+
+  /**
+   * Print out the evaluation result.
+   */
+  def print_individual_result(doctag: String, document: XTDoc,
+    result: TEvalRes)
+
+  def imp_evaluate_document(document: XTDoc, doctag: String,
+      true_cell: TCell, want_indiv_results: Boolean): TEvalRes = {
+    //val num_nearest_neighbors = 10
+    // FIXME, implement the appropriate mean-shift algorithm here.
+    // Note that 'mean_shift_window' is the value of 'h' in the mean-shift
+    // algorithm; similarly for the other parameters.
+    val pred_coord: TCoord = null.asInstanceOf[TCoord] // FIXME, implement me
+    val result = create_coord_evaluation_result(document, strategy.cell_grid,
+      pred_coord)
+
+    if (want_indiv_results)
+      print_individual_result(doctag, document, result)
+
+    return result
+  }
+}
+
 trait DocumentIteratingEvaluator[TEvalDoc, TEvalRes] extends
-  TestFileEvaluator[TEvalDoc, TEvalRes] {
+  TestDocumentEvaluator[TEvalDoc, TEvalRes] {
   /**
    * Return an Iterable listing the documents retrievable from the given
    * filename.
     fileproc.process_files(filehand, files)
   }
 }
-

src/main/scala/opennlp/textgrounder/gridlocate/GridLocate.scala

   }
 
   protected def process_strategies[T](strategies: Seq[(String, T)])(
-      geneval: (String, T) => TestFileEvaluator[_,_]) = {
+      geneval: (String, T) => TestDocumentEvaluator[_,_]) = {
     for ((stratname, strategy) <- strategies) yield {
       val evalobj = geneval(stratname, strategy)
       // For --eval-format=internal, there is no eval file.  To make the

src/main/scala/opennlp/textgrounder/util/mathutil.scala

 
   abstract class MeanShift[Coord : Manifest](
       h:Double = 1.0,
-      min_variance:Double = 1e-10,
+      max_stddev:Double = 1e-10,
       max_iterations:Int = 100
     ) {
     def squared_distance(x:Coord, y:Coord): Double
     }
 
     def mean_shift(list: Seq[Coord]):Array[Coord] = {
-      var variance = min_variance + 1
+      var next_stddev = max_stddev + 1
       var numiters = 0
       val points = list.toArray
       val shifted = list.toArray
-      while (variance >= min_variance && numiters <= max_iterations) {
+      while (next_stddev >= max_stddev && numiters <= max_iterations) {
         for (j <- 0 until points.length) {
           val y = shifted(j)
           val weights =
           shifted(j) = weighted_sum(normalized_weights, points)
         }
         numiters += 1
-        variance = vec_variance(shifted)
+        next_stddev = sqrt(vec_variance(shifted))
       }
       shifted
     }