Commits

Ben Wing committed 1ccfa52 Merge

Automatic merge

  • Participants
  • Parent commits 116292f, aff6d40

Comments (0)

Files changed (16)

src/main/resources/data/deu/stopwords.txt

+aber
+alle
+allem
+allen
+aller
+alles
+als
+also
+am
+an
+ander
+andere
+anderem
+anderen
+anderer
+anderes
+anderm
+andern
+anderr
+anders
+auch
+auf
+aus
+bei
+bin
+bis
+bist
+da
+damit
+dann
+der
+den
+des
+dem
+die
+das
+daß
+derselbe
+derselben
+denselben
+desselben
+demselben
+dieselbe
+dieselben
+dasselbe
+dazu
+dein
+deine
+deinem
+deinen
+deiner
+deines
+denn
+derer
+dessen
+dich
+dir
+du
+dies
+diese
+diesem
+diesen
+dieser
+dieses
+doch
+dort
+durch
+ein
+eine
+einem
+einen
+einer
+eines
+einig
+einige
+einigem
+einigen
+einiger
+einiges
+einmal
+er
+ihn
+ihm
+es
+etwas
+euer
+eure
+eurem
+euren
+eurer
+eures
+für
+gegen
+gewesen
+hab
+habe
+haben
+hat
+hatte
+hatten
+hier
+hin
+hinter
+ich
+mich
+mir
+ihr
+ihre
+ihrem
+ihren
+ihrer
+ihres
+euch
+im
+in
+indem
+ins
+ist
+jede
+jedem
+jeden
+jeder
+jedes
+jene
+jenem
+jenen
+jener
+jenes
+jetzt
+kann
+kein
+keine
+keinem
+keinen
+keiner
+keines
+können
+könnte
+machen
+man
+manche
+manchem
+manchen
+mancher
+manches
+mein
+meine
+meinem
+meinen
+meiner
+meines
+mit
+muss
+musste
+nach
+nicht
+nichts
+noch
+nun
+nur
+ob
+oder
+ohne
+sehr
+sein
+seine
+seinem
+seinen
+seiner
+seines
+selbst
+sich
+sie
+ihnen
+sind
+so
+solche
+solchem
+solchen
+solcher
+solches
+soll
+sollte
+sondern
+sonst
+über
+um
+und
+uns
+unse
+unsem
+unsen
+unser
+unses
+unter
+viel
+vom
+von
+vor
+während
+war
+waren
+warst
+was
+weg
+weil
+weiter
+welche
+welchem
+welchen
+welcher
+welches
+wenn
+werde
+werden
+wie
+wieder
+will
+wir
+wird
+wirst
+wo
+wollen
+wollte
+würde
+würden
+zu
+zum
+zur
+zwar
+zwischen

src/main/resources/data/por/stopwords.txt

+de
+a
+o
+que
+e
+do
+da
+em
+um
+para
+com
+não
+uma
+os
+no
+se
+na
+por
+mais
+as
+dos
+como
+mas
+ao
+ele
+das
+seu
+sua
+ou
+quando
+muito
+nos
+já
+eu
+também
+só
+pelo
+pela
+até
+isso
+ela
+entre
+depois
+sem
+mesmo
+aos
+seus
+quem
+nas
+me
+esse
+eles
+você
+essa
+num
+nem
+suas
+meu
+às
+minha
+numa
+pelos
+elas
+qual
+nós
+lhe
+deles
+essas
+esses
+pelas
+este
+dele
+tu
+te
+vocês
+vos
+lhes
+meus
+minhas
+teu
+tua
+teus
+tuas
+nosso
+nossa
+nossos
+nossas
+dela
+delas
+esta
+estes
+estas
+aquele
+aquela
+aqueles
+aquelas
+isto
+aquilo
+estou
+está
+estamos
+estão
+estive
+esteve
+estivemos
+estiveram
+estava
+estávamos
+estavam
+estivera
+estivéramos
+esteja
+estejamos
+estejam
+estivesse
+estivéssemos
+estivessem
+estiver
+estivermos
+estiverem
+hei
+há
+havemos
+hão
+houve
+houvemos
+houveram
+houvera
+houvéramos
+haja
+hajamos
+hajam
+houvesse
+houvéssemos
+houvessem
+houver
+houvermos
+houverem
+houverei
+houverá
+houveremos
+houverão
+houveria
+houveríamos
+houveriam
+sou
+somos
+são
+era
+éramos
+eram
+fui
+foi
+fomos
+foram
+fora
+fôramos
+seja
+sejamos
+sejam
+fosse
+fôssemos
+fossem
+for
+formos
+forem
+serei
+será
+seremos
+serão
+seria
+seríamos
+seriam
+tenho
+tem
+temos
+tém
+tinha
+tínhamos
+tinham
+tive
+teve
+tivemos
+tiveram
+tivera
+tivéramos
+tenha
+tenhamos
+tenham
+tivesse
+tivéssemos
+tivessem
+tiver
+tivermos
+tiverem
+terei
+terá
+teremos
+terão
+teria
+teríamos
+teriam

src/main/scala/opennlp/textgrounder/geolocate/DocumentPinKMLGenerator.scala

+///////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2011 The University of Texas at Austin
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+///////////////////////////////////////////////////////////////////////////////
+
+////////
+//////// DocumentPinKMLGenerator.scala
+////////
+//////// Copyright (c) 2012.
+////////
+
+package opennlp.textgrounder.geolocate
+
+import java.io._
+import javax.xml.datatype._
+import javax.xml.stream._
+import opennlp.textgrounder.topo._
+import opennlp.textgrounder.util.KMLUtil
+import opennlp.textgrounder.util.LogUtil
+import scala.collection.JavaConversions._
+import org.clapper.argot._
+
+object DocumentPinKMLGenerator {
+
+  val factory = XMLOutputFactory.newInstance
+  val rand = new scala.util.Random
+
+  import ArgotConverters._
+
+  val parser = new ArgotParser("textgrounder run opennlp.textgrounder.geolocate.DocumentPinKMLGenerator", preUsage = Some("TextGrounder"))
+  val inFile = parser.option[String](List("i", "input"), "input", "input file")
+  val kmlOutFile = parser.option[String](List("k", "kml"), "kml", "kml output file")
+  val tokenIndexOffset = parser.option[Int](List("o", "offset"), "offset", "token index offset")
+
+  def main(args: Array[String]) {
+    try {
+      parser.parse(args)
+    }
+    catch {
+      case e: ArgotUsageException => println(e.message); sys.exit(0)
+    }
+
+    if(inFile.value == None) {
+      println("You must specify an input file via -i.")
+      sys.exit(0)
+    }
+    if(kmlOutFile.value == None) {
+      println("You must specify a KML output file via -k.")
+      sys.exit(0)
+    }
+    val offset = if(tokenIndexOffset.value != None) tokenIndexOffset.value.get else 0
+
+    val outFile = new File(kmlOutFile.value.get)
+    val stream = new BufferedOutputStream(new FileOutputStream(outFile))
+    val out = factory.createXMLStreamWriter(stream, "UTF-8")
+
+    KMLUtil.writeHeader(out, inFile.value.get)
+
+    for(line <- scala.io.Source.fromFile(inFile.value.get).getLines) {
+      val tokens = line.split("\t")
+      if(tokens.length >= 3+offset) {
+        val docName = tokens(1+offset)
+        val coordTextPair = tokens(2+offset).split(",")
+        val coord = Coordinate.fromDegrees(coordTextPair(0).toDouble, coordTextPair(1).toDouble)
+        KMLUtil.writePinPlacemark(out, docName, coord)
+      }
+    }
+
+    KMLUtil.writeFooter(out)
+
+    out.close
+  }
+}

src/main/scala/opennlp/textgrounder/geolocate/GenerateKML.scala

     filter_words: Seq[String],
     ignore_case: Boolean,
     stopwords: Set[String],
+    whitelist: Set[String],
     minimum_word_count: Int = 1
   ) extends DefaultUnigramWordDistConstructor(
-    factory, ignore_case, stopwords, minimum_word_count
+    factory, ignore_case, stopwords, whitelist, minimum_word_count
   ) {
 
   override def finish_before_global(dist: WordDist) {
     if (num_ngrams > 1)
       param_error("Only unigram word distribution words with GenerateKML")
     val the_stopwords = get_stopwords()
+    val the_whitelist = get_whitelist()
     /* if (num_ngrams == 2)
       new FilterBigramWordDistConstructor(factory, ...)
     else */
         params.split_kml_words,
         ignore_case = !params.preserve_case_words,
         stopwords = the_stopwords,
+        whitelist = the_whitelist,
         minimum_word_count = params.minimum_word_count)
   }
 

src/main/scala/opennlp/textgrounder/geolocate/Geolocate.scala

 NOTE: Multiple --strategy options can be given, and each strategy will
 be tried, one after the other.""")
 
+  var coord_strategy =
+    ap.option[String]("coord-strategy", "cs",
+      default = "top-ranked",
+      choices = Seq("top-ranked", "mean-shift"),
+      help = """Strategy/strategies to use to choose the best coordinate for
+a document.
+
+'top-ranked' means to choose the single best-ranked cell according to the
+scoring strategy specified using '--strategy', and use its central point.
+
+'mean-shift' means to take the K best cells (according to '--k-best'),
+and then compute a single point using the mean-shift algorithm.  This
+algorithm works by steadily shifting each point towards the others by
+computing an average of the points surrounding a given point, weighted
+by a function that drops off rapidly as the distance from the point
+increases (specifically, the weighting is the same as for a Gaussian density,
+with a parameter H, specified using '--mean-shift-window', that corresponds to
+the standard deviation in the Gaussian distribution function).  The idea is
+that the points will eventually converge on the largest cluster within the
+original points.  The algorithm repeatedly moves the points closer to each
+other until either the total standard deviation of the points (i.e.
+approximately the average distance of the points from their mean) is less than
+the value specified by '--mean-shift-max-stddev', or the number of iterations
+exceeds '--mean-shift-max-iterations'.
+
+Default '%default'.""")
+
+  var k_best =
+    ap.option[Int]("k-best", "kb",
+      default = 10,
+      help = """Value of K for use in the mean-shift algorithm
+(see '--coord-strategy').  For this value of K, we choose the K best cells
+and then apply the mean-shift algorithm to the central points of those cells.
+
+Default '%default'.""")
+
+  var mean_shift_window =
+    ap.option[Double]("mean-shift-window", "msw",
+      default = 1.0,
+      help = """Window to use in the mean-shift algorithm
+(see '--coord-strategy').
+
+Default '%default'.""")
+
+  var mean_shift_max_stddev =
+    ap.option[Double]("mean-shift-max-stddev", "msms",
+      default = 1e-10,
+      help = """Maximum allowed standard deviation (i.e. approximately the
+average distance of the points from their mean) among the points selected by
+the mean-shift algorithm (see '--coord-strategy').
+
+Default '%default'.""")
+
+  var mean_shift_max_iterations =
+    ap.option[Int]("mean-shift-max-iterations", "msmi",
+      default = 100,
+      help = """Maximum number of iterations in the mean-shift algorithm
+(see '--coord-strategy').
+
+Default '%default'.""")
+
   var baseline_strategy =
     ap.multiOption[String]("baseline-strategy", "bs",
       default = Seq("internal-link"),
   override type TParam <: GeolocateDocumentParameters
   type TRunRes =
     Seq[(String, GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
-         TestFileEvaluator[_,_])]
+         TestDocumentEvaluator[_,_])]
 
   var strategies: Seq[(String, GridLocateDocumentStrategy[SphereCell, SphereCellGrid])] = _
 
    *
    * The current return type is as follows:
    *
-   * Seq[(java.lang.String, GridLocateDocumentStrategy[SphereCell, SphereCellGrid], scala.collection.mutable.Map[evalobj.Document,opennlp.textgrounder.geolocate.EvaluationResult])] where val evalobj: opennlp.textgrounder.geolocate.TestFileEvaluator
+   * Seq[(java.lang.String, GridLocateDocumentStrategy[SphereCell, SphereCellGrid], scala.collection.mutable.Map[evalobj.Document,opennlp.textgrounder.geolocate.EvaluationResult])] where val evalobj: opennlp.textgrounder.geolocate.TestDocumentEvaluator
    *
    * This means you get a sequence of tuples of
    * (strategyname, strategy, results)
    * strategyname = name of strategy as given on command line
    * strategy = strategy object
    * results = map listing results for each document (an abstract type
-   * defined in TestFileEvaluator; the result type EvaluationResult
+   * defined in TestDocumentEvaluator; the result type EvaluationResult
    * is practically an abstract type, too -- the most useful dynamic
    * type in practice is DocumentEvaluationResult)
    */
 
+  def create_document_evaluator(
+      strategy: GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
+      stratname: String) = {
+    // Generate reader object
+    if (params.eval_format == "pcl-travel")
+      new PCLTravelGeolocateDocumentEvaluator(strategy, stratname, this)
+    else if (params.coord_strategy =="top-ranked")
+      new RankedSphereCellGridEvaluator(strategy, stratname, this)
+    else
+      new MeanShiftSphereCellGridEvaluator(strategy, stratname, this,
+        params.k_best, params.mean_shift_window,
+        params.mean_shift_max_stddev,
+        params.mean_shift_max_iterations)
+  }
+
   def run_after_setup() = {
-    process_strategies(strategies)((stratname, strategy) => {
-      // Generate reader object
-      if (params.eval_format == "pcl-travel")
-        new PCLTravelGeolocateDocumentEvaluator(strategy, stratname, this)
-      else
-        new CorpusGeolocateDocumentEvaluator(strategy, stratname, this)
-    })
+    process_strategies(strategies)((stratname, strategy) =>
+      create_document_evaluator(strategy, stratname))
   }
 }
 

src/main/scala/opennlp/textgrounder/geolocate/Hadoop.scala

 import org.apache.hadoop.fs.Path
 
 import opennlp.textgrounder.util.argparser._
+import opennlp.textgrounder.util.distances._
 import opennlp.textgrounder.util.experiment.ExperimentMeteredTask
 import opennlp.textgrounder.util.hadoop._
 import opennlp.textgrounder.util.ioutil.FileHandler
 import opennlp.textgrounder.util.mathutil.{mean, median}
 import opennlp.textgrounder.util.printutil.{errprint, warning}
 
-import opennlp.textgrounder.gridlocate.{TextGrounderInfo,DistDocumentFileProcessor}
+import opennlp.textgrounder.gridlocate.{CellGridEvaluator,TextGrounderInfo,DistDocumentFileProcessor}
 
 /* Basic idea for hooking up Geolocate with Hadoop.  Hadoop works in terms
    of key-value pairs, as follows:
   def create_param_object(ap: ArgParser) = new TParam(ap)
   def create_driver() = new TDriver
 
-  var evaluators: Iterable[CorpusGeolocateDocumentEvaluator] = null
+  var evaluators: Iterable[CellGridEvaluator[SphereCoord,SphereDocument,_,_,_]] = null
   val task = new ExperimentMeteredTask(driver, "document", "evaluating")
 
   class HadoopDocumentFileProcessor(
   var processor: HadoopDocumentFileProcessor = _
   override def init(context: TContext) {
     super.init(context)
-    evaluators =
-      for ((stratname, strategy) <- driver.strategies)
-        yield new CorpusGeolocateDocumentEvaluator(strategy, stratname,
-          driver)
-    if (driver.params.input_corpus.length != 1) {
+    if (driver.params.eval_format != "internal")
       driver.params.parser.error(
-        "FIXME: For Hadoop, currently need exactly one corpus")
-    } else {
-      processor = new HadoopDocumentFileProcessor(context)
-      processor.read_schema_from_corpus(driver.get_file_handler,
-          driver.params.input_corpus(0))
-      context.progress
+        "For Hadoop, '--eval-format' must be 'internal'")
+    else {
+      evaluators =
+        for ((stratname, strategy) <- driver.strategies)
+          yield driver.create_document_evaluator(strategy, stratname).
+            asInstanceOf[CellGridEvaluator[
+              SphereCoord,SphereDocument,_,_,_]]
+      if (driver.params.input_corpus.length != 1) {
+        driver.params.parser.error(
+          "FIXME: For Hadoop, currently need exactly one corpus")
+      } else {
+        processor = new HadoopDocumentFileProcessor(context)
+        processor.read_schema_from_corpus(driver.get_file_handler,
+            driver.params.input_corpus(0))
+        context.progress
+      }
     }
   }
 

src/main/scala/opennlp/textgrounder/geolocate/SphereDocument.scala

 
   def distance_to_coord(coord2: SphereCoord) = spheredist(coord, coord2)
   def degree_distance_to_coord(coord2: SphereCoord) = degree_dist(coord, coord2)
+  def output_distance(dist: Double) = km_and_miles(dist)
 }
 
 /**

src/main/scala/opennlp/textgrounder/geolocate/SphereEvaluation.scala

 
 //////// Statistics for geolocating documents
 
-class SphereDocumentEvalStats(
-  driver_stats: ExperimentDriverStats,
-  prefix: String,
-  max_rank_for_credit: Int = 10
-) extends DocumentEvalStats(
-  driver_stats, prefix, max_rank_for_credit) {
+/**
+ * A general trait for encapsulating SphereDocument-specific behavior.
+ * In this case, this is largely the computation of "degree distances" in
+ * addition to "true distances", and making sure results are output in
+ * miles and km.
+ */
+trait SphereDocumentEvalStats extends DocumentEvalStats {
   // "True dist" means actual distance in km's or whatever.
   // "Degree dist" is the distance in degrees.
   val degree_dists = mutable.Buffer[Double]()
   val oracle_degree_dists = mutable.Buffer[Double]()
 
-  def record_result(rank: Int, pred_true_dist: Double,
-      pred_degree_dist: Double) {
-    super.record_result(rank, pred_true_dist)
+  def record_predicted_degree_distance(pred_degree_dist: Double) {
     degree_dists += pred_degree_dist
   }
 
-  def record_oracle_result(oracle_true_dist: Double,
-      oracle_degree_dist: Double) {
-    super.record_oracle_result(oracle_true_dist)
+  def record_oracle_degree_distance(oracle_degree_dist: Double) {
     oracle_degree_dists += oracle_degree_dist
   }
 
-  protected def km_and_miles(kmdist: Double) = {
-    "%.2f km (%.2f miles)" format (kmdist, kmdist / km_per_mile)
-  }
-
   protected def output_result_with_units(kmdist: Double) = km_and_miles(kmdist)
 
   override def output_incorrect_results() {
 }
 
 /**
- * Class for statistics for geolocating documents, with separate
- * sets of statistics for different intervals of error distances and
- * number of documents in true cell.
+ * SphereDocument version of `CoordDocumentEvalStats`.
  */
+class CoordSphereDocumentEvalStats(
+  driver_stats: ExperimentDriverStats,
+  prefix: String
+) extends CoordDocumentEvalStats(driver_stats, prefix)
+  with SphereDocumentEvalStats {
+}
 
-class SphereGroupedDocumentEvalStats(
+/**
+ * SphereDocument version of `RankedDocumentEvalStats`.
+ */
+class RankedSphereDocumentEvalStats(
+  driver_stats: ExperimentDriverStats,
+  prefix: String,
+  max_rank_for_credit: Int = 10
+) extends RankedDocumentEvalStats(driver_stats, prefix, max_rank_for_credit)
+  with SphereDocumentEvalStats {
+}
+
+/**
+ * SphereDocument version of `GroupedDocumentEvalStats`.  This keeps separate
+ * sets of statistics for different subgroups of the test documents, i.e.
+ * those within particular ranges of one or more quantities of interest.
+ */
+class GroupedSphereDocumentEvalStats(
   driver_stats: ExperimentDriverStats,
   cell_grid: SphereCellGrid,
-  results_by_range: Boolean
+  results_by_range: Boolean,
+  is_ranked: Boolean
 ) extends GroupedDocumentEvalStats[
-  SphereCoord, SphereDocument, SphereCell](
-  driver_stats, cell_grid, results_by_range) {
-  type TBasicEvalStats = SphereDocumentEvalStats
-  type TDocEvalRes = SphereDocumentEvaluationResult
-  override def create_stats(prefix: String) =
-    new SphereDocumentEvalStats(driver_stats, prefix)
+  SphereCoord, SphereDocument, SphereCell, SphereCellGrid,
+  SphereDocumentEvaluationResult
+](driver_stats, cell_grid, results_by_range) {
+  override def create_stats(prefix: String) = {
+    if (is_ranked)
+      new RankedSphereDocumentEvalStats(driver_stats, prefix)
+    else
+      new CoordSphereDocumentEvalStats(driver_stats, prefix)
+  }
 
   val docs_by_degree_dist_to_true_center =
     docmap("degree_dist_to_true_center")
     new DoubleTableByRange(dist_fractions_for_error_dist,
       create_stats_for_range("degree_dist_to_pred_center", _))
 
-  override def record_one_result(stats: TBasicEvalStats,
-      res: TDocEvalRes) {
-    stats.record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
+  override def record_one_result(stats: DocumentEvalStats,
+      res: SphereDocumentEvaluationResult) {
+    super.record_one_result(stats, res)
+    stats.asInstanceOf[SphereDocumentEvalStats].
+      record_predicted_degree_distance(res.pred_degdist)
   }
 
-  override def record_one_oracle_result(stats: TBasicEvalStats,
-      res: TDocEvalRes) {
-    stats.record_oracle_result(res.true_truedist, res.true_degdist)
+  override def record_one_oracle_result(stats: DocumentEvalStats,
+      res: SphereDocumentEvaluationResult) {
+    super.record_one_oracle_result(stats, res)
+    stats.asInstanceOf[SphereDocumentEvalStats].
+      record_oracle_degree_distance(res.true_degdist)
   }
 
-  override def record_result_by_range(res: TDocEvalRes) {
+  override def record_result_by_range(res: SphereDocumentEvaluationResult) {
     super.record_result_by_range(res)
 
     /* FIXME: This code specific to MultiRegularCellGrid is kind of ugly.
         fracinc * floor(frac_true_degdist / fracinc)
       val rounded_frac_true_degdist =
         fracinc * floor(frac_true_degdist / fracinc)
-      docs_by_true_dist_to_true_center(rounded_frac_true_truedist).
-        record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
-      docs_by_degree_dist_to_true_center(rounded_frac_true_degdist).
-        record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
+      res.record_result(docs_by_true_dist_to_true_center(
+        rounded_frac_true_truedist))
+      res.record_result(docs_by_degree_dist_to_true_center(
+        rounded_frac_true_degdist))
 
       /* For distance to center of predicted cell, which may be large, since
          predicted cell may be nowhere near the true cell.  Again we convert
          cell size */
       val frac_pred_truedist = res.pred_truedist / multigrid.km_per_cell
       val frac_pred_degdist = res.pred_degdist / multigrid.degrees_per_cell
-      docs_by_true_dist_to_pred_center.get_collector(frac_pred_truedist).
-        record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
-      docs_by_degree_dist_to_pred_center.get_collector(frac_pred_degdist).
-        record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
+      res.record_result(docs_by_true_dist_to_pred_center.get_collector(
+        frac_pred_truedist))
+      res.record_result(docs_by_degree_dist_to_pred_center.get_collector(
+        frac_pred_degdist))
      } else if (cell_grid.isInstanceOf[KdTreeCellGrid]) {
        // for kd trees, we do something similar to above, but round to the nearest km...
        val kdgrid = cell_grid.asInstanceOf[KdTreeCellGrid]
-       docs_by_true_dist_to_true_center(round(res.true_truedist)).
-         record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
-       docs_by_degree_dist_to_true_center(round(res.true_degdist)).
-         record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
+       res.record_result(docs_by_true_dist_to_true_center(
+         round(res.true_truedist)))
+       res.record_result(docs_by_degree_dist_to_true_center(
+         round(res.true_degdist)))
     }
   }
 
 //                             Main evaluation code                        //
 /////////////////////////////////////////////////////////////////////////////
 
-abstract class GeolocateDocumentEvaluator[TEvalDoc, TEvalRes](
+/**
+ * A general trait holding SphereDocument-specific code for storing the
+ * result of evaluation on a document.  Here we simply compute the
+ * true and predicted "degree distances" -- i.e. measured in degrees,
+ * rather than in actual distance along a great circle.
+ */
+trait SphereDocumentEvaluationResult extends DocumentEvaluationResult[
+  SphereCoord, SphereDocument, SphereCell, SphereCellGrid
+] {
+  val xdocument: SphereDocument
+  /* The following must be declared as 'lazy' because 'xdocument' above isn't
+     initialized at creation time (which is impossible because traits can't
+     have construction parameters). */
+  /**
+   * Distance in degrees between document's coordinate and central
+   * point of true cell
+   */
+  lazy val true_degdist = xdocument.degree_distance_to_coord(true_center)
+  /**
+   * Distance in degrees between document's coordinate and predicted
+   * coordinate
+   */
+  lazy val pred_degdist = xdocument.degree_distance_to_coord(pred_coord)
+}
+
+/**
+ * Result of evaluating a SphereDocument using an algorithm that does
+ * cell-by-cell comparison and computes a ranking of all the cells.
+ * The predicted coordinate is the central point of the top-ranked cell,
+ * and the cell grid is derived from the cell.
+ *
+ * @param document document whose coordinate is predicted
+ * @param pred_cell top-ranked predicted cell in which the document should
+ *        belong
+ * @param true_rank rank of the document's true cell among all of the
+ *        predicted cell
+ */
+class RankedSphereDocumentEvaluationResult(
+  document: SphereDocument,
+  pred_cell: SphereCell,
+  true_rank: Int
+) extends RankedDocumentEvaluationResult[
+  SphereCoord, SphereDocument, SphereCell, SphereCellGrid
+  ](
+  document, pred_cell, true_rank
+) with SphereDocumentEvaluationResult {
+  val xdocument = document
+}
+
+/**
+ * Result of evaluating a SphereDocument using an algorithm that
+ * predicts a coordinate that is not necessarily the central point of
+ * any cell (e.g. using a mean-shift algorithm).
+ *
+ * @param document document whose coordinate is predicted
+ * @param cell_grid cell grid against which error comparison should be done
+ * @param pred_coord predicted coordinate of the document
+ */
+class CoordSphereDocumentEvaluationResult(
+  document: SphereDocument,
+  cell_grid: SphereCellGrid,
+  pred_coord: SphereCoord
+) extends CoordDocumentEvaluationResult[
+  SphereCoord, SphereDocument, SphereCell, SphereCellGrid
+  ](
+  document, cell_grid, pred_coord
+) with SphereDocumentEvaluationResult {
+  val xdocument = document
+}
+
+/**
+ * Specialization of `RankedCellGridEvaluator` for SphereCoords (latitude/
+ * longitude coordinates on the surface of a sphere).  Class for evaluating
+ * (geolocating) a test document using a strategy that ranks the cells in the
+ * cell grid and picks the central point of the top-ranked one.
+ */
+class RankedSphereCellGridEvaluator(
   strategy: GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
   stratname: String,
   driver: GeolocateDocumentTypeDriver
-) extends DocumentEvaluator[SphereCoord, SphereDocument, SphereCell,
-  SphereCellGrid, TEvalDoc, TEvalRes](strategy, stratname, driver) {
-  type TGroupedEvalStats = SphereGroupedDocumentEvalStats
-  def create_grouped_eval_stats(driver: GridLocateDriver,
-    cell_grid: SphereCellGrid, results_by_range: Boolean) =
-    new SphereGroupedDocumentEvalStats(driver,
-      cell_grid.asInstanceOf[SphereCellGrid],
-      results_by_range)
-}
-
-class SphereDocumentEvaluationResult(
-  document: SphereDocument,
-  pred_cell: SphereCell,
-  true_rank: Int
-) extends DocumentEvaluationResult[SphereCoord, SphereDocument, SphereCell](
-  document, pred_cell, true_rank
-) {
-  val true_degdist = document.degree_distance_to_coord(true_center)
-  val pred_degdist = document.degree_distance_to_coord(pred_center)
-}
-
-/**
- * Class to do document geolocating on documents from the document data, in
- * the dev or test set.
- */
-class CorpusGeolocateDocumentEvaluator(
-  strategy: GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
-  stratname: String,
-  driver: GeolocateDocumentTypeDriver
-) extends CorpusDocumentEvaluator[
+) extends RankedCellGridEvaluator[
   SphereCoord, SphereDocument, SphereCell, SphereCellGrid,
   SphereDocumentEvaluationResult
 ](strategy, stratname, driver) {
-  // FIXME, the following 8 lines are copied from GeolocateDocumentEvaluator
-  type TGroupedEvalStats = SphereGroupedDocumentEvalStats
   def create_grouped_eval_stats(driver: GridLocateDriver,
     cell_grid: SphereCellGrid, results_by_range: Boolean) =
-    new SphereGroupedDocumentEvalStats(driver,
-      cell_grid.asInstanceOf[SphereCellGrid],
-      results_by_range)
-  def create_evaluation_result(document: SphereDocument, pred_cell: SphereCell,
-      true_rank: Int) =
-    new SphereDocumentEvaluationResult(document, pred_cell, true_rank)
+    new GroupedSphereDocumentEvalStats(
+      driver, cell_grid, results_by_range, is_ranked = true)
+  def create_cell_evaluation_result(document: SphereDocument,
+      pred_cell: SphereCell, true_rank: Int) =
+    new RankedSphereDocumentEvaluationResult(document, pred_cell, true_rank)
 
-  val num_nearest_neighbors = driver.params.num_nearest_neighbors
-
-  def print_individual_result(doctag: String, document: SphereDocument,
+  override def print_individual_result(doctag: String, document: SphereDocument,
       result: SphereDocumentEvaluationResult,
       pred_cells: Array[(SphereCell, Double)]) {
-    errprint("%s:Document %s:", doctag, document)
-    // errprint("%s:Document distribution: %s", doctag, document.dist)
-    errprint("%s:  %d types, %f tokens",
-      doctag, document.dist.num_word_types, document.dist.num_word_tokens)
-    errprint("%s:  true cell at rank: %s", doctag, result.true_rank)
-    errprint("%s:  true cell: %s", doctag, result.true_cell)
-    for (i <- 0 until 5) {
-      errprint("%s:  Predicted cell (at rank %s, kl-div %s): %s",
-        doctag, i + 1, pred_cells(i)._2, pred_cells(i)._1)
-    }
+    super.print_individual_result(doctag, document, result, pred_cells)
 
-    //for (num_nearest_neighbors <- 2 to 100 by 2) {
-    val kNN = pred_cells.take(num_nearest_neighbors).map(_._1)
-    val kNNranks = pred_cells.take(num_nearest_neighbors).zipWithIndex.map(p => (p._1._1, p._2+1)).toMap
-    val closest_half_with_dists = kNN.map(n => (n, spheredist(n.get_center_coord, document.coord))).sortWith(_._2 < _._2).take(num_nearest_neighbors/2)
-
-    closest_half_with_dists.zipWithIndex.foreach(c => errprint("%s:  #%s close neighbor: %s; error distance: %.2f km",
-      doctag, kNNranks(c._1._1), c._1._1.get_center_coord, c._1._2))
-
-    errprint("%s:  Distance %.2f km to true cell center at %s",
-      doctag, result.true_truedist, result.true_center)
-    errprint("%s:  Distance %.2f km to predicted cell center at %s",
-      doctag, result.pred_truedist, result.pred_center)
-
-    val avg_dist_of_neighbors = mean(closest_half_with_dists.map(_._2))
-    errprint("%s:  Average distance from true cell center to %s closest cells' centers from %s best matches: %.2f km",
-      doctag, (num_nearest_neighbors/2), num_nearest_neighbors, avg_dist_of_neighbors)
-
-    if(avg_dist_of_neighbors < result.pred_truedist)
-      driver.increment_local_counter("instances.num_where_avg_dist_of_neighbors_beats_pred_truedist.%s" format num_nearest_neighbors)
-    //}
-
-  
     assert(doctag(0) == '#')
     if (debug("gridrank") ||
       (debuglist("gridrank") contains doctag.drop(1))) {
   }
 }
 
+/**
+ * Specialization of `MeanShiftCellGridEvaluator` for SphereCoords (latitude/
+ * longitude coordinates on the surface of a sphere).  Class for evaluating
+ * (geolocating) a test document using a mean-shift strategy, i.e. picking the
+ * K-best-ranked cells and using the mean-shift algorithm to derive a single
+ * point that hopefully should be in the center of the largest cluster.
+ */
+class MeanShiftSphereCellGridEvaluator(
+  strategy: GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
+  stratname: String,
+  driver: GeolocateDocumentTypeDriver,
+  k_best: Int,
+  mean_shift_window: Double,
+  mean_shift_max_stddev: Double,
+  mean_shift_max_iterations: Int
+) extends MeanShiftCellGridEvaluator[
+  SphereCoord, SphereDocument, SphereCell, SphereCellGrid,
+  SphereDocumentEvaluationResult
+](strategy, stratname, driver, k_best, mean_shift_window,
+  mean_shift_max_stddev, mean_shift_max_iterations) {
+  def create_grouped_eval_stats(driver: GridLocateDriver,
+    cell_grid: SphereCellGrid, results_by_range: Boolean) =
+    new GroupedSphereDocumentEvalStats(
+      driver, cell_grid, results_by_range, is_ranked = false)
+  def create_coord_evaluation_result(document: SphereDocument,
+      cell_grid: SphereCellGrid, pred_coord: SphereCoord) =
+    new CoordSphereDocumentEvaluationResult(document, cell_grid, pred_coord)
+  def create_mean_shift_obj(h: Double, max_stddev: Double,
+    max_iterations: Int) = new SphereMeanShift(h, max_stddev, max_iterations)
+}
+
 case class TitledDocument(title: String, text: String)
 class TitledDocumentResult { }
 
   strategy: GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
   stratname: String,
   driver: GeolocateDocumentTypeDriver
-) extends GeolocateDocumentEvaluator[
+) extends TestDocumentEvaluator[
   TitledDocument, TitledDocumentResult
-](strategy, stratname, driver) with DocumentIteratingEvaluator[
+](stratname, driver) with DocumentIteratingEvaluator[
   TitledDocument, TitledDocumentResult
 ] {
   def iter_documents(filehand: FileHandler, filename: String) = {
 
     new TitledDocumentResult()
   }
+
+  def output_results(isfinal: Boolean = false) {
+  }
 }
 

src/main/scala/opennlp/textgrounder/geolocate/toponym/Toponym.scala

 import opennlp.textgrounder.util.osutil._
 import opennlp.textgrounder.util.printutil.{errout, errprint, warning}
 
-import opennlp.textgrounder.gridlocate.{CombinedWordDist,EvalStats,TestFileEvaluator,DocumentIteratingEvaluator}
+import opennlp.textgrounder.gridlocate.{CombinedWordDist,EvalStats,TestDocumentEvaluator,DocumentIteratingEvaluator}
 import opennlp.textgrounder.gridlocate.GridLocateDriver.Debug._
 import opennlp.textgrounder.geolocate._
 
   strategy: GeolocateToponymStrategy,
   stratname: String,
   driver: GeolocateToponymDriver
-) extends TestFileEvaluator[
+) extends TestDocumentEvaluator[
   GeogWordDocument, ToponymEvaluationResult
 ](stratname, driver) with DocumentIteratingEvaluator[
   GeogWordDocument, ToponymEvaluationResult
     GeolocateDriver with StandaloneExperimentDriverStats {
   type TParam = GeolocateToponymParameters
   type TRunRes =
-    Seq[(String, GeolocateToponymStrategy, TestFileEvaluator[_,_])]
+    Seq[(String, GeolocateToponymStrategy, TestDocumentEvaluator[_,_])]
 
   override def handle_parameters() {
     super.handle_parameters()

src/main/scala/opennlp/textgrounder/gridlocate/DistDocument.scala

   def struct: scala.xml.Elem
 
   def distance_to_coord(coord2: TCoord): Double
+
+  /**
+   * Output a distance with attached units
+   */
+  def output_distance(dist: Double): String
 }
 
 object DistDocument {

src/main/scala/opennlp/textgrounder/gridlocate/Evaluation.scala

 
 import opennlp.textgrounder.util.collectionutil._
 import opennlp.textgrounder.util.experiment.ExperimentDriverStats
-import opennlp.textgrounder.util.mathutil.{mean, median}
+import opennlp.textgrounder.util.mathutil._
 import opennlp.textgrounder.util.ioutil.{FileHandler, FileProcessor}
 import opennlp.textgrounder.util.MeteredTask
 import opennlp.textgrounder.util.osutil.{curtimehuman, output_resource_usage}
 
 //////// Statistics for locating documents
 
-case class DocumentEvaluationResult[
+/**
+ * General class for the result of evaluating a document.  Specifies a
+ * document, cell grid, and the predicted coordinate for the document.
+ * The reason that a cell grid needs to be given is that we need to
+ * retrieve the cell that the document belongs to in order to get the
+ * "central point" (center or centroid of the cell), and in general we
+ * may be operating with multiple cell grids (e.g. in the combination of
+ * uniform and k-D tree grids). (FIXME: I don't know if this is actually
+ * true.)
+ *
+ * FIXME: Perhaps we should redo the results in terms of pseudo-documents
+ * instead of cells.
+ *
+ * @tparam TCoord type of a coordinate
+ * @tparam TDoc type of a document
+ * @tparam TCell type of a cell
+ * @tparam TGrid type of a cell grid
+ *
+ * @param document document whose coordinate is predicted
+ * @param cell_grid cell grid against which error comparison should be done
+ * @param pred_coord predicted coordinate of the document
+ */
+class DocumentEvaluationResult[
   TCoord,
   TDoc <: DistDocument[TCoord],
-  TCell <: GeoCell[TCoord, TDoc]
+  TCell <: GeoCell[TCoord, TDoc],
+  TGrid <: CellGrid[TCoord, TDoc, TCell]
+](
+  val document: TDoc,
+  val cell_grid: TGrid,
+  val pred_coord: TCoord
+) {
+  /**
+   * True cell in the cell grid in which the document belongs
+   */
+  val true_cell = cell_grid.find_best_cell_for_coord(document.coord, true)
+  /**
+   * Number of documents in the true cell
+   */
+  val num_docs_in_true_cell = true_cell.combined_dist.num_docs_for_word_dist
+  /**
+   * Central point of the true cell
+   */
+  val true_center = true_cell.get_center_coord()
+  /**
+   * "True distance" (rather than e.g. degree distance) between document's
+   * coordinate and central point of true cell
+   */
+  val true_truedist = document.distance_to_coord(true_center)
+  /**
+   * "True distance" (rather than e.g. degree distance) between document's
+   * coordinate and predicted coordinate
+   */
+  val pred_truedist = document.distance_to_coord(pred_coord)
+
+  def record_result(stats: DocumentEvalStats) {
+    stats.record_predicted_distance(pred_truedist)
+  }
+}
+
+/**
+ * Subclass of `DocumentEvaluationResult` where the predicted coordinate
+ * is a point, not necessarily the central point of one of the grid cells.
+ *
+ * @tparam TCoord type of a coordinate
+ * @tparam TDoc type of a document
+ * @tparam TCell type of a cell
+ * @tparam TGrid type of a cell grid
+ *
+ * @param document document whose coordinate is predicted
+ * @param cell_grid cell grid against which error comparison should be done
+ * @param pred_coord predicted coordinate of the document
+ */
+class CoordDocumentEvaluationResult[
+  TCoord,
+  TDoc <: DistDocument[TCoord],
+  TCell <: GeoCell[TCoord, TDoc],
+  TGrid <: CellGrid[TCoord, TDoc, TCell]
 ](
   document: TDoc,
-  pred_cell: TCell,
-  true_rank: Int
+  cell_grid: TGrid,
+  pred_coord: TCoord
+) extends DocumentEvaluationResult[TCoord, TDoc, TCell, TGrid](
+  document, cell_grid, pred_coord
 ) {
-  val true_cell =
-    pred_cell.cell_grid.find_best_cell_for_coord(document.coord, true)
-  val num_docs_in_true_cell = true_cell.combined_dist.num_docs_for_word_dist
-  val true_center = true_cell.get_center_coord()
-  val true_truedist = document.distance_to_coord(true_center)
-  val pred_center = pred_cell.get_center_coord()
-  val pred_truedist = document.distance_to_coord(pred_center)
+  override def record_result(stats: DocumentEvalStats) {
+    super.record_result(stats)
+    // It doesn't really make sense to record a result as "correct" or
+    // "incorrect" but we need to record something; just do "false"
+    // FIXME: Fix the incorrect assumption here that "correct" or
+    // "incorrect" always exists.
+    stats.asInstanceOf[CoordDocumentEvalStats].record_result(false)
+  }
 }
 
-abstract class DocumentEvalStats(
-  driver_stats: ExperimentDriverStats,
-  prefix: String,
-  max_rank_for_credit: Int = 10
-) extends EvalStatsWithRank(driver_stats, prefix, max_rank_for_credit) {
+/**
+ * Subclass of `DocumentEvaluationResult` where the predicted coordinate
+ * is specifically the central point of one of the grid cells.
+ *
+ * @tparam TCoord type of a coordinate
+ * @tparam TDoc type of a document
+ * @tparam TCell type of a cell
+ * @tparam TGrid type of a cell grid
+ *
+ * @param document document whose coordinate is predicted
+ * @param pred_cell top-ranked predicted cell in which the document should
+ *        belong
+ * @param true_rank rank of the document's true cell among all of the
+ *        predicted cell
+ */
+class RankedDocumentEvaluationResult[
+  TCoord,
+  TDoc <: DistDocument[TCoord],
+  TCell <: GeoCell[TCoord, TDoc],
+  TGrid <: CellGrid[TCoord, TDoc, TCell]
+](
+  document: TDoc,
+  val pred_cell: TCell,
+  val true_rank: Int
+) extends DocumentEvaluationResult[TCoord, TDoc, TCell, TGrid](
+  document, pred_cell.cell_grid.asInstanceOf[TGrid],
+  pred_cell.get_center_coord()
+) {
+  override def record_result(stats: DocumentEvalStats) {
+    super.record_result(stats)
+    stats.asInstanceOf[RankedDocumentEvalStats].record_true_rank(true_rank)
+  }
+}
+
+/**
+ * A basic class for accumulating statistics from multiple evaluation
+ * results.
+ */
+trait DocumentEvalStats extends EvalStats {
   // "True dist" means actual distance in km's or whatever.
   val true_dists = mutable.Buffer[Double]()
   val oracle_true_dists = mutable.Buffer[Double]()
 
-  def record_result(rank: Int, pred_true_dist: Double) {
-    super.record_result(rank)
+  def record_predicted_distance(pred_true_dist: Double) {
     true_dists += pred_true_dist
   }
 
-  def record_oracle_result(oracle_true_dist: Double) {
+  def record_oracle_distance(oracle_true_dist: Double) {
     oracle_true_dists += oracle_true_dist
   }
 
 }
 
 /**
- * Class for statistics for locating documents, with separate
- * sets of statistics for different intervals of error distances and
- * number of documents in true cell.
+ * A class for accumulating statistics from multiple evaluation results,
+ * where the results directly specify a coordinate (rather than e.g. a cell).
  */
+abstract class CoordDocumentEvalStats(
+  driver_stats: ExperimentDriverStats,
+  prefix: String
+) extends EvalStats(driver_stats, prefix, Map[String, String]())
+  with DocumentEvalStats {
+}
 
-abstract class GroupedDocumentEvalStats[TCoord,
+/**
+ * A class for accumulating statistics from multiple evaluation results,
+ * including statistics on the rank of the true cell.
+ */
+abstract class RankedDocumentEvalStats(
+  driver_stats: ExperimentDriverStats,
+  prefix: String,
+  max_rank_for_credit: Int = 10
+) extends EvalStatsWithRank(driver_stats, prefix, max_rank_for_credit)
+  with DocumentEvalStats {
+  def record_true_rank(rank: Int) {
+    record_result(rank)
+  }
+}
+
+/**
+ * Class for accumulating statistics from multiple document evaluation results,
+ * with separate sets of statistics for different intervals of error distances
+ * and number of documents in true cell. ("Grouped" in the sense that we may be
+ * computing not only results for the documents as a whole but also for various
+ * subgroups.)
+ *
+ * @tparam TCoord type of a coordinate
+ * @tparam TDoc type of a document
+ * @tparam TCell type of a cell
+ * @tparam TGrid type of a cell grid
+ * @tparam TEvalRes type of object holding result of evaluating a document
+ *
+ * @param driver_stats Object (possibly a trait) through which global-level
+ *   program statistics can be accumulated (in a Hadoop context, this maps
+ *   to counters).
+ * @param cell_grid Cell grid against which results were derived.
+ * @param results_by_range If true, record more detailed range-by-range
+ *   subresults.  Not on by default because Hadoop may choke on the large
+ *   number of counters created this way.
+ */
+abstract class GroupedDocumentEvalStats[
+  TCoord,
   TDoc <: DistDocument[TCoord],
-  TCell <: GeoCell[TCoord, TDoc]](
+  TCell <: GeoCell[TCoord, TDoc],
+  TGrid <: CellGrid[TCoord, TDoc, TCell],
+  TEvalRes <: DocumentEvaluationResult[TCoord, TDoc, TCell, TGrid]
+](
   driver_stats: ExperimentDriverStats,
-  cell_grid: CellGrid[TCoord,TDoc,TCell],
+  cell_grid: TGrid,
   results_by_range: Boolean
 ) {
-  type TBasicEvalStats <: DocumentEvalStats
-  type TDocEvalRes <:
-    DocumentEvaluationResult[TCoord, TDoc, TCell]
-
-  def create_stats(prefix: String): TBasicEvalStats
+  def create_stats(prefix: String): DocumentEvalStats
   def create_stats_for_range[T](prefix: String, range: T) =
     create_stats(prefix + ".byrange." + range)
 
   // and longitudinally.
   val dist_fraction_increment = 0.25
   def docmap(prefix: String) =
-    new SettingDefaultHashMap[Double, TBasicEvalStats](
+    new SettingDefaultHashMap[Double, DocumentEvalStats](
       create_stats_for_range(prefix, _))
   val docs_by_true_dist_to_true_center =
     docmap("true_dist_to_true_center")
     new DoubleTableByRange(dist_fractions_for_error_dist,
       create_stats_for_range("true_dist_to_pred_center", _))
 
-  def record_one_result(stats: TBasicEvalStats, res: TDocEvalRes) {
-    stats.record_result(res.true_rank, res.pred_truedist)
+  def record_one_result(stats: DocumentEvalStats, res: TEvalRes) {
+    res.record_result(stats)
   }
 
-  def record_one_oracle_result(stats: TBasicEvalStats, res: TDocEvalRes) {
-    stats.record_oracle_result(res.true_truedist)
+  def record_one_oracle_result(stats: DocumentEvalStats, res: TEvalRes) {
+    stats.record_oracle_distance(res.true_truedist)
   }
 
-  def record_result(res: TDocEvalRes) {
+  def record_result(res: TEvalRes) {
     record_one_result(all_document, res)
     record_one_oracle_result(all_document, res)
     // Stephen says recording so many counters leads to crashes (at the 51st
       record_result_by_range(res)
   }
 
-  def record_result_by_range(res: TDocEvalRes) {
+  def record_result_by_range(res: TEvalRes) {
     val naitr = docs_by_naitr.get_collector(res.num_docs_in_true_cell)
     record_one_result(naitr, res)
   }
 /////////////////////////////////////////////////////////////////////////////
 
 /**
- * Basic abstract class for reading documents from a test file and evaluating
- * on them.  Doesn't use any driver class. (FIXME, perhaps we should
- * integrate this into TestFileEvaluator.)
+ * Basic abstract class for evaluating a test document.  Doesn't use any
+ * driver class.
+ *
+ * TestDocumentEvaluator is currently the only subclass. (FIXME: Perhaps we
+ * should integrate the two.) The reason they are separated is because
+ * TestDocumentEvaluator makes use of a GridLocateDriver class, which
+ * encapsulates (among other things) various command-line parameters,
+ * in particular command-line parameters that allow a subset of the
+ * total set of documents to be evaluated.
  *
  * @tparam TEvalDoc Type of document to evaluate.
  * @tparam TEvalRes Type of result of evaluating a document.
+ *
+ * @param stratname Name of the strategy used to perform evaluation.
+ *   This is output in various status messages.
  */
-abstract class BasicTestFileEvaluator[TEvalDoc, TEvalRes](
+abstract class BasicTestDocumentEvaluator[TEvalDoc, TEvalRes](
   val stratname: String
 ) {
   var documents_processed = 0
   def would_stop_processing(new_processed: Int) = false
 
   /**
-   * Return true if document was actually processed and evaluated; false
-   * if skipped.
+   * Evaluate a document.  Return an object describing the results of the
+   * evaluation.
+   *
+   * @param document Document to evaluate.
+   * @param doctag A short string identifying the document (e.g. '#25'),
+   *   to be printed out at the beginning of diagnostic lines describing
+   *   the document and its evaluation results.
    */
   def evaluate_document(doc: TEvalDoc, doctag: String):
     TEvalRes
 }
 
 /**
- * Abstract class for reading documents from a test file and evaluating
- * on them.
+ * Abstract class for evaluating a test document.
  *
  * @tparam TEvalDoc Type of document to evaluate.
  * @tparam TEvalRes Type of result of evaluating a document.
  *
- * Evaluates on all of the given files, outputting periodic results and
- * results after all files are done.  If the evaluator uses documents as
- * documents (so that it doesn't need any external test files), the value
- * of 'files' should be a sequence of one item, which is null. (If an
- * empty sequence is passed in, no evaluation will happen.)
-
- * Also returns an object containing the results.
+ * @param stratname Name of the strategy used for performing evaluation.
+ * @param driver Driver class that encapsulates command-line parameters and
+ *   such.
+ *
+ * This is a subclass of `BasicTestDocumentEvaluator` which uses the command-line
+ * parameters to determine which documents should be skipped.
  */
-abstract class TestFileEvaluator[TEvalDoc, TEvalRes](
+abstract class TestDocumentEvaluator[TEvalDoc, TEvalRes](
   stratname: String,
   val driver: GridLocateDriver
-) extends BasicTestFileEvaluator[TEvalDoc, TEvalRes](stratname) {
+) extends BasicTestDocumentEvaluator[TEvalDoc, TEvalRes](stratname) {
   override val task = new MeteredTask("document", "evaluating",
     maxtime = driver.params.max_time_per_stage)
   var skip_initial = driver.params.skip_initial_test_docs
   }
 }
 
-abstract class DocumentEvaluator[
+/**
+ * Abstract class for evaluating a test document by comparing it against each
+ * of the cells in a cell grid, where each cell has an associated
+ * pseudo-document created by amalgamating all of the training documents
+ * in the cell.
+ *
+ * Abstract class for for evaluating a test document where a collection of
+ * documents has been divided into "training" and "test" sets, and the
+ * training set used to construct a cell grid in which the training
+ * documents in a particular cell are amalgamated to form a pseudo-document
+ * and evaluation of a test document proceeds by comparing it against each
+ * pseudo-document in turn.
+ *
+ * This is the highest-level evaluation class that includes the concept of a
+ * coordinate that is associated with training and test documents, so that
+ * computation of error distances possible.
+ *
+ * @tparam TCoord Type of the coordinate assigned to a document
+ * @tparam XTDoc Type of the training and test documents
+ * @tparam TCell Type of a cell in a cell grid
+ * @tparam XTGrid Type of a cell grid
+ * @tparam TEvalRes Type of result of evaluating a document.
+ *
+ * @param strategy Object encapsulating the strategy used for performing
+ *   evaluation.
+ * @param stratname Name of the strategy used for performing evaluation.
+ * @param driver Driver class that encapsulates command-line parameters and
+ *   such.
+ *
+ * Note that we are forced to use the strange names `XTDoc` and `XTGrid`
+ * because of an apparent Scala bug that prevents use of the more obvious
+ * names `TDoc` and `TGrid` due to a naming clash.  Possibly there is a
+ * solution to this problem but if so I can't figure it out.
+ */
+abstract class CellGridEvaluator[
   TCoord,
-  TDoc <: DistDocument[TCoord],
-  TCell <: GeoCell[TCoord, TDoc],
-  TGrid <: CellGrid[TCoord, TDoc, TCell],
-  TEvalDoc,
-  TEvalRes
+  XTDoc <: DistDocument[TCoord],
+  TCell <: GeoCell[TCoord, XTDoc],
+  // SCALABUG: No way to access something called 'TGrid' at this scope in the
+  // line below where it says 'type TGrid = XTGrid'
+  XTGrid <: CellGrid[TCoord, XTDoc, TCell],
+  TEvalRes <: DocumentEvaluationResult[TCoord, XTDoc, TCell, XTGrid]
 ](
-  val strategy: GridLocateDocumentStrategy[TCell, TGrid],
+  strategy: GridLocateDocumentStrategy[TCell, XTGrid],
   stratname: String,
-  driver: GridLocateDriver // GridLocateDocumentTypeDriver
-) extends TestFileEvaluator[TEvalDoc, TEvalRes](stratname, driver) {
-  type TGroupedEvalStats <: GroupedDocumentEvalStats[TCoord,TDoc,TCell]
+  driver: GridLocateDriver { type TGrid = XTGrid; type TDoc = XTDoc } // GridLocateDocumentTypeDriver
+) extends TestDocumentEvaluator[XTDoc, TEvalRes](stratname, driver) {
   def create_grouped_eval_stats(driver: GridLocateDriver, // GridLocateDocumentTypeDriver
-    cell_grid: TGrid, results_by_range: Boolean):
-    TGroupedEvalStats
+    cell_grid: XTGrid, results_by_range: Boolean):
+    GroupedDocumentEvalStats[TCoord, XTDoc, TCell, XTGrid, TEvalRes]
+
   val evalstats = create_grouped_eval_stats(driver,
     strategy.cell_grid, results_by_range = driver.params.results_by_range)
 
   def output_results(isfinal: Boolean = false) {
     evalstats.output_results(all_results = isfinal)
-  }
-}
-
-/**
- * Class to do document grid-location on documents from the document data, in
- * the dev or test set.
- */
-abstract class CorpusDocumentEvaluator[
-  TCoord,
-  XTDoc <: DistDocument[TCoord],
-  TCell <: GeoCell[TCoord, XTDoc],
-  // SCALABUG: No way access something called 'TGrid' at this scope in the
-  // line below where it says 'type TGrid = XTGrid'
-  XTGrid <: CellGrid[TCoord, XTDoc, TCell],
-  TEvalRes <: DocumentEvaluationResult[_,_,_]
-](
-  strategy: GridLocateDocumentStrategy[TCell, XTGrid],
-  stratname: String,
-  driver: GridLocateDriver { type TGrid = XTGrid; type TDoc = XTDoc } // GridLocateDocumentTypeDriver
-) extends DocumentEvaluator[
-  TCoord, XTDoc, TCell, XTGrid, XTDoc, TEvalRes
-](strategy, stratname, driver) {
-  override type TGroupedEvalStats <:
-    GroupedDocumentEvalStats[TCoord,XTDoc,TCell] { type TDocEvalRes = TEvalRes }
+ }
 
   /**
    * A file processor that reads corpora containing document metadata and
     } else false
   }
 
-  def create_evaluation_result(document: XTDoc, pred_cell: TCell,
-    true_rank: Int): TEvalRes
+  /**
+   * Compare the document to the pseudo-documents associated with each cell,
+   * using the strategy for this evaluator.  Return a tuple
+   * (pred_cells, true_rank), where:
+   *
+   *  pred_cells = List of predicted cells, from best to worst; each list
+   *     entry is actually a tuple of (cell, score) where lower scores
+   *     are better
+   *  true_rank = Rank of true cell among predicted cells
+   *
+   * @param document Document to evaluate.
+   * @param true_cell Cell in the cell grid which contains the document.
+   */
+  def return_ranked_cells(document: XTDoc, true_cell: TCell) = {
+    if (driver.params.oracle_results)
+      (Array((true_cell, 0.0)), 1)
+    else {
+      def get_computed_results() = {
+        val cells = strategy.return_ranked_cells(document.dist).toArray
+        var rank = 1
+        var broken = false
+        breakable {
+          for ((cell, value) <- cells) {
+            if (cell eq true_cell) {
+              broken = true
+              break
+            }
+            rank += 1
+          }
+        }
+        if (!broken)
+          rank = 1000000000
+        (cells, rank)
+      }
 
-  def print_individual_result(doctag: String, document: XTDoc,
-    result: TEvalRes, pred_cells: Array[(TCell, Double)])
+      get_computed_results()
+    }
+  }
 
+  /**
+   * Actual implementation of code to evaluate a document.  Optionally
+   * Return an object describing the results of the evaluation, and
+   * optionally print out information on these results.
+   *
+   * @param document Document to evaluate.
+   * @param doctag A short string identifying the document (e.g. '#25'),
+   *   to be printed out at the beginning of diagnostic lines describing
+   *   the document and its evaluation results.
+   * @param true_cell Cell in the cell grid which contains the document.
+   * @param want_indiv_results Whether we should print out individual
+   *   evaluation results for the document.
+   */
+  def imp_evaluate_document(document: XTDoc, doctag: String,
+      true_cell: TCell, want_indiv_results: Boolean): TEvalRes
+
+  /**
+   * Evaluate a document, record statistics about it, etc.  Calls
+   * `imp_evaluate_document` to do the document evaluation and optionally
+   * print out information on the results, and records the results in
+   * `evalstat`.
+   *
+   * Return an object describing the results of the evaluation.
+   *
+   * @param document Document to evaluate.
+   * @param doctag A short string identifying the document (e.g. '#25'),
+   *   to be printed out at the beginning of diagnostic lines describing
+   *   the document and its evaluation results.
+   */
   def evaluate_document(document: XTDoc, doctag: String): TEvalRes = {
-    if (would_skip_document(document, doctag)) {
-      evalstats.increment_counter("documents.skipped")
-      // SCALABUG: Doesn't automatically recognize TEvalRes as a reference
-      // type despite being a subclass of DocumentEvaluationResult
-      return null.asInstanceOf[TEvalRes]
-    }
+    assert(!would_skip_document(document, doctag))
     assert(document.dist.finished)
     val true_cell =
       strategy.cell_grid.find_best_cell_for_coord(document.coord, true)
       errprint("Evaluating document %s with %s word-dist documents in true cell",
         document, naitr)
     }
+    val want_indiv_results =
+      !driver.params.oracle_results && !driver.params.no_individual_results
+    val result = imp_evaluate_document(document, doctag, true_cell,
+      want_indiv_results)
+    evalstats.record_result(result)
+    if (result.num_docs_in_true_cell == 0) {
+      evalstats.increment_counter("documents.no_training_documents_in_cell")
+    }
+    result
+  }
+}
 
-    //val num_nearest_neighbors = 10
+/**
+ * An implementation of `CellGridEvaluator` that compares the test
+ * document against each pseudo-document in the cell grid, ranks them by
+ * score and computes the document's location by the central point of the
+ * top-ranked cell.
+ *
+ * @tparam TCoord Type of the coordinate assigned to a document
+ * @tparam XTDoc Type of the training and test documents
+ * @tparam TCell Type of a cell in a cell grid
+ * @tparam XTGrid Type of a cell grid
+ * @tparam TEvalRes Type of result of evaluating a document.
+ *
+ * @param strategy Object encapsulating the strategy used for performing
+ *   evaluation.
+ * @param stratname Name of the strategy used for performing evaluation.
+ * @param driver Driver class that encapsulates command-line parameters and
+ *   such.
+ */
+abstract class RankedCellGridEvaluator[
+  TCoord,
+  XTDoc <: DistDocument[TCoord],
+  TCell <: GeoCell[TCoord, XTDoc],
+  XTGrid <: CellGrid[TCoord, XTDoc, TCell],
+  TEvalRes <: DocumentEvaluationResult[TCoord, XTDoc, TCell, XTGrid]
+](
+  strategy: GridLocateDocumentStrategy[TCell, XTGrid],
+  stratname: String,
+  driver: GridLocateDriver { type TGrid = XTGrid; type TDoc = XTDoc } // GridLocateDocumentTypeDriver
+) extends CellGridEvaluator[
+  TCoord, XTDoc, TCell, XTGrid, TEvalRes
+](strategy, stratname, driver) {
+  /**
+   * Create an evaluation-result object describing the top-ranked
+   * predicted cell and the rank of the document's true cell among
+   * all predicted cells.
+   */
+  def create_cell_evaluation_result(document: XTDoc, pred_cell: TCell,
+    true_rank: Int): TEvalRes
 
-    /* That is:
+  val num_nearest_neighbors = driver.params.num_nearest_neighbors
 
-       pred_cells = List of predicted cells, from best to worst; each list
-          entry is actually a tuple of (cell, score) where lower scores
-          are better
-       true_rank = Rank of true cell among predicted cells
-     */
-    val (pred_cells, true_rank) =