Commits

Ben Wing committed 8b25e36

Rename/rationalize the names of various classes, add documentation

Comments (0)

Files changed (5)

src/main/scala/opennlp/textgrounder/geolocate/Geolocate.scala

     if (params.eval_format == "pcl-travel")
       new PCLTravelGeolocateDocumentEvaluator(strategy, stratname, this)
     else if (params.coord_strategy =="top-ranked")
-      new RankedCorpusGeolocateDocumentEvaluator(strategy, stratname, this)
+      new RankedSphereCellGridEvaluator(strategy, stratname, this)
     else
-      new MeanShiftCorpusGeolocateDocumentEvaluator(strategy, stratname, this,
+      new MeanShiftSphereCellGridEvaluator(strategy, stratname, this,
         params.k_best, params.mean_shift_window,
         params.mean_shift_max_stddev,
         params.mean_shift_max_iterations)

src/main/scala/opennlp/textgrounder/geolocate/Hadoop.scala

 import opennlp.textgrounder.util.mathutil.{mean, median}
 import opennlp.textgrounder.util.printutil.{errprint, warning}
 
-import opennlp.textgrounder.gridlocate.{CorpusDocumentEvaluator,TextGrounderInfo,DistDocumentFileProcessor}
+import opennlp.textgrounder.gridlocate.{CellGridEvaluator,TextGrounderInfo,DistDocumentFileProcessor}
 
 /* Basic idea for hooking up Geolocate with Hadoop.  Hadoop works in terms
    of key-value pairs, as follows:
   def create_param_object(ap: ArgParser) = new TParam(ap)
   def create_driver() = new TDriver
 
-  var evaluators: Iterable[CorpusDocumentEvaluator[SphereCoord,SphereDocument,_,_,_]] = null
+  var evaluators: Iterable[CellGridEvaluator[SphereCoord,SphereDocument,_,_,_]] = null
   val task = new ExperimentMeteredTask(driver, "document", "evaluating")
 
   class HadoopDocumentFileProcessor(
       evaluators =
         for ((stratname, strategy) <- driver.strategies)
           yield driver.create_document_evaluator(strategy, stratname).
-            asInstanceOf[CorpusDocumentEvaluator[
+            asInstanceOf[CellGridEvaluator[
               SphereCoord,SphereDocument,_,_,_]]
       if (driver.params.input_corpus.length != 1) {
         driver.params.parser.error(

src/main/scala/opennlp/textgrounder/geolocate/SphereEvaluation.scala

 
 //////// Statistics for geolocating documents
 
+/**
+ * A general trait for encapsulating SphereDocument-specific behavior.
+ * In this case, this is largely the computation of "degree distances" in
+ * addition to "true distances", and making sure results are output in
+ * miles and km.
+ */
 trait SphereDocumentEvalStats extends DocumentEvalStats {
   // "True dist" means actual distance in km's or whatever.
   // "Degree dist" is the distance in degrees.
 }
 
 /**
- * Class for statistics for geolocating documents, with separate
- * sets of statistics for different intervals of error distances and
- * number of documents in true cell.
+ * SphereDocument version of `GroupedDocumentEvalStats`.  This keeps separate
+ * sets of statistics for different subgroups of the test documents, i.e.
+ * those within particular ranges of one or more quantities of interest.
  */
-
-class SphereGroupedDocumentEvalStats(
+class GroupedSphereDocumentEvalStats(
   driver_stats: ExperimentDriverStats,
   cell_grid: SphereCellGrid,
   results_by_range: Boolean,
 //                             Main evaluation code                        //
 /////////////////////////////////////////////////////////////////////////////
 
+/**
+ * A general trait holding SphereDocument-specific code for storing the
+ * result of evaluation on a document.  Here we simply compute the
+ * true and predicted "degree distances" -- i.e. measured in degrees,
+ * rather than in actual distance along a great circle.
+ */
 trait SphereDocumentEvaluationResult extends DocumentEvaluationResult[
   SphereCoord, SphereDocument, SphereCell, SphereCellGrid
 ] {
  * @param true_rank rank of the document's true cell among all of the
  *        predicted cell
  */
-class SphereDocumentEvaluationResultCell(
+class RankedSphereDocumentEvaluationResult(
   document: SphereDocument,
   pred_cell: SphereCell,
   true_rank: Int
-) extends DocumentEvaluationResultCell[
+) extends RankedDocumentEvaluationResult[
   SphereCoord, SphereDocument, SphereCell, SphereCellGrid
   ](
   document, pred_cell, true_rank
  * @param cell_grid cell grid against which error comparison should be done
  * @param pred_coord predicted coordinate of the document
  */
-class SphereDocumentEvaluationResultCoord(
+class CoordSphereDocumentEvaluationResult(
   document: SphereDocument,
   cell_grid: SphereCellGrid,
   pred_coord: SphereCoord
 }
 
 /**
- * Class to do document geolocating on documents from the document data, in
- * the dev or test set.
+ * Specialization of `RankedCellGridEvaluator` for SphereCoords (latitude/
+ * longitude coordinates on the surface of a sphere).  Class for evaluating
+ * (geolocating) a test document using a strategy that ranks the cells in the
+ * cell grid and picks the central point of the top-ranked one.
  */
-class RankedCorpusGeolocateDocumentEvaluator(
+class RankedSphereCellGridEvaluator(
   strategy: GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
   stratname: String,
   driver: GeolocateDocumentTypeDriver
-) extends RankedCorpusDocumentEvaluator[
+) extends RankedCellGridEvaluator[
   SphereCoord, SphereDocument, SphereCell, SphereCellGrid,
-  SphereDocumentEvaluationResultCell
+  RankedSphereDocumentEvaluationResult
 ](strategy, stratname, driver) {
   def create_grouped_eval_stats(driver: GridLocateDriver,
     cell_grid: SphereCellGrid, results_by_range: Boolean) =
-    new SphereGroupedDocumentEvalStats(driver,
+    new GroupedSphereDocumentEvalStats(driver,
       cell_grid.asInstanceOf[SphereCellGrid],
       results_by_range, is_ranked = true)
   def create_cell_evaluation_result(document: SphereDocument,
       pred_cell: SphereCell, true_rank: Int) =
-    new SphereDocumentEvaluationResultCell(document, pred_cell, true_rank)
+    new RankedSphereDocumentEvaluationResult(document, pred_cell, true_rank)
 
   val num_nearest_neighbors = driver.params.num_nearest_neighbors
 
   def print_individual_result(doctag: String, document: SphereDocument,
-      result: SphereDocumentEvaluationResultCell,
+      result: RankedSphereDocumentEvaluationResult,
       pred_cells: Array[(SphereCell, Double)]) {
     errprint("%s:Document %s:", doctag, document)
     // errprint("%s:Document distribution: %s", doctag, document.dist)
 }
 
 /**
- * Class to do document geolocating on documents from the document data, in
- * the dev or test set.
+ * Specialization of `MeanShiftCellGridEvaluator` for SphereCoords (latitude/
+ * longitude coordinates on the surface of a sphere).  Class for evaluating
+ * (geolocating) a test document using a mean-shift strategy, i.e. picking the
+ * K-best-ranked cells and using the mean-shift algorithm to derive a single
+ * point that hopefully should be in the center of the largest cluster.
  */
-class MeanShiftCorpusGeolocateDocumentEvaluator(
+class MeanShiftSphereCellGridEvaluator(
   strategy: GridLocateDocumentStrategy[SphereCell, SphereCellGrid],
   stratname: String,
   driver: GeolocateDocumentTypeDriver,
   mean_shift_window: Double,
   mean_shift_max_stddev: Double,
   mean_shift_max_iterations: Int
-) extends MeanShiftCorpusDocumentEvaluator[
+) extends MeanShiftCellGridEvaluator[
   SphereCoord, SphereDocument, SphereCell, SphereCellGrid,
   SphereDocumentEvaluationResult
 ](strategy, stratname, driver, k_best, mean_shift_window,
   mean_shift_max_stddev, mean_shift_max_iterations) {
   def create_grouped_eval_stats(driver: GridLocateDriver,
     cell_grid: SphereCellGrid, results_by_range: Boolean) =
-    new SphereGroupedDocumentEvalStats(driver,
-      cell_grid.asInstanceOf[SphereCellGrid],
-      results_by_range, is_ranked = false)
+    new GroupedSphereDocumentEvalStats(driver,
+      cell_grid, results_by_range, is_ranked = false)
   def create_coord_evaluation_result(document: SphereDocument,
       cell_grid: SphereCellGrid, pred_coord: SphereCoord) =
-    new SphereDocumentEvaluationResultCoord(document, cell_grid, pred_coord)
+    new CoordSphereDocumentEvaluationResult(document, cell_grid, pred_coord)
 
   def print_individual_result(doctag: String, document: SphereDocument,
       result: SphereDocumentEvaluationResult) {

src/main/scala/opennlp/textgrounder/gridlocate/Evaluation.scala

  * Subclass of `DocumentEvaluationResult` where the predicted coordinate
  * is specifically the central point of one of the grid cells.
  *
+ * @tparam TCoord type of a coordinate
+ * @tparam TDoc type of a document
+ * @tparam TCell type of a cell
+ * @tparam TGrid type of a cell grid
+ *
  * @param document document whose coordinate is predicted
  * @param pred_cell top-ranked predicted cell in which the document should
  *        belong
  * @param true_rank rank of the document's true cell among all of the
  *        predicted cell
  */
-class DocumentEvaluationResultCell[
+class RankedDocumentEvaluationResult[
   TCoord,
   TDoc <: DistDocument[TCoord],
   TCell <: GeoCell[TCoord, TDoc],
  * and number of documents in true cell. ("Grouped" in the sense that we may be
  * computing not only results for the documents as a whole but also for various
  * subgroups.)
+ *
+ * @tparam TCoord type of a coordinate
+ * @tparam TDoc type of a document
+ * @tparam TCell type of a cell
+ * @tparam TGrid type of a cell grid
+ * @tparam TEvalRes type of object holding result of evaluating a document
+ *
+ * @param driver_stats Object (possibly a trait) through which global-level
+ *   program statistics can be accumulated (in a Hadoop context, this maps
+ *   to counters).
+ * @param cell_grid Cell grid against which results were derived.
+ * @param results_by_range If true, record more detailed range-by-range
+ *   subresults.  Not on by default because Hadoop may choke on the large
+ *   number of counters created this way.
  */
 abstract class GroupedDocumentEvalStats[
   TCoord,
   TDoc <: DistDocument[TCoord],
   TCell <: GeoCell[TCoord, TDoc],
   TGrid <: CellGrid[TCoord, TDoc, TCell],
-  -TDocEvalRes <: DocumentEvaluationResult[TCoord, TDoc, TCell, TGrid]
+  /* The following - sign is necessary (indicating contravariance) because of
+     the use of subclasses like RankedSphereDocumentEvaluationResult in
+     RankedSphereCellGridEvaluator, whereas GroupedSphereDocumentEvalStats
+     merely uses SphereDocumentEvaluationResult. */
+  -TEvalRes <: DocumentEvaluationResult[TCoord, TDoc, TCell, TGrid]
 ](
   driver_stats: ExperimentDriverStats,
   cell_grid: CellGrid[TCoord,TDoc,TCell],
     new DoubleTableByRange(dist_fractions_for_error_dist,
       create_stats_for_range("true_dist_to_pred_center", _))
 
-  def record_one_result(stats: DocumentEvalStats, res: TDocEvalRes) {
+  def record_one_result(stats: DocumentEvalStats, res: TEvalRes) {
     res.record_result(stats)
   }
 
-  def record_one_oracle_result(stats: DocumentEvalStats, res: TDocEvalRes) {
+  def record_one_oracle_result(stats: DocumentEvalStats, res: TEvalRes) {
     stats.record_oracle_distance(res.true_truedist)
   }
 
-  def record_result(res: TDocEvalRes) {
+  def record_result(res: TEvalRes) {
     record_one_result(all_document, res)
     record_one_oracle_result(all_document, res)
     // Stephen says recording so many counters leads to crashes (at the 51st
       record_result_by_range(res)
   }
 
-  def record_result_by_range(res: TDocEvalRes) {
+  def record_result_by_range(res: TEvalRes) {
     val naitr = docs_by_naitr.get_collector(res.num_docs_in_true_cell)
     record_one_result(naitr, res)
   }
  * names `TDoc` and `TGrid` due to a naming clash.  Possibly there is a
  * solution to this problem but if so I can't figure it out.
  */
-abstract class CorpusDocumentEvaluator[
+abstract class CellGridEvaluator[
   TCoord,
   XTDoc <: DistDocument[TCoord],
   TCell <: GeoCell[TCoord, XTDoc],
 }
 
 /**
- * An implementation of `CorpusDocumentEvaluator` that compares the test
+ * An implementation of `CellGridEvaluator` that compares the test
  * document against each pseudo-document in the cell grid, ranks them by
  * score and computes the document's location by the central point of the
  * top-ranked cell.
  * @param driver Driver class that encapsulates command-line parameters and
  *   such.
  */
-abstract class RankedCorpusDocumentEvaluator[
+abstract class RankedCellGridEvaluator[
   TCoord,
   XTDoc <: DistDocument[TCoord],
   TCell <: GeoCell[TCoord, XTDoc],
   strategy: GridLocateDocumentStrategy[TCell, XTGrid],
   stratname: String,
   driver: GridLocateDriver { type TGrid = XTGrid; type TDoc = XTDoc } // GridLocateDocumentTypeDriver
-) extends CorpusDocumentEvaluator[
+) extends CellGridEvaluator[
   TCoord, XTDoc, TCell, XTGrid, TEvalRes
 ](strategy, stratname, driver) {
   /**
 }
 
 /**
- * An implementation of `CorpusDocumentEvaluator` that compares the test
+ * An implementation of `CellGridEvaluator` that compares the test
  * document against each pseudo-document in the cell grid, selects the
  * top N ranked pseudo-documents for some N, and uses the mean-shift
  * algorithm to determine a single point that is hopefully in the middle
  * @param driver Driver class that encapsulates command-line parameters and
  *   such.
  */
-abstract class MeanShiftCorpusDocumentEvaluator[
+abstract class MeanShiftCellGridEvaluator[
   TCoord,
   XTDoc <: DistDocument[TCoord],
   TCell <: GeoCell[TCoord, XTDoc],
   mean_shift_window: Double,
   mean_shift_max_stddev: Double,
   mean_shift_max_iterations: Int
-) extends CorpusDocumentEvaluator[
+) extends CellGridEvaluator[
   TCoord, XTDoc, TCell, XTGrid, TEvalRes
 ](strategy, stratname, driver) {
   /**
   }
 }
 
+/**
+ * A trait used when '--eval-format' is not 'internal', i.e. the test documents
+ * don't come from the same corpus used to supply the training documents,
+ * but come from some separate text file.  This is a general interface for
+ * iterating over files and returning the test documents in those files
+ * (possibly more than one per file).
+ */
 trait DocumentIteratingEvaluator[TEvalDoc, TEvalRes] extends
   TestDocumentEvaluator[TEvalDoc, TEvalRes] {
   /**

src/main/scala/opennlp/textgrounder/gridlocate/GridLocate.scala

   var Params: GridLocateParameters = _
   val Debug: DebugSettings = new DebugSettings
 
-  // Debug flags (from CorpusGeolocateDocumentEvaluator) -- need to set them
+  // Debug flags (from SphereCellGridEvaluator) -- need to set them
   // here before we parse the command-line debug settings. (FIXME, should
   // be a better way that introduces fewer long-range dependencies like
   // this)