Commits

Stephen Roller  committed 9df7dbb

Manually merge work that got obliterated during the merge conflict fiasco from a few weeks ago.

  • Participants
  • Parent commits 2b384ee

Comments (0)

Files changed (4)

File src/main/java/ags/utils/KdTree.java

     // All types
     private final int                  dimensions;
     private final KdTree<T>            parent;
-    private  int                       bucketSize;
+    private int                        bucketSize;
  
     // Leaf only
     private double[][]                 locations;
      */
     private KdTree(KdTree<T> parent, boolean right) {
         this.dimensions = parent.dimensions;
+        this.bucketSize = parent.bucketSize;
  
         // Init as leaf
         this.locations = new double[Math.max(bucketSize, parent.locationCount)][];

File src/main/scala/opennlp/textgrounder/geolocate/Evaluation.scala

         record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
       docs_by_degree_dist_to_pred_center.get_collector(frac_pred_degdist).
         record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
+     } else if (cell_grid.isInstanceOf[KdTreeCellGrid]) {
+       // for kd trees, we do something similar to above, but round to the nearest km...
+       val kdgrid = cell_grid.asInstanceOf[KdTreeCellGrid]
+       all_document.record_oracle_result(res.true_truedist, res.true_degdist)
+       docs_by_true_dist_to_true_center(round(res.true_truedist)).
+         record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
+       docs_by_degree_dist_to_true_center(round(res.true_degdist)).
+         record_result(res.true_rank, res.pred_truedist, res.pred_degdist)
     }
   }
 

File src/main/scala/opennlp/textgrounder/geolocate/Geolocate.scala

 tiling cell to compute each multi cell.  If the value is more than
 1, the multi cells overlap.""")
 
+  //// Options for using KD trees, and related parameters
+  var use_kd_tree = 
+    ap.option[Boolean]("kd-tree", "kd", "kdtree", metavar = "BOOL", 
+      default = false,
+      help = """Specifies we should use a KD tree rather than uniform
+grid cell. Default %default.""")
+
+  var kd_bucketsize =
+    ap.option[Int]("kd-bucket-size", "kdbs", "bucket-size", default=200,
+      metavar = "INT",
+      help = """Bucket size before splitting a leaf into two children.
+Default %default.""")
+
+  var kd_center_or_centroid =
+    ap.option[String]("center-method", "cm", metavar = "CENTER_METHOD",
+      default = "centroid",
+      choices = Seq("centroid", "center"),
+      help = """Chooses whether to use center or centroid for cell
+center calculation. Options are either 'centroid' or 'center'.
+Default '%default'.""")
+
+
   //// Options used when creating word distributions
   var word_dist =
     ap.option[String]("word-dist", "wd",
   }
 
   protected def initialize_cell_grid(table: DistDocumentTable) = {
-    new MultiRegularCellGrid(degrees_per_cell,
-      params.width_of_multi_cell, table)
+    if (params.use_kd_tree)
+      new KdTreeCellGrid(table, params.kd_bucketsize)
+    else
+      new MultiRegularCellGrid(degrees_per_cell,
+        params.width_of_multi_cell, table)
   }
 
   protected def initialize_word_dist_factory() = {

File src/main/scala/opennlp/textgrounder/geolocate/KDTreeCellGrid.scala

 package opennlp.textgrounder.geolocate
 
 import scala.collection.JavaConversions._
-import scala.collection.immutable.Map
+import scala.collection.mutable.Map
 
 import ags.utils.KdTree
 
 import opennlp.textgrounder.util.distances.Coord
 
+import GeolocateDriver.Args
+
 class KdTreeCell(
   cellgrid: KdTreeCellGrid,
   val kdleaf : KdTree[DistDocument]) extends RectangularCell(cellgrid) {
     def iterate_documents () : Iterable[DistDocument] = {
         kdleaf.getData()
     }
+
+    override def get_center_coord () = {
+      if (Args.kd_center_or_centroid == "center") {
+        // center method
+        super.get_center_coord
+      } else {
+        // centroid method
+        var sum_lat = 0.0
+        var sum_long = 0.0
+        for (art <- kdleaf.getData) {
+          sum_lat += art.coord.lat
+          sum_long += art.coord.long
+        }
+        Coord(sum_lat / kdleaf.size, sum_long / kdleaf.size)
+      }
+    }
     
     def describe_indices () : String = {
         "Placeholder"
    */
   var total_num_cells: Int = 0
   var kdtree : KdTree[DistDocument] = new KdTree[DistDocument](2, bucketSize);
+  val leaves_to_cell : Map[KdTree[DistDocument], KdTreeCell] = Map();
 
   /**
    * Find the correct cell for the given coordinates.  If no such cell
    * exists, return null.
    */
   def find_best_cell_for_coord(coord: Coord): KdTreeCell = {
-      new KdTreeCell(this, kdtree.getLeaf(Array(coord.lat, coord.long)))
+      leaves_to_cell(kdtree.getLeaf(Array(coord.lat, coord.long)))
   }
 
   /**
    */
   def initialize_cells: Unit = {
       total_num_cells = kdtree.getLeaves.size
+      num_non_empty_cells = total_num_cells
+
+      for (leaf <- kdtree.getLeaves) {
+        val c = new KdTreeCell(this, leaf)
+        c.generate_dist
+        leaves_to_cell.update(leaf, c)
+      }
   }
 
   /**
    *   but have no corresponding word counts given in the counts file.)
    */
   def iter_nonempty_cells(nonempty_word_dist: Boolean = false): Iterable[GeoCell] = {
-      for (leaf <- kdtree.getLeaves)
-          yield new KdTreeCell(this, leaf)
+      for (leaf <- kdtree.getLeaves
+        if (leaf.size() > 0 || !nonempty_word_dist))
+          yield leaves_to_cell(leaf)
   }
 }