Commits

Ben Wing  committed d36bb54 Merge

Automatic merge

  • Participants
  • Parent commits 8278a35, 92173ab

Comments (0)

Files changed (34)

File bin/tg-geolocate

 wikipedia         Run on Wikipedia.
 twitter           Run on the GeoText twitter corpus.
 twitter-wiki      Run on a combination of both of the above corpora.
+gutonly-small     Run on the GeoTwitterUT small corpus.
 
 The option '--hadoop' causes the application to be run using Hadoop
 (a package for large-scale parallel processing).  Before doing this,

File bin/tg-get-corpus-args

 twitter           Run on the GeoText twitter corpus, with document threshold 5.
 twitter-wiki      Run on a combination of the 'wikipedia' and 'twitter'
                   corpora.
+gutonly-small     Run on the GeoTwitterUT small corpus.
 *                 Run on some other corpus in the corpus dir (located
                   at $TG_CORPUS_DIR).
 
       echo --input-corpus $TG_CORPUS_DIR/twitter-geotext/$1 ;;
     twitter-wiki )
       output_args wikipedia; output_args twitter ;;
+    gutonly-small )
+      echo --input-corpus $TG_CORPUS_DIR/gutonly-small ;;
     * )
       echo --input-corpus $TG_CORPUS_DIR/$1 ;;
   esac

File src/main/java/opennlp/textgrounder/util/KMLUtil.java

     w.writeEndElement(); // PolyStyle
     w.writeStartElement("LineStyle");
     KMLUtil.writeWithCharacters(w, "color", "ff0000ff");
-    KMLUtil.writeWithCharacters(w, "with", "3");
+    KMLUtil.writeWithCharacters(w, "width", "3");
     w.writeEndElement(); // LineStyle
     w.writeStartElement("IconStyle");
     w.writeEmptyElement("Icon");
     w.writeEndElement(); // Style
 
     w.writeStartElement("Style");
+    w.writeAttribute("id", "whiteLine");
+    w.writeStartElement("LineStyle");
+    KMLUtil.writeWithCharacters(w, "color", "ffffffff");
+    KMLUtil.writeWithCharacters(w, "width", "3");
+    w.writeEndElement(); // LineStyle
+    w.writeEndElement(); // Style
+
+    w.writeStartElement("Style");
+    w.writeAttribute("id", "redLine");
+    w.writeStartElement("LineStyle");
+    KMLUtil.writeWithCharacters(w, "color", "ff0000ff");
+    KMLUtil.writeWithCharacters(w, "width", "3");
+    w.writeEndElement(); // LineStyle
+    w.writeEndElement(); // Style
+
+    w.writeStartElement("Style");
     w.writeAttribute("id", "blue");
     w.writeStartElement("IconStyle");
-    //KMLUtil.writeWithCharacters(w, "color", "ffff0000");
     w.writeStartElement("Icon");
     KMLUtil.writeWithCharacters(w, "href", "http://maps.google.com/mapfiles/kml/paddle/blu-blank-lv.png");
-    //KMLUtil.writeWithCharacters(w, "scale", "0.5);
     w.writeEndElement(); // Icon
     w.writeEndElement(); // IconStyle
     w.writeEndElement(); // Style
     w.writeStartElement("Style");
     w.writeAttribute("id", "yellow");
     w.writeStartElement("IconStyle");
-    //KMLUtil.writeWithCharacters(w, "color", "ffff0000");
     w.writeStartElement("Icon");
     KMLUtil.writeWithCharacters(w, "href", "http://maps.google.com/mapfiles/kml/paddle/ylw-blank-lv.png");
-    //KMLUtil.writeWithCharacters(w, "scale", "10.0");
+    w.writeEndElement(); // Icon
+    w.writeEndElement(); // IconStyle
+    w.writeEndElement(); // Style
+
+    w.writeStartElement("Style");
+    w.writeAttribute("id", "green");
+    w.writeStartElement("IconStyle");
+    w.writeStartElement("Icon");
+    KMLUtil.writeWithCharacters(w, "href", "http://maps.google.com/mapfiles/kml/paddle/grn-blank-lv.png");
     w.writeEndElement(); // Icon
     w.writeEndElement(); // IconStyle
     w.writeEndElement(); // Style
 
     public static void writeLinePlacemark(XMLStreamWriter w, Coordinate coord1, Coordinate coord2)
                                           throws XMLStreamException {
+        writeLinePlacemark(w, coord1, coord2, "whiteLine");
+    }
+
+    public static void writeLinePlacemark(XMLStreamWriter w, Coordinate coord1, Coordinate coord2, String styleUrl)
+                                          throws XMLStreamException {
+        w.writeStartElement("Placemark");
+        KMLUtil.writeWithCharacters(w, "styleUrl", "#"+styleUrl);
+        w.writeStartElement("LineString");
+        KMLUtil.writeWithCharacters(w, "gx:altitudeOffset", "0");
+        KMLUtil.writeWithCharacters(w, "extrude", "1");
+        KMLUtil.writeWithCharacters(w, "tessellate", "1");
+        KMLUtil.writeWithCharacters(w, "altitudeMode", "clampToGround");
+        KMLUtil.writeWithCharacters(w, "gx:drawOrder", "0");
+        w.writeStartElement("coordinates");
+        w.writeCharacters(df.format(coord1.getLngDegrees())+","+df.format(coord1.getLatDegrees())+",0\n");
+        w.writeCharacters(df.format(coord2.getLngDegrees())+","+df.format(coord2.getLatDegrees())+",0\n");
+        w.writeEndElement(); // coordinates
+        w.writeEndElement(); // LineString
+        w.writeEndElement(); // Placemark
+    }
+
+    public static void writeArcLinePlacemark(XMLStreamWriter w, Coordinate coord1, Coordinate coord2)
+                                             throws XMLStreamException {
         w.writeStartElement("Placemark");
         KMLUtil.writeWithCharacters(w, "styleUrl", "#bar");
         w.writeStartElement("LineString");
     w.writeEndElement(); // Point
     w.writeEndElement(); // Placemark*/
 
-      writePlacemark(w, name, coord, radius);
+    writePlacemark(w, name, coord, radius);
 
     w.writeStartElement("Placemark");
     KMLUtil.writeWithCharacters(w, "name", name + " POLYGON");

File src/main/scala/opennlp/textgrounder/geolocate/DocumentRankerByError.scala

     }
 
     val docsAndErrors:List[(String, Double, Coordinate, Coordinate)] =
-      (for((docName, trueCoord, predCoord) <- LogUtil.parseLogFile(logFile.value.get)) yield {
+      (for((docName, trueCoord, predCoord, neighbors) <- LogUtil.parseLogFile(logFile.value.get)) yield {
         val dist = trueCoord.distanceInKm(predCoord)
 
         (docName, dist, trueCoord, predCoord)

File src/main/scala/opennlp/textgrounder/geolocate/ErrorKMLGenerator.scala

 
 object ErrorKMLGenerator {
 
-  /*val DOC_PREFIX = "Document "
-  //val DOC_PREFIX = "Article "
-  val TRUE_COORD_PREFIX = ") at ("
-  val PRED_COORD_PREFIX = " predicted cell center at ("*/
-  
-
   val factory = XMLOutputFactory.newInstance
 
-  /*def parseLogFile(filename: String): List[(String, Coordinate, Coordinate)] = {
-    val lines = scala.io.Source.fromFile(filename).getLines
-
-    var docName:String = null
-    var trueCoord:Coordinate = null
-    var predCoord:Coordinate = null
-
-    (for(line <- lines) yield {
-      if(line.startsWith("#")) {
-
-        if(line.contains(DOC_PREFIX)) {
-          var startIndex = line.indexOf(DOC_PREFIX) + DOC_PREFIX.length
-          var endIndex = line.indexOf("(", startIndex)
-          docName = line.slice(startIndex, endIndex)
-          
-          startIndex = line.indexOf(TRUE_COORD_PREFIX) + TRUE_COORD_PREFIX.length
-          endIndex = line.indexOf(")", startIndex)
-          val rawCoords = line.slice(startIndex, endIndex).split(",")
-          trueCoord = Coordinate.fromDegrees(rawCoords(0).toDouble, rawCoords(1).toDouble)
-          None
-        }
-
-        else if(line.contains(PRED_COORD_PREFIX)) {
-          val startIndex = line.indexOf(PRED_COORD_PREFIX) + PRED_COORD_PREFIX.length
-          val endIndex = line.indexOf(")", startIndex)
-          val rawCoords = line.slice(startIndex, endIndex).split(",")
-          predCoord = Coordinate.fromDegrees(rawCoords(0).toDouble, rawCoords(1).toDouble)
-
-          Some((docName, trueCoord, predCoord))
-        }
-
-        else None
-      }
-      else None
-    }).flatten.toList
-
-  }*/
-
   import ArgotConverters._
 
   val parser = new ArgotParser("textgrounder run opennlp.textgrounder.geolocate.ErrorKMLGenerator", preUsage = Some("TextGrounder"))
 
     KMLUtil.writeHeader(out, "errors-at-"+(if(usePred.value == None) "true" else "pred"))
 
-    for((docName, trueCoord, predCoordOrig) <- LogUtil.parseLogFile(logFile.value.get)) {
+    for((docName, trueCoord, predCoordOrig, neighbors) <- LogUtil.parseLogFile(logFile.value.get)) {
       val predCoord = Coordinate.fromDegrees(predCoordOrig.getLatDegrees() + (rand.nextDouble() - 0.5) * .1,
                                              predCoordOrig.getLngDegrees() + (rand.nextDouble() - 0.5) * .1);
 
       val coord1 = if(usePred.value == None) trueCoord else predCoord
       val coord2 = if(usePred.value == None) predCoord else trueCoord
 
-      KMLUtil.writeLinePlacemark(out, coord1, coord2);
+      KMLUtil.writeArcLinePlacemark(out, coord1, coord2);
       KMLUtil.writePinPlacemark(out, docName, coord1, "yellow");
       //KMLUtil.writePlacemark(out, docName, coord1, KMLUtil.RADIUS);
       KMLUtil.writePinPlacemark(out, docName, coord2, "blue");

File src/main/scala/opennlp/textgrounder/geolocate/GenerateKML.scala

 }
 
 
-/* A factory that filters the distributions to contain only the words we
+/* A constructor that filters the distributions to contain only the words we
    care about, to save memory and time. */
-class FilterPseudoGoodTuringSmoothedWordDistFactory(
-    filter_words: Seq[String]
-  ) extends PseudoGoodTuringSmoothedWordDistFactory {
-  val oov = "-OOV-"
-  override def set_unigram_word_dist(doc: GenericDistDocument,
-      keys: Array[String], values: Array[Int], num_words: Int,
-      is_training_set: Boolean) {
-    val (newkeys, newvalues) =
-      (for ((k, v) <- (keys zip values).take(num_words);
-           newk = if (filter_words contains k) k else oov)
-         yield (k, v)).unzip
-    doc.dist = new PseudoGoodTuringSmoothedWordDist(this,
-        newkeys.toArray, newvalues.toArray, newkeys.length,
-        note_globally = is_training_set)
+class FilterUnigramWordDistConstructor(
+    factory: WordDistFactory,
+    filter_words: Seq[String],
+    ignore_case: Boolean,
+    stopwords: Set[String],
+    minimum_word_count: Int = 1
+  ) extends DefaultUnigramWordDistConstructor(
+    factory, ignore_case, stopwords, minimum_word_count
+  ) {
+
+  override def finish_before_global(dist: WordDist) {
+    super.finish_before_global(dist)
+
+    val counts = dist.asInstanceOf[UnigramWordDist].counts
+    val oov = memoize_string("-OOV-")
+
+    // Filter the words we don't care about, to save memory and time.
+    for ((word, count) <- counts
+         if !(filter_words contains unmemoize_string(word))) {
+      counts -= word
+      counts(oov) += count
+    }
   }
 }
 
     params.split_kml_words = params.kml_words.split(',')
   }
 
-  override def initialize_word_dist_suffix() = 
-    DistDocument.unigram_counts_suffix
-  override def initialize_word_dist_factory() = {
-    new FilterPseudoGoodTuringSmoothedWordDistFactory(
-      params.split_kml_words)
+  override protected def initialize_word_dist_constructor(
+      factory: WordDistFactory) = {
+    if (num_ngrams > 1)
+      param_error("Only unigram word distribution words with GenerateKML")
+    val the_stopwords = get_stopwords()
+    /* if (num_ngrams == 2)
+      new FilterBigramWordDistConstructor(factory, ...)
+    else */
+      new FilterUnigramWordDistConstructor(
+        factory,
+        params.split_kml_words,
+        ignore_case = !params.preserve_case_words,
+        stopwords = the_stopwords,
+        minimum_word_count = params.minimum_word_count)
   }
 
   /**

File src/main/scala/opennlp/textgrounder/geolocate/KNNKMLGenerator.scala

+///////////////////////////////////////////////////////////////////////////////
+//  Copyright (C) 2011 The University of Texas at Austin
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+///////////////////////////////////////////////////////////////////////////////
+
+////////
+//////// KNNKMLGenerator.scala
+////////
+//////// Copyright (c) 2012.
+////////
+
+package opennlp.textgrounder.geolocate
+
+import java.io._
+import javax.xml.datatype._
+import javax.xml.stream._
+import opennlp.textgrounder.topo._
+import opennlp.textgrounder.util.KMLUtil
+import opennlp.textgrounder.util.LogUtil
+import scala.collection.JavaConversions._
+import org.clapper.argot._
+
+object KNNKMLGenerator {
+
+  val factory = XMLOutputFactory.newInstance
+  val rand = new scala.util.Random
+
+  import ArgotConverters._
+
+  val parser = new ArgotParser("textgrounder run opennlp.textgrounder.geolocate.KNNKMLGenerator", preUsage = Some("TextGrounder"))
+  val logFile = parser.option[String](List("l", "log"), "log", "log input file")
+  val kmlOutFile = parser.option[String](List("k", "kml"), "kml", "kml output file")
+
+  def main(args: Array[String]) {
+    try {
+      parser.parse(args)
+    }
+    catch {
+      case e: ArgotUsageException => println(e.message); sys.exit(0)
+    }
+
+    if(logFile.value == None) {
+      println("You must specify a log input file via -l.")
+      sys.exit(0)
+    }
+    if(kmlOutFile.value == None) {
+      println("You must specify a KML output file via -k.")
+      sys.exit(0)
+    }
+
+    val outFile = new File(kmlOutFile.value.get)
+    val stream = new BufferedOutputStream(new FileOutputStream(outFile))
+    val out = factory.createXMLStreamWriter(stream, "UTF-8")
+
+    KMLUtil.writeHeader(out, "knn")
+
+    for((docName, trueCoord, predCoord, neighbors) <- LogUtil.parseLogFile(logFile.value.get)) {
+
+      val jPredCoord = jitter(predCoord)
+
+      KMLUtil.writePinPlacemark(out, docName, trueCoord)
+      KMLUtil.writePinPlacemark(out, docName, jPredCoord, "blue")
+      KMLUtil.writePlacemark(out, "#1", jPredCoord, KMLUtil.RADIUS*10)
+      KMLUtil.writeLinePlacemark(out, trueCoord, jPredCoord, "redLine")
+
+      for((neighbor, rank) <- neighbors) {
+        val jNeighbor = jitter(neighbor)
+        /*if(rank == 1) {
+          KMLUtil.writePlacemark(out, "#1", neighbor, KMLUtil.RADIUS*10)
+        }*/
+        if(rank != 1) {
+          KMLUtil.writePlacemark(out, "#"+rank, jNeighbor, KMLUtil.RADIUS*10)
+          KMLUtil.writePinPlacemark(out, docName, jNeighbor, "green")
+          /*if(!neighbor.equals(predCoord))*/ KMLUtil.writeLinePlacemark(out, trueCoord, jNeighbor)
+        }
+      }
+
+    }
+
+    KMLUtil.writeFooter(out)
+
+    out.close
+  }
+
+  def jitter(coord:Coordinate): Coordinate = {
+    Coordinate.fromDegrees(coord.getLatDegrees() + (rand.nextDouble() - 0.5) * .1,
+                           coord.getLngDegrees() + (rand.nextDouble() - 0.5) * .1);
+  }
+}

File src/main/scala/opennlp/textgrounder/geolocate/SphereEvaluation.scala

       true_rank: Int) =
     new SphereDocumentEvaluationResult(document, pred_cell, true_rank)
 
+  val num_nearest_neighbors = driver.params.num_nearest_neighbors
+
   def print_individual_result(doctag: String, document: SphereDocument,
       result: SphereDocumentEvaluationResult,
       pred_cells: Array[(SphereCell, Double)]) {
       errprint("%s:  Predicted cell (at rank %s, kl-div %s): %s",
         doctag, i + 1, pred_cells(i)._2, pred_cells(i)._1)
     }
+
+    //for (num_nearest_neighbors <- 2 to 100 by 2) {
+    val kNN = pred_cells.take(num_nearest_neighbors).map(_._1)
+    val kNNranks = pred_cells.take(num_nearest_neighbors).zipWithIndex.map(p => (p._1._1, p._2+1)).toMap
+    val closest_half_with_dists = kNN.map(n => (n, spheredist(n.get_center_coord, document.coord))).sortWith(_._2 < _._2).take(num_nearest_neighbors/2)
+
+    closest_half_with_dists.zipWithIndex.foreach(c => errprint("%s:  #%s close neighbor: %s; error distance: %.2f km",
+      doctag, kNNranks(c._1._1), c._1._1.get_center_coord, c._1._2))
+
     errprint("%s:  Distance %.2f km to true cell center at %s",
       doctag, result.true_truedist, result.true_center)
     errprint("%s:  Distance %.2f km to predicted cell center at %s",
       doctag, result.pred_truedist, result.pred_center)
+
+    val avg_dist_of_neighbors = mean(closest_half_with_dists.map(_._2))
+    errprint("%s:  Average distance from true cell center to %s closest cells' centers from %s best matches: %.2f km",
+      doctag, (num_nearest_neighbors/2), num_nearest_neighbors, avg_dist_of_neighbors)
+
+    if(avg_dist_of_neighbors < result.pred_truedist)
+      driver.increment_local_counter("instances.num_where_avg_dist_of_neighbors_beats_pred_truedist.%s" format num_nearest_neighbors)
+    //}
+
+  
     assert(doctag(0) == '#')
     if (debug("gridrank") ||
       (debuglist("gridrank") contains doctag.drop(1))) {
 
   def evaluate_document(doc: TitledDocument, doctag: String) = {
     val dist = driver.word_dist_factory.create_word_dist()
-    val the_stopwords =
-      if (driver.params.include_stopwords_in_document_dists) Set[String]()
-      else driver.stopwords
-    for (text <- Seq(doc.title, doc.text)) {
-      dist.add_document(split_text_into_words(text, ignore_punc = true),
-        ignore_case = !driver.params.preserve_case_words,
-        stopwords = the_stopwords)
-    }
-    dist.finish(minimum_word_count = driver.params.minimum_word_count)
+    for (text <- Seq(doc.title, doc.text))
+      dist.add_document(split_text_into_words(text, ignore_punc = true))
+    dist.finish_before_global()
+    dist.finish_after_global()
     val cells = strategy.return_ranked_cells(dist)
     errprint("")
     errprint("Document with title: %s", doc.title)

File src/main/scala/opennlp/textgrounder/geolocate/toponym/Toponym.scala

     combined_dist = new CombinedWordDist(word_dist_factory)
     for (loc <- Seq(this) ++ goodlocs if loc.docmatch != null)
       yield combined_dist.add_document(loc.docmatch)
-    combined_dist.word_dist.finish(minimum_word_count =
-      Params.minimum_word_count)
+    combined_dist.word_dist.finish_before_global()
+    combined_dist.word_dist.finish_after_global()
   }
 
   def contains(coord: SphereCoord) = boundary contains coord
           warning("Couldn't find existing cell distribution for document %s",
             this)
           combined_dist = new CombinedWordDist(table.word_dist_factory)
-          combined_dist.word_dist.finish()
+          combined_dist.word_dist.finish_before_global()
+          combined_dist.word_dist.finish_after_global()
         }
       }
       combined_dist

File src/main/scala/opennlp/textgrounder/gridlocate/Cell.scala

   def is_empty() = num_docs_for_links == 0
 
   /**
-   *  Add the given document to the total distribution seen so far
+   *  Add the given document to the total distribution seen so far.
+   *  `partial` is a scaling factor (between 0.0 and 1.0) used for
+   *  interpolating multiple distributions.
    */
-  def add_document(doc: DistDocument[_]) {
+  def add_document(doc: DistDocument[_], partial: Double = 1.0) {
     /* Formerly, we arranged things so that we were passed in all documents,
        regardless of the split.  The reason for this was that the decision
        was made to accumulate link counts from all documents, even in the
       if (Params.max_time_per_stage == 0.0 && Params.num_training_docs == 0)
         warning("Saw document %s without distribution", doc)
     } else {
-      word_dist.add_word_distribution(doc.dist)
-      num_docs_for_word_dist += 1
-    }
-  }
-
-  def add_document_partial(doc: DistDocument[_],partial: Double) {
-    assert (doc.split == "training")
-
-    /* Add link count of document to cell. */
-    doc.incoming_links match {
-      // Might be None, for unknown link count
-      case Some(x) => incoming_links += x
-      case _ =>
-    }
-    num_docs_for_links += 1
-
-    if (doc.dist == null) {
-      if (Params.max_time_per_stage == 0.0 && Params.num_training_docs == 0)
-        warning("Saw document %s without distribution", doc)
-    } else {
-
-      word_dist.add_word_distribution_partial(doc.dist,partial)
+      word_dist.add_word_distribution(doc.dist, partial)
       num_docs_for_word_dist += 1
     }
   }
    */
   def finish() {
     assert(!finished)
-    combined_dist.word_dist.finish(
-      minimum_word_count = cell_grid.table.driver.params.minimum_word_count)
+    combined_dist.word_dist.finish_before_global()
+    combined_dist.word_dist.finish_after_global()
   }
 }
 

File src/main/scala/opennlp/textgrounder/gridlocate/DistDocument.scala

         val is_training_set = (this.split == "training")
         val is_eval_set = (this.split == table.driver.params.eval_set)
         assert (is_training_set || is_eval_set)
-        table.word_dist_factory.initialize_distribution(this, value,
-          is_training_set)
-        dist.finish_before_global(minimum_word_count =
-          table.driver.params.minimum_word_count)
+        table.word_dist_factory.constructor.initialize_distribution(this,
+          value, is_training_set)
+        dist.finish_before_global()
       }
-      case _ => () // Just eat the  other parameters
+      case _ => () // Just eat the other parameters
     }
   }
 

File src/main/scala/opennlp/textgrounder/gridlocate/Evaluation.scala

         document, naitr)
     }
 
+    //val num_nearest_neighbors = 10
+
     /* That is:
 
        pred_cells = List of predicted cells, from best to worst; each list
     if (result.num_docs_in_true_cell == 0) {
       evalstats.increment_counter("documents.no_training_documents_in_cell")
     }
-    if (want_indiv_results)
+    if (want_indiv_results) {
+      //val cells_for_average = pred_cells.zip(pred_cells.map(_._1.center))
+      //for((cell, score) <- pred_cells) {
+      //  val scell = cell.asInstanceOf[GeoCell[GeoCoord, GeoDoc]]
+      //}
       print_individual_result(doctag, document, result, pred_cells)
+    }
 
     return result
   }

File src/main/scala/opennlp/textgrounder/gridlocate/GridLocate.scala

 ) extends GridLocateDocumentStrategy[TCell, TGrid](cell_grid) {
   def return_ranked_cells(word_dist: WordDist) = {
     val cells = cell_grid.iter_nonempty_cells()
-    val shuffled = (new Random()).shuffle(cells)
-    (for (cell <- shuffled) yield (cell, 0.0))
+      val shuffled = (new Random()).shuffle(cells)
+      (for (cell <- shuffled) yield (cell, 0.0))
+    }
   }
-}
-
-/**
- * Class that implements a simple baseline strategy -- pick the "most
- * popular" cell (the one either with the largest number of documents, or
- * the most number of links pointing to it, if `internal_link` is true).
- */
-
-class MostPopularCellGridLocateDocumentStrategy[
-  TCell <: GenericGeoCell,
-  TGrid <: CellGenericCellGrid[TCell]
-](
-  cell_grid: TGrid,
-  internal_link: Boolean
-) extends GridLocateDocumentStrategy[TCell, TGrid](cell_grid) {
-  var cached_ranked_mps: Iterable[(TCell, Double)] = null
-  def return_ranked_cells(word_dist: WordDist) = {
-    if (cached_ranked_mps == null) {
-      cached_ranked_mps = (
-        (for (cell <- cell_grid.iter_nonempty_cells())
-          yield (cell,
-            (if (internal_link)
-               cell.combined_dist.incoming_links
-             else
-               cell.combined_dist.num_docs_for_links).toDouble)).
-        toArray sortWith (_._2 > _._2))
-    }
-    cached_ranked_mps
-  }
-}
-
-/**
- * Abstract class that implements a strategy for document geolocation that
- * involves directly comparing the document distribution against each cell
- * in turn and computing a score.
- *
- * @param prefer_minimum If true, lower scores are better; if false, higher
- *   scores are better.
- */
-abstract class MinMaxScoreStrategy[
-  TCell <: GenericGeoCell,
-  TGrid <: CellGenericCellGrid[TCell]
-](
-  cell_grid: TGrid,
-  prefer_minimum: Boolean
-) extends GridLocateDocumentStrategy[TCell, TGrid](cell_grid) {
-  /**
-   * Function to return the score of a document distribution against a
-   * cell.
-   */
-  def score_cell(word_dist: WordDist, cell: TCell): Double
 
   /**
-   * Compare a word distribution (for a document, typically) against all
-   * cells. Return a sequence of tuples (cell, score) where 'cell'
-   * indicates the cell and 'score' the score.
+   * Class that implements a simple baseline strategy -- pick the "most
+   * popular" cell (the one either with the largest number of documents, or
+   * the most number of links pointing to it, if `internal_link` is true).
    */
-  def return_ranked_cells(word_dist: WordDist) = {
-    val old = true
-    val cell_buf =
-      if (old) {
-      /*
-       The "old" (non-parallel) way of doing things; Stephen resurrected it when
-       merging the Dirichlet stuff.  Attempting to use the parallel method
-       caused an assertion failure after about 1200 of 1895 documents using
-       GeoText.
+
+  class MostPopularCellGridLocateDocumentStrategy[
+    TCell <: GenericGeoCell,
+    TGrid <: CellGenericCellGrid[TCell]
+  ](
+    cell_grid: TGrid,
+    internal_link: Boolean
+  ) extends GridLocateDocumentStrategy[TCell, TGrid](cell_grid) {
+    var cached_ranked_mps: Iterable[(TCell, Double)] = null
+    def return_ranked_cells(word_dist: WordDist) = {
+      if (cached_ranked_mps == null) {
+        cached_ranked_mps = (
+          (for (cell <- cell_grid.iter_nonempty_cells())
+            yield (cell,
+              (if (internal_link)
+                 cell.combined_dist.incoming_links
+               else
+                 cell.combined_dist.num_docs_for_links).toDouble)).
+          toArray sortWith (_._2 > _._2))
+      }
+      cached_ranked_mps
+    }
+  }
+
+  /**
+   * Abstract class that implements a strategy for document geolocation that
+   * involves directly comparing the document distribution against each cell
+   * in turn and computing a score.
+   *
+   * @param prefer_minimum If true, lower scores are better; if false, higher
+   *   scores are better.
+   */
+  abstract class MinMaxScoreStrategy[
+    TCell <: GenericGeoCell,
+    TGrid <: CellGenericCellGrid[TCell]
+  ](
+    cell_grid: TGrid,
+    prefer_minimum: Boolean
+  ) extends GridLocateDocumentStrategy[TCell, TGrid](cell_grid) {
+    /**
+     * Function to return the score of a document distribution against a
+     * cell.
+     */
+    def score_cell(word_dist: WordDist, cell: TCell): Double
+
+    /**
+     * Compare a word distribution (for a document, typically) against all
+     * cells. Return a sequence of tuples (cell, score) where 'cell'
+     * indicates the cell and 'score' the score.
+     */
+    def return_ranked_cells(word_dist: WordDist) = {
+      val old = true
+      val cell_buf =
+        if (old) {
+        /*
+         The "old" (non-parallel) way of doing things; Stephen resurrected it when
+         merging the Dirichlet stuff.  Attempting to use the parallel method
+         caused an assertion failure after about 1200 of 1895 documents using
+         GeoText.
+         */
+          val buffer = mutable.Buffer[(TCell, Double)]()
+
+          for (cell <- cell_grid.iter_nonempty_cells(nonempty_word_dist = true)) {
+            if (debug("lots")) {
+              errprint("Nonempty cell at indices %s = location %s, num_documents = %s",
+                cell.describe_indices(), cell.describe_location(),
+                cell.combined_dist.num_docs_for_word_dist)
+            }
+
+            val score = score_cell(word_dist, cell)
+            buffer += ((cell, score))
+          }
+          buffer
+        } else {
+          /* The new way of doing things */
+          val cells = cell_grid.iter_nonempty_cells(nonempty_word_dist = true)
+          cells.par.map(c => (c, score_cell(word_dist, c))).toBuffer
+        }
+
+      /* SCALABUG:
+         If written simply as 'cell_buf sortWith (_._2 < _._2)',
+         return type is mutable.Buffer.  However, if written as an
+         if/then as follows, return type is Iterable, even though both
+         forks have the same type of mutable.buffer!
        */
-        val buffer = mutable.Buffer[(TCell, Double)]()
+      val retval =
+        if (prefer_minimum)
+          cell_buf sortWith (_._2 < _._2)
+        else
+          cell_buf sortWith (_._2 > _._2)
+      /* If using the new way, this code applies for debugging (old way has
+         the debugging code embedded into it). */
+      if (!old && debug("lots")) {
+        for ((cell, score) <- retval)
+          errprint("Nonempty cell at indices %s = location %s, num_documents = %s, score = %s",
+            cell.describe_indices(), cell.describe_location(),
+            cell.combined_dist.num_docs_for_word_dist, score)
+      }
+      retval
+    }
+  }
 
-        for (cell <- cell_grid.iter_nonempty_cells(nonempty_word_dist = true)) {
-          if (debug("lots")) {
-            errprint("Nonempty cell at indices %s = location %s, num_documents = %s",
-              cell.describe_indices(), cell.describe_location(),
-              cell.combined_dist.num_docs_for_word_dist)
-          }
+  /**
+   * Class that implements a strategy for document geolocation by computing
+   * the KL-divergence between document and cell (approximately, how much
+   * the word distributions differ).  Note that the KL-divergence as currently
+   * implemented uses the smoothed word distributions.
+   *
+   * @param partial If true (the default), only do "partial" KL-divergence.
+   * This only computes the divergence involving words in the document
+   * distribution, rather than considering all words in the vocabulary.
+   * @param symmetric If true, do a symmetric KL-divergence by computing
+   * the divergence in both directions and averaging the two values.
+   * (Not by default; the comparison is fundamentally asymmetric in
+   * any case since it's comparing documents against cells.)
+   */
+  class KLDivergenceStrategy[
+    TCell <: GenericGeoCell,
+    TGrid <: CellGenericCellGrid[TCell]
+  ](
+    cell_grid: TGrid,
+    partial: Boolean = true,
+    symmetric: Boolean = false
+  ) extends MinMaxScoreStrategy[TCell, TGrid](cell_grid, true) {
 
-          val score = score_cell(word_dist, cell)
-          buffer += ((cell, score))
+    def score_cell(word_dist: WordDist, cell: TCell) = {
+      val cell_word_dist = cell.combined_dist.word_dist
+      var kldiv =
+        word_dist.kl_divergence(cell_word_dist, partial = partial)
+      if (symmetric) {
+        val kldiv2 = cell_word_dist.kl_divergence(word_dist, partial = partial)
+        kldiv = (kldiv + kldiv2) / 2.0
+      }
+      kldiv
+    }
+
+    override def return_ranked_cells(word_dist: WordDist) = {
+      val cells = super.return_ranked_cells(word_dist)
+
+      if (debug("kldiv") && word_dist.isInstanceOf[FastSlowKLDivergence]) {
+        val fast_slow_dist = word_dist.asInstanceOf[FastSlowKLDivergence]
+        // Print out the words that contribute most to the KL divergence, for
+        // the top-ranked cells
+        val num_contrib_cells = 5
+        val num_contrib_words = 25
+        errprint("")
+        errprint("KL-divergence debugging info:")
+        for (((cell, _), i) <- cells.take(num_contrib_cells) zipWithIndex) {
+          val (_, contribs) =
+            fast_slow_dist.slow_kl_divergence_debug(
+              cell.combined_dist.word_dist, partial = partial,
+              return_contributing_words = true)
+          errprint("  At rank #%s, cell %s:", i + 1, cell)
+          errprint("    %30s  %s", "Word", "KL-div contribution")
+          errprint("    %s", "-" * 50)
+          // sort by absolute value of second element of tuple, in reverse order
+          val items = (contribs.toArray sortWith ((x, y) => abs(x._2) > abs(y._2))).
+            take(num_contrib_words)
+          for ((word, contribval) <- items)
+            errprint("    %30s  %s", word, contribval)
+          errprint("")
         }
-        buffer
-      } else {
-        /* The new way of doing things */
-        val cells = cell_grid.iter_nonempty_cells(nonempty_word_dist = true)
-        cells.par.map(c => (c, score_cell(word_dist, c))).toBuffer
       }
 
-    /* SCALABUG:
-       If written simply as 'cell_buf sortWith (_._2 < _._2)',
-       return type is mutable.Buffer.  However, if written as an
-       if/then as follows, return type is Iterable, even though both
-       forks have the same type of mutable.buffer!
-     */
-    val retval =
-      if (prefer_minimum)
-        cell_buf sortWith (_._2 < _._2)
-      else
-        cell_buf sortWith (_._2 > _._2)
-    /* If using the new way, this code applies for debugging (old way has
-       the debugging code embedded into it). */
-    if (!old && debug("lots")) {
-      for ((cell, score) <- retval)
-        errprint("Nonempty cell at indices %s = location %s, num_documents = %s, score = %s",
-          cell.describe_indices(), cell.describe_location(),
-          cell.combined_dist.num_docs_for_word_dist, score)
+      cells
     }
-    retval
-  }
-}
-
-/**
- * Class that implements a strategy for document geolocation by computing
- * the KL-divergence between document and cell (approximately, how much
- * the word distributions differ).  Note that the KL-divergence as currently
- * implemented uses the smoothed word distributions.
- *
- * @param partial If true (the default), only do "partial" KL-divergence.
- * This only computes the divergence involving words in the document
- * distribution, rather than considering all words in the vocabulary.
- * @param symmetric If true, do a symmetric KL-divergence by computing
- * the divergence in both directions and averaging the two values.
- * (Not by default; the comparison is fundamentally asymmetric in
- * any case since it's comparing documents against cells.)
- */
-class KLDivergenceStrategy[
-  TCell <: GenericGeoCell,
-  TGrid <: CellGenericCellGrid[TCell]
-](
-  cell_grid: TGrid,
-  partial: Boolean = true,
-  symmetric: Boolean = false
-) extends MinMaxScoreStrategy[TCell, TGrid](cell_grid, true) {
-
-  def score_cell(word_dist: WordDist, cell: TCell) = {
-    val cell_word_dist = cell.combined_dist.word_dist
-    var kldiv =
-      word_dist.kl_divergence(cell_word_dist, partial = partial)
-    if (symmetric) {
-      val kldiv2 = cell_word_dist.kl_divergence(word_dist, partial = partial)
-      kldiv = (kldiv + kldiv2) / 2.0
-    }
-    kldiv
   }
 
-  override def return_ranked_cells(word_dist: WordDist) = {
-    val cells = super.return_ranked_cells(word_dist)
+  /**
+   * Class that implements a strategy for document geolocation by computing
+   * the cosine similarity between the distributions of document and cell.
+   * FIXME: We really should transform the distributions by TF/IDF before
+   * doing this.
+   *
+   * @param smoothed If true, use the smoothed word distributions. (By default,
+   * use unsmoothed distributions.)
+   * @param partial If true, only do "partial" cosine similarity.
+   * This only computes the similarity involving words in the document
+   * distribution, rather than considering all words in the vocabulary.
+   */
+  class CosineSimilarityStrategy[
+    TCell <: GenericGeoCell,
+    TGrid <: CellGenericCellGrid[TCell]
+  ](
+    cell_grid: TGrid,
+    smoothed: Boolean = false,
+    partial: Boolean = false
+  ) extends MinMaxScoreStrategy[TCell, TGrid](cell_grid, true) {
 
-    if (debug("kldiv") && word_dist.isInstanceOf[FastSlowKLDivergence]) {
-      val fast_slow_dist = word_dist.asInstanceOf[FastSlowKLDivergence]
-      // Print out the words that contribute most to the KL divergence, for
-      // the top-ranked cells
-      val num_contrib_cells = 5
-      val num_contrib_words = 25
-      errprint("")
-      errprint("KL-divergence debugging info:")
-      for (((cell, _), i) <- cells.take(num_contrib_cells) zipWithIndex) {
-        val (_, contribs) =
-          fast_slow_dist.slow_kl_divergence_debug(
-            cell.combined_dist.word_dist, partial = partial,
-            return_contributing_words = true)
-        errprint("  At rank #%s, cell %s:", i + 1, cell)
-        errprint("    %30s  %s", "Word", "KL-div contribution")
-        errprint("    %s", "-" * 50)
-        // sort by absolute value of second element of tuple, in reverse order
-        val items = (contribs.toArray sortWith ((x, y) => abs(x._2) > abs(y._2))).
-          take(num_contrib_words)
-        for ((word, contribval) <- items)
-          errprint("    %30s  %s", word, contribval)
-        errprint("")
+    def score_cell(word_dist: WordDist, cell: TCell) = {
+      var cossim =
+        word_dist.cosine_similarity(cell.combined_dist.word_dist,
+          partial = partial, smoothed = smoothed)
+      assert(cossim >= 0.0)
+      // Just in case of round-off problems
+      assert(cossim <= 1.002)
+      cossim = 1.002 - cossim
+      cossim
+    }
+  }
+
+  /** Use a Naive Bayes strategy for comparing document and cell. */
+  class NaiveBayesDocumentStrategy[
+    TCell <: GenericGeoCell,
+    TGrid <: CellGenericCellGrid[TCell]
+  ](
+    cell_grid: TGrid,
+    use_baseline: Boolean = true
+  ) extends MinMaxScoreStrategy[TCell, TGrid](cell_grid, false) {
+
+    def score_cell(word_dist: WordDist, cell: TCell) = {
+      val params = cell_grid.table.driver.params
+      // Determine respective weightings
+      val (word_weight, baseline_weight) = (
+        if (use_baseline) {
+          if (params.naive_bayes_weighting == "equal") (1.0, 1.0)
+          else {
+            val bw = params.naive_bayes_baseline_weight.toDouble
+            ((1.0 - bw) / word_dist.num_word_tokens, bw)
+          }
+        } else (1.0, 0.0))
+
+      val word_logprob =
+        cell.combined_dist.word_dist.get_nbayes_logprob(word_dist)
+      val baseline_logprob =
+        log(cell.combined_dist.num_docs_for_links.toDouble /
+            cell_grid.total_num_docs_for_links)
+      val logprob = (word_weight * word_logprob +
+        baseline_weight * baseline_logprob)
+      logprob
+    }
+  }
+
+  abstract class AverageCellProbabilityStrategy[
+    TCell <: GenericGeoCell,
+    XTGrid <: CellGenericCellGrid[TCell]
+  ](
+    cell_grid: XTGrid
+  ) extends GridLocateDocumentStrategy[TCell, XTGrid](cell_grid) {
+    type TCellDistFactory <:
+      CellDistFactory[_, _ <: GenericDistDocument, TCell] { type TGrid = XTGrid }
+    def create_cell_dist_factory(lru_cache_size: Int): TCellDistFactory
+
+    val cdist_factory =
+      create_cell_dist_factory(cell_grid.table.driver.params.lru_cache_size)
+
+    def return_ranked_cells(word_dist: WordDist) = {
+      val celldist =
+        cdist_factory.get_cell_dist_for_word_dist(cell_grid, word_dist)
+      celldist.get_ranked_cells()
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  //                                Segmentation                             //
+  /////////////////////////////////////////////////////////////////////////////
+
+  // General idea: Keep track of best possible segmentations up to a maximum
+  // number of segments.  Either do it using a maximum number of segmentations
+  // (e.g. 100 or 1000) or all within a given factor of the best score (the
+  // "beam width", e.g. 10^-4).  Then given the existing best segmentations,
+  // we search for new segmentations with more segments by looking at all
+  // possible ways of segmenting each of the existing best segments, and
+  // finding the best score for each of these.  This is a slow process -- for
+  // each segmentation, we have to iterate over all segments, and for each
+  // segment we have to look at all possible ways of splitting it, and for
+  // each split we have to look at all assignments of cells to the two
+  // new segments.  It also seems that we're likely to consider the same
+  // segmentation multiple times.
+  //
+  // In the case of per-word cell dists, we can maybe speed things up by
+  // computing the non-normalized distributions over each paragraph and then
+  // summing them up as necessary.
+
+  /////////////////////////////////////////////////////////////////////////////
+  //                                   Stopwords                             //
+  /////////////////////////////////////////////////////////////////////////////
+
+  object Stopwords {
+    val stopwords_file_in_tg = "data/lists/stopwords.english"
+
+    // Read in the list of stopwords from the given filename.
+    def read_stopwords(filehand: FileHandler, stopwords_filename: String) = {
+      def compute_stopwords_filename(filename: String) = {
+        if (filename != null) filename
+        else {
+          val tgdir = TextGrounderInfo.get_textgrounder_dir
+          // Concatenate directory and rest in most robust way
+          filehand.join_filename(tgdir, stopwords_file_in_tg)
+        }
+      }
+      val filename = compute_stopwords_filename(stopwords_filename)
+      errprint("Reading stopwords from %s...", filename)
+      filehand.openr(filename).toSet
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  //                                  Main code                              //
+  /////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * General class retrieving command-line arguments or storing programmatic
+   * configuration parameters for a Cell-grid-based application.
+   *
+   * @param parser If specified, should be a parser for retrieving the
+   *   value of command-line arguments from the command line.  Provided
+   *   that the parser has been created and initialized by creating a
+   *   previous instance of this same class with the same parser (a
+   *   "shadow field" class), the variables below will be initialized with
+   *   the values given by the user on the command line.  Otherwise, they
+   *   will be initialized with the default values for the parameters.
+   *   Because they are vars, they can be freely set to other values.
+   *
+   */
+  class GridLocateParameters(parser: ArgParser = null) extends
+      ArgParserParameters(parser) {
+    protected val ap =
+      if (parser == null) new ArgParser("unknown") else parser
+
+    //// Input files
+    var stopwords_file =
+      ap.option[String]("stopwords-file",
+        metavar = "FILE",
+        help = """File containing list of stopwords.  If not specified,
+  a default list of English stopwords (stored in the TextGrounder distribution)
+  is used.""")
+
+    var input_corpus =
+      ap.multiOption[String]("i", "input-corpus",
+        metavar = "DIR",
+        help = """Directory containing an input corpus.  Documents in the
+  corpus can be Wikipedia articles, individual tweets in Twitter, the set of all
+  tweets for a given user, etc.  The corpus generally contains one or more
+  "views" on the raw data comprising the corpus, with different views
+  corresponding to differing ways of representing the original text of the
+  documents -- as raw, word-split text; as unigram word counts; as bigram word
+  counts; etc.  Each such view has a schema file and one or more document files.
+  The latter contains all the data for describing each document, including
+  title, split (training, dev or test) and other metadata, as well as the text
+  or word counts that are used to create the textual distribution of the
+  document.  The document files are laid out in a very simple database format,
+  consisting of one document per line, where each line is composed of a fixed
+  number of fields, separated by TAB characters. (E.g. one field would list
+  the title, another the split, another all the word counts, etc.) A separate
+  schema file lists the name of each expected field.  Some of these names
+  (e.g. "title", "split", "text", "coord") have pre-defined meanings, but
+  arbitrary names are allowed, so that additional corpus-specific information
+  can be provided (e.g. retweet info for tweets that were retweeted from some
+  other tweet, redirect info when a Wikipedia article is a redirect to another
+  article, etc.).
+
+  Multiple such files can be given by specifying the option multiple
+  times.""")
+    var eval_file =
+      ap.multiOption[String]("e", "eval-file",
+        metavar = "FILE",
+        help = """File or directory containing files to evaluate on.
+  Multiple such files/directories can be given by specifying the option multiple
+  times.  If a directory is given, all files in the directory will be
+  considered (but if an error occurs upon parsing a file, it will be ignored).
+  Each file is read in and then disambiguation is performed.  Not used during
+  document geolocation when --eval-format=internal (the default).""")
+
+    var num_nearest_neighbors =
+      ap.option[Int]("num-nearest-neighbors", "knn", default = 4,
+        help = """Number of nearest neighbors (k in kNN); default is %default.""")
+
+    //// Options indicating which documents to train on or evaluate
+    var eval_set =
+      ap.option[String]("eval-set", "es", metavar = "SET",
+        default = "dev",
+        choices = Seq("dev", "test"),
+        aliases = Map("dev" -> Seq("devel")),
+        help = """Set to use for evaluation during document geolocation when
+  when --eval-format=internal ('dev' or 'devel' for the development set,
+  'test' for the test set).  Default '%default'.""")
+    var num_training_docs =
+      ap.option[Int]("num-training-docs", "ntrain", metavar = "NUM",
+        default = 0,
+        help = """Maximum number of training documents to use.
+  0 means no limit.  Default 0, i.e. no limit.""")
+    var num_test_docs =
+      ap.option[Int]("num-test-docs", "ntest", metavar = "NUM",
+        default = 0,
+        help = """Maximum number of test (evaluation) documents to process.
+  0 means no limit.  Default 0, i.e. no limit.""")
+    var skip_initial_test_docs =
+      ap.option[Int]("skip-initial-test-docs", "skip-initial", metavar = "NUM",
+        default = 0,
+        help = """Skip this many test docs at beginning.  Default 0, i.e.
+  don't skip any documents.""")
+    var every_nth_test_doc =
+      ap.option[Int]("every-nth-test-doc", "every-nth", metavar = "NUM",
+        default = 1,
+        help = """Only process every Nth test doc.  Default 1, i.e.
+  process all.""")
+    //  def skip_every_n_test_docs =
+    //    ap.option[Int]("skip-every-n-test-docs", "skip-n", default = 0,
+    //      help = """Skip this many after each one processed.  Default 0.""")
+
+    //// Options used when creating word distributions
+    var word_dist =
+      ap.option[String]("word-dist", "wd",
+        default = "pseudo-good-turing",
+        choices = Seq("pseudo-good-turing", "pseudo-good-turing-bigram", "dirichlet", "jelinek-mercer"),
+        aliases = Map("jelinek-mercer" -> Seq("jelinek")),
+        help = """Type of word distribution to use.  Possibilities are
+  'pseudo-good-turing' (a simplified version of Good-Turing over a unigram
+  distribution), 'dirichlet' (Dirichlet smoothing over a unigram distribution),
+  'jelinek' or 'jelinek-mercer' (Jelinek-Mercer smoothing over a unigram
+  distribution), and 'pseudo-good-turing-bigram' (a non-smoothed bigram
+  distribution??).  Default '%default'.
+
+  Note that all three involve some type of discounting, i.e. taking away a
+  certain amount of probability mass compared with the maximum-likelihood
+  distribution (which estimates 0 probability for words unobserved in a
+  particular document), so that unobserved words can be assigned positive
+  probability, based on their probability across all documents (i.e. their
+  global distribution).  The difference is in how the discounting factor is
+  computed, as well as the default value for whether to do interpolation
+  (always mix the global distribution in) or back-off (use the global
+  distribution only for words not seen in the document).  Jelinek-Mercer
+  and Dirichlet do interpolation by default, while pseudo-Good-Turing
+  does back-off; but this can be overridden using --interpolate.
+  Jelinek-Mercer uses a fixed discounting factor; Dirichlet uses a
+  discounting factor that gets smaller and smaller the larger the document,
+  while pseudo-Good-Turing uses a discounting factor that reserves total
+  mass for unobserved words that is equal to the total mass observed
+  for words seen once.""")
+    var interpolate =
+      ap.option[String]("interpolate",
+        default = "default",
+        choices = Seq("yes", "no", "default"),
+        aliases = Map("yes" -> Seq("interpolate"), "no" -> Seq("backoff")),
+        help = """Whether to do interpolation rather than back-off.
+  Possibilities are 'yes', 'no', and 'default' (which means 'yes' when doing
+  Dirichlet or Jelinek-Mercer smoothing, 'no' when doing pseudo-Good-Turing
+  smoothing).""")
+    var jelinek_factor =
+      ap.option[Double]("jelinek-factor", "jf",
+        default = 0.3,
+        help = """Smoothing factor when doing Jelinek-Mercer smoothing.
+  The higher the value, the more relative weight to give to the global
+  distribution vis-a-vis the document-specific distribution.  This
+  should be a value between 0.0 (no smoothing at all) and 1.0 (total
+  smoothing, i.e. use only the global distribution and ignore
+  document-specific distributions entirely).  Default %default.""")
+    var dirichlet_factor =
+      ap.option[Double]("dirichlet-factor", "df",
+        default = 500,
+        help = """Smoothing factor when doing Dirichlet smoothing.
+  The higher the value, the more relative weight to give to the global
+  distribution vis-a-vis the document-specific distribution.  Default
+  %default.""")
+    var preserve_case_words =
+      ap.flag("preserve-case-words", "pcw",
+        help = """Don't fold the case of words used to compute and
+  match against document distributions.  Note that in toponym resolution,
+  this applies only to words in documents (currently used only in Naive Bayes
+  matching), not to toponyms, which are always matched case-insensitively.""")
+    var include_stopwords_in_document_dists =
+      ap.flag("include-stopwords-in-document-dists",
+        help = """Include stopwords when computing word distributions.""")
+    var minimum_word_count =
+      ap.option[Int]("minimum-word-count", "mwc", metavar = "NUM",
+        default = 1,
+        help = """Minimum count of words to consider in word
+  distributions.  Words whose count is less than this value are ignored.""")
+
+    //// Options used when doing Naive Bayes geolocation
+    var naive_bayes_weighting =
+      ap.option[String]("naive-bayes-weighting", "nbw", metavar = "STRATEGY",
+        default = "equal",
+        choices = Seq("equal", "equal-words", "distance-weighted"),
+        help = """Strategy for weighting the different probabilities
+  that go into Naive Bayes.  If 'equal', do pure Naive Bayes, weighting the
+  prior probability (baseline) and all word probabilities the same.  If
+  'equal-words', weight all the words the same but collectively weight all words
+  against the baseline, giving the baseline weight according to --baseline-weight
+  and assigning the remainder to the words.  If 'distance-weighted', similar to
+  'equal-words' but don't weight each word the same as each other word; instead,
+  weight the words according to distance from the toponym.""")
+    var naive_bayes_baseline_weight =
+      ap.option[Double]("naive-bayes-baseline-weight", "nbbw",
+        metavar = "WEIGHT",
+        default = 0.5,
+        help = """Relative weight to assign to the baseline (prior
+  probability) when doing weighted Naive Bayes.  Default %default.""")
+
+    //// Options used when doing ACP geolocation
+    var lru_cache_size =
+      ap.option[Int]("lru-cache-size", "lru", metavar = "SIZE",
+        default = 400,
+        help = """Number of entries in the LRU cache.  Default %default.
+  Used only when --strategy=average-cell-probability.""")
+
+    //// Debugging/output options
+    var max_time_per_stage =
+      ap.option[Double]("max-time-per-stage", "mts", metavar = "SECONDS",
+        default = 0.0,
+        help = """Maximum time per stage in seconds.  If 0, no limit.
+  Used for testing purposes.  Default 0, i.e. no limit.""")
+    var no_individual_results =
+      ap.flag("no-individual-results", "no-results",
+        help = """Don't show individual results for each test document.""")
+    var results_by_range =
+      ap.flag("results-by-range",
+        help = """Show results by range (of error distances and number of
+  documents in true cell).  Not on by default as counters are used for this,
+  and setting so many counters breaks some Hadoop installations.""")
+    var oracle_results =
+      ap.flag("oracle-results",
+        help = """Only compute oracle results (much faster).""")
+    var debug =
+      ap.option[String]("d", "debug", metavar = "FLAGS",
+        help = """Output debug info of the given types.  Multiple debug
+  parameters can be specified, indicating different types of info to output.
+  Separate parameters by spaces, colons or semicolons.  Params can be boolean,
+  if given alone, or valueful, if given as PARAM=VALUE.  Certain params are
+  list-valued; multiple values are specified by including the parameter
+  multiple times, or by separating values by a comma.
+
+  The best way to figure out the possible parameters is by reading the
+  source code. (Look for references to debug("foo") for boolean params,
+  debugval("foo") for valueful params, or debuglist("foo") for list-valued
+  params.) Some known debug flags:
+
+  gridrank: For the given test document number (starting at 1), output
+  a grid of the predicted rank for cells around the true cell.
+  Multiple documents can have the rank output, e.g. --debug 'gridrank=45,58'
+  (This will output info for documents 45 and 58.) This output can be
+  postprocessed to generate nice graphs; this is used e.g. in Wing's thesis.
+
+  gridranksize: Size of the grid, in numbers of documents on a side.
+  This is a single number, and the grid will be a square centered on the
+  true cell. (Default currently 11.)
+
+  kldiv: Print out words contributing most to KL divergence.
+
+  wordcountdocs: Regenerate document file, filtering out documents not
+  seen in any counts file.
+
+  some, lots, tons: General info of various sorts. (Document me.)
+
+  cell: Print out info on each cell of the Earth as it's generated.  Also
+  triggers some additional info during toponym resolution. (Document me.)
+
+  commontop: Extra info for debugging
+   --baseline-strategy=link-most-common-toponym.
+
+  pcl-travel: Extra info for debugging --eval-format=pcl-travel.
+  """)
+
+  }
+
+  class DebugSettings {
+    // Debug params.  Different params indicate different info to output.
+    // Specified using --debug.  Multiple params are separated by spaces,
+    // colons or semicolons.  Params can be boolean, if given alone, or
+    // valueful, if given as PARAM=VALUE.  Certain params are list-valued;
+    // multiple values are specified by including the parameter multiple
+    // times, or by separating values by a comma.
+    val debug = booleanmap[String]()
+    val debugval = stringmap[String]()
+    val debuglist = bufmap[String, String]()
+
+    var list_debug_params = Set[String]()
+
+    // Register a list-valued debug param.
+    def register_list_debug_param(param: String) {
+      list_debug_params += param
+    }
+
+    def parse_debug_spec(debugspec: String) {
+      val params = """[:;\s]+""".r.split(debugspec)
+      // Allow params with values, and allow lists of values to be given
+      // by repeating the param
+      for (f <- params) {
+        if (f contains '=') {
+          val Array(param, value) = f.split("=", 2)
+          if (list_debug_params contains param) {
+            val values = "[,]".split(value)
+            debuglist(param) ++= values
+          } else
+            debugval(param) = value
+        } else
+          debug(f) = true
       }
     }
-
-    cells
-  }
-}
-
-/**
- * Class that implements a strategy for document geolocation by computing
- * the cosine similarity between the distributions of document and cell.
- * FIXME: We really should transform the distributions by TF/IDF before
- * doing this.
- *
- * @param smoothed If true, use the smoothed word distributions. (By default,
- * use unsmoothed distributions.)
- * @param partial If true, only do "partial" cosine similarity.
- * This only computes the similarity involving words in the document
- * distribution, rather than considering all words in the vocabulary.
- */
-class CosineSimilarityStrategy[
-  TCell <: GenericGeoCell,
-  TGrid <: CellGenericCellGrid[TCell]
-](
-  cell_grid: TGrid,
-  smoothed: Boolean = false,
-  partial: Boolean = false
-) extends MinMaxScoreStrategy[TCell, TGrid](cell_grid, true) {
-
-  def score_cell(word_dist: WordDist, cell: TCell) = {
-    var cossim =
-      word_dist.cosine_similarity(cell.combined_dist.word_dist,
-        partial = partial, smoothed = smoothed)
-    assert(cossim >= 0.0)
-    // Just in case of round-off problems
-    assert(cossim <= 1.002)
-    cossim = 1.002 - cossim
-    cossim
-  }
-}
-
-/** Use a Naive Bayes strategy for comparing document and cell. */
-class NaiveBayesDocumentStrategy[
-  TCell <: GenericGeoCell,
-  TGrid <: CellGenericCellGrid[TCell]
-](
-  cell_grid: TGrid,
-  use_baseline: Boolean = true
-) extends MinMaxScoreStrategy[TCell, TGrid](cell_grid, false) {
-
-  def score_cell(word_dist: WordDist, cell: TCell) = {
-    val params = cell_grid.table.driver.params
-    // Determine respective weightings
-    val (word_weight, baseline_weight) = (
-      if (use_baseline) {
-        if (params.naive_bayes_weighting == "equal") (1.0, 1.0)
-        else {
-          val bw = params.naive_bayes_baseline_weight.toDouble
-          ((1.0 - bw) / word_dist.num_word_tokens, bw)
-        }
-      } else (1.0, 0.0))
-
-    val word_logprob =
-      cell.combined_dist.word_dist.get_nbayes_logprob(word_dist)
-    val baseline_logprob =
-      log(cell.combined_dist.num_docs_for_links.toDouble /
-          cell_grid.total_num_docs_for_links)
-    val logprob = (word_weight * word_logprob +
-      baseline_weight * baseline_logprob)
-    logprob
-  }
-}
-
-abstract class AverageCellProbabilityStrategy[
-  TCell <: GenericGeoCell,
-  XTGrid <: CellGenericCellGrid[TCell]
-](
-  cell_grid: XTGrid
-) extends GridLocateDocumentStrategy[TCell, XTGrid](cell_grid) {
-  type TCellDistFactory <:
-    CellDistFactory[_, _ <: GenericDistDocument, TCell] { type TGrid = XTGrid }
-  def create_cell_dist_factory(lru_cache_size: Int): TCellDistFactory
-
-  val cdist_factory =
-    create_cell_dist_factory(cell_grid.table.driver.params.lru_cache_size)
-
-  def return_ranked_cells(word_dist: WordDist) = {
-    val celldist =
-      cdist_factory.get_cell_dist_for_word_dist(cell_grid, word_dist)
-    celldist.get_ranked_cells()
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////
-//                                Segmentation                             //
-/////////////////////////////////////////////////////////////////////////////
-
-// General idea: Keep track of best possible segmentations up to a maximum
-// number of segments.  Either do it using a maximum number of segmentations
-// (e.g. 100 or 1000) or all within a given factor of the best score (the
-// "beam width", e.g. 10^-4).  Then given the existing best segmentations,
-// we search for new segmentations with more segments by looking at all
-// possible ways of segmenting each of the existing best segments, and
-// finding the best score for each of these.  This is a slow process -- for
-// each segmentation, we have to iterate over all segments, and for each
-// segment we have to look at all possible ways of splitting it, and for
-// each split we have to look at all assignments of cells to the two
-// new segments.  It also seems that we're likely to consider the same
-// segmentation multiple times.
-//
-// In the case of per-word cell dists, we can maybe speed things up by
-// computing the non-normalized distributions over each paragraph and then
-// summing them up as necessary.
-
-/////////////////////////////////////////////////////////////////////////////
-//                                   Stopwords                             //
-/////////////////////////////////////////////////////////////////////////////
-
-object Stopwords {
-  val stopwords_file_in_tg = "data/lists/stopwords.english"
-
-  // Read in the list of stopwords from the given filename.
-  def read_stopwords(filehand: FileHandler, stopwords_filename: String) = {
-    def compute_stopwords_filename(filename: String) = {
-      if (filename != null) filename
-      else {
-        val tgdir = TextGrounderInfo.get_textgrounder_dir
-        // Concatenate directory and rest in most robust way
-        filehand.join_filename(tgdir, stopwords_file_in_tg)
-      }
-    }
-    val filename = compute_stopwords_filename(stopwords_filename)
-    errprint("Reading stopwords from %s...", filename)
-    filehand.openr(filename).toSet
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////
-//                                  Main code                              //
-/////////////////////////////////////////////////////////////////////////////
-
-/**
- * General class retrieving command-line arguments or storing programmatic
- * configuration parameters for a Cell-grid-based application.
- *
- * @param parser If specified, should be a parser for retrieving the
- *   value of command-line arguments from the command line.  Provided
- *   that the parser has been created and initialized by creating a
- *   previous instance of this same class with the same parser (a
- *   "shadow field" class), the variables below will be initialized with
- *   the values given by the user on the command line.  Otherwise, they
- *   will be initialized with the default values for the parameters.
- *   Because they are vars, they can be freely set to other values.
- *
- */
-class GridLocateParameters(parser: ArgParser = null) extends
-    ArgParserParameters(parser) {
-  protected val ap =
-    if (parser == null) new ArgParser("unknown") else parser
-
-  //// Input files
-  var stopwords_file =
-    ap.option[String]("stopwords-file",
-      metavar = "FILE",
-      help = """File containing list of stopwords.  If not specified,
-a default list of English stopwords (stored in the TextGrounder distribution)
-is used.""")
-
-  var input_corpus =
-    ap.multiOption[String]("i", "input-corpus",
-      metavar = "DIR",
-      help = """Directory containing an input corpus.  Documents in the
-corpus can be Wikipedia articles, individual tweets in Twitter, the set of all
-tweets for a given user, etc.  The corpus generally contains one or more
-"views" on the raw data comprising the corpus, with different views
-corresponding to differing ways of representing the original text of the
-documents -- as raw, word-split text; as unigram word counts; as bigram word
-counts; etc.  Each such view has a schema file and one or more document files.
-The latter contains all the data for describing each document, including
-title, split (training, dev or test) and other metadata, as well as the text
-or word counts that are used to create the textual distribution of the
-document.  The document files are laid out in a very simple database format,
-consisting of one document per line, where each line is composed of a fixed
-number of fields, separated by TAB characters. (E.g. one field would list
-the title, another the split, another all the word counts, etc.) A separate
-schema file lists the name of each expected field.  Some of these names
-(e.g. "title", "split", "text", "coord") have pre-defined meanings, but
-arbitrary names are allowed, so that additional corpus-specific information
-can be provided (e.g. retweet info for tweets that were retweeted from some
-other tweet, redirect info when a Wikipedia article is a redirect to another
-article, etc.).
-
-Multiple such files can be given by specifying the option multiple
-times.""")
-  var eval_file =
-    ap.multiOption[String]("e", "eval-file",
-      metavar = "FILE",
-      help = """File or directory containing files to evaluate on.
-Multiple such files/directories can be given by specifying the option multiple
-times.  If a directory is given, all files in the directory will be
-considered (but if an error occurs upon parsing a file, it will be ignored).
-Each file is read in and then disambiguation is performed.  Not used during
-document geolocation when --eval-format=internal (the default).""")
-
-  //// Options indicating which documents to train on or evaluate
-  var eval_set =
-    ap.option[String]("eval-set", "es", metavar = "SET",
-      default = "dev",
-      choices = Seq("dev", "test"),
-      aliases = Map("dev" -> Seq("devel")),
-      help = """Set to use for evaluation during document geolocation when
-when --eval-format=internal ('dev' or 'devel' for the development set,
-'test' for the test set).  Default '%default'.""")
-  var num_training_docs =
-    ap.option[Int]("num-training-docs", "ntrain", metavar = "NUM",
-      default = 0,
-      help = """Maximum number of training documents to use.
-0 means no limit.  Default 0, i.e. no limit.""")
-  var num_test_docs =
-    ap.option[Int]("num-test-docs", "ntest", metavar = "NUM",
-      default = 0,
-      help = """Maximum number of test (evaluation) documents to process.
-0 means no limit.  Default 0, i.e. no limit.""")
-  var skip_initial_test_docs =
-    ap.option[Int]("skip-initial-test-docs", "skip-initial", metavar = "NUM",
-      default = 0,
-      help = """Skip this many test docs at beginning.  Default 0, i.e.
-don't skip any documents.""")
-  var every_nth_test_doc =
-    ap.option[Int]("every-nth-test-doc", "every-nth", metavar = "NUM",
-      default = 1,
-      help = """Only process every Nth test doc.  Default 1, i.e.
-process all.""")
-  //  def skip_every_n_test_docs =
-  //    ap.option[Int]("skip-every-n-test-docs", "skip-n", default = 0,
-  //      help = """Skip this many after each one processed.  Default 0.""")
-
-  //// Options used when creating word distributions
-  var word_dist =
-    ap.option[String]("word-dist", "wd",
-      default = "pseudo-good-turing-unigram",
-      choices = Seq("pseudo-good-turing-unigram", "pseudo-good-turing-bigram", "dirichlet"),
-      help = """Type of word distribution to use.  Possibilities are
-'pseudo-good-turing-unigram' (a simplified version of Good-Turing over a unigram
-distribution) 'pseudo-good-turing-bigram' (a non-smoothed bigram distribution),
-and 'dirichlet' (dirchlet smoothing over a unigram distribution)
-Default '%default'.""")
-  var preserve_case_words =
-    ap.flag("preserve-case-words", "pcw",
-      help = """Don't fold the case of words used to compute and
-match against document distributions.  Note that in toponym resolution,
-this applies only to words in documents (currently used only in Naive Bayes
-matching), not to toponyms, which are always matched case-insensitively.""")
-  var include_stopwords_in_document_dists =
-    ap.flag("include-stopwords-in-document-dists",
-      help = """Include stopwords when computing word distributions.""")
-  var minimum_word_count =
-    ap.option[Int]("minimum-word-count", "mwc", metavar = "NUM",
-      default = 1,
-      help = """Minimum count of words to consider in word
-distributions.  Words whose count is less than this value are ignored.""")
-
-  //// Options used when doing Naive Bayes geolocation
-  var naive_bayes_weighting =
-    ap.option[String]("naive-bayes-weighting", "nbw", metavar = "STRATEGY",
-      default = "equal",
-      choices = Seq("equal", "equal-words", "distance-weighted"),
-      help = """Strategy for weighting the different probabilities
-that go into Naive Bayes.  If 'equal', do pure Naive Bayes, weighting the
-prior probability (baseline) and all word probabilities the same.  If
-'equal-words', weight all the words the same but collectively weight all words
-against the baseline, giving the baseline weight according to --baseline-weight
-and assigning the remainder to the words.  If 'distance-weighted', similar to
-'equal-words' but don't weight each word the same as each other word; instead,
-weight the words according to distance from the toponym.""")
-  var naive_bayes_baseline_weight =
-    ap.option[Double]("naive-bayes-baseline-weight", "nbbw",
-      metavar = "WEIGHT",
-      default = 0.5,
-      help = """Relative weight to assign to the baseline (prior
-probability) when doing weighted Naive Bayes.  Default %default.""")
-
-  //// Options used when doing ACP geolocation
-  var lru_cache_size =
-    ap.option[Int]("lru-cache-size", "lru", metavar = "SIZE",
-      default = 400,
-      help = """Number of entries in the LRU cache.  Default %default.
-Used only when --strategy=average-cell-probability.""")
-
-  //// Debugging/output options
-  var max_time_per_stage =
-    ap.option[Double]("max-time-per-stage", "mts", metavar = "SECONDS",
-      default = 0.0,
-      help = """Maximum time per stage in seconds.  If 0, no limit.
-Used for testing purposes.  Default 0, i.e. no limit.""")
-  var no_individual_results =
-    ap.flag("no-individual-results", "no-results",
-      help = """Don't show individual results for each test document.""")
-  var results_by_range =
-    ap.flag("results-by-range",
-      help = """Show results by range (of error distances and number of
-documents in true cell).  Not on by default as counters are used for this,
-and setting so many counters breaks some Hadoop installations.""")
-  var oracle_results =
-    ap.flag("oracle-results",
-      help = """Only compute oracle results (much faster).""")
-  var debug =
-    ap.option[String]("d", "debug", metavar = "FLAGS",
-      help = """Output debug info of the given types.  Multiple debug
-parameters can be specified, indicating different types of info to output.
-Separate parameters by spaces, colons or semicolons.  Params can be boolean,
-if given alone, or valueful, if given as PARAM=VALUE.  Certain params are
-list-valued; multiple values are specified by including the parameter
-multiple times, or by separating values by a comma.
-
-The best way to figure out the possible parameters is by reading the
-source code. (Look for references to debug("foo") for boolean params,
-debugval("foo") for valueful params, or debuglist("foo") for list-valued
-params.) Some known debug flags:
-
-gridrank: For the given test document number (starting at 1), output
-a grid of the predicted rank for cells around the true cell.
-Multiple documents can have the rank output, e.g. --debug 'gridrank=45,58'
-(This will output info for documents 45 and 58.) This output can be
-postprocessed to generate nice graphs; this is used e.g. in Wing's thesis.
-
-gridranksize: Size of the grid, in numbers of documents on a side.
-This is a single number, and the grid will be a square centered on the
-true cell. (Default currently 11.)
-
-kldiv: Print out words contributing most to KL divergence.
-
-wordcountdocs: Regenerate document file, filtering out documents not
-seen in any counts file.
-
-some, lots, tons: General info of various sorts. (Document me.)
-
-cell: Print out info on each cell of the Earth as it's generated.  Also
-triggers some additional info during toponym resolution. (Document me.)
-
-commontop: Extra info for debugging
- --baseline-strategy=link-most-common-toponym.
-
-pcl-travel: Extra info for debugging --eval-format=pcl-travel.
-""")
-
-}
-
-class DebugSettings {
-  // Debug params.  Different params indicate different info to output.
-  // Specified using --debug.  Multiple params are separated by spaces,
-  // colons or semicolons.  Params can be boolean, if given alone, or
-  // valueful, if given as PARAM=VALUE.  Certain params are list-valued;
-  // multiple values are specified by including the parameter multiple
-  // times, or by separating values by a comma.
-  val debug = booleanmap[String]()
-  val debugval = stringmap[String]()
-  val debuglist = bufmap[String, String]()
-
-  var list_debug_params = Set[String]()
-
-  // Register a list-valued debug param.
-  def register_list_debug_param(param: String) {
-    list_debug_params += param
   }
 
-  def parse_debug_spec(debugspec: String) {
-    val params = """[:;\s]+""".r.split(debugspec)
-    // Allow params with values, and allow lists of values to be given
-    // by repeating the param
-    for (f <- params) {
-      if (f contains '=') {
-        val Array(param, value) = f.split("=", 2)
-        if (list_debug_params contains param) {
-          val values = "[,]".split(value)
-          debuglist(param) ++= values
-        } else
-          debugval(param) = value
-      } else
-        debug(f) = true
+  /**
+   * Base class for programmatic access to document/etc. geolocation.
+   * Subclasses are for particular apps, e.g. GeolocateDocumentDriver for
+   * document-level geolocation.
+   *
+   * NOTE: Currently the code has some values stored in singleton objects,
+   * and no clear provided interface for resetting them.  This basically
+   * means that there can be only one geolocation instance per JVM.
+   * By now, most of the singleton objects have been removed, and it should
+   * not be difficult to remove the final limitations so that multiple
+   * drivers per JVM (possibly not at the same time) can be done.
+   *
+   * Basic operation:
+   *
+   * 1. Create an instance of the appropriate subclass of GeolocateParameters
+   * (e.g. GeolocateDocumentParameters for document geolocation) and populate
+   * it with the appropriate parameters.  Don't pass in any ArgParser instance,