Mike Speriosu avatar Mike Speriosu committed 5b20606

Added whitelist capability.

Comments (0)

Files changed (3)

src/main/scala/opennlp/textgrounder/geolocate/GenerateKML.scala

     filter_words: Seq[String],
     ignore_case: Boolean,
     stopwords: Set[String],
+    whitelist: Set[String],
     minimum_word_count: Int = 1
   ) extends DefaultUnigramWordDistConstructor(
-    factory, ignore_case, stopwords, minimum_word_count
+    factory, ignore_case, stopwords, whitelist, minimum_word_count
   ) {
 
   override def finish_before_global(dist: WordDist) {
     if (num_ngrams > 1)
       param_error("Only unigram word distribution words with GenerateKML")
     val the_stopwords = get_stopwords()
+    val the_whitelist = get_whitelist()
     /* if (num_ngrams == 2)
       new FilterBigramWordDistConstructor(factory, ...)
     else */
         params.split_kml_words,
         ignore_case = !params.preserve_case_words,
         stopwords = the_stopwords,
+        whitelist = the_whitelist,
         minimum_word_count = params.minimum_word_count)
   }
 

src/main/scala/opennlp/textgrounder/gridlocate/GridLocate.scala

   }
 
   /////////////////////////////////////////////////////////////////////////////
+  //                                   Whitelist                             //
+  /////////////////////////////////////////////////////////////////////////////
+
+  object Whitelist {
+    def read_whitelist(filehand: FileHandler, whitelist_filename: String): Set[String] = {
+      if(whitelist_filename == null || whitelist_filename.length == 0)
+        Nil.toSet
+      else
+        filehand.openr(whitelist_filename).toSet
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
   //                                  Main code                              //
   /////////////////////////////////////////////////////////////////////////////
 
   a default list of English stopwords (stored in the TextGrounder distribution)
   is used.""")
 
+    var whitelist_file =
+      ap.option[String]("whitelist-file",
+         metavar = "FILE",
+         help = """File containing a whitelist of words. If specified, ONLY
+  words on the list will be read from any corpora; other words will be ignored.
+  If not specified, all words (except those on the stopword list) will be
+  read.""")
+
     var input_corpus =
       ap.multiOption[String]("i", "input-corpus",
         metavar = "DIR",
     type TDocTable <: DistDocumentTable[_, TDoc, TGrid]
     override type TParam <: GridLocateParameters
     var stopwords: Set[String] = _
+    var whitelist: Set[String] = _
     var cell_grid: TGrid = _
     var document_table: TDocTable = _
     var word_dist_factory: WordDistFactory = _
       else stopwords
     }
 
+    protected def get_whitelist() = {
+      whitelist
+    }
+
     protected def initialize_word_dist_constructor(factory: WordDistFactory) = {
       val the_stopwords = get_stopwords()
+      val the_whitelist = get_whitelist()
       /* if (num_ngrams == 2)
         new DefaultBigramWordDistConstructor(factory, ...)
       else */
           factory,
           ignore_case = !params.preserve_case_words,
           stopwords = the_stopwords,
+          whitelist = the_whitelist,
           minimum_word_count = params.minimum_word_count)
     }
 
     Stopwords.read_stopwords(get_file_handler, params.stopwords_file)
   }
 
+  protected def read_whitelist() = {
+    Whitelist.read_whitelist(get_file_handler, params.whitelist_file)
+  }
+
   protected def read_documents(table: TDocTable) {
     for (fn <- params.input_corpus)
       table.read_training_documents(get_file_handler, fn,
 
   def setup_for_run() {
     stopwords = read_stopwords()
+    whitelist = read_whitelist()
     word_dist_factory = initialize_word_dist_factory()
     word_dist_constructor = initialize_word_dist_constructor(word_dist_factory)
     word_dist_factory.set_word_dist_constructor(word_dist_constructor)
     document_table = initialize_document_table(word_dist_factory)
     cell_grid = initialize_cell_grid(document_table)
-    // This accesses the stopwords through the pointer to this in
+    // This accesses the stopwords and whitelist through the pointer to this in
     // document_table.
     read_documents(document_table)
     if (debug("stop-after-reading-dists")) {

src/main/scala/opennlp/textgrounder/worddist/UnigramWordDist.scala

   factory: WordDistFactory,
   ignore_case: Boolean,
   stopwords: Set[String],
+  whitelist: Set[String],
   minimum_word_count: Int = 1
 ) extends WordDistConstructor(factory: WordDistFactory) {
   /**
   protected def add_word_with_count(counts: WordDoubleMap,
       word: String, count: Int) {
     val lword = maybe_lowercase(word)
-    if (!(stopwords contains lword))
+    if (!stopwords.contains(lword) && (whitelist.size == 0 || whitelist.contains(lword)))
       counts(memoize_string(lword)) += count
   }
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.