Commits

Anonymous committed cb8551b

BulkLoader can prune high-frequency features

  • Participants
  • Parent commits 0528cbc

Comments (0)

Files changed (2)

File src/cc/mallet/pipe/FeatureDocFreqPipe.java

+package cc.mallet.pipe;
+
+import cc.mallet.types.*;
+import gnu.trove.*;
+import java.io.*;
+
+/** 
+ *  Pruning low-count features can be a good way to save memory and computation.
+ *   However, in order to use Vectors2Vectors, you need to write out the unpruned
+ *   instance list, read it back into memory, collect statistics, create new 
+ *   instances, and then write everything back out.
+ * <p>
+ *  This class supports a simpler method that makes two passes over the data:
+ *   one to collect statistics and create an augmented "stop list", and a
+ *   second to actually create instances.
+ */
+
+public class FeatureDocFreqPipe extends Pipe {
+		
+	FeatureCounter counter;
+	int numInstances;
+
+	public FeatureDocFreqPipe() {
+		super(new Alphabet(), null);
+
+		counter = new FeatureCounter(this.getDataAlphabet());
+		numInstances = 0;
+	}
+		
+	public FeatureDocFreqPipe(Alphabet dataAlphabet, Alphabet targetAlphabet) {
+		super(dataAlphabet, targetAlphabet);
+
+		counter = new FeatureCounter(dataAlphabet);
+		numInstances = 0;
+	}
+
+	public Instance pipe(Instance instance) {
+		
+		TIntIntHashMap localCounter = new TIntIntHashMap();
+	
+		if (instance.getData() instanceof FeatureSequence) {
+				
+			FeatureSequence features = (FeatureSequence) instance.getData();
+
+			for (int position = 0; position < features.size(); position++) {
+				localCounter.adjustOrPutValue(features.getIndexAtPosition(position), 1, 1);
+			}
+
+		}
+		else {
+			throw new IllegalArgumentException("Looking for a FeatureSequence, found a " + 
+											   instance.getData().getClass());
+		}
+
+		for (int feature: localCounter.keys()) {
+			counter.increment(feature);
+		}
+
+		numInstances++;
+
+		return instance;
+	}
+
+	/** 
+	 *  Add all pruned words to the internal stoplist of a SimpleTokenizer.
+	 * 
+	 * @param docFrequencyCutoff Remove words that occur in greater than this proportion of documents. 0.05 corresponds to IDF >= 3.
+	 */
+	public void addPrunedWordsToStoplist(SimpleTokenizer tokenizer, double docFrequencyCutoff) {
+		Alphabet currentAlphabet = getDataAlphabet();
+
+        for (int feature = 0; feature < currentAlphabet.size(); feature++) {
+            if ((double) counter.get(feature) / numInstances > docFrequencyCutoff) {
+                tokenizer.stop((String) currentAlphabet.lookupObject(feature));
+            }
+        }
+	}
+
+	static final long serialVersionUID = 1;
+
+}

File src/cc/mallet/util/BulkLoader.java

         (BulkLoader.class, "prune-count", "N", false, 0,
          "Reduce features to those that occur more than N times.", null);
 	
+    static CommandOption.Double docProportionCutoff = new CommandOption.Double
+        (BulkLoader.class, "prune-doc-frequency", "N", false, 1.0,
+         "Remove features that occur in more than (X*100)% of documents. 0.05 is equivalent to IDF of 3.0.", null);
+	
     /**
      *  Read the data from inputFile, then write all the words
      *   that do not occur <tt>pruneCount.value</tt> times or more to the pruned word file.
         SimpleTokenizer st = prunedTokenizer.deepClone();
 		StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
 		FeatureCountPipe featureCounter = new FeatureCountPipe(alphabet, null);
+		FeatureDocFreqPipe docCounter = new FeatureDocFreqPipe(alphabet, null);
 
 		if (! preserveCase.value) {
 			pipes.add(csl);
 		}
 		pipes.add(st);
 		pipes.add(sl2fs);
-		pipes.add(featureCounter);
+		if (pruneCount.value > 0) {
+			pipes.add(featureCounter);
+		}
+		if (docProportionCutoff.value < 1.0) {
+			pipes.add(docCounter);
+		}
 
 		Pipe serialPipe = new SerialPipes(pipes);
 
             iterator.next();
 		}
 
-		featureCounter.addPrunedWordsToStoplist(prunedTokenizer, pruneCount.value);
+		if (pruneCount.value > 0) {
+			featureCounter.addPrunedWordsToStoplist(prunedTokenizer, pruneCount.value);
+		}
+		if (docProportionCutoff.value < 1.0) {
+			docCounter.addPrunedWordsToStoplist(prunedTokenizer, docProportionCutoff.value);
+		}
 	}
 
 
 			tokenizer = new SimpleTokenizer(SimpleTokenizer.USE_EMPTY_STOPLIST);
 		}
 
-		if (pruneCount.value > 0) {
+		if (pruneCount.value > 0 || docProportionCutoff.value < 1.0) {
 			generateStoplist(tokenizer);
 		}