Commits

yanchuan sim committed 8af58df

handy script to prune vocabulary

Comments (0)

Files changed (1)

+#!/usr/bin/env python
+
+import argparse, sys, codecs 
+import ycutils.corpus
+
+parser = argparse.ArgumentParser(description='Prune vocabulary size. If any of the numbers are < 1.0, the lower/top proportion will be used instead.')
+parser.add_argument('vocab_file', type=str, help='Vocab file to prune.')
+parser.add_argument('save_as', type=str, help='Save pruned vocabulary file.')
+
+parser.add_argument('-f', '--min-freq', type=float, default=None, help='Minimum vocabulary frequency.')
+parser.add_argument('-F', '--max-freq', type=float, default=None, help='Maximum vocabulary frequency.')
+parser.add_argument('-d', '--min-docfreq', type=float, default=None, help='Minimum document frequency.')
+parser.add_argument('-D', '--max-docfreq', type=float, default=None, help='Maximum document frequency.')
+parser.add_argument('-i', '--min-idf', type=float, default=None, help='Minimum inverse document frequency.')
+parser.add_argument('-I', '--max-idf', type=float, default=None, help='Maximum inverse document frequency.')
+
+A = parser.parse_args()
+
+print >>sys.stderr, 'Loading vocabulary file...'
+vocab = ycutils.corpus.CorpusVocabulary(from_filename=A.vocab_file)
+
+freqs = []
+docfreqs = []
+idfs = []
+for (w, (freq, df, idf)) in vocab.iteritems():
+  freqs.append(freq)
+  docfreqs.append(df)
+  idfs.append(idf)
+#end for
+
+freqs.sort()
+docfreqs.sort()
+idfs.sort()
+L = len(vocab)
+
+if A.min_freq and A.min_freq < 1.0: A.min_freq = freqs[int(A.min_freq * L)]
+if A.max_freq and A.max_freq < 1.0: A.max_freq = freqs[int(A.max_freq * L)]
+if A.min_docfreq and A.min_docfreq < 1.0: A.min_docfreq = freqs[int(A.min_docfreq * L)]
+if A.max_docfreq and A.max_docfreq < 1.0: A.max_docfreq = freqs[int(A.max_docfreq * L)]
+if A.min_idf and A.min_idf < 1.0: A.min_idf = freqs[int(A.min_idf * L)]
+if A.max_idf and A.max_idf < 1.0: A.max_idf = freqs[int(A.max_idf * L)]
+
+print >>sys.stderr, 'Minimums={}'.format((A.min_freq, A.min_docfreq, A.min_idf))
+print >>sys.stderr, 'Maximums={}'.format((A.max_freq, A.max_docfreq, A.max_idf))
+
+print >>sys.stderr, 'Filtering vocabulary items...'
+vocab.filter(minimums=(A.min_freq, A.min_docfreq, A.min_idf), maximums=(A.max_freq, A.max_docfreq, A.max_idf))
+
+print >>sys.stderr, 'Saving pruned vocabulary...'
+with codecs.open(A.save_as, 'w', 'utf-8') as vocab_file:
+  vocab.to_file(vocab_file)
+print >>sys.stderr, 'Saved {} word types to {}.'.format(len(vocab), A.save_as)
+