1. yanchuan sim
  2. yc-pyutils

Commits

yanchuan sim  committed 029e59b

fixed bug with < 1 idf

  • Participants
  • Parent commits 43ff4b2
  • Branches master

Comments (0)

Files changed (1)

File prune-vocab.py

View file
  • Ignore whitespace
 import argparse, sys, codecs 
 import ycutils.corpus
 
-parser = argparse.ArgumentParser(description='Prune vocabulary size. If any of the numbers are < 1.0, the lower/top proportion will be used instead.')
+parser = argparse.ArgumentParser(description='Prune vocabulary size. If freq/docfreq are < 1.0, the lower/top proportion will be used instead.')
 parser.add_argument('vocab_file', type=str, help='Vocab file to prune.')
 parser.add_argument('save_as', type=str, help='Save pruned vocabulary file.')
 
 
 if A.min_freq and A.min_freq < 1.0: A.min_freq = freqs[int(A.min_freq * L)]
 if A.max_freq and A.max_freq < 1.0: A.max_freq = freqs[int(A.max_freq * L)]
-if A.min_docfreq and A.min_docfreq < 1.0: A.min_docfreq = freqs[int(A.min_docfreq * L)]
-if A.max_docfreq and A.max_docfreq < 1.0: A.max_docfreq = freqs[int(A.max_docfreq * L)]
-if A.min_idf and A.min_idf < 1.0: A.min_idf = freqs[int(A.min_idf * L)]
-if A.max_idf and A.max_idf < 1.0: A.max_idf = freqs[int(A.max_idf * L)]
+if A.min_docfreq and A.min_docfreq < 1.0: A.min_docfreq = docfreqs[int(A.min_docfreq * L)]
+if A.max_docfreq and A.max_docfreq < 1.0: A.max_docfreq = docfreqs[int(A.max_docfreq * L)]
+# if A.min_idf and A.min_idf < 1.0: A.min_idf = freqs[int(A.min_idf * L)]
+# if A.max_idf and A.max_idf < 1.0: A.max_idf = freqs[int(A.max_idf * L)]
 
 print >>sys.stderr, 'Minimums={}'.format((A.min_freq, A.min_docfreq, A.min_idf))
 print >>sys.stderr, 'Maximums={}'.format((A.max_freq, A.max_docfreq, A.max_idf))