1. yanchuan sim
  2. yc-pyutils

Commits

yanchuan sim  committed 48a39f3

added script for building vocabulary

  • Participants
  • Parent commits ab9ad60
  • Branches master

Comments (0)

Files changed (1)

File scripts/build-vocab.py

View file
+#!/usr/bin/env python
+
+import argparse, sys, unicodedata, os, codecs, collections, time, string
+import regex as re
+import ycutils.tokenize, ycutils.tfidf, ycutils.bagofwords, ycutils.corpus
+
+parser = argparse.ArgumentParser(description='Extract terms in tokenized text corpus and build vocab file.')
+
+parser.add_argument('vocab_file', metavar='<vocab_file>', type=argparse.FileType('w'), help='Location to save vocabulary file.')
+
+group = parser.add_argument_group(title='Input/output options', description='If none of these are specified, defaults to STDIN). This script expects input to be UTF-8 or ASCII, and already tokenized (i.e each line is a sentence and each token is separated by a space).')
+group.add_argument('-f', '--files', metavar='<file>', type=file, nargs='+', default=[], help='Input is a list of files.')
+group.add_argument('-l', '--fileslist', metavar='<files_list>', type=file, default=False, help='Input is a file containing a list of filenames.')
+group.add_argument('-d', '--input-dir', type=str, metavar='<input_dir>', help='Recursively searches <input_dir> for files and extract terms from them.')
+group.add_argument('--ignore-comments', action='store_true', default=False, help='Ignore lines that start with \'#\' (default: false).')
+group.add_argument('--multi-file-format', action='store_true', default=False, help='Input is in the multi file format (as produced by `tokenize-docs.py`). Treats each empty line followed line starting with `#` as a new document.')
+
+group = parser.add_argument_group(title='Text options')
+group.add_argument('--strip-unicode', action='store_true', default=False, help='Strip input text of unicode and force everything to be ASCII. This is quite aggressive and uses the Unidecode library.')
+group.add_argument('--filter-stopwords', type=file, nargs='?', metavar='<stopword_file>', default=None, const='/dev/null', help='Filter stopwords from input texts. You can specify a text file (UTF-8 encoding) containing a list of stopwords, one on each line.')
+group.add_argument('--ignore-rare', type=float, metavar='N', default=0.0, help='If N > 1, filter words that appear < N times in each document. If N < 1, filters away the smallest N portion of types.')
+
+group = parser.add_argument_group(title='N-gram options')
+group.add_argument('--ngrams', type=int, metavar='n', nargs='*', default=[1], help='Generate n-gram terms (default: unigram only).')
+group.add_argument('--ngram-separator', type=str, metavar='<separator>', default='_', help='Separator string for n-grams (default: \'_\').')
+group.add_argument('--ngram-stopwords', type=str, metavar='<stopwords>', nargs='*', default=['__PUNCT__'], help='Additional stopwords (beyond those specified by --filter-stopwords) to ignore when generating n-grams, for instance __PUNCT__ (default: __PUNCT__). N-grams can\'t \"cross\" stopwords.')
+
+group = parser.add_mutually_exclusive_group()
+group.add_argument('--stemming', action='store_true', default=False, help='Apply Porter stemming to tokens and use stemmed tokens only.')
+group.add_argument('--plus-stemming', action='store_true', default=False, help='Apply Porter stemming to tokens and include them in addition to non-stemmed terms.')
+
+group = parser.add_argument_group(title='TF-IDF options')
+group.add_argument('--no-tfidf', action='store_true', default=False, help='Just build list of terms and not compute TF-IDF information.')
+group.add_argument('--idf-smoothing', type=float, default=1e-7, metavar='<lambda>', help='Laplace smoothing factor for computing IDF.')
+
+def do_document(i=None, input_f=None):
+  global filter_stopwords, corpus, A
+
+  if i: print >>sys.stderr, '{:<8d} {}: reading'.format(i, input_f.name),
+  else: print >>sys.stderr, '{}: reading'.format(input_f.name),
+  start_time = time.time()
+  
+  title = input_f.name
+
+  bow = ycutils.bagofwords.BOW()
+  prev_empty = True
+  sent_count = 0
+  token_count = 0
+
+  for line in input_f:
+    line = line.lstrip(unicode(codecs.BOM_UTF8, 'utf8')).strip()
+    if not line:
+      prev_empty = True
+      continue
+    #end if
+
+    if line.startswith('#'):
+      if A.ignore_comments:
+        continue
+      elif A.multi_file_format and prev_empty:
+        if len(bow) > 0:
+          if A.ignore_rare: bow.filter_rare_terms(A.ignore_rare)
+          if A.no_tfidf: corpus.update(bow.iterkeys())
+          else: corpus.add_bow(bow, title=title)
+          bow = ycutils.bagofwords.BOW()
+        #end if
+
+        title = input_f.name + line
+        prev_empty = False
+        continue
+      #end if
+    #end if
+
+    prev_empty = False
+
+    if A.strip_unicode: line = ycutils.tokenize.to_ascii(line)
+
+    if A.stemming:
+      sent = ycutils.tokenize.stem_tokens(line.split())
+    elif A.plus_stemming:
+      sent = line.split()
+      stem_sent = ycutils.tokenize.stem_tokens(sent)
+
+    else:
+      sent = line.split()
+
+    bow.add_tokens(ycutils.tokenize.ngram_tokens(sent, A.ngrams, sep_char=A.ngram_separator, filter_stopwords=A.ngram_stopwords))
+    if A.plus_stemming: bow.add_tokens(ycutils.tokenize.ngram_tokens(stem_sent, A.ngrams, sep_char=A.ngram_separator, filter_stopwords=A.ngram_stopwords))
+
+    sent_count += 1
+    token_count += len(sent)
+  #end for
+
+  input_f.close()
+
+  if A.ignore_rare: bow.filter_rare_terms(A.ignore_rare)
+  if A.no_tfidf: corpus.update(bow.iterkeys())
+  else: corpus.add_bow(bow, title=title)
+
+  print >>sys.stderr, 'ok!'
+  print >>sys.stderr, '\t{:8d} sentences {:8d} tokens {:.2f} seconds'.format(sent_count, token_count, time.time() - start_time)
+#end def
+
+
+A = parser.parse_args()
+
+sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
+sys.stdin = codecs.getreader('utf-8')(sys.stdin)
+A.vocab_file = codecs.getwriter('utf-8')(A.vocab_file)
+
+total_start_time = time.time()
+
+filter_stopwords = set()
+if A.filter_stopwords:
+  if A.filter_stopwords.name == '/dev/null':
+    filter_stopwords = ycutils.tokenize.STOPWORDS
+    print >>sys.stderr, 'Using default stopword list ({} items)...'.format(len(filter_stopwords))
+
+  else:
+    A.filter_stopwords = codecs.getreader('utf-8')(A.filter_stopwords)
+
+    for line in A.filter_stopwords:
+      line = line.strip()
+      if line.startswith('#') or not line: continue
+      filter_stopwords.add(line.lower())
+    #end for
+    print >>sys.stderr, 'Using stopword list {} ({} items)...'.format(A.filter_stopwords.name, len(filter_stopwords))
+  #end if
+#end if
+
+A.ngram_stopwords = set(A.ngram_stopwords) | filter_stopwords
+print >>sys.stderr, 'Will extract {}.'.format(', '.join(['{}-grams'.format(n) for n in A.ngrams]))
+
+done_something = False
+
+if A.no_tfidf: corpus = set()
+else:
+  corpus = ycutils.corpus.Corpus()
+  corpus.IDF_LAPLACE_SMOOTHING = A.idf_smoothing
+#end if
+
+if A.files:
+  done_something = True
+
+  print >>sys.stderr, 'Processing filenames from command line.'
+
+  for i, f in enumerate(A.files, start=1):
+    do_document(i, input_f=codecs.getreader('utf-8')(f))
+#end if
+
+if A.fileslist:
+  done_something = True
+
+  print >>sys.stderr, 'Processing filenames from {}'.format(A.fileslist.name)
+
+  A.fileslist = codecs.getreader('utf-8')(A.fileslist)
+  for i, line in enumerate(A.fileslist, start=1):
+    line = line.strip()
+    if line.startswith('#') or not line: continue
+    do_document(i, input_f=codecs.open(line, 'r', 'utf-8'))
+  #end for
+#end if
+
+if A.input_dir:
+  done_something = True
+
+  input_dir = A.input_dir
+  print >>sys.stderr, 'Processing files from {}'.format(input_dir)
+
+  i = 1
+  for (dirpath, dirnames, filenames) in os.walk(input_dir):
+    filenames.sort()
+    for fname in filenames:
+      dirnames.sort()
+      do_document(i, input_f=codecs.open(os.path.join(dirpath, fname), 'r', 'utf-8'))
+      i += 1
+    #end for
+  #end for
+#end if
+
+if not done_something: do_document(input_f=sys.stdin)
+
+print >>sys.stderr, 'Building vocabulary...'
+
+if A.no_tfidf:
+  for w in sorted(corpus): print >>A.vocab_file, w
+else:
+  cv = ycutils.corpus.CorpusVocabulary(corpus)
+  cv.to_file(A.vocab_file)
+  print >>sys.stderr, 'Saved {} word types to {}.'.format(len(cv), A.vocab_file.name)
+#end if
+
+print >>sys.stderr, 'Took {:03f} seconds.'.format(time.time() - total_start_time)