1. yanchuan sim
  2. yc-pyutils

Commits

yanchuan sim  committed 1807c5d

fix bugs. added -f options

  • Participants
  • Parent commits 48a39f3
  • Branches master

Comments (0)

Files changed (2)

File scripts/prune-vocab.py

View file
 #!/usr/bin/env python
-import argparse, sys, codecs 
+
+import argparse, sys, codecs
 import ycutils.corpus
 
 parser = argparse.ArgumentParser(description='Prune vocabulary size. If freq/docfreq are < 1.0, the lower/top proportion will be used instead.')

File scripts/tokenize-docs.py

View file
 
 parser = argparse.ArgumentParser(description='Tokenize text using `ycutils.tokenize`. When displaying multiple tokenized files to STDOUT, each file on the output is marked with a line `# <filename>`. The end of a file is marked with an empty line.')
 
-parser.add_argument('--strip-unicode', action='store_true', default=False, help='Strip input text of unicode and force everything to be ASCII. This is quite aggressive and uses the Unidecode library.')
-parser.add_argument('--filter-stopwords', type=file, nargs='?', metavar='<stopword_file>', default=None, const='/dev/null', help='Filter stopwords from input texts. You can specify a text file (UTF-8 encoding) containing a list of stopwords, one on each line.')
-
-group = parser.add_argument_group(description='Input/output options (if none of these are specified, defaults to STDIN and STDOUT). This script expects input to be UTF-8 or ASCII.')
-group.add_argument('-f', '--fileslist', metavar='<files_list>', type=file, default=False, help='Input is a list of files. Tokenized files will be printed to STDOUT.')
-group.add_argument('-F', '--filepairs', metavar='<file_pairs>', type=file, default=False, help='Input is a pair of input/output file on each line, separated by a single <TAB> character.')
+group = parser.add_argument_group(title='Input/output options', description='If none of these are specified, defaults to STDIN and STDOUT. This script expects input to be UTF-8 or ASCII.')
+group.add_argument('-f', '--files', metavar='<file>', type=file, nargs='+', default=[], help='Input is a list of files. Tokenized files will be printed to STDOUT.')
+group.add_argument('-l', '--fileslist', metavar='<files_list>', type=file, default=False, help='Input is a file containing a list of filenames. Tokenized files will be printed to STDOUT.')
+group.add_argument('-L', '--filepairs', metavar='<file_pairs>', type=file, default=False, help='Input is a file containing pairs of input/output files on each line, separated by a single <TAB> character.')
 group.add_argument('-D', '--io-dir', type=str, nargs=2, metavar='<dir>', help='Input a pair of directories. The files/directory structure in <input_dir> will be mirrored in <output_dir>. Non existing directories will be created.')
 group.add_argument('-d', '--input-dir', type=str, metavar='<input_dir>', help='Input is a directory of files and tokenized files will be printed to STDOUT. This is recursive.')
 
-group = parser.add_argument_group(description='Sentence splitting options')
+group = parser.add_argument_group(title='Sentence splitting options')
 group.add_argument('-P', '--no-split-paragraph', action='store_true', default=False, help='Split text by paragraphs first (i.e splitting on instances of `\\n\\n`) (default: split paragraphs).')
 group.add_argument('-S', '--no-split-sentence', action='store_true', default=False, help='Split text by sentences (using Splitta library) (default: split sentences).')
 
-group = parser.add_argument_group(description='Tag options')
+group = parser.add_argument_group(title='Tag options')
 group.add_argument('-T', '--ignore-tags', choices=['separator', 'punctuation', 'symbol', 'phone', 'time', 'date', 'url', 'email', 'number', 'money'], nargs='*', default=['separator'], help='Specify categories of tags to ignore (default: separator).')
 
-group = parser.add_argument_group(description='Normalization options')
+group = parser.add_argument_group(title='Normalization options')
 group.add_argument('-N', '--normalize-tags', choices=['phone', 'time', 'date', 'url', 'email', 'number', 'money', 'punctuation', 'symbol', 'consecutive', 'case'], nargs='*', default=['phone', 'time', 'date', 'url', 'email', 'number', 'money', 'punctuation', 'symbol', 'consecutive', 'case'], help='Specify tagged categories to normalize. Defaults to everything: [phone, time, date, url, email, number, money, punctuation, symbol, consecutive, case].')
 group.add_argument('--hyphens', choices=['keep', 'del', 'split'], default='split', help='Specify what to do with hyphenated words: keep, del or split (default: split).')
 group.add_argument('--clitics', choices=['keep', 'del', 'split'], default='del', help='Specify what to do with non-negative clitics: keep, del or split (default: del).')
 group.add_argument('--neg-clitics', choices=['keep', 'del', 'split'], default='keep', help='Specify what to do with negative clitics: keep, del or split (default: keep).')
 group.add_argument('--no-normalize-clitics-quote', action='store_true', default=False, help='Standardize the single quote used in clitics to the ASCII version (default: yes).')
-
-group = parser.add_argument_group(description='Other options')
 group.add_argument('--stemming', action='store_true', default=False, help='Apply Porter stemming to tokens.')
+group.add_argument('--strip-unicode', action='store_true', default=False, help='Strip input text of unicode and force everything to be ASCII. This is quite aggressive and uses the Unidecode library.')
+group.add_argument('--filter-stopwords', type=file, nargs='?', metavar='<stopword_file>', default=None, const='/dev/null', help='Filter stopwords from input texts. You can specify a text file (UTF-8 encoding) containing a list of stopwords, one on each line.')
+
+group = parser.add_argument_group(title='N-gram options')
 group.add_argument('--ngrams', type=int, metavar='n', nargs='*', default=[1], help='Generate n-gram terms (default: unigram only).')
-group.add_argument('--ngram-stopwords', type=str, metavar='<stopwords>', nargs='*', default=['__PUNCT__'], help='Additional stopword (on top of those specified by --filter-stopwords) to ignore when generating n-grams, for instance __PUNCT__ (default: __PUNCT__).')
 group.add_argument('--ngram-separator', type=str, metavar='<separator>', default='_', help='Separator string for n-grams (default: \'_\').')
+group.add_argument('--ngram-stopwords', type=str, metavar='<stopwords>', nargs='*', default=['__PUNCT__'], help='Additional stopwords (beyond those specified by --filter-stopwords) to ignore when generating n-grams, for instance __PUNCT__ (default: __PUNCT__). N-grams can\'t \"cross\" stopwords.')
 
 def tokenize_text(text, filter_stopwords):
   global A, normalize
   ngram_sents = []
   for sent in sents:
     ngram_sent = []
-    for n in A.ngrams: ngram_sent += ycutils.tokenize.ngram_tokens(sent, n, sep_char=A.ngram_separator, filter_stopwords=filter_stopwords | set(A.ngram_stopwords))
+    for n in A.ngrams: ngram_sent += ycutils.tokenize.ngram_tokens(sent, n, sep_char=A.ngram_separator, filter_stopwords=A.ngram_stopwords)
 
     ngram_sents.append(ngram_sent)
   #end for
 def do_document(i=None, input_f=None, output_f=None):
   global filter_stopwords
 
-  if input_f:
-    print >>sys.stderr, '{:<8d} {} reading,'.format(i, input_f),
-    start_time = time.time()
-    with codecs.open(input_f, 'r', 'utf-8') as f: text = f.read()
-  else:
-    print >>sys.stderr, 'STDIN: reading,',
-    start_time = time.time()
-    text = sys.stdin.read()
-  #end if
+  if i: print >>sys.stderr, '{:<8d} {}: reading'.format(i, input_f.name),
+  else: print >>sys.stderr, '{}: reading'.format(input_f.name),
+
+  start_time = time.time()
+  text = input_f.read()
+  text = text.lstrip(unicode(codecs.BOM_UTF8, 'utf8'))
 
   print >>sys.stderr, 'tokenizing,',
   sents = tokenize_text(text, filter_stopwords)
 normalize.append('neg-clitics-' + A.neg_clitics)
 normalize += [] if A.no_normalize_clitics_quote else ['clitics-normalize']
 
+A.ngram_stopwords = set(A.ngram_stopwords) | filter_stopwords
+print >>sys.stderr, 'Will generate {}.'.format(', '.join(['{}-grams'.format(n) for n in A.ngrams]))
+
 done_something = False
 
+if A.files:
+  done_something = True
+
+  print >>sys.stderr, 'Processing filenames from command line.'
+
+  for i, f in enumerate(A.files, start=1):
+    do_document(i, input_f=codecs.getreader('utf-8')(f))
+#end if
+
 if A.fileslist:
   done_something = True
 
   for i, line in enumerate(A.fileslist, start=1):
     line = line.strip()
     if line.startswith('#') or not line: continue
-    do_document(i, input_f=line)
+    do_document(i, input_f=codecs.open(line, 'r', 'utf-8'))
   #end for
 #end if
 
     if line.startswith('#') or not line: continue
     input_f, output_f = line.split(u'\t')
 
-    do_document(i, input_f=input_f, output_f=output_f)
+    do_document(i, input_f=codecs.open(input_f, 'r', 'utf-8'), output_f=output_f)
   #end for
 #end if
 
 
       if not os.path.exists(new_dir): os.makedirs(new_dir, 0755)
 
-      do_document(i, input_f=src_path, output_f=dst_path)
+      do_document(i, input_f=codecs.open(src_path, 'r', 'utf-8'), output_f=dst_path)
       i += 1
     #end for
   #end for
     filenames.sort()
     for fname in filenames:
       dirnames.sort()
-      do_document(i, input_f=os.path.join(dirpath, fname))
+      do_document(i, input_f=codecs.open(os.path.join(dirpath, fname), 'r', 'utf-8'))
       i += 1
     #end for
   #end for
 #end if
 
-if not done_something: do_document()
+if not done_something: do_document(input_f=sys.stdin)