Commits

yanchuan sim committed 907ca7f

fail on bad unicode data

Comments (0)

Files changed (1)

scripts/tokenize-docs.py

   else: print >>sys.stderr, '{}: reading'.format(input_f.name),
 
   start_time = time.time()
-  text = input_f.read()
+  try:
+    text = input_f.read()
+  except UnicodeDecodeError:
+    print >>sys.stderr, '\nNot encoded in UTF-8 or ASCII: {}'.format(input_f.name)
+    print >>sys.stderr, 'Exiting.'
+    sys.exit(-1)
+  #end try
   text = text.lstrip(unicode(codecs.BOM_UTF8, 'utf8'))
 
   print >>sys.stderr, 'tokenizing,',
   i = 1
   for (dirpath, dirnames, filenames) in os.walk(input_dir):
     filenames.sort()
+    dirnames.sort()
     for fname in filenames:
-      dirnames.sort()
-
       src_path = os.path.join(dirpath, fname)
       rel_path = os.path.relpath(src_path, input_dir)
       new_dir = os.path.join(output_dir, os.path.dirname(rel_path))