Commits

Lars Yencken committed 1b55709

Allows one-shot evaluation and multiple input formats.

Comments (0)

Files changed (1)

 
 "This module performs pure segmentation and alignment only."
 
+import os
 import sys
 import optparse
+import warnings
 
 from gpalign import potentials
 from gpalign import dictionary
         for each kanji input row.
     """
     # read in edict dictionary
-    if not options.edict:
+    format = options.format
+    if format == 'simple':
         print 'Reading evaluation entries'
         entries, num_rejected = dictionary.evaluation_entries(input_file)
-    else:
+    elif format == 'edict':
         print 'Reading edict entries'
         entries, num_rejected = dictionary.edict_entries(input_file)
+    else:
+        raise Exception('unknown format: %s' % format)
 
     print '--> Found %d entries (rejected %d)' % (len(entries), num_rejected)
 
 
     alignment_model.finish()
 
-    if not options.edict:
+    if options.evaluate:
         evaluate.evaluate_alignment(output_file, output_file + '.eval')
 
 #----------------------------------------------------------------------------#
     """
     usage = \
 """%prog [options] input_file output_file
+       %prog [options] --evaluate
 
 An efficient implementation of the Baldwin-Tanaka automated grapheme-phoneme
-alignment algorithm based on TF-IDF. By default, it uses an evaluation data set
-as input, and prints an accuracy analysis for the score."""
+alignment algorithm based on TF-IDF. If passed --evaluate, it uses a bundled 
+evaluation data set as input, and prints an accuracy analysis for the 
+score."""
 
     parser = optparse.OptionParser(usage)
 
             type='int', default=4,
             help='The longest kanji run to be handled in the first pass [4]')
 
-    parser.add_option('--edict', action='store_true',
-            dest='edict', help='Use the EDICT dictionary as input [False]')
+    parser.add_option('--format', action='store', dest='format',
+            default='simple', 
+            help='The format of the input file [simple]/edict')
+    
+    parser.add_option('--evaluate', action='store_true', dest='evaluate',
+            help='Perform a run against the evaluation data.')
 
     parser.add_option('-a', '--alpha', action='store', dest='alpha',
             default=2.5, type='float',
 #----------------------------------------------------------------------------#
 
 def main(argv):
-    """ The main method for this module.
-    """
     parser = create_option_parser()
     (options, args) = parser.parse_args(argv)
 
-    if len(args) != 2:
-        parser.print_help()
-        sys.exit(1)
-
-    input_file, output_file = args
-
     if options.random:
         options.tf_heuristic = False
         options.idf_heuristic = False
-
+    
+    if (options.evaluate and args) \
+            or (not options.evaluate and len(args) != 2) \
+            or options.format not in ('simple', 'edict'):
+        parser.print_help()
+        sys.exit(1)
+    
+    if options.evaluate:
+        input_file = os.path.join(os.path.dirname(__file__), 'data',
+                'eval-alignment.data')
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore')
+            output_file = os.tempnam()    
+    else:
+        input_file, output_file = args
+    
     perform_segmentation(input_file, output_file, options)
+    
+    if options.evaluate:
+        os.remove(output_file)
 
 #----------------------------------------------------------------------------#
 
         sys.exit(1)
 
 #----------------------------------------------------------------------------#
-