gpalign-py / src /

Full commit
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#  gpalign
#  Created by Lars Yencken on 2005-05-14.
#  Copyright 2005-2010 Lars Yencken. All rights reserved.

"This module performs pure segmentation and alignment only."

import os
import sys
import optparse
import warnings

from gpalign import potentials
from gpalign import dictionary
from gpalign.alignment import AlignmentModel
from gpalign.reading_model import ReadingModel
from gpalign import evaluate

def perform_segmentation(input_file, output_file, options):
    """ The main method for this module. Performs the entire segmentation run,
        taking an edict dictionary as input and producing a segmented output
        for each kanji input row.
    # read in edict dictionary
    format = options.format
    if format == 'simple':
        print 'Reading evaluation entries'
        entries, num_rejected = dictionary.evaluation_entries(input_file)
    elif format == 'edict':
        print 'Reading edict entries'
        entries, num_rejected = dictionary.edict_entries(input_file)
        raise Exception('unknown format: %s' % format)

    print '--> Found %d entries (rejected %d)' % (len(entries), num_rejected)

    print 'Separating long and short entries'
    short_entries, long_entries = dictionary.separate_entries(entries,
    print '--> %d short, %d long' % (len(short_entries), len(long_entries))

    alignment_model = AlignmentModel(output_file, options)

    if options.use_kanjidic:
        reading_model = ReadingModel()
        reading_model = None

    print 'PASS 1: SHORT ENTRIES'
    _resolve_entries(alignment_model, reading_model, short_entries, options)

    print 'PASS 2: LONG ENTRIES'
    _resolve_entries(alignment_model, reading_model, long_entries, options)


    if options.evaluate:
        evaluate.evaluate_alignment(output_file, output_file + '.eval')


def _resolve_entries(model, reading_model, entries, options):
    print 'Generating possible alignments'
    unique, ambiguous = potentials.generate_alignments(entries, options)
    print '--> %d unique, %d ambiguous' % (len(unique), len(ambiguous))
    print '--> %d overconstrained' % \
            (len(entries) - (len(unique) + len(ambiguous)))

    if options.use_kanjidic:
        print 'Disambiguating using kanjidic'
        more_unique, ambiguous = reading_model.prune_alignments(ambiguous)
        print '--> %d unique, %d ambiguous' % (len(more_unique),
        unique.extend(more_unique); del more_unique

    print 'Disambiguating readings using statistical model'
    print '--> Processing %d unique entries' % len(unique)
    model.add_resolved(unique); del unique
    print '--> Beginning statistical disambiguation of %d entries' % \
    model.disambiguate(ambiguous); del ambiguous


def create_option_parser():
    """ Creates an option parser instance to handle command-line options.
    usage = \
"""%prog [options] input_file output_file
       %prog [options] --evaluate

An efficient implementation of the Baldwin-Tanaka automated grapheme-phoneme
alignment algorithm based on TF-IDF. If passed --evaluate, it uses a bundled 
evaluation data set as input, and prints an accuracy analysis for the 

    parser = optparse.OptionParser(usage)

    parser.add_option('--max-per-kanji', action='store', dest='max_per_kanji',
            type='int', default=5,
            help='The maximum number of kana aligned to one kanji [5]')

    parser.add_option('--no-kanjidic', action='store_false',
            dest='use_kanjidic', default=True,
            help='Disables the kanjidic reading model')

    parser.add_option('--idf-only', action='store_false', dest='tf_heuristic',
            default=True, help='Only uses the idf heuristic [False]')

    parser.add_option('--tf-only', action='store_false', dest='idf_heuristic',
            default=True, help='Only uses the tf heuristic [False]')

    parser.add_option('--random', action='store_true', dest='random',
            help='Choose a random entry to disambiguate each time [False]')

    parser.add_option('--longest-run', action='store', dest='longest_run',
            type='int', default=4,
            help='The longest kanji run to be handled in the first pass [4]')

    parser.add_option('--format', action='store', dest='format',
            help='The format of the input file [simple]/edict')
    parser.add_option('--evaluate', action='store_true', dest='evaluate',
            help='Perform a run against the evaluation data.')

    parser.add_option('-a', '--alpha', action='store', dest='alpha',
            default=2.5, type='float',
            help='The smoothing value to use in tf-idf [2.5]')

    parser.add_option('-s', '--solved', action='store', dest='solved',
            default=0.07, type='float',
            help='The weight of solved frequencies in the tf-idf equation [0.07]')

    parser.add_option('-m', '--max-potentials', action='store',
            dest='max_potentials', type='int', default=120,
            help='The maximum number of potential alignments for an entry [120]')

    parser.add_option('--non-iterative', action='store_false',
            dest='iterative', default=True,
            help='Disables iterative alignment, instead taking one pass [False]')
    parser.add_option('-u', '--unsolved', action='store', dest='unsolved',
            default=0.13, type='float',
            help='The weight of unsolved frequencies in the tf-idf equation [0.13]')

    parser.add_option('--dump-model', action='store', dest='model_output',
            help="At the end of alignment, dump the model " + \
            "generated to the given file.")

    parser.add_option('--use-model', action='store', dest='model_input',
            help="Instead of generating a model, use this one.")

    return parser


def main(argv):
    parser = create_option_parser()
    (options, args) = parser.parse_args(argv)

    if options.random:
        options.tf_heuristic = False
        options.idf_heuristic = False
    if (options.evaluate and args) \
            or (not options.evaluate and len(args) != 2) \
            or options.format not in ('simple', 'edict'):
    if options.evaluate:
        input_file = os.path.join(os.path.dirname(__file__), 'data',
        with warnings.catch_warnings():
            output_file = os.tempnam()    
        input_file, output_file = args
    perform_segmentation(input_file, output_file, options)
    if options.evaluate:


if __name__ == '__main__':
    except KeyboardInterrupt:
        # we cancel runs often, so do it nicely
        print >> sys.stderr, '\nAborting run!'