1. Lars Yencken
  2. gpalign-py


gpalign-py / src / align.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#  align.py
#  gpalign
#  Created by Lars Yencken on 2005-05-14.
#  Copyright 2005-2010 Lars Yencken. All rights reserved.

"This module performs pure segmentation and alignment only."

import sys
import optparse

from gpalign import potentials
from gpalign import dictionary
from gpalign.alignment import AlignmentModel
from gpalign.reading_model import ReadingModel
from gpalign import evaluate

def perform_segmentation(input_file, output_file, options):
    """ The main method for this module. Performs the entire segmentation run,
        taking an edict dictionary as input and producing a segmented output
        for each kanji input row.
    # read in edict dictionary
    if not options.edict:
        print 'Reading evaluation entries'
        entries, num_rejected = dictionary.evaluation_entries(input_file)
        print 'Reading edict entries'
        entries, num_rejected = dictionary.edict_entries(input_file)

    print '--> Found %d entries (rejected %d)' % (len(entries), num_rejected)

    print 'Separating long and short entries'
    short_entries, long_entries = dictionary.separate_entries(entries,
    print '--> %d short, %d long' % (len(short_entries), len(long_entries))

    alignment_model = AlignmentModel(output_file, options)

    if options.use_kanjidic:
        reading_model = ReadingModel()
        reading_model = None

    print 'PASS 1: SHORT ENTRIES'
    _resolve_entries(alignment_model, reading_model, short_entries, options)

    print 'PASS 2: LONG ENTRIES'
    _resolve_entries(alignment_model, reading_model, long_entries, options)


    if not options.edict:
        evaluate.evaluate_alignment(output_file, output_file + '.eval')


def _resolve_entries(model, reading_model, entries, options):
    print 'Generating possible alignments'
    unique, ambiguous = potentials.generate_alignments(entries, options)
    print '--> %d unique, %d ambiguous' % (len(unique), len(ambiguous))
    print '--> %d overconstrained' % \
            (len(entries) - (len(unique) + len(ambiguous)))

    if options.use_kanjidic:
        print 'Disambiguating using kanjidic'
        more_unique, ambiguous = reading_model.prune_alignments(ambiguous)
        print '--> %d unique, %d ambiguous' % (len(more_unique),
        unique.extend(more_unique); del more_unique

    print 'Disambiguating readings using statistical model'
    print '--> Processing %d unique entries' % len(unique)
    model.add_resolved(unique); del unique
    print '--> Beginning statistical disambiguation of %d entries' % \
    model.disambiguate(ambiguous); del ambiguous


def create_option_parser():
    """ Creates an option parser instance to handle command-line options.
    usage = \
"""%prog [options] input_file output_file

An efficient implementation of the Baldwin-Tanaka automated grapheme-phoneme
alignment algorithm based on TF-IDF. By default, it uses an evaluation data set
as input, and prints an accuracy analysis for the score."""

    parser = optparse.OptionParser(usage)

    parser.add_option('--max-per-kanji', action='store', dest='max_per_kanji',
            type='int', default=5,
            help='The maximum number of kana aligned to one kanji [5]')

    parser.add_option('--no-kanjidic', action='store_false',
            dest='use_kanjidic', default=True,
            help='Disables the kanjidic reading model')

    parser.add_option('--idf-only', action='store_false', dest='tf_heuristic',
            default=True, help='Only uses the idf heuristic [False]')

    parser.add_option('--tf-only', action='store_false', dest='idf_heuristic',
            default=True, help='Only uses the tf heuristic [False]')

    parser.add_option('--random', action='store_true', dest='random',
            help='Choose a random entry to disambiguate each time [False]')

    parser.add_option('--longest-run', action='store', dest='longest_run',
            type='int', default=4,
            help='The longest kanji run to be handled in the first pass [4]')

    parser.add_option('--edict', action='store_true',
            dest='edict', help='Use the EDICT dictionary as input [False]')

    parser.add_option('-a', '--alpha', action='store', dest='alpha',
            default=2.5, type='float',
            help='The smoothing value to use in tf-idf [2.5]')

    parser.add_option('-s', '--solved', action='store', dest='solved',
            default=0.07, type='float',
            help='The weight of solved frequencies in the tf-idf equation [0.07]')

    parser.add_option('-m', '--max-potentials', action='store',
            dest='max_potentials', type='int', default=120,
            help='The maximum number of potential alignments for an entry [120]')

    parser.add_option('--non-iterative', action='store_false',
            dest='iterative', default=True,
            help='Disables iterative alignment, instead taking one pass [False]')
    parser.add_option('-u', '--unsolved', action='store', dest='unsolved',
            default=0.13, type='float',
            help='The weight of unsolved frequencies in the tf-idf equation [0.13]')

    parser.add_option('--dump-model', action='store', dest='model_output',
            help="At the end of alignment, dump the model " + \
            "generated to the given file.")

    parser.add_option('--use-model', action='store', dest='model_input',
            help="Instead of generating a model, use this one.")

    return parser


def main(argv):
    """ The main method for this module.
    parser = create_option_parser()
    (options, args) = parser.parse_args(argv)

    if len(args) != 2:

    input_file, output_file = args

    if options.random:
        options.tf_heuristic = False
        options.idf_heuristic = False

    perform_segmentation(input_file, output_file, options)


if __name__ == '__main__':
    except KeyboardInterrupt:
        # we cancel runs often, so do it nicely
        print >> sys.stderr, '\nAborting run!'