Source

gpalign-py / src / segment.py

Full commit
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 
#  segment.py
#  gpalign
#  
#  Created by Lars Yencken on 2005-05-14.
#  Copyright 2005-2010 Lars Yencken. All rights reserved.
# 

""" This module is an executable script performing grapheme-phoneme alignment
    based on papers by Baldwin and Tanaka.
"""

#----------------------------------------------------------------------------#

import os, sys
import optparse

import potentials
import dictionary
from alignment import AlignmentModel
from reading_model import ReadingModel
from okurigana_model import OkuriganaModel
import evaluate
import settings

#----------------------------------------------------------------------------#

def perform_segmentation(output_file, options):
    """ The main method for this module. Performs the entire segmentation run,
        taking an edict dictionary as input and producing a segmented output
        for each kanji input row.
    """
    # read in edict dictionary
    if not options.edict:
        print 'Reading evaluation entries'
        entries, num_rejected = dictionary.evaluation_entries(
                os.path.join(settings.DATA_DIR, 'evaluation.data'))
    else:
        print 'Reading edict entries'
        entries, num_rejected = dictionary.edict_entries(
                os.path.join(settings.DATA_DIR, 'edict.bz2'))
    print '--> Found %d entries (rejected %d)' % (len(entries), num_rejected)

    print 'Separating long and short entries'
    short_entries, long_entries = dictionary.separate_entries(entries,
            options.longest_run)
    print '--> %d short, %d long' % (len(short_entries), len(long_entries))

    alignment_model = AlignmentModel(output_file, options)

    if options.use_kanjidic:
        reading_model = ReadingModel()
        kanjidic_okurigana = reading_model.get_okurigana()
    else:
        reading_model = None
        kanjidic_okurigana = {}

    print 'PASS 1: SHORT ENTRIES'
    _resolve_entries(alignment_model, reading_model, short_entries, options)
    
    print 'PASS 2: LONG ENTRIES'
    _resolve_entries(alignment_model, reading_model, long_entries, options)

    alignment_model.finish()

    okurigana_model = OkuriganaModel(kanjidic_okurigana, options)
    okurigana_model.okurigana_adjustments(output_file)

    if not options.edict:
        evaluate.main([output_file, output_file + '.eval'])

#----------------------------------------------------------------------------#

def _resolve_entries(model, reading_model, entries, options):
    print 'Generating possible alignments'
    unique, ambiguous = potentials.generate_alignments(entries, options)
    print '--> %d unique, %d ambiguous' % (len(unique), len(ambiguous))
    print '--> %d overconstrained' % \
            (len(entries) - (len(unique) + len(ambiguous)))

    if options.use_kanjidic:
        print 'Disambiguating using kanjidic'
        more_unique, ambiguous = reading_model.prune_alignments(ambiguous)
        print '--> %d unique, %d ambiguous' % (len(more_unique),
                len(ambiguous))
        unique.extend(more_unique)

    print 'Disambiguating readings using statistical model'
    print '--> Processing %d unique entries' % len(unique)
    model.add_resolved(unique)
    print '--> Beginning statistical disambiguation of %d entries' % \
            len(ambiguous)
    model.disambiguate(ambiguous)

#----------------------------------------------------------------------------#

#----------------------------------------------------------------------------#
# COMMAND-LINE INTERFACE
#

def create_option_parser():
    usage = \
"""%prog [options] input_file output_file

An efficient implementation of the Baldwin-Tanaka automated grapheme-phoneme
alignment algorithm based on TF-IDF."""

    parser = optparse.OptionParser(usage)

    parser.add_option('--max-per-kanji', action='store', dest='max_per_kanji',
            type='int', default=5,
            help='The maximum number of kana aligned to one kanji [5]')

    parser.add_option('--no-kanjidic', action='store_false',
            dest='use_kanjidic', default=True,
            help='Disables the kanjidic reading model')

    parser.add_option('--idf-only', action='store_false', dest='tf_heuristic',
            default=True, help='Only uses the idf heuristic [False]')

    parser.add_option('--tf-only', action='store_false', dest='idf_heuristic',
            default=True, help='Only uses the tf heuristic [False]')

    parser.add_option('--random', action='store_true', dest='random',
            help='Choose a random entry to disambiguate each time [False]')

    parser.add_option('--longest-run', action='store', dest='longest_run',
            type='int', default=4,
            help='The longest kanji run to be handled in the first pass [4]')

    parser.add_option('--edict', action='store_true',
            dest='edict', help='Indicates an edict run [False]')

    parser.add_option('-a', '--alpha', action='store', dest='alpha',
            default=2.5, type='float',
            help='The smoothing value to use in tf-idf [2.5]')

    parser.add_option('-s', '--solved', action='store', dest='solved',
            default=0.07, type='float',
            help='The weight of solved frequencies in the tf-idf equation [0.07]')

    parser.add_option('-m', '--max-potentials', action='store',
            dest='max_potentials', type='int', default=120,
            help='The maximum number of potential alignments for an entry [120]')

    parser.add_option('-u', '--unsolved', action='store', dest='unsolved',
            default=0.13, type='float',
            help='The weight of unsolved frequencies in the tf-idf equation [0.13]')

    parser.add_option('-o', '--okurigana', action='store',
            dest='okuri_threshold', type='int', default=1,
            help='The threshold used for cooccurrence-based okurigana')

    parser.add_option('--simple-okurigana', action='store_true',
            dest='simple_okurigana', default=False,
            help='Use a simple okurigana method, ignoring the main model')

    return parser

#----------------------------------------------------------------------------#

def main(argv):
    parser = create_option_parser()
    (options, args) = parser.parse_args(argv)

    if len(args) != 1:
        parser.print_help()
        sys.exit(1)

    output_file = args[0]

    if options.random:
        options.tf_heuristic = False
        options.idf_heuristic = False

    perform_segmentation(output_file, options)

#----------------------------------------------------------------------------#

if __name__ == '__main__':
    try:
        main(sys.argv[1:])
    except KeyboardInterrupt:
        # we cancel runs often, so do it nicely
        print >> sys.stderr, '\nAborting run!'
        sys.exit(1)