1. Lars Yencken
  2. gpalign-py

Commits

Lars Yencken  committed 3e5c4f8

Major cleanup of codebase.

- Converts all method, variable and module names to underscore_notation.
- Removes some redundant method calls.
- Cleans up modules based on pyflakes output.

  • Participants
  • Parent commits e8613ca
  • Branches default

Comments (0)

Files changed (26)

File src/align.py

View file
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# align.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 sts=4 et tw=78:
-# Sat May 14 14:49:45 EST 2005
-#
-#----------------------------------------------------------------------------#
+# 
+#  align.py
+#  gpalign
+#  
+#  Created by Lars Yencken on 2005-05-14.
+#  Copyright 2005-2010 Lars Yencken. All rights reserved.
+# 
 
-""" This module performs pure segmentation and alignment only.
-"""
 
-#----------------------------------------------------------------------------#
+"This module performs pure segmentation and alignment only."
 
 import sys
 import optparse
 from gpalign import potentials
 from gpalign import dictionary
 from gpalign.alignment import AlignmentModel
-from gpalign.readingModel import ReadingModel
+from gpalign.reading_model import ReadingModel
 from gpalign import evaluate
 
-#----------------------------------------------------------------------------#
-
-def performSegmentation(inputFile, outputFile, options):
+def perform_segmentation(input_file, output_file, options):
     """ The main method for this module. Performs the entire segmentation run,
         taking an edict dictionary as input and producing a segmented output
         for each kanji input row.
     # read in edict dictionary
     if not options.edict:
         print 'Reading evaluation entries'
-        entries, numRejected = dictionary.evaluationEntries(inputFile)
+        entries, num_rejected = dictionary.evaluation_entries(input_file)
     else:
         print 'Reading edict entries'
-        entries, numRejected = dictionary.edictEntries(inputFile)
+        entries, num_rejected = dictionary.edict_entries(input_file)
 
-    print '--> Found %d entries (rejected %d)' % (len(entries), numRejected)
+    print '--> Found %d entries (rejected %d)' % (len(entries), num_rejected)
 
     print 'Separating long and short entries'
-    shortEntries, longEntries = dictionary.separateEntries(entries,
-            options.longestRun)
-    print '--> %d short, %d long' % (len(shortEntries), len(longEntries))
+    short_entries, long_entries = dictionary.separate_entries(entries,
+            options.longest_run)
+    print '--> %d short, %d long' % (len(short_entries), len(long_entries))
 
-    alignmentModel = AlignmentModel(outputFile, options)
+    alignment_model = AlignmentModel(output_file, options)
 
-    if options.useKanjidic:
-        readingModel = ReadingModel()
-        kanjidicOkurigana = readingModel.getOkurigana()
+    if options.use_kanjidic:
+        reading_model = ReadingModel()
     else:
-        readingModel = None
-        kanjidicOkurigana = {}
+        reading_model = None
 
     print 'PASS 1: SHORT ENTRIES'
-    _resolveEntries(alignmentModel, readingModel, shortEntries, options)
-    del shortEntries
+    _resolve_entries(alignment_model, reading_model, short_entries, options)
+    del short_entries
 
     print 'PASS 2: LONG ENTRIES'
-    _resolveEntries(alignmentModel, readingModel, longEntries, options)
-    del longEntries
+    _resolve_entries(alignment_model, reading_model, long_entries, options)
+    del long_entries
 
-    del readingModel
+    del reading_model
 
-    alignmentModel.finish()
-    del alignmentModel
+    alignment_model.finish()
+    del alignment_model
 
     if not options.edict:
-        evaluate.evaluateAlignment(outputFile, outputFile + '.eval')
+        evaluate.evaluate_alignment(output_file, output_file + '.eval')
 
     return
 
 #----------------------------------------------------------------------------#
 
-def _resolveEntries(model, readingModel, entries, options):
-    """ 
-    """
+def _resolve_entries(model, reading_model, entries, options):
     print 'Generating possible alignments'
-    unique, ambiguous = potentials.generateAlignments(entries, options)
+    unique, ambiguous = potentials.generate_alignments(entries, options)
     print '--> %d unique, %d ambiguous' % (len(unique), len(ambiguous))
     print '--> %d overconstrained' % \
             (len(entries) - (len(unique) + len(ambiguous)))
 
-    if options.useKanjidic:
+    if options.use_kanjidic:
         print 'Disambiguating using kanjidic'
-        moreUnique, ambiguous = readingModel.pruneAlignments(ambiguous)
-        print '--> %d unique, %d ambiguous' % (len(moreUnique), len(ambiguous))
-        unique.extend(moreUnique); del moreUnique
+        more_unique, ambiguous = reading_model.prune_alignments(ambiguous)
+        print '--> %d unique, %d ambiguous' % (len(more_unique),
+                len(ambiguous))
+        unique.extend(more_unique); del more_unique
 
     print 'Disambiguating readings using statistical model'
     print '--> Processing %d unique entries' % len(unique)
-    model.addResolved(unique); del unique
+    model.add_resolved(unique); del unique
     print '--> Beginning statistical disambiguation of %d entries' % \
             len(ambiguous)
     model.disambiguate(ambiguous); del ambiguous
 
-    return
-
-#----------------------------------------------------------------------------#
-
-
 #----------------------------------------------------------------------------#
 # COMMAND-LINE INTERFACE
 #
 
-def createOptionParser():
+def create_option_parser():
     """ Creates an option parser instance to handle command-line options.
     """
     usage = \
-"""%prog [options] inputFile outputFile
+"""%prog [options] input_file output_file
 
 An efficient implementation of the Baldwin-Tanaka automated grapheme-phoneme
 alignment algorithm based on TF-IDF. By default, it uses an evaluation data set
 
     parser = optparse.OptionParser(usage)
 
-    parser.add_option('--max-per-kanji', action='store', dest='maxPerKanji',
+    parser.add_option('--max-per-kanji', action='store', dest='max_per_kanji',
             type='int', default=5,
             help='The maximum number of kana aligned to one kanji [5]')
 
     parser.add_option('--no-kanjidic', action='store_false',
-            dest='useKanjidic', default=True,
+            dest='use_kanjidic', default=True,
             help='Disables the kanjidic reading model')
 
-    parser.add_option('--idf-only', action='store_false', dest='tfHeuristic',
+    parser.add_option('--idf-only', action='store_false', dest='tf_heuristic',
             default=True, help='Only uses the idf heuristic [False]')
 
-    parser.add_option('--tf-only', action='store_false', dest='idfHeuristic',
+    parser.add_option('--tf-only', action='store_false', dest='idf_heuristic',
             default=True, help='Only uses the tf heuristic [False]')
 
     parser.add_option('--random', action='store_true', dest='random',
             help='Choose a random entry to disambiguate each time [False]')
 
-    parser.add_option('--longest-run', action='store', dest='longestRun',
+    parser.add_option('--longest-run', action='store', dest='longest_run',
             type='int', default=4,
             help='The longest kanji run to be handled in the first pass [4]')
 
             help='The weight of solved frequencies in the tf-idf equation [0.07]')
 
     parser.add_option('-m', '--max-potentials', action='store',
-            dest='maxPotentials', type='int', default=120,
+            dest='max_potentials', type='int', default=120,
             help='The maximum number of potential alignments for an entry [120]')
 
     parser.add_option('--non-iterative', action='store_false',
             default=0.13, type='float',
             help='The weight of unsolved frequencies in the tf-idf equation [0.13]')
 
-    parser.add_option('--dump-model', action='store', dest='modelOutput',
+    parser.add_option('--dump-model', action='store', dest='model_output',
             help="At the end of alignment, dump the model " + \
             "generated to the given file.")
 
-    parser.add_option('--use-model', action='store', dest='modelInput',
+    parser.add_option('--use-model', action='store', dest='model_input',
             help="Instead of generating a model, use this one.")
 
     return parser
 def main(argv):
     """ The main method for this module.
     """
-    parser = createOptionParser()
+    parser = create_option_parser()
     (options, args) = parser.parse_args(argv)
 
     if len(args) != 2:
         parser.print_help()
         sys.exit(1)
 
-    inputFile, outputFile = args
+    input_file, output_file = args
 
     if options.random:
-        options.tfHeuristic = False
-        options.idfHeuristic = False
+        options.tf_heuristic = False
+        options.idf_heuristic = False
 
-    performSegmentation(inputFile, outputFile, options)
-
-    return
+    perform_segmentation(input_file, output_file, options)
 
 #----------------------------------------------------------------------------#
 
 if __name__ == '__main__':
     try:
-        import psyco
-        psyco.profile()
-    except:
-        pass
-
-    try:
         main(sys.argv[1:])
     except KeyboardInterrupt:
         # we cancel runs often, so do it nicely

File src/alignment.py

View file
 # -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# alignment.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 sts=4 et tw=78:
-# Mon May 16 11:24:31 EST 2005
-#
-#----------------------------------------------------------------------------#
+# 
+#  alignment.py
+#  gpalign
+#  
+#  Created by Lars Yencken on 2005-05-16.
+#  Copyright 2005-2010 Lars Yencken. All rights reserved.
+# 
 
-""" This module implements the iterative TF-IDF method.
-"""
-
-#----------------------------------------------------------------------------#
+"This module implements the iterative TF-IDF alignment method."
 
 import potentials
 from frequency import FrequencyMap
 import random
 import cPickle as pickle
 
-#----------------------------------------------------------------------------#
-
 # epsilon for testing for zero
 eps = 1e-8
 
-#----------------------------------------------------------------------------#
-
 class AlignmentModel:
     """ This class is responsible for the alignment algorithm, and all its
         internal data structures.
     """
-    #------------------------------------------------------------------------#
-    # PUBLIC METHODS
-    #
-
-    def __init__(self, outputFile, options):
+    def __init__(self, output_file, options):
         """ Creates a new instance using the list of correctly aligned
             readings.
         """
         print 'Initialising alignment model'
-        if options.modelInput:
-            print '--> Seeding from', `options.modelInput`
-            self._uniqueCounts = pickle.load(open(options.modelInput))
+        if options.model_input:
+            print '--> Seeding from', `options.model_input`
+            self._unique_counts = pickle.load(open(options.model_input))
         else:
             print '--> Seeding from empty model'
-            self._uniqueCounts = FrequencyMap()
+            self._unique_counts = FrequencyMap()
 
-        self._ambiguousCounts = FrequencyMap()
+        self._ambiguous_counts = FrequencyMap()
 
         # possibly a filename to dump our final model into
-        self._modelDumpFile = options.modelOutput
+        self._model_dump_file = options.model_output
 
         # whether to align all at once or iteratively
         self._iterative = options.iterative
 
         # we write aligned readings as we go, rather than storing them in
         # memory
-        self._output = sopen(outputFile, 'w')
-        self._outputName = outputFile
+        self._output = sopen(output_file, 'w')
+        self._output_name = output_file
 
         # ratios for the tf-idf
         self._alpha = options.alpha
         # setting either of these defaults non-zero will prevent calculation
         # of that heuristic
         if options.random:
-            self._useRandom = True
+            self._use_random = True
             print '--> Random model selected'
         else:
-            self._useRandom = False
+            self._use_random = False
 
             # only define these variables in the non-random case to ensure
             # that they never get used in the random case
-            self._defaultTf = 0
-            self._defaultIdf = 0
+            self._default_tf = 0
+            self._default_idf = 0
     
-            if not options.tfHeuristic:
+            if not options.tf_heuristic:
                 print '--> Disabling tf heuristic'
-                self._defaultTf = 1
+                self._default_tf = 1
     
-            elif not options.idfHeuristic:
+            elif not options.idf_heuristic:
                 print '--> Disabling idf heuristic'
-                self._defaultIdf = 1
+                self._default_idf = 1
             
             else:
                 print '--> Full TF-IDF enabled'
     
     #------------------------------------------------------------------------#
 
-    def addResolved(self, resolvedEntries):
+    def add_resolved(self, resolved_entries):
         """ Populates the statistical model with a number of resolved entries. 
         """
         # add all unambiguous readings to our model
-        for entry in resolvedEntries:
-            self._uniqueCounts.addCounts(entry.alignment)
-            print >> self._output, entry.toLine()
+        for entry in resolved_entries:
+            self._unique_counts.add_counts(entry.alignment)
+            print >> self._output, entry.to_line()
 
         return
 
         if not ambiguous:
             return
 
-        self._initialiseEntries(ambiguous)
-        numEntries = len(ambiguous)
+        self._initialise_entries(ambiguous)
+        num_entries = len(ambiguous)
 
-        if self._useRandom:
+        if self._use_random:
             # randomly pick the best alignment for each entry
-            self._randomAlignment(ambiguous)
+            self._random_alignment(ambiguous)
 
         elif not self._iterative:
             # perform first and only scoring iteration
             self._rescore(ambiguous)
     
-        progressBar = ProgressBar()
-        progressBar.start(100)
+        progress_bar = ProgressBar()
+        progress_bar.start(100)
 
         i = 0
-        while i < numEntries:
-            if self._iterative and not self._useRandom:
+        while i < num_entries:
+            if self._iterative and not self._use_random:
                 # perform expensive rescoring
                 self._rescore(ambiguous)
                 ambiguous.sort()
 
-            bestEntry = ambiguous.pop()
-            self._disambiguateEntry(bestEntry)
+            best_entry = ambiguous.pop()
+            self._disambiguate_entry(best_entry)
 
-            print >> self._output, bestEntry.toLine()
+            print >> self._output, best_entry.to_line()
 
             i += 1
-            progressBar.fractional(math.sqrt(i)/math.sqrt(numEntries))
+            progress_bar.fractional(math.sqrt(i)/math.sqrt(num_entries))
 
-        progressBar.finish()
-
-        return
+        progress_bar.finish()
 
     #------------------------------------------------------------------------#
     
         """
         self._output.close()
 
-        if self._modelDumpFile:
+        if self._model_dump_file:
             # dump our 
-            oStream = open(self._modelDumpFile, 'w')
-            pickle.dump(self._uniqueCounts, oStream)
-            oStream.close()
+            ostream = open(self._model_dump_file, 'w')
+            pickle.dump(self._unique_counts, ostream)
+            ostream.close()
 
-        assert self._ambiguousCounts._gSize == 0
+        assert self._ambiguous_counts._g_size == 0
 
         return
     
     # PRIVATE METHODS
     #
 
-    def _initialiseEntries(self, ambiguousEntries):
+    def _initialise_entries(self, ambiguous_entries):
         """ Updates the counts for ambiguous readings and restructures them to
             be updated.
         """
-        for i in xrange(len(ambiguousEntries)):
-            entry = ambiguousEntries[i]
+        for i in xrange(len(ambiguous_entries)):
+            entry = ambiguous_entries[i]
             alignments = entry.potentials
 
             assert len(set(alignments)) == len(alignments), \
 
             # update our counts
             for alignment in alignments:
-                self._ambiguousCounts.addCounts(alignment)
+                self._ambiguous_counts.add_counts(alignment)
 
             entry.score = 0.0
             entry.scores = [0.0]*len(alignments)
  
     #------------------------------------------------------------------------#
 
-    def _disambiguateEntry(self, entry):
-        """ Modify the entry to remove all the additional ambiguous alignments,
-            and update our internal counts.
+    def _disambiguate_entry(self, entry):
+        """ Modify the entry to remove all the additional ambiguous
+            alignments, and update our internal counts.
         """
         entry.scores = None
 
         # put this count amongst the unique ones
-        self._uniqueCounts.addCounts(entry.alignment)
+        self._unique_counts.add_counts(entry.alignment)
 
         # fill in the rest of this count
         # eliminate the bad readings from the model
         for alignment in entry.potentials:
-            self._ambiguousCounts.delCounts(alignment)
+            self._ambiguous_counts.del_counts(alignment)
 
         entry.potentials = None
         entry.aligned = True
 
     #------------------------------------------------------------------------#
 
-    def _weightedFreqs(self, gSegments, pSegments, index):
+    def _weighted_freqs(self, g_segments, p_segments, index):
         """ Weight the frequencies from the two models.
         """
-        s_gFreq, s_gpFreq, s_gpcFreq = self._uniqueCounts.frequencies(
-                gSegments, pSegments, index)
-        u_gFreq, u_gpFreq, u_gpcFreq = self._ambiguousCounts.frequencies(
-                gSegments, pSegments, index)
+        s_g_freq, s_gp_freq, s_gpc_freq = self._unique_counts.frequencies(
+                g_segments, p_segments, index)
+        u_g_freq, u_gp_freq, u_gpc_freq = self._ambiguous_counts.frequencies(
+                g_segments, p_segments, index)
 
-        gFreq = self._solved*s_gFreq + self._unsolved*u_gFreq
-        gpFreq = self._solved*s_gpFreq + self._unsolved*u_gpFreq
-        gpcFreq = self._solved*s_gpcFreq + self._unsolved*u_gpcFreq
+        g_freq = self._solved*s_g_freq + self._unsolved*u_g_freq
+        gp_freq = self._solved*s_gp_freq + self._unsolved*u_gp_freq
+        gpc_freq = self._solved*s_gpc_freq + self._unsolved*u_gpc_freq
 
-        return gFreq, gpFreq, gpcFreq
+        return g_freq, gp_freq, gpc_freq
         
     #------------------------------------------------------------------------#
 
-    def _explainAlignment(self, entry, alignment):
+    def _explain_alignment(self, entry, alignment):
         """
         """
-        bestScore, allAlignments = entry
-        print '--->', bestScore,
-        potentials.printAlignment(alignment)
-        allAlignments.sort()
-        allAlignments.reverse()
-        for otherScore, otherAlignment in allAlignments:
-            print '----->', otherScore,
-            potentials.printAlignment(otherAlignment)
+        best_score, all_alignments = entry
+        print '--->', best_score,
+        potentials.print_alignment(alignment)
+        all_alignments.sort()
+        all_alignments.reverse()
+        for other_score, other_alignment in all_alignments:
+            print '----->', other_score,
+            potentials.print_alignment(other_alignment)
     
         return
 
     #------------------------------------------------------------------------#
 
-    def _randomAlignment(self, entries):
+    def _random_alignment(self, entries):
         """ Picks a random alignment for each entry in a list of ambiguous
             entries. 
         """
-        for ambiguousEntry in entries:
-            ambiguousEntry.alignment = random.sample(
-                    ambiguousEntry.potentials, 1)[0]
+        for ambiguous_entry in entries:
+            ambiguous_entry.alignment = random.sample(
+                    ambiguous_entry.potentials, 1)[0]
         return
 
     #------------------------------------------------------------------------#
         """ Calculates the tf-idf score of the alignment passed in based on
             the current model.
         """
-        kanjiScript = scripts.Script.Kanji
-        currentScores = []
+        kanji_script = scripts.Script.Kanji
+        current_scores = []
 
-        gSegments, pSegments = alignment
-        for i in range(len(gSegments)):
-            if not scripts.script_type(gSegments[i]) == kanjiScript:
+        g_segments, p_segments = alignment
+        for i in range(len(g_segments)):
+            if not scripts.script_type(g_segments[i]) == kanji_script:
                 continue
 
-            gFreq, gpFreq, gpcFreq = self._weightedFreqs(gSegments,
-                    pSegments, i)
+            g_freq, gp_freq, gpc_freq = self._weighted_freqs(g_segments,
+                    p_segments, i)
 
-            tf = self._defaultTf or \
-                (gpFreq + self._alpha - self._unsolved) / gFreq
+            tf = self._default_tf or \
+                (gp_freq + self._alpha - self._unsolved) / g_freq
 
-            idf = self._defaultIdf or \
-                math.log(gpFreq/(gpcFreq + self._alpha - self._unsolved))
+            idf = self._default_idf or \
+                math.log(gp_freq/(gpc_freq + self._alpha - self._unsolved))
 
-            currentScores.append(tf*idf)
+            current_scores.append(tf*idf)
  
-        newScore = sum(currentScores) / float(len(currentScores))
+        new_score = sum(current_scores) / float(len(current_scores))
 
-        return newScore
+        return new_score
 
     #------------------------------------------------------------------------#
 

File src/detectOkurigana.py

-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# detectOkurigana.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 sts=4 et tw=78:
-# Sat May 14 14:49:45 EST 2005
-#
-#----------------------------------------------------------------------------#
-
-""" This module is an executable script performing grapheme-phoneme alignment
-    based on papers by Baldwin and Tanaka.
-"""
-
-#----------------------------------------------------------------------------#
-
-import os, sys
-import optparse
-
-from okuriganaModel import OkuriganaModel
-import evaluate
-import settings
-
-#----------------------------------------------------------------------------#
-
-def detectOkurigana(outputFile, options):
-    """ Performs just okurigana detection and alignment alteration.
-    """
-    okuriganaModel = OkuriganaModel(options)
-
-    inputFile = options.inputFile or os.path.join(settings.DATA_DIR,
-            'eval-okurigana.data')
-    okuriganaModel.okuriganaAdjustments(inputFile, outputFile)
-
-    if not options.inputFile:
-        evaluate.evaluateOkurigana(outputFile, outputFile + '.eval')
-
-    return
-
-#----------------------------------------------------------------------------#
-
-
-#----------------------------------------------------------------------------#
-# COMMAND-LINE INTERFACE
-#
-
-def createOptionParser():
-    """ Creates an option parser instance to handle command-line options.
-    """
-    usage = \
-"""%prog [options] outputFile
-
-An efficient implementation of the Baldwin-Tanaka automated grapheme-phoneme
-alignment algorithm based on TF-IDF."""
-
-    parser = optparse.OptionParser(usage)
-
-    parser.add_option('-t', '--threshold', action='store',
-            dest='okuriThreshold', type='int', default=1,
-            help='The threshold used for cooccurrence-based okurigana')
-
-    parser.add_option('--simple', action='store_true',
-            dest='simpleOkurigana', default=False,
-            help='Use a simple okurigana method, ignoring the main model')
-
-    parser.add_option('--no-kanjidic', action='store_false',
-            dest='useKanjidic', default=True,
-            help='Disables the kanjidic reading model')
-
-    parser.add_option('--no-cooccurrence', action='store_false',
-            dest='useCooccurrence', default=True,
-            help='Disables cooccurrence entries from edict')
-
-    parser.add_option('--no-verbs', action='store_false',
-            dest='useVerbs', default=True,
-            help='Disables verb entries from edict')
-
-    parser.add_option('-i', '--input', action='store', dest='inputFile',
-            help="Specify a custom input file to use.")
-
-    return parser
-
-#----------------------------------------------------------------------------#
-
-def main(argv):
-    """ The main method for this module.
-    """
-    parser = createOptionParser()
-    (options, args) = parser.parse_args(argv)
-
-    if len(args) != 1:
-        parser.print_help()
-        sys.exit(1)
-
-    outputFile = args[0]
-
-    detectOkurigana(outputFile, options)
-
-    return
-
-#----------------------------------------------------------------------------#
-
-if __name__ == '__main__':
-    try:
-        import psyco
-        psyco.profile()
-    except:
-        pass
-
-    try:
-        main(sys.argv[1:])
-    except KeyboardInterrupt:
-        # we cancel runs often, so do it nicely
-        print >> sys.stderr, '\nAborting run!'
-        sys.exit(1)
-
-#----------------------------------------------------------------------------#
-

File src/detect_okurigana.py

View file
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# 
+#  detect_okurigana.py
+#  gpalign
+#  
+#  Created by Lars Yencken on 2005-05-14.
+#  Copyright 2005-2010 Lars Yencken. All rights reserved.
+# 
+
+"""
+This module is an executable script performing grapheme-phoneme alignment
+based on papers by Baldwin and Tanaka.
+"""
+
+import os, sys
+import optparse
+
+from okurigana_model import OkuriganaModel
+import evaluate
+import settings
+
+def detect_okurigana(output_file, options):
+    """ Performs just okurigana detection and alignment alteration.
+    """
+    okurigana_model = OkuriganaModel(options)
+
+    input_file = options.input_file or os.path.join(settings.DATA_DIR,
+            'eval-okurigana.data')
+    okurigana_model.okurigana_adjustments(input_file, output_file)
+
+    if not options.input_file:
+        evaluate.evaluate_okurigana(output_file, output_file + '.eval')
+
+def create_option_parser():
+    """ Creates an option parser instance to handle command-line options.
+    """
+    usage = \
+"""%prog [options] output_file
+
+An efficient implementation of the Baldwin-Tanaka automated grapheme-phoneme
+alignment algorithm based on TF-IDF."""
+
+    parser = optparse.OptionParser(usage)
+
+    parser.add_option('-t', '--threshold', action='store',
+            dest='okuri_threshold', type='int', default=1,
+            help='The threshold used for cooccurrence-based okurigana')
+
+    parser.add_option('--simple', action='store_true',
+            dest='simple_okurigana', default=False,
+            help='Use a simple okurigana method, ignoring the main model')
+
+    parser.add_option('--no-kanjidic', action='store_false',
+            dest='use_kanjidic', default=True,
+            help='Disables the kanjidic reading model')
+
+    parser.add_option('--no-cooccurrence', action='store_false',
+            dest='use_cooccurrence', default=True,
+            help='Disables cooccurrence entries from edict')
+
+    parser.add_option('--no-verbs', action='store_false',
+            dest='use_verbs', default=True,
+            help='Disables verb entries from edict')
+
+    parser.add_option('-i', '--input', action='store', dest='input_file',
+            help="Specify a custom input file to use.")
+
+    return parser
+
+def main(argv):
+    """ The main method for this module.
+    """
+    parser = create_option_parser()
+    (options, args) = parser.parse_args(argv)
+
+    if len(args) != 1:
+        parser.print_help()
+        sys.exit(1)
+
+    output_file = args[0]
+
+    detect_okurigana(output_file, options)
+
+if __name__ == '__main__':
+    try:
+        main(sys.argv[1:])
+    except KeyboardInterrupt:
+        # we cancel runs often, so do it nicely
+        print >> sys.stderr, '\nAborting run!'
+        sys.exit(1)

File src/dictionary.py

View file
 # -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# dictionary.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 sts=4 et tw=78:
-# Mon May 16 10:50:57 EST 2005
-#
-#----------------------------------------------------------------------------#
+# 
+#  dictionary.py
+#  gpalign
+#  
+#  Created by Lars Yencken on 2005-05-16.
+#  Copyright 2005-2010 Lars Yencken. All rights reserved.
+# 
 
-""" This module is responsible for parsing input data sets for
-    grapheme/phoneme string pairs to align. Its main methods are
-    edictEntries() and evaluationEntries().
 """
-
-#----------------------------------------------------------------------------#
+This module is responsible for parsing input data sets for grapheme/phoneme
+string pairs to align. Its main methods are edict_entries() and
+evaluation_entries().
+"""
 
 from os import path
 
 from entry import Entry
 import settings
 
-#----------------------------------------------------------------------------#
-# PUBLIC METHODS
-#
-
-def edictEntries(inputFile):
-    """ Determines all the kanji entries available in the input file. The input
-        file is assumed to be in edict format.
+def edict_entries(input_file):
     """
-    inputStream = sopen(inputFile)
-    rejectionStream = sopen(path.join(settings.LOG_DIR, 'rejected-entries'),
+    Determines all the kanji entries available in the input file. The
+    input file is assumed to be in edict format.
+    """
+    istream = sopen(input_file)
+    rejection_stream = sopen(path.join(settings.LOG_DIR, 'rejected-entries'),
             'w')
 
     entries = []
-    numRejected = 0
-    for line in inputStream:
-        lineParts = line.split()
-        gString = lineParts[0]
-        pString = lineParts[1][1:-1]
+    num_rejected = 0
+    for line in istream:
+        parts = line.split()
+        g_string = parts[0]
+        p_string = parts[1][1:-1]
         
-        if _validEntry(gString, pString):
-            entries.append(Entry(gString, pString))
+        if _valid_entry(g_string, p_string):
+            entries.append(Entry(g_string, p_string))
         else:
-            numRejected += 1
-            rejectionStream.write(line)
+            num_rejected += 1
+            rejection_stream.write(line)
 
-    return entries, numRejected
+    return entries, num_rejected
 
-#----------------------------------------------------------------------------#
-
-def evaluationEntries(inputFile):
+def evaluation_entries(input_file):
     """ Get entries from a file formatted like an evaluation type instead of
         in edict format.
     """
     entries = []
-    inputStream = sopen(inputFile, 'r')
+    istream = sopen(input_file, 'r')
 
-    rejectionStream = sopen(path.join(settings.LOG_DIR, 'rejected-entries'),
+    rejection_stream = sopen(path.join(settings.LOG_DIR, 'rejected-entries'),
             'w')
 
-    numRejected = 0
-    for line in inputStream:
-        gString, pString = line.split(':')[0].split()
+    num_rejected = 0
+    for line in istream:
+        g_string, p_string = line.split(':')[0].split()
         
-        if _validEntry(gString, pString):
-            entries.append(Entry(gString, pString))
+        if _valid_entry(g_string, p_string):
+            entries.append(Entry(g_string, p_string))
         else:
-            numRejected += 1
-            rejectionStream.write(line)
+            num_rejected += 1
+            rejection_stream.write(line)
 
-    return entries, numRejected
+    return entries, num_rejected
+
+def separate_entries(entries, max_run_length=3):
+    """ Split out the longest entries for later processing.
+    """
+    short_entries = []
+    long_entries = []
+
+    for entry in entries:
+        if _longest_kanji_run(entry.g_string) > max_run_length:
+            long_entries.append(entry)
+        else:
+            short_entries.append(entry)
+    
+    return short_entries, long_entries
 
 #----------------------------------------------------------------------------#
 
-def separateEntries(entries, maxRunLength=3):
-    """ Split out the longest entries for later processing.
-    """
-    shortEntries = []
-    longEntries = []
-
-    for entry in entries:
-        if _longestKanjiRun(entry.gString) > maxRunLength:
-            longEntries.append(entry)
-        else:
-            shortEntries.append(entry)
-    
-    return shortEntries, longEntries
-
-#----------------------------------------------------------------------------#
-
-#----------------------------------------------------------------------------#
-# PRIVATE METHODS
-#
-
-def _validEntry(gString, pString):
+def _valid_entry(g_string, p_string):
     """ Returns True if the word is only kanji and kana, False otherwise.
     """
     # throw out any grapheme string which contains ascii
-    if scripts.Script.Ascii in map(scripts.script_type, gString): 
+    if scripts.Script.Ascii in map(scripts.script_type, g_string): 
         return False
 
     # throw out any reading which non-kana readings
     isKana = lambda x: x in (scripts.Script.Hiragana, scripts.Script.Katakana)
 
-    hasNonKana = (filter(isKana, map(scripts.script_type, pString)) != [])
+    has_non_kana = (filter(isKana, map(scripts.script_type, p_string)) != [])
 
-    return hasNonKana
-    
-#----------------------------------------------------------------------------#
+    return has_non_kana
 
-def _longestKanjiRun(gString):
+def _longest_kanji_run(g_string):
     """ Works out the longest number of kanji in a row in the grapheme string.
     """
     run = 0
     longest = 0
-    kanjiScript = scripts.Script.Kanji
-    for char in gString:
-        if scripts.script_type(char) == kanjiScript:
+    kanji_script = scripts.Script.Kanji
+    for char in g_string:
+        if scripts.script_type(char) == kanji_script:
             run += 1
         else:
             if run > longest:
     
     return longest
 
-#----------------------------------------------------------------------------#

File src/entry.py

View file
 # -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# entry.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 sts=4 et tw=78:
-# Thu Aug 25 15:28:58 EST 2005
-#
-#----------------------------------------------------------------------------#
+# 
+#  entry.py
+#  gpalign
+#  
+#  Created by Lars Yencken on 2005-08-25.
+#  Copyright 2005-2010 Lars Yencken. All rights reserved.
+# 
 
 from cjktools import scripts
 
-#----------------------------------------------------------------------------#
-
-#class Entry(object):
-#    """A grapheme-phoneme pair."""
-#    def __init__(self, arg):
-#        super(Entry, self).__init__()
-#        self.arg = arg
-        
-
-#----------------------------------------------------------------------------#
-
 class Entry:
-    """ A single grapheme-phoneme pair undergoing alignment.
-    """
-    def __init__(self, gString, pString):
-        """ Creates a new instance.
-
-            @param gString: The grapheme string
-            @param pString: The phoneme string
-            @param potentials: Potential alignments pre-calculated
-            @param score: The current scoring
-        """
-        self.pString = pString
-        self.gString_original = gString
+    "A single grapheme-phoneme pair undergoing alignment."
+    def __init__(self, g_string, p_string):
+        self.p_string = p_string
+        self.g_string_original = g_string
 
         # normalise the graphical form
-        if u'々' in gString:
-            gString = self._insertDuplicateKanji(gString)
-        self.gString = gString
+        if u'々' in g_string:
+            g_string = self._insert_duplicate_kanji(g_string)
+        self.g_string = g_string
 
         # have we aligned yet?
         self.aligned = False
         self.potentials = None
         self.scores = None
 
-        return
+    def __cmp__(self, rhs):
+        return cmp(self.score, rhs.score)
 
-    def __cmp__(self, otherEntry):
-        return cmp(self.score, otherEntry.score)
-
-    def toString(self):
+    def to_string(self):
         if self.aligned:
-            gSegments, pSegments = self.alignment
+            g_segments, p_segments = self.alignment
             retStr = 'Entry(%s <-> %s)' % \
-                    ('|'.join(gSegments), '|'.join(pSegments))
+                    ('|'.join(g_segments), '|'.join(p_segments))
         elif self.potentials:
             retStr = 'Entry(%s <-> %s, %d potentials)' % \
-                    (self.gString, self.pString, len(self.potentials))
+                    (self.g_string, self.p_string, len(self.potentials))
         else:
-            retStr = 'Entry(%s <-> %s)' % (self.gString, self.pString)
+            retStr = 'Entry(%s <-> %s)' % (self.g_string, self.p_string)
         return retStr
 
     def __str__(self):
-        return self.toString()
+        return self.to_string()
     
     def __repr__(self):
-        return self.toString()
+        return self.to_string()
 
-    def toLine(self):
+    def to_line(self):
         """ Prints the final alignment in our desired output format. 
         """
         assert self.aligned
 
         alignment = ' '.join(map(lambda x: '|'.join(x), self.alignment))
 
-        original = '%s %s'%(self.gString_original, ''.join(self.alignment[1]))
+        original = '%s %s' % (
+                self.g_string_original,
+                ''.join(self.alignment[1])
+            )
     
         return ':'.join((original, alignment))
 
-    def _insertDuplicateKanji(self, gString):
+    def _insert_duplicate_kanji(self, g_string):
         result = []
-        kanjiScript = scripts.Script.Kanji
-        for i, c in enumerate(gString):
-            if c == u'々' and i > 0 and scripts.script_type(gString[i-1]) == kanjiScript:
+        kanji_script = scripts.Script.Kanji
+        for i, c in enumerate(g_string):
+            if c == u'々' and i > 0 and \
+                    scripts.script_type(g_string[i-1]) == kanji_script:
                 # Insert a duplicate of the previous kanji
-                result.append(gString[i-1])
+                result.append(g_string[i-1])
             else:
                 result.append(c)
 
 
     def __hash__(self):
         if not self.alignment:
-            return hash(self.gString + self.pString)
+            return hash(self.g_string + self.p_string)
         else:
             return hash(tuple(self.alignment))
 

File src/errors.py

View file
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# errors.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 sts=4 et tw=78:
-# Wed May 25 23:45:40 EST 2005
-#
-#----------------------------------------------------------------------------#
+# 
+#  errors.py
+#  gpalign
+#  
+#  Created by Lars Yencken on 2005-05-25.
+#  Copyright 2005-2010 Lars Yencken. All rights reserved.
+# 
 
-import okuriganaModel
+import okurigana_model
 
 from cjktools.common import sopen
 
-#----------------------------------------------------------------------------#
-
-def separateErrors(baseFile):
-    """ Separates out the errors from the alignments, and tries to classify
-        them.
+def separate_errors(base_file):
     """
-    newUtfFile = lambda x: sopen(baseFile + x, 'w')
-
-    inputFile = sopen(baseFile, 'r')
+    Separates out the errors from the alignments, and tries to classify them.
+    """
+    input_file = sopen(base_file, 'r')
 
     good = set()
     bad = set()
-    badOkuri = set()
-    badGapping = set()
-    badAlign = set()
-    badConstrain = set()
+    bad_okuri = set()
+    bad_gapping = set()
+    bad_align = set()
+    bad_constrain = set()
 
-    for line in inputFile:
-        original, testCase, correctCase = _parseLine(line)
+    for line in input_file:
+        original, test_case, correct_case = _parse_line(line)
 
-        if testCase == correctCase:
+        if test_case == correct_case:
             good.add(line)
             continue
 
-        if testCase == [('???',)]:
-            badConstrain.add(line)
+        if test_case == [('???',)]:
+            bad_constrain.add(line)
             bad.add(line)
             continue
 
         # the rest of the cases are bad
-        if _detectGapping(correctCase):
-            badGapping.add(line)
+        if _detect_gapping(correct_case):
+            bad_gapping.add(line)
             bad.add(line)
             continue
 
-        if _badAlignment(testCase, correctCase):
-            badAlign.add(line)
+        if _bad_alignment(test_case, correct_case):
+            bad_align.add(line)
 
-        elif _badOkurigana(correctCase, testCase):
-            badOkuri.add(line)
+        elif _bad_okurigana(correct_case, test_case):
+            bad_okuri.add(line)
 
         bad.add(line)
     
     total = len(good.union(bad))
-    badOther = bad.difference(badGapping.union(badAlign).union(badOkuri).union(
-            badConstrain))
+    bad_other = bad.difference(bad_gapping.union(bad_align).union(
+            bad_okuri).union(bad_constrain))
 
-    _linesToFile(good, '.good', baseFile)
-    _linesToFile(bad, '.bad', baseFile)
-    _linesToFile(badOkuri, '.bad.okuri', baseFile)
-    _linesToFile(badGapping, '.bad.gapping', baseFile)
-    _linesToFile(badAlign, '.bad.align', baseFile)
-    _linesToFile(badOther, '.bad.other', baseFile)
-    _linesToFile(badConstrain, '.bad.constrain', baseFile)
+    _lines_to_file(good, '.good', base_file)
+    _lines_to_file(bad, '.bad', base_file)
+    _lines_to_file(bad_okuri, '.bad.okuri', base_file)
+    _lines_to_file(bad_gapping, '.bad.gapping', base_file)
+    _lines_to_file(bad_align, '.bad.align', base_file)
+    _lines_to_file(bad_other, '.bad.other', base_file)
+    _lines_to_file(bad_constrain, '.bad.constrain', base_file)
 
-    nGood, nBad, nBadOkuri, nBadGapping, nBadAlign, nUnknown, nConstrain = \
+    (n_good, n_bad, n_bad_okuri, n_bad_gapping, n_bad_align, n_unknown,
+            n_constrain) = \
             map(
                 len,
-                (good, bad, badOkuri, badGapping, badAlign, badOther,
-                badConstrain)
+                (good, bad, bad_okuri, bad_gapping, bad_align, bad_other,
+                bad_constrain)
             )
 
     print '%d total alignments' % total
-    print '--> %.2f%% correct (%d)' % ((100*nGood / float(total)),nGood)
-    print '--> %.2f%% in error (%d)' % ((100*nBad / float(total)),nBad)
-    print '----> %.2f%% okurigana (%d)' % ((100*nBadOkuri / float(total)),\
-            nBadOkuri)
-    print '----> %.2f%% gapping (%d)' % ((100*nBadGapping / float(total)),\
-            nBadGapping)
-    print '----> %.2f%% align (%d)' % ((100*nBadAlign / float(total)),\
-            nBadAlign)
-    print '----> %.2f%% overconstrained (%d)' % ((100*nConstrain / \
-            float(total)), nConstrain)
-    print '----> %.2f%% unknown (%d)' % ((100*(nUnknown)/float(total)),\
-            nUnknown)
-
-    return
+    print '--> %.2f%% correct (%d)' % ((100*n_good / float(total)),n_good)
+    print '--> %.2f%% in error (%d)' % ((100*n_bad / float(total)),n_bad)
+    print '----> %.2f%% okurigana (%d)' % ((100*n_bad_okuri / float(total)),\
+            n_bad_okuri)
+    print '----> %.2f%% gapping (%d)' % ((100*n_bad_gapping / float(total)),\
+            n_bad_gapping)
+    print '----> %.2f%% align (%d)' % ((100*n_bad_align / float(total)),\
+            n_bad_align)
+    print '----> %.2f%% overconstrained (%d)' % ((100*n_constrain / \
+            float(total)), n_constrain)
+    print '----> %.2f%% unknown (%d)' % ((100*(n_unknown)/float(total)),\
+            n_unknown)
 
 #----------------------------------------------------------------------------#
 
-def _parseLine(line):
-    lineTuple = line.strip().split(':', 2)
+def _parse_line(line):
+    line_tuple = line.strip().split(':', 2)
 
     segment = lambda x: tuple(x.strip('|').split('|'))
-    lineTuple = map(lambda x: map(segment, x.split(' ',1)), lineTuple)
+    line_tuple = map(lambda x: map(segment, x.split(' ',1)), line_tuple)
 
-    return lineTuple
+    return line_tuple
 
 #----------------------------------------------------------------------------#
 
-def _linesToFile(lineSet, extension, baseName):
-    oStream = sopen(baseName + extension, 'w')
-    oStream.writelines(lineSet)
-    oStream.close()
-    return 
+def _lines_to_file(line_set, extension, base_name):
+    ostream = sopen(base_name + extension, 'w')
+    ostream.writelines(line_set)
+    ostream.close() 
 
 #----------------------------------------------------------------------------#
 
-def _badAlignment(testCase, correctCase):
+def _bad_alignment(test_case, correct_case):
     """ Determines whether this case is a bad alignment case.
     """
-    gSegments, pSegments = testCase
-    cgSegments, cpSegments = correctCase
+    g_segments, p_segments = test_case
+    cg_segments, cp_segments = correct_case
 
-    if okuriganaModel.alignmentHasOkurigana(cgSegments, cpSegments):
-        testCase = okuriganaModel.removeOkurigana(testCase[0], testCase[1])
-        correctCase = okuriganaModel.removeOkurigana(correctCase[0],
-                correctCase[1])
+    if okurigana_model.alignment_has_okurigana(cg_segments, cp_segments):
+        test_case = okurigana_model.remove_okurigana(test_case[0],
+                test_case[1])
+        correct_case = okurigana_model.remove_okurigana(correct_case[0],
+                correct_case[1])
 
-    return testCase != correctCase
+    return test_case != correct_case
 
 #----------------------------------------------------------------------------#
 
-def _badOkurigana(testCase, correctCase):
-    gSegments, pSegments = testCase
-    cgSegments, cpSegments = correctCase
+def _bad_okurigana(test_case, correct_case):
+    g_segments, p_segments = test_case
+    cg_segments, cp_segments = correct_case
 
-    if okuriganaModel.alignmentHasOkurigana(cgSegments, cpSegments):
-        if okuriganaModel.alignmentHasOkurigana(gSegments, pSegments):
+    if okurigana_model.alignment_has_okurigana(cg_segments, cp_segments):
+        if okurigana_model.alignment_has_okurigana(g_segments, p_segments):
             return True
         else:
             # we forgot to add okurigana
             return False
     else:
         # have we mistakenly added okurigana?
-        return okuriganaModel.alignmentHasOkurigana(gSegments, pSegments)
+        return okurigana_model.alignment_has_okurigana(g_segments, p_segments)
 
 #----------------------------------------------------------------------------#
 
-def _detectGapping(correctCase):
+def _detect_gapping(correct_case):
     """ Determines whether this was a case of grapheme gapping. Tell-tale
         signs: a '<' in the phoneme segment.
     """
-    gSegments, pSegments = correctCase
-    for segment in pSegments:
+    g_segments, p_segments = correct_case
+    for segment in p_segments:
         if '<' in segment:
             return True
     else:

File src/evaluate.py

View file
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# evaluate.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 sts=4 et tw=78:
-# Fri Aug 12 11:41:18 EST 2005
-#
-#----------------------------------------------------------------------------#
+# 
+#  evaluate.py
+#  gpalign
+#  
+#  Created by Lars Yencken on 2005-08-12.
+#  Copyright 2005-2010 Lars Yencken. All rights reserved.
+# 
 
 import os, sys
 import optparse
+import operator
 
 from simplestats import sequences
 from cjktools.common import sopen
 
 #----------------------------------------------------------------------------#
 
-def evaluateAlignment(predictionFile, resultsFile):
+def evaluate_alignment(prediction_file, results_file):
     """ Evaluates the alignments provided in the prediction file, writing the
         results to the results file.
     """
-    validationFile = os.path.join(settings.DATA_DIR, 'eval-alignment.data')
+    validation_file = os.path.join(settings.DATA_DIR, 'eval-alignment.data')
 
-    iStream = sopen(predictionFile, 'r')
     results = {}
 
-    validationCases = _listEntries(validationFile)
-    validationDict = dict(validationCases)
+    validation_cases = _list_entries(validation_file)
+    validation_dict = dict(validation_cases)
 
-    predictionCases = _listEntries(predictionFile)
-    predictionDict = dict(predictionCases)
+    prediction_cases = _list_entries(prediction_file)
+    prediction_dict = dict(prediction_cases)
 
-    matching = lambda x: x in validationCases
-    good, bad = sequences.separate(matching, predictionCases)
+    matching = lambda x: x in validation_cases
+    good, bad = sequences.separate(matching, prediction_cases)
 
     results['good'] = good
 
-    addCorrect = lambda x: x + (validationDict[x[0]],)
-    bad = map(addCorrect, bad)
+    add_correct = lambda x: x + (validation_dict[x[0]],)
+    bad = map(add_correct, bad)
 
     results['bad'] = bad
 
-    orFunc = lambda x, y: x or y
-    hasGapping = lambda x: reduce(orFunc, map(lambda y: '<' in y, x[2]))
-    gapping, align = sequences.separate(hasGapping, bad)
+    has_gapping = lambda x: reduce(
+            operator.or_,
+            map(lambda y: '<' in y, x[2])
+        )
+    gapping, align = sequences.separate(has_gapping, bad)
 
     results['gapping'] = gapping
     results['align'] = align
 
-    isMissing = lambda x: not predictionDict.has_key(x[0])
-    missing = filter(isMissing, validationCases)
+    is_missing = lambda x: not prediction_dict.has_key(x[0])
+    missing = filter(is_missing, validation_cases)
     results['missing'] = missing
 
-    _writeResults(results, resultsFile)
+    _write_results(results, results_file)
 
     return
 
 #----------------------------------------------------------------------------#
 
-def evaluateOkurigana(predictionFile, resultsFile):
+def evaluate_okurigana(prediction_file, results_file):
     """ Evaluates the alignments provided in the prediction file, writing the
         results to the results file.
     """
-    validationFile = os.path.join(settings.DATA_DIR, 'eval-okurigana.data')
+    validation_file = os.path.join(settings.DATA_DIR, 'eval-okurigana.data')
 
-    iStream = sopen(predictionFile, 'r')
     results = {}
 
-    validationCases = _listEntries(validationFile)
-    validationDict = dict(validationCases)
+    validation_cases = _list_entries(validation_file)
+    validation_dict = dict(validation_cases)
 
-    predictionCases = _listEntries(predictionFile)
-    predictionDict = dict(predictionCases)
+    prediction_cases = _list_entries(prediction_file)
+    prediction_dict = dict(prediction_cases)
 
-    matching = lambda x: x in validationCases
-    good, bad = sequences.separate(matching, predictionCases)
+    matching = lambda x: x in validation_cases
+    good, bad = sequences.separate(matching, prediction_cases)
 
     results['good'] = good
 
-    addCorrect = lambda x: x + (validationDict[x[0]],)
-    bad = map(addCorrect, bad)
+    add_correct = lambda x: x + (validation_dict[x[0]],)
+    bad = map(add_correct, bad)
 
     results['okuri'] = bad
 
-    isMissing = lambda x: not predictionDict.has_key(x[0])
-    missing = filter(isMissing, validationCases)
+    is_missing = lambda x: not prediction_dict.has_key(x[0])
+    missing = filter(is_missing, validation_cases)
     results['missing'] = missing
 
     results['bad'] = bad + missing
 
-    _writeResults(results, resultsFile)
+    _write_results(results, results_file)
+
+#----------------------------------------------------------------------------#
+
+def _write_results(results_dict, results_file):
+    keys = results_dict.keys()
+    keys.sort()
+
+    summary_stream = open(results_file, 'w')
+
+    for key in keys:
+        key_entries = results_dict[key]
+        number = len(key_entries)
+        percent = 100.0*number/5000.0
+        print >> summary_stream, '%s    %4d    %6.2f%%' % (key, number,
+                percent)
+        print '%s    %4d    %6.2f%%' % (key, number, percent)
+        ostream = sopen(results_file + '.' + key, 'w')
+        for line in key_entries:
+            print >> ostream, ':'.join(line)
+        ostream.close()
 
     return
 
 #----------------------------------------------------------------------------#
 
-def _writeResults(resultsDict, resultsFile):
-    keys = resultsDict.keys()
-    keys.sort()
 
-    summaryStream = open(resultsFile, 'w')
+def _list_entries(filename):
+    entries = []
+    istream = sopen(filename, 'r')
 
-    for key in keys:
-        keyEntries = resultsDict[key]
-        number = len(keyEntries)
-        percent = 100.0*number/5000.0
-        print >> summaryStream, '%s    %4d    %6.2f%%' % (key, number, percent)
-        print '%s    %4d    %6.2f%%' % (key, number, percent)
-        oStream = sopen(resultsFile + '.' + key, 'w')
-        for line in keyEntries:
-            print >> oStream, ':'.join(line)
-        oStream.close()
-
-    return
-
-#----------------------------------------------------------------------------#
-
-
-def _listEntries(filename):
-    entries = []
-    iStream = sopen(filename, 'r')
-
-    for line in iStream:
+    for line in istream:
         key, value = line.strip().split(':', 1)
         entries.append((key, value))
 
-    iStream.close()
+    istream.close()
 
     return entries
 
 #----------------------------------------------------------------------------#
 
-def evaluate(predictionFile, validationFile, validationResults):
+def evaluate(prediction_file, validation_file, validation_results):
     """ Evaluates the predictions against the validation data, writing the
-        output to a series of files with basename validationResults.
+        output to a series of files with basename validation_results.
     """
-    testEntries = _getEntries(predictionFile)
-    correctEntries = _getEntries(validationFile)
+    test_entries = _get_entries(prediction_file)
+    correct_entries = _get_entries(validation_file)
 
-    _compareEntries(testEntries, correctEntries, validationResults)
+    _compare_entries(test_entries, correct_entries, validation_results)
 
     # split the errors into a detailed analysis
-    errors.separateErrors(validationResults)
-
-    return
+    errors.separate_errors(validation_results)
 
 #----------------------------------------------------------------------------#
 
-def _getEntries(filename):
+def _get_entries(filename):
     """ Creates a dictionary of all the entries in the given file.
     """
     lines = sopen(filename, 'r').readlines()
 
 #----------------------------------------------------------------------------#
 
-def _compareEntries(testEntries, correctEntries, resultFile):
+def _compare_entries(test_entries, correct_entries, result_file):
     """ Compares the entries from the different files.
     """
-    nLines = 0
-    nCorrect = 0
-    nMissing = 0
-    oStream = sopen(resultFile, 'w')
-    for key, alignment in correctEntries.iteritems():
-        testAlignment = testEntries.get(key, '???')
+    n_lines = 0
+    n_correct = 0
+    n_missing = 0
+    ostream = sopen(result_file, 'w')
+    for key, alignment in correct_entries.iteritems():
+        test_alignment = test_entries.get(key, '???')
 
-        if alignment == testAlignment:
-            nCorrect += 1
+        if alignment == test_alignment:
+            n_correct += 1
 
-        if testAlignment == '???':
-            nMissing += 1
+        if test_alignment == '???':
+            n_missing += 1
 
-        print >> oStream, '%s:%s:%s' % (key, testAlignment, alignment)
+        print >> ostream, '%s:%s:%s' % (key, test_alignment, alignment)
 
-        nLines += 1
+        n_lines += 1
     
-    oStream.close()
+    ostream.close()
 
-    print 'Got %.2f%% correct!' % (nCorrect*100.0/nLines)
-    if nMissing > 0:
-        print '   but %d were missing...' % nMissing
-
-    return
+    print 'Got %.2f%% correct!' % (n_correct*100.0/n_lines)
+    if n_missing > 0:
+        print '   but %d were missing...' % n_missing
 
 #----------------------------------------------------------------------------#
 
-def sortFile(filename):
-    """ Sorts the file in a line-based manner.
-    """
-    iStream = sopen(filename, 'r')
-    lines = iStream.readlines()
-    iStream.close()
+def sort_file(filename):
+    istream = sopen(filename, 'r')
+    lines = istream.readlines()
+    istream.close()
 
     lines.sort()
 
-    oStream = sopen(filename, 'w')
-    oStream.writelines(lines)
-    oStream.close()
+    ostream = sopen(filename, 'w')
+    ostream.writelines(lines)
+    ostream.close()
 
-    return
-
-#----------------------------------------------------------------------------#
-
-def createOptionParser():
-    """ Creates an option parser instance to handle command-line options.
-    """
+def create_option_parser():
     usage = "%prog [options] rawResults adjustedResults"
 
     parser = optparse.OptionParser(usage)
 
     
-    parser.add_option('-e', action='store', dest='correctFile',
+    parser.add_option('-e', action='store', dest='correct_file',
         default=os.path.join(settings.DATA_DIR, 'evaluation.data'),
         help='The file of correct evaluations')
 
     return parser
 
-#----------------------------------------------------------------------------#
-
 def main(argv):
     """ The main method for this module.
     """
-    parser = createOptionParser()
+    parser = create_option_parser()
     (options, args) = parser.parse_args(argv)
 
     try:
-        [testOutputFile, resultsFile] = args
+        [test_output_file, results_file] = args
     except:
         parser.print_help()
         sys.exit(1)
 
-    # execute new code here
-    evaluate(testOutputFile, options.correctFile, resultsFile)
-    
-    return
-
-#----------------------------------------------------------------------------#
+    evaluate(test_output_file, options.correct_file, results_file)
 
 if __name__ == '__main__':
     main(sys.argv[1:])
 
-#----------------------------------------------------------------------------#
-
-# vim: ts=4 sw=4 sts=4 et tw=78:
+# vim: ts=4 sw=4 sts=4 et tw=78:

File src/formatEval.py

-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# formatEval.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 sts=4 et tw=78:
-# Wed Sep  7 16:10:02 EST 1005
-#
-#----------------------------------------------------------------------------#
-
-import sys
-import optparse
-from cjktools.common import sopen
-
-from entry import Entry
-
-#----------------------------------------------------------------------------#
-
-def formatEvalFile(inputFile, outputFile):
-    entries = _parseEntries(inputFile)
-    oStream = sopen(outputFile, 'w')
-
-    for entry in entries:
-        lineA = entry.gString.ljust(10, u' ')
-        lineB = entry.pString.ljust(10, u' ')
-
-        extraA, extraB = _matchAlignents(entry.alignments[0])
-        lineA += extraA.ljust(10, u' ')
-        lineB += extraB.ljust(10, u' ')
-
-        extraA, extraB = _matchAlignents(entry.alignments[1])
-        lineA += extraA.ljust(10, u' ')
-        lineB += extraB.ljust(10, u' ')
-
-        print >> oStream, lineA
-        print >> oStream, lineB
-        print >> oStream
-
-    oStream.close()
-
-    return
-
-#----------------------------------------------------------------------------#
-
-def _matchAlignents(alignment):
-    gSegments, pSegments = map(list, alignment)
-    for i in range(len(gSegments)):
-        lenDiff = len(pSegments[i]) - len(gSegments[i])
-        gSegments[i] = gSegments[i].ljust(len(pSegments[i]), u' ')
-
-    lineA = u'|'.join(gSegments)
-    lineB = u'|'.join(pSegments)
-
-    return lineA, lineB
-
-#----------------------------------------------------------------------------#
-
-def _parseEntries(inputFile):
-    entries = []
-    for line in sopen(inputFile, 'r'):
-        base, attempt, actual = line.strip().split(':')
-
-        gString, pString = base.split()
-        entry = Entry(gString, pString)
-        fixify = lambda x: map(lambda y: y.strip('|').split('|'), 
-                x.split())
-        attempt = fixify(attempt)
-        actual = fixify(actual)
-
-        entry.alignments=[attempt, actual]
-        
-        entries.append(entry)
-
-    return entries
-
-#----------------------------------------------------------------------------#
-
-def createOptionParser():
-    """ Creates an option parser instance to handle command-line options.
-    """
-    usage = "%prog [options] inputFile outputFile"
-
-    parser = optparse.OptionParser(usage)
-
-    return parser
-
-#----------------------------------------------------------------------------#
-
-def main(argv):
-    """ The main method for this module.
-    """
-
-    parser = createOptionParser()
-    (options, args) = parser.parse_args(argv)
-
-    try:
-        [inputFile, outputFile] = args
-    except:
-        parser.print_help()
-        sys.exit(1)
-
-    # execute new code here
-    formatEvalFile(inputFile, outputFile)
-    
-    return
-
-#----------------------------------------------------------------------------#
-
-if __name__ == '__main__':
-    main(sys.argv[1:])
-
-#----------------------------------------------------------------------------#
-
-# vim: ts=4 sw=4 sts=4 et tw=78:

File src/format_eval.py

View file
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# 
+#  format_eval.py
+#  gpalign
+#  
+#  Created by Lars Yencken on 2005-09-07.
+#  Copyright 2005-2010 Lars Yencken. All rights reserved.
+# 
+
+import sys
+import optparse
+from cjktools.common import sopen
+
+from entry import Entry
+
+def format_eval_file(input_file, output_file):
+    entries = _parse_entries(input_file)
+    ostream = sopen(output_file, 'w')
+
+    for entry in entries:
+        line_a = entry.g_string.ljust(10, u' ')
+        line_b = entry.p_string.ljust(10, u' ')
+
+        extra_a, extra_b = _match_alignments(entry.alignments[0])
+        line_a += extra_a.ljust(10, u' ')
+        line_b += extra_b.ljust(10, u' ')
+
+        extra_a, extra_b = _match_alignments(entry.alignments[1])
+        line_a += extra_a.ljust(10, u' ')
+        line_b += extra_b.ljust(10, u' ')
+
+        print >> ostream, line_a
+        print >> ostream, line_b
+        print >> ostream
+
+    ostream.close()
+
+def _match_alignments(alignment):
+    g_segments, p_segments = map(list, alignment)
+    for i in range(len(g_segments)):
+        g_segments[i] = g_segments[i].ljust(len(p_segments[i]), u' ')
+
+    line_a = u'|'.join(g_segments)
+    line_b = u'|'.join(p_segments)
+
+    return line_a, line_b
+
+def _parse_entries(input_file):
+    entries = []
+    for line in sopen(input_file, 'r'):
+        base, attempt, actual = line.strip().split(':')
+
+        g_string, p_string = base.split()
+        entry = Entry(g_string, p_string)
+        fixify = lambda x: map(lambda y: y.strip('|').split('|'), 
+                x.split())
+        attempt = fixify(attempt)
+        actual = fixify(actual)
+
+        entry.alignments=[attempt, actual]
+        
+        entries.append(entry)
+
+    return entries
+
+def create_option_parser():
+    usage = "%prog [options] input_file output_file"
+
+    parser = optparse.OptionParser(usage)
+
+    return parser
+
+def main(argv):
+    parser = create_option_parser()
+    (options, args) = parser.parse_args(argv)
+
+    try:
+        [input_file, output_file] = args
+    except:
+        parser.print_help()
+        sys.exit(1)
+
+    format_eval_file(input_file, output_file)
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
+
+# vim: ts=4 sw=4 sts=4 et tw=78:

File src/frequency.py

View file
 # -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# frequency.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 sts=4 et tw=78:
-# Thu Aug 11 16:01:52 EST 2005
-#
-#----------------------------------------------------------------------------#
+# 
+#  frequency.py
+#  gpalign
+#  
+#  Created by Lars Yencken on 2005-08-11.
+#  Copyright 2005-2010 Lars Yencken. All rights reserved.
+# 
 
 from cjktools import scripts
 
-#----------------------------------------------------------------------------#
-
 class FrequencyMap:
     """ The data structure within which frequency counts for tf-idf
         calculations are stored.
     def __init__(self):
         self._graphemes = {}
 
-        self._gSize = 0.0
-        self._gpSize = 0.0
-        self._gpcSize = 0.0
+        self._g_size = 0.0
+        self._gp_size = 0.0
+        self._gpc_size = 0.0
 
         return
     
-    def addCounts(self, alignment):
+    def add_counts(self, alignment):
         """ This method updates all the counts associated with the entry.
         """
-        kanjiScript = scripts.Script.Kanji
-        gSegments, pSegments = alignment
-        for i in range(len(gSegments)):
-            if scripts.script_type(gSegments[i]) == kanjiScript:
-                g, gp, gpc = self._getContext(gSegments, pSegments, i)
+        kanji_script = scripts.Script.Kanji
+        g_segments, p_segments = alignment
+        for i in range(len(g_segments)):
+            if scripts.script_type(g_segments[i]) == kanji_script:
+                g, gp, gpc = self._get_context(g_segments, p_segments, i)
 
                 if not self._graphemes.has_key(g):
                     # if we don't have g, we can't have gp, gpc
                     self._graphemes[g] = (1, {gp: (1, {gpc: 1})})
-                    self._gSize += 1
-                    self._gpSize += 1
-                    self._gpcSize += 1
+                    self._g_size += 1
+                    self._gp_size += 1
+                    self._gpc_size += 1
 
                 else:
-                    gCount, gpDict = self._graphemes[g]
-                    gCount += 1
-                    if not gpDict.has_key(gp):
+                    g_count, gp_dict = self._graphemes[g]
+                    g_count += 1
+                    if not gp_dict.has_key(gp):
                         # without gp, we also can't have gpc
-                        gpDict[gp] = (1, {gpc: 1})
-                        self._gpSize += 1
-                        self._gpcSize += 1
+                        gp_dict[gp] = (1, {gpc: 1})
+                        self._gp_size += 1
+                        self._gpc_size += 1
                     else:
-                        gpCount, gpcDict = gpDict[gp]
-                        gpCount += 1
-                        if not gpcDict.has_key(gpc):
-                            gpcDict[gpc] = 1
-                            self._gpcSize += 1
+                        gp_count, gpc_dict = gp_dict[gp]
+                        gp_count += 1
+                        if not gpc_dict.has_key(gpc):
+                            gpc_dict[gpc] = 1
+                            self._gpc_size += 1
                         else:
-                            gpcDict[gpc] += 1
-                        gpDict[gp] = gpCount, gpcDict
-                    self._graphemes[g] = gCount, gpDict
+                            gpc_dict[gpc] += 1
+                        gp_dict[gp] = gp_count, gpc_dict
+                    self._graphemes[g] = g_count, gp_dict
 
         return
     
-    def delCounts(self, alignment):
+    def del_counts(self, alignment):
         """ This method updates all the counts associated with the entry.
         """
-        kanjiScript = scripts.Script.Kanji
-        gSegments, pSegments = alignment
-        for i in range(len(gSegments)):
-            if scripts.script_type(gSegments[i]) == kanjiScript:
-                g, gp, gpc = self._getContext(gSegments, pSegments, i)
-                gCount, gpDict = self._graphemes[g]
-                gCount -= 1
-                if gCount < 1:
+        kanji_script = scripts.Script.Kanji
+        g_segments, p_segments = alignment
+        for i in range(len(g_segments)):
+            if scripts.script_type(g_segments[i]) == kanji_script:
+                g, gp, gpc = self._get_context(g_segments, p_segments, i)
+                g_count, gp_dict = self._graphemes[g]
+                g_count -= 1
+                if g_count < 1:
                     del self._graphemes[g]
-                    self._gSize -= 1
+                    self._g_size -= 1
                     continue
 
-                gpCount, gpcDict = gpDict[gp]
-                gpCount -= 1
-                if gpCount < 1:
-                    del gpDict[gp]
-                    self._gpSize -= 1
-                    self._graphemes[g] = gCount, gpDict
+                gp_count, gpc_dict = gp_dict[gp]
+                gp_count -= 1
+                if gp_count < 1:
+                    del gp_dict[gp]
+                    self._gp_size -= 1
+                    self._graphemes[g] = g_count, gp_dict
                     continue
 
-                gpcCount = gpcDict[gpc]
+                gpcCount = gpc_dict[gpc]
                 gpcCount -= 1
                 if gpcCount < 1:
-                    del gpcDict[gpc]
-                    self._gpcSize -= 1
+                    del gpc_dict[gpc]
+                    self._gpc_size -= 1
                 else:
-                    gpcDict[gpc] = gpcCount
+                    gpc_dict[gpc] = gpcCount
 
-                gpDict[gp] = gpCount, gpcDict
-                self._graphemes[g] = gCount, gpDict
+                gp_dict[gp] = gp_count, gpc_dict
+                self._graphemes[g] = g_count, gp_dict
 
         return
         
-    def _getContext(self, gSegments, pSegments, index):
+    def _get_context(self, g_segments, p_segments, index):
         """ Determine the context needed for calculations or for frequency
             updates.
         """
-        grapheme = gSegments[index]
-        phoneme = pSegments[index]
+        grapheme = g_segments[index]
+        phoneme = p_segments[index]
 
         # determine the left context...
         if index > 0:
-            leftG = gSegments[index-1]
-            leftP = pSegments[index-1]
+            leftG = g_segments[index-1]
+            leftP = p_segments[index-1]
         else:
             leftG = None
             leftP = None
 
         # ...and the right context 
-        if index < len(gSegments) - 1:
-            rightG = gSegments[index+1]
-            rightP = pSegments[index+1]
+        if index < len(g_segments) - 1:
+            rightG = g_segments[index+1]
+            rightP = p_segments[index+1]
         else:
             rightG = None
             rightP = None
 
         return grapheme, phoneme, (leftG, leftP, rightG, rightP)
     
-    def frequencies(self, gSegments, pSegments, index):
+    def frequencies(self, g_segments, p_segments, index):
         """ Calculates the frequencies of occurence of the segment specified
             within the alignment.
         """
-        g, gp, gpc = self._getContext(gSegments, pSegments, index)
+        g, gp, gpc = self._get_context(g_segments, p_segments, index)
 
-        gFreq, gpDict = self._graphemes.get(g, (0, {}))
-        gpFreq, gpcDict = gpDict.get(gp, (0, {}))
-        gpcFreq = gpcDict.get(gpc, 0)
+        g_freq, gp_dict = self._graphemes.get(g, (0, {}))
+        gp_freq, gpc_dict = gp_dict.get(gp, (0, {}))
+        gpc_freq = gpc_dict.get(gpc, 0)
 
-        gFreq /= self._gSize
-        gpFreq /= self._gpSize
-        gpcFreq /= self._gpcSize
+        g_freq /= self._g_size
+        gp_freq /= self._gp_size
+        gpc_freq /= self._gpc_size
 
-        return gFreq, gpFreq, gpcFreq
+        return g_freq, gp_freq, gpc_freq
     
 #----------------------------------------------------------------------------#

File src/okuriganaModel.py