Source

jp-grapheme-alt / src / grapheme_database.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 
#  grapheme_database.py
#  jp-grapheme-alt
#  
#  Created by Lars Yencken on 2008-01-08.
#  Copyright 2008 Lars Yencken. All rights reserved.
# 

"""
This script builds the grapheme-alternation table which allows character
similarity search to take place.
"""

import sys, optparse

import pkg_resources
from django.db import connection
from simplestats.sequences import groups_of_n_iter
import consoleLog

import confusion_model

#----------------------------------------------------------------------------#
# PUBLIC
#----------------------------------------------------------------------------#

log = consoleLog.default

class GraphemeDatabase(object):
    def __init__(self, dataset='jyouyou__strokeEditDistance'):
        self._dataset = dataset
    
    def build(self):
        log.start('Building grapheme tables', nSteps=1)
        log.log('Storing grapheme alternations')
        alt_tree = self._store_alternations()
        log.finish()
        return

    def _store_alternations(self):
        filename = pkg_resources.resource_filename(
                'jp_grapheme_alt', 'data/%s' % self._dataset)
        graphicalModel = confusion_model.WeightedConfusionModel(
                filename, 0.9)
        assert graphicalModel.dist
        rowIter = self._iter_rows(graphicalModel)
        cursor = connection.cursor()
        cursor.execute('DELETE FROM jp_grapheme_alt_graphemealternation')
        for rowGroup in groups_of_n_iter(10000, rowIter):
            cursor.executemany(
                    """
                    INSERT INTO jp_grapheme_alt_graphemealternation
                        (base_form, surface_form, code, probability)
                    VALUES (%s, %s, %s, %s)
                    """,
                    rowGroup
                )
        cursor.close()
        return

    def _iter_rows(self, model):
        for surface_form in model.iterkeys():
            candidates = model.candidates(surface_form)
            if len(candidates) == 1:
                # No candidates other than exact matches; omit from our table.
                continue
                
            for base_form, log_prob in candidates:
                yield base_form, surface_form, 'g', log_prob

#----------------------------------------------------------------------------#

def build():
    db = GraphemeDatabase()
    db.build()
    return
    
#----------------------------------------------------------------------------#
# PRIVATE
#----------------------------------------------------------------------------#

#----------------------------------------------------------------------------#
# MODULE EPILOGUE
#----------------------------------------------------------------------------#

def _create_option_parser():
    usage = \
"""%prog [options]

Build the grapheme confusion tables for FOKS."""

    parser = optparse.OptionParser(usage)

    parser.add_option('--debug', action='store_true', dest='debug',
            default=False, help='Enables debugging mode [False]')

    return parser

#----------------------------------------------------------------------------#

def main(argv):
    parser = _create_option_parser()
    (options, args) = parser.parse_args(argv)

    if args:
        parser.print_help()
        sys.exit(1)

    # Avoid psyco in debugging mode, since it merges stack frames.
    if not options.debug:
        try:
            import psyco
            psyco.profile()
        except:
            pass

    build()

    return

#----------------------------------------------------------------------------#

if __name__ == '__main__':
    main(sys.argv[1:])

#----------------------------------------------------------------------------#
  
# vim: ts=4 sw=4 sts=4 et tw=78:
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.