1. Lars Yencken
  2. jp-grapheme-alt

Commits

Lars Yencken  committed d33df77

Imports an initial snapshot from FOKS.

  • Participants
  • Branches default
  • Tags v0.1.0

Comments (0)

Files changed (9)

File .hgignore

View file
  • Ignore whitespace
+syntax: glob
+*.orig
+*.rej
+*~
+*.o
+*.so
+*.os
+*.pyo
+*.pyc
+*.log
+build/*
+html/*
+.sconf_temp/*
+.sconsign.dblite
+cache/*
+tmp/*
+log/*
+*.swp
+MANIFEST
+dist/*
+*.deb
+.DS_Store
+data
+*.egg-info
+__version__.py
+*.bak

File .project

View file
  • Ignore whitespace
+jp-grapheme-alt

File setup.py

View file
  • Ignore whitespace
+# -*- coding: utf-8 -*-
+# 
+#  setup.py
+#  jp-grapheme-alt
+#  
+#  Created by Lars Yencken on 2009-04-20.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+# 
+
+"""
+Package setup file for the jp-grapheme-alt package.
+"""
+
+#----------------------------------------------------------------------------#
+
+from setuptools import setup
+
+#----------------------------------------------------------------------------#
+
+VERSION = '0.1.0'
+
+f = open('src/__version__.py', 'w')
+f.write('# Autogenerated by setup.py\n')
+f.write('version = "%s"\n' % VERSION)
+f.close()
+
+setup(
+        name='jp-grapheme-alt',
+        description="Probabilistic misrecognition models for Japanese kanji.",
+        long_description = """
+        Models for plausible misrecognition of Japanese kanji based on
+        visual similarity.
+        """,
+        url="http://bitbucket.org/lars512/jp-grapheme-alt/",
+        version=VERSION,
+        author="Lars Yencken",
+        author_email="lljy@csse.unimelb.edu.au",
+        license="BSD",
+        install_requires=[
+                'cjktools',
+                'cjktools_data',
+                'django',
+                'django-hierarchy',
+                'south',
+            ],
+        package_dir={'jp_grapheme_alt': 'src'},
+        packages=['jp_grapheme_alt'],
+    )

File src/admin.py

View file
  • Ignore whitespace
+# -*- coding: utf-8 -*-
+# 
+#  admin.py
+#  jp-grapheme-alt
+#  
+#  Created by Lars Yencken on 2009-04-20.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+# 
+
+from django.contrib import admin
+from django.db.models import get_model
+
+class GraphemeAlternationAdmin(admin.ModelAdmin):
+    list_display = ('surface_form', 'base_form', 'code', 'probability')
+    list_filter = ('code',)
+    search_fields = ('surface_form', 'base_form')
+
+admin.site.register(
+        get_model('jp_grapheme_alt', 'graphemealternation'),
+        GraphemeAlternationAdmin,
+    )

File src/confusion_model.py

View file
  • Ignore whitespace
+# -*- coding: utf-8 -*-
+# 
+#  confusion_model.py
+#  jp-grapheme-alt
+#  
+#  Created by Lars Yencken on 2007-12-29.
+#  Copyright 2007-2008 Lars Yencken. All rights reserved.
+# 
+
+"Grapheme confusion models based on orthographic or semantic similarity."
+
+#----------------------------------------------------------------------------#
+
+import os
+import math
+
+from django.conf import settings
+from cjktools.sequences import ithread
+from cjktools.common import sopen
+from simplestats import freq
+
+#----------------------------------------------------------------------------#
+
+_char_freq_file = os.path.join(settings.DATA_DIR, 'corpus',
+        'jp_char_corpus_counts.gz')
+
+#----------------------------------------------------------------------------#
+
+class FlatConfusionModel(object):
+    """
+    A generic confusion module based on precomputed similarity scores
+    between symbols.
+    
+    Acts as the distribution P(base_form|surface_form), approximated by
+    P(surface_form|base_form)P(base_form).
+    """
+    def __init__(self, data_file):
+        self.data_file = data_file
+        char_dist = freq.FreqDist.from_file(_char_freq_file)
+        freq.smooth_by_adding_one(char_dist)
+        
+        surface_given_base_dist = self._load_distractors(data_file, char_dist)
+        base_given_surface_dist = self._rescore_candidates(
+                surface_given_base_dist, char_dist)
+        self.dist = base_given_surface_dist
+
+    def prob(self, surface_form, base_form):
+        base_dist = self.dist.get(surface_form)
+        if base_dist:
+            return base_dist.get(base_form, 0.0)
+        else:
+            return (surface_form == base_form) and 1.0 or 0.0
+
+    def log_prob(self, surface_form, base_form):
+        return math.log(self.prob(surface_form, base_form))
+
+    def candidates(self, surface_form):
+        """
+        Returns a list of (base_form, log_p_base_given_surface) pairs
+        containing all known candidates.
+        """
+        base_dist = self.dist.get(surface_form)
+        if base_dist:
+            return base_dist.items()
+        else:
+            return [(surface_form, 0.0)]
+
+    @staticmethod
+    def _load_distractors(filename, char_dist):
+        """
+        Builds and returns the conditional distribution 
+        P(surface_form|base_form).
+        """
+        # First build a map: base -> (surface -> similarity)
+        neighbourhoods = {}
+        i_stream = sopen(filename)
+        for line in i_stream:
+            slots = line.rstrip().split()
+            base_form = slots.pop(0)
+            surface_forms = {base_form: 1.0}
+            for surface_form, similarity in ithread(slots):
+                similarity = float(similarity)
+                assert 0.0 <= similarity <= 1.0
+
+                if similarity == 0.0:
+                    continue
+
+                surface_forms[surface_form] = similarity
+            assert surface_forms
+            neighbourhoods[base_form] = surface_forms
+
+        i_stream.close()
+        assert neighbourhoods
+        
+        # Convert to map: base -> (surface -> probability), using
+        # P(surface|base) = P(surface)s(surface, base)/(sum(...)), 
+        # i.e. normalised.
+        surface_given_base_dist = {}
+        for base_form, neighbourhood in neighbourhoods.iteritems():
+            surface_dist = {}
+            assert neighbourhood
+            for neighbour, similarity in neighbourhood.iteritems():
+                surface_dist[neighbour] = similarity * \
+                        char_dist.prob(neighbour)
+            sum_scores = sum(surface_dist.values())
+            for surface_form in surface_dist.iterkeys():
+                surface_dist[surface_form] /= sum_scores
+            
+            assert surface_dist
+            surface_given_base_dist[base_form] = surface_dist
+            
+        return surface_given_base_dist
+        
+    @staticmethod
+    def _rescore_candidates(surface_given_base_dist, char_dist):
+        """
+        Convert our map from a P(surface_form|base_form) distribution to
+        an approximation of P(base_form|surface_form), by multiplying each
+        value by P(base_form).
+        """
+        assert char_dist
+        new_dist = {}
+        log = math.log
+        for base_form, surface_dist in surface_given_base_dist.iteritems():
+            assert surface_dist
+            base_form_prob = char_dist.prob(base_form)
+            for surface_form, p_surface_given_base in \
+                    surface_dist.iteritems():
+                base_dist = new_dist.setdefault(surface_form, {})
+                p_base_given_surface = p_surface_given_base * base_form_prob
+                base_dist[base_form] = p_base_given_surface
+        
+        for surface_form, base_dist in new_dist.iteritems():
+            sum_probs = sum(base_dist.itervalues())
+            assert sum_probs > 0
+            for base_form, simple_prob in base_dist.iteritems():
+                normalised_prob = simple_prob / sum_probs
+                assert 0 < normalised_prob <= 1
+                base_dist[base_form] = normalised_prob                
+        assert new_dist
+        return new_dist
+
+#----------------------------------------------------------------------------#
+
+class WeightedConfusionModel(FlatConfusionModel):
+    """
+    Acts as the distribution P(base_form|surface_form), approximated by
+    P(surface_form|base_form)P(base_form).
+    
+    Compared to the flat model, we now weight probabilities towards
+    surface_form == base_form by using a constant weight parameter in (0, 1).
+    """
+    def __init__(self, data_file, weight):
+        FlatConfusionModel.__init__(self, data_file)
+        assert 0 < weight < 1
+        self.weight = weight
+
+    def prob(self, surface_form, base_form):
+        raw_prob = FlatConfusionModel.prob(self, surface_form, base_form)
+        return self._adjust_using_weight(surface_form, base_form, raw_prob)
+
+    def log_prob(self, surface_form, base_form):
+        return math.log(self.prob(surface_form, base_form))
+
+    def candidates(self, surface_form):
+        raw_candidates = FlatConfusionModel.candidates(self, surface_form)
+        adjusted_candidates = []
+        for base_form, raw_log_prob in raw_candidates:
+            adjusted_log_prob = math.log(self._adjust_using_weight(
+                    surface_form, base_form, math.exp(raw_log_prob)))
+            adjusted_candidates.append((base_form, adjusted_log_prob))
+        return adjusted_candidates
+    
+    def iterkeys(self):
+        return self.dist.iterkeys()
+
+    def _adjust_using_weight(self, surface_form, base_form, raw_prob):
+        if (surface_form == base_form):
+            return self.weight + (1 - self.weight) * raw_prob
+        else:
+            return (1 - self.weight) * raw_prob
+
+#----------------------------------------------------------------------------#

File src/grapheme_database.py

View file
  • Ignore whitespace
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# 
+#  grapheme_database.py
+#  jp-grapheme-alt
+#  
+#  Created by Lars Yencken on 2008-01-08.
+#  Copyright 2008 Lars Yencken. All rights reserved.
+# 
+
+"""
+This script builds the grapheme-alternation table which allows character
+similarity search to take place.
+"""
+
+import os, sys, optparse
+
+from django.db import connection
+from django.conf import settings
+from cjktools.sequences import groups_of_n_iter
+import consoleLog
+
+import confusion_model
+
+#----------------------------------------------------------------------------#
+# PUBLIC
+#----------------------------------------------------------------------------#
+
+# Location of file giving P(surface_form|base_form) for each surface_form 
+# listed.
+_similarity_file = os.path.join(settings.DATA_DIR, 'similarity',
+                    'jyouyou__strokeEditDistance')
+ 
+dependencies = [__file__, _similarity_file]
+
+log = consoleLog.default
+
+class GraphemeDatabase(object):
+    def build(self):
+        log.start('Building grapheme tables', nSteps=1)
+        log.log('Storing grapheme alternations')
+        alt_tree = self._store_alternations()
+        log.finish()
+        return
+
+    def _store_alternations(self):
+        graphicalModel = confusion_model.WeightedConfusionModel(
+                _similarity_file, 0.9)
+        assert graphicalModel.dist
+        rowIter = self._iter_rows(graphicalModel)
+        cursor = connection.cursor()
+        cursor.execute('DELETE FROM jp_grapheme_alt_graphemealternation')
+        for rowGroup in groups_of_n_iter(10000, rowIter):
+            cursor.executemany(
+                    """
+                    INSERT INTO jp_grapheme_alt_graphemealternation
+                        (base_form, surface_form, code, probability)
+                    VALUES (%s, %s, %s, %s)
+                    """,
+                    rowGroup
+                )
+        cursor.close()
+        return
+
+    def _iter_rows(self, model):
+        for surface_form in model.iterkeys():
+            candidates = model.candidates(surface_form)
+            if len(candidates) == 1:
+                # No candidates other than exact matches; omit from our table.
+                continue
+                
+            for base_form, log_prob in candidates:
+                yield base_form, surface_form, 'g', log_prob
+
+#----------------------------------------------------------------------------#
+
+def build():
+    db = GraphemeDatabase()
+    db.build()
+    return
+    
+#----------------------------------------------------------------------------#
+# PRIVATE
+#----------------------------------------------------------------------------#
+
+#----------------------------------------------------------------------------#
+# MODULE EPILOGUE
+#----------------------------------------------------------------------------#
+
+def _create_option_parser():
+    usage = \
+"""%prog [options]
+
+Build the grapheme confusion tables for FOKS."""
+
+    parser = optparse.OptionParser(usage)
+
+    parser.add_option('--debug', action='store_true', dest='debug',
+            default=False, help='Enables debugging mode [False]')
+
+    return parser
+
+#----------------------------------------------------------------------------#
+
+def main(argv):
+    parser = _create_option_parser()
+    (options, args) = parser.parse_args(argv)
+
+    if args:
+        parser.print_help()
+        sys.exit(1)
+
+    # Avoid psyco in debugging mode, since it merges stack frames.
+    if not options.debug:
+        try:
+            import psyco
+            psyco.profile()
+        except:
+            pass
+
+    build()
+
+    return
+
+#----------------------------------------------------------------------------#
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
+
+#----------------------------------------------------------------------------#
+  
+# vim: ts=4 sw=4 sts=4 et tw=78:

File src/models.py

View file
  • Ignore whitespace
+# -*- coding: utf-8 -*-
+# 
+#  models.py
+#  jp-grapheme-alt
+#  
+#  Created by Lars Yencken on 2009-04-20.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+# 
+
+import math
+
+from django.db import models
+from django.conf import settings
+
+ALTERNATION_TYPES = (
+        ('/', 'root node'),
+        ('k', 'kanji node'),
+        ('b', 'base reading'),
+        ('v', 'vowel length'),
+        ('s', 'sequential voicing'),
+        ('g', 'sound euphony'),
+        ('p', 'palatalization'),
+        ('G', 'graphical similarity'),
+        ('S', 'semantic similarity'),
+        ('c', 'cooccurrence'),
+    )
+
+class GraphemeAlternation(models.Model):
+    "The probability of the base_form given the surface_form."
+    surface_form = models.CharField(max_length=settings.UTF8_BYTES_PER_CHAR,
+            db_index=True,
+            help_text='The observed form of the grapheme.')
+
+    base_form = models.CharField(max_length=settings.UTF8_BYTES_PER_CHAR,
+            db_index=True,
+            help_text='The underlying form from which the surface is derived.')
+
+    code = models.CharField(max_length=1, choices=ALTERNATION_TYPES,
+            help_text='The type of alternation which occurred.')
+
+    # The probability P(base_form|surface_form), approximated by
+    # P(surface_form|base_form)P(base_form).
+    probability = models.FloatField(
+            help_text='An approximation of P(base_form|surface_form).')
+
+    def __unicode__(self):
+        return 'P(%s|%s) = %g' % (
+                self.base_form,
+                self.surface_form,
+                math.exp(self.probability),
+            )

File src/test_confusion_model.py

View file
  • Ignore whitespace
+# -*- coding: utf-8 -*-
+# 
+#  test_confusion_model.py
+#  jp-grapheme-alt
+#  
+#  Created by Lars Yencken on 2008-01-04.
+#  Copyright 2008 Lars Yencken. All rights reserved.
+# 
+
+import os
+import unittest
+import doctest
+
+from cjktools.scripts import unique_kanji
+from cjktools.common import sopen
+from django.conf import settings
+
+import confusion_model
+
+#----------------------------------------------------------------------------#
+
+def suite():
+    testSuite = unittest.TestSuite((
+            unittest.makeSuite(ConfusionModelTestCase),
+            doctest.DocTestSuite(confusion_model)
+        ))
+    return testSuite
+
+#----------------------------------------------------------------------------#
+
+_allKanji = os.path.join(settings.DATA_DIR, 'lists', 'char', 'jp_jis')
+
+class ConfusionModelTestCase(unittest.TestCase):
+    """
+    This class tests the ConfusionModel class. 
+    """
+    def setUp(self):
+        filename = os.path.join(settings.DATA_DIR, 'similarity',
+                'jyouyou__yehAndLiRadical')
+        self.model = confusion_model.WeightedConfusionModel(filename, 0.9)
+        pass
+
+    def testConfusion(self):
+        for kanji in unique_kanji(sopen(_allKanji).read()):
+            candidates = self.model.candidates(kanji)
+            self.assertEqual(kanji, max(candidates, key=lambda x: x[1])[0])
+    
+    def tearDown(self):
+        pass
+
+#----------------------------------------------------------------------------#
+
+if __name__ == "__main__":
+    unittest.TextTestRunner(verbosity=1).run(suite())
+
+#----------------------------------------------------------------------------#
+
+# vim: ts=4 sw=4 sts=4 et tw=78:

File src/views.py

View file
  • Ignore whitespace
+# Create your views here.