Commits

Lars Yencken  committed f31f6e6

Adds an initial snapshot of the reading_alt app from FOKS.

  • Participants

Comments (0)

Files changed (17)

+syntax: glob
+*.orig
+*.rej
+*~
+*.o
+*.so
+*.os
+*.pyo
+*.pyc
+*.log
+build/*
+html/*
+.sconf_temp/*
+.sconsign.dblite
+cache/*
+tmp/*
+log/*
+*.swp
+MANIFEST
+dist/*
+*.deb
+.DS_Store
+data
+*.egg-info
+__version__.py
+*.bak
+jp-reading-alt
+# -*- coding: utf-8 -*-
+#
+#  setup.py
+#  jp-reading-alt
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+"""
+Package setup file for the jp-reading-alt package.
+"""
+
+#----------------------------------------------------------------------------#
+
+from setuptools import setup
+import os
+import re
+
+#----------------------------------------------------------------------------#
+
+VERSION = '0.1.0'
+
+f = open('src/__version__.py', 'w')
+f.write('# Autogenerated by setup.py\n')
+f.write('version = "%s"\n' % VERSION)
+f.close()
+
+setup(
+        name='jp-reading-alt',
+        description="Probabilistic reading error models for Japanese kanji.",
+        long_description = """
+        Reading alternation models for Japanese kanji, considering effects of:
+        sequential voicing, sound euphony, vowel length and palatalization
+        errors.
+        """,
+        url="http://bitbucket.org/lars512/jp-reading-alt/",
+        version=VERSION,
+        author="Lars Yencken",
+        author_email="lljy@csse.unimelb.edu.au",
+        license="BSD",
+
+        package_dir={'jp_reading_alt': 'src'},
+        packages=['jp_reading_alt'],
+    )

File src/__init__.py

+# -*- coding: utf-8 -*-
+#
+#  __init__.py
+#  jp-reading-alt
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+"""
+A django app for modelling kanji reading alternations.
+"""
+
+# vim: ts=4 sw=4 sts=4 et tw=78:
+

File src/admin.py

+# -*- coding: utf-8 -*-
+#
+#  admin.py
+#  jp-reading-alt
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+from django.contrib import admin
+
+import models
+
+class KanjiReadingAdmin(admin.ModelAdmin):
+    list_display = ('kanji', 'reading', 'alternations')
+    list_filter = ('alternations',)
+
+admin.site.register(models.KanjiReading, KanjiReadingAdmin)
+
+class ReadingAlternationAdmin(admin.ModelAdmin):
+    list_display = ('value', 'code', 'probability')
+
+admin.site.register(models.ReadingAlternation, ReadingAlternationAdmin)
+
+# vim: ts=4 sw=4 sts=4 et tw=78:
+

File src/alternation_model.py

+# -*- coding: utf-8 -*-
+#
+#  alternation_model.py
+#  jp-reading-alt
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+"An abstract phonetic alternation model."
+
+
+import math
+
+from cjktools import kana_table
+from cjktools.exceptions import AbstractMethodError
+from django.conf import settings
+
+#----------------------------------------------------------------------------#
+
+class AlternationModelI(object):
+    """
+    An alternation model provides P(r'|r, k), giving both candidates and
+    probabilities for r'.
+    """
+    def prob(kanji, reading, alternation):
+        raise AbstractMethodError
+
+    def log_prob(kanji, reading, alternation):
+        raise AbstractMethodError
+
+#----------------------------------------------------------------------------#
+
+class SimpleAlternationModel(AlternationModelI):
+    """
+    An alternation model based on readings which are subsets of things. To use,
+    subclass this model and implement the build_pairs() method.
+    """
+    #------------------------------------------------------------------------#
+    # PUBLIC
+    #------------------------------------------------------------------------#
+
+    def __init__(self, alpha):
+        self.alpha = alpha
+        self.pairs = self.build_pairs()
+        map = {}
+        for key_a, key_b in self.pairs:
+            if key_a in map:
+                map[key_a].append(key_b)
+            else:
+                map[key_a] = [key_b]
+
+            if key_b in map:
+                map[key_b].append(key_a)
+            else:
+                map[key_b] = [key_a]
+
+        self.map = map
+
+    #------------------------------------------------------------------------#
+
+    def log_prob(self, reading, reading_variant):
+        """
+        Returns the log probability of this variant given the canonical
+        reading.
+        """
+        return math.log(self.prob(reading, reading_variant))
+
+    #------------------------------------------------------------------------#
+
+    def prob(self, reading, reading_variant):
+        """
+        Returns the probability of the given reading variant being shown
+        given the canonical reading.
+        """
+        uniform_prob = 1.0 / self._num_variants(reading)
+        if reading == reading_variant:
+            return (1 - self.alpha) + self.alpha * uniform_prob
+        else:
+            return self.alpha * uniform_prob
+
+    #------------------------------------------------------------------------#
+
+    def candidates(self, kanji, reading):
+        """
+        Return a list of potential reading variant candidates for this
+        model.
+        """
+        variants = [reading]
+        if reading in self.map:
+            variants.extend(self.map[reading])
+
+        results = []
+        for reading_variant in variants:
+            results.append(
+                    (reading_variant, self.log_prob(reading, reading_variant))
+                )
+
+        return results
+
+    #------------------------------------------------------------------------#
+    # PRIVATE
+    #------------------------------------------------------------------------#
+
+    def build_pairs(self):
+        """
+        Builds a list of (short form, long form) pairs for this type of
+        alternation.
+        """
+        raise AbstractMethodError
+
+    #------------------------------------------------------------------------#
+
+    def _num_variants(self, reading):
+        """
+        Returns the number of variants for this particular reading.
+
+        Sometimes calculating this is useful without generating the
+        actual candidate list, which might be exponentially large.
+        """
+        if reading in self.map:
+            return 1 + len(self.map[reading])
+        else:
+            return 1
+
+    #------------------------------------------------------------------------#
+
+#----------------------------------------------------------------------------#
+
+class VowelLengthModel(SimpleAlternationModel):
+    """
+    An alternation model for vowel length.
+    """
+    def __init__(self):
+        SimpleAlternationModel.__init__(self, settings.VOWEL_LENGTH_ALPHA)
+
+    def build_pairs(self):
+        """
+        Builds a correspondence between palatalized and unpalatalized forms
+        of kana.
+        """
+        vowel_pairs = {
+                u'あ': u'あ',
+                u'い': u'い',
+                u'う': u'う',
+                u'え': u'い',
+                u'お': u'う',
+            }
+
+        vowel_to_y_form = {
+                u'あ':  u'ゃ',
+                u'う':  u'ゅ',
+                u'お':  u'ょ',
+            }
+
+        table = kana_table.KanaTable.get_cached()
+        pairs = []
+        for consonant in table.consonants:
+            if consonant == u'あ':
+                # Plain vowels double in Japanese.
+                for vowel, long_vowel in vowel_pairs:
+                    pairs.append((vowel, 2*vowel))
+
+            else:
+                # Other consonants are more limited.
+                for vowel, long_vowel in vowel_pairs.iteritems():
+                    kana = table.from_coords(consonant, vowel)
+                    pairs.append((kana, kana + long_vowel))
+
+                y_prefix = table.from_coords(consonant, u'い')
+                assert y_prefix
+                for vowel, y_suffix in vowel_to_y_form.iteritems():
+                    long_vowel = vowel_pairs[vowel]
+                    pairs.append((
+                            y_prefix + y_suffix,
+                            y_prefix + y_suffix + long_vowel,
+                        ))
+
+        return pairs
+
+    #------------------------------------------------------------------------#
+
+    @classmethod
+    def get_cached(cls):
+        if not hasattr(cls, '_cached'):
+            cls._cached = cls()
+        return cls._cached
+
+    #------------------------------------------------------------------------#
+
+#----------------------------------------------------------------------------#
+
+class PalatalizationModel(SimpleAlternationModel):
+    """
+    A probability model of palatalization for Japanese.
+    """
+
+    #------------------------------------------------------------------------#
+
+    def __init__(self):
+        SimpleAlternationModel.__init__(self, settings.PALATALIZATION_ALPHA)
+
+    #------------------------------------------------------------------------#
+
+    def build_pairs(self):
+        """
+        Builds a correspondence between palatalized and unpalatalized forms
+        of kana.
+        """
+        vowel_to_y_form = {
+                u'あ':  u'ゃ',
+                u'う':  u'ゅ',
+                u'お':  u'ょ',
+            }
+
+        table = kana_table.KanaTable.get_cached()
+        pairs = []
+        for consonant in table.consonants:
+            i_form = table.from_coords(consonant, u'い')
+            for vowel in u'あうお':
+                base_form = table.from_coords(consonant, vowel)
+                y_form = i_form + vowel_to_y_form[vowel]
+                pairs.append((base_form, y_form))
+
+        return pairs
+
+    #------------------------------------------------------------------------#
+
+    @classmethod
+    def get_cached(cls):
+        if not hasattr(cls, '_cached'):
+            cls._cached = cls()
+        return cls._cached
+
+    #------------------------------------------------------------------------#
+
+#----------------------------------------------------------------------------#

File src/explain_query.py

+# -*- coding: utf-8 -*-
+#
+#  explain_query.py
+#  jp-reading-alt
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+try:
+    from itertools import product
+except ImportError:
+    # Not available yet, must be running python <2.6
+    def product(*args, **kwds):
+        # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
+        # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111
+        pools = map(tuple, args) * kwds.get('repeat', 1)
+        result = [[]]
+        for pool in pools:
+            result = [x+[y] for x in result for y in pool]
+        for prod in result:
+            yield tuple(prod)
+
+from cjktools.scripts import script_type, Script, script_boundaries
+from cjktools import enum
+from django.db.models import get_model
+from hierarchy import model_tree, html_tree
+
+def explain_reading(word, reading):
+    """
+    Explains how this reading leads us to this word. If no plausible
+    explanation is found, the reading is presumed to be unique to the whole
+    word. The query link between the reading and the word should have been 
+    already tested before this method is called.
+    """
+    grapheme_segs = _maximally_segment(word)
+    
+    reading_segs = _segment_reading(grapheme_segs, reading)
+    if not reading_segs:
+        return _unique_reading(word, reading)
+                
+    result = []
+    for g_seg, r_seg in zip(grapheme_segs, reading_segs):
+        if script_type(g_seg) != Script.Kanji:
+            result.append('<span class="explanation">%s</span> ' % g_seg)
+            continue
+        
+        alternation = get_model('reading_alt', 'kanjireading'
+                ).objects.get(kanji=g_seg, reading=r_seg).reading_alternation
+        tree = model_tree.build_path(alternation, 'value')
+        tree = _collapse_tree(tree)
+        result.append(html_tree.as_html_tree(tree, open_until_depth=100,
+                annotate=_annotator))
+        
+    return result
+
+QueryError = enum.Enum(
+        'NonCompositionalReading',
+        'SequentialVoicing',
+        'SoundEuphony',
+        'ChoiceOfReading',
+        'VowelLength',
+        'Palatalisation',
+    )
+
+def error_types(word, query, real_readings):
+    """
+    Determines what the cause of the error is in the given reading. Needs to
+    know the real readings of the word to give an answer.
+    """
+    query = query.replace(' ', '') # ignore explicit segmentations
+    
+    if query in real_readings:
+        raise ValueError('query is a valid reading')
+
+    grapheme_segs = _maximally_segment(word)    
+    for real_reading in real_readings:
+        if _segment_reading(grapheme_segs, real_reading):
+            break
+    
+    else:
+        # No real readings are compositional, so this error is due to
+        # non-compositionality
+        return set([QueryError.NonCompositionalReading])
+    
+    query_segs = _segment_reading(grapheme_segs, query)
+    if not query_segs:
+        raise ValueError("can't segment query -- shouldn't happen")
+        
+    reading_segs = _match_real_segments(grapheme_segs, query_segs,
+            real_readings)
+    if not reading_segs:
+        # We didn't align to any real reading, so our reading was bad.
+        return set([QueryError.ChoiceOfReading])
+    
+    KanjiReading = get_model('reading_alt', 'kanjireading')
+    
+    # We aligned to a real reading, so we can compare segment by segment
+    errors = set()
+    for g_seg, r_seg, q_seg in zip(grapheme_segs, reading_segs, query_segs):
+        if r_seg == q_seg or script_type(g_seg) != Script.Kanji:
+            continue
+        
+        real_seg_reading = KanjiReading.objects.get(kanji=g_seg, 
+                reading=r_seg)
+        query_seg_reading = KanjiReading.objects.get(kanji=g_seg, 
+                reading=q_seg)
+        real_alt_set = set(real_seg_reading.alternations)
+        query_alt_set = set(query_seg_reading.alternations)
+        
+        diff_codes = real_alt_set.union(query_alt_set).difference(
+                real_alt_set.intersection(query_alt_set))
+        
+        if not diff_codes:
+            # Readings differ, but how?
+            if r_seg[1:] == q_seg[1:] and r_seg[0] != q_seg[0]:
+                # Looks like voiced variants
+                diff_codes = 's'
+            else:
+                raise Exception('no ostensible difference between readings')
+        
+        for code in diff_codes:
+            if code == 's':
+                # 's' is shared between sequential voicing and sound euphony
+                if r_seg[:-1] == q_seg[:-1] \
+                        and u'っ' in (r_seg[-1], q_seg[-1]):
+                    errors.add(QueryError.SoundEuphony)
+                else:
+                    errors.add(QueryError.SequentialVoicing)
+            elif code == 'g':
+                errors.add(QueryError.SoundEuphony)
+            elif code == 'v':
+                errors.add(QueryError.VowelLength)
+            elif code == 'p':
+                errors.add(QueryError.Palatalisation)
+            else:
+                raise ValueError('unknown error code "%s"' % code)
+    
+    return errors
+
+def _match_real_segments(grapheme_segs, query_segs, real_readings):
+    """
+    Work out if the given query is a minor variant of any of the real 
+    readings. If so, return the segments for the real reading; otherwise,
+    return None.
+    """
+    # Can we align this to any single real reading?
+    get_reading = get_model('reading_alt', 'kanjireading').objects.get
+    for reading in real_readings:
+        reading_segs = _segment_reading(grapheme_segs, reading)
+        if reading_segs is None:
+            # No way to align these segments
+            continue
+                
+        for g_seg, r_seg, q_seg in zip(grapheme_segs, reading_segs, 
+                query_segs):
+            
+            if g_seg == q_seg or script_type(g_seg) != Script.Kanji:
+                continue
+            
+            real_seg_reading = get_reading(kanji=g_seg, reading=r_seg)
+            query_seg_reading = get_reading(kanji=g_seg, reading=q_seg)
+            if not real_seg_reading.shares_alternation_path(
+                    query_seg_reading):
+                # Not based on the same reading root -- this pairing is bad
+                break
+        else:
+            # This pairing shared all significant reading roots
+            return reading_segs
+    
+    return None
+
+#----------------------------------------------------------------------------#
+
+def _segment_reading(grapheme_segs, reading):
+    """
+    Segment the reading in such a way that it matches the grapheme segments,
+    or return None.
+    """
+    # Build list of part candidates
+    KanjiReading = get_model('reading_alt', 'kanjireading')
+    reading_parts = []
+    for g_seg in grapheme_segs:
+        if script_type(g_seg) == Script.Kanji:
+            reading_parts.append([r.reading for r in
+                    KanjiReading.objects.filter(kanji=g_seg)])
+        else:
+            reading_parts.append([g_seg])
+        
+    matches = [c for c in product(*reading_parts) if ''.join(c) == 
+            reading]
+    
+    if len(matches) == 0:
+        return None
+    
+    return matches[0] # possibly truncating alternative matches
+
+def _collapse_tree(tree):
+    "Reduce a tree to significant differences between child and parent nodes."
+    for node in tree.walk_preorder():
+        if node.parent:
+            node.attrib['diff_codes'] = set(node.attrib['code']
+                    ).difference(node.parent.attrib['code'])
+        else:
+            node.attrib['diff_codes'] = None
+            
+    for node in tree.walk_postorder():
+        if not node.children:
+            continue
+        
+        for key, child in node.children.items():
+            if not child.attrib['diff_codes']:
+                # Prune this child, taking its children first
+                if child.children:
+                    for grand_key, grand_child in child.children.items():
+                        assert grand_key not in node.children
+                        node.children[grand_key] = grand_child
+                
+                del node.children[key]
+
+    return tree
+
+def _unique_reading(word, reading):
+    return u'This is a unique reading of %s, not based on its parts.</p>' %\
+            word
+
+_diff_code_verbose = {
+        'b':        'base reading',
+        'p':        'palatalization error',
+        's':        'voicing or gemination alternation',
+        'v':        'vowel error',
+    }
+
+def _annotator(node):
+    diff_code, = list(node.attrib['diff_codes'])
+    if diff_code == 'k':
+        return u'<span class="explanation">%s</span>' % node.label
+    
+    explanation = _diff_code_verbose[diff_code]
+    if diff_code == 's':
+        if node.label.endswith(u'っ'):
+            explanation = 'sound euphony'
+        else:
+            explanation = 'sequential voicing'
+    
+    return u'<span class="explanation">%s (%s)</span>' % (
+            node.label, explanation)
+
+def _maximally_segment(word):
+    """
+    Splits the word into segments, such that each kanji is in its own
+    segment and consecutive kana are grouped into one segment.
+    """
+    # Firstly insert boundaries for each script change.
+    wordSegs = script_boundaries(word)
+
+    # Secondly, add boundaries between consecutive kanji.
+    output_segs = []
+    for segment in wordSegs:
+        if script_type(segment) == Script.Kanji:
+            output_segs.extend(segment)
+        else:
+            output_segs.append(segment)
+
+    return output_segs

File src/migrations/0001_initial.py

+# -*- coding: utf-8 -*-
+
+from south.db import db
+from django.db import models, get_model
+from django.conf import settings
+
+ALTERNATION_TYPES = (
+        ('/', 'root node'),
+        ('k', 'kanji node'),
+        ('b', 'base reading'),
+        ('v', 'vowel length'),
+        ('s', 'sequential voicing'),
+        ('g', 'sound euphony'),      # Note: lumped together with 's'
+        ('p', 'palatalization'),
+        ('G', 'graphical similarity'),
+        ('S', 'semantic similarity'),
+        ('c', 'cooccurrence'),
+    )
+
+class Migration:
+    
+    def forwards(self):
+        
+        # Model 'ReadingAlternation'
+        db.create_table('reading_alt_readingalternation', (
+            ('left_visit', models.IntegerField(primary_key=True)),
+            ('right_visit', models.IntegerField(db_index=True)),
+            ('value', models.CharField(
+                max_length=settings.MAX_READING_LENGTH * \
+                settings.UTF8_BYTES_PER_CHAR)),
+            ('code', models.CharField(max_length=1, choices=ALTERNATION_TYPES)),
+            ('probability', models.FloatField()),
+        ))
+        # Model 'KanjiReading'
+        db.create_table('reading_alt_kanjireading', (
+            ('id', models.AutoField(verbose_name='ID', primary_key=True, auto_created=True)),
+            ('reading', models.CharField(
+                max_length=settings.MAX_READING_LENGTH * \
+                        settings.UTF8_BYTES_PER_CHAR,
+                db_index=True,
+                help_text='The reading of this kanji.',
+            )),
+            ('kanji', models.CharField( max_length=settings.UTF8_BYTES_PER_CHAR, help_text='The kanji from which the reading derived.', )),
+            ('alternations', models.CharField(max_length=len(ALTERNATION_TYPES), blank=True, null=True, help_text='The alternation codes used to get this reading.')),
+            ('probability', models.FloatField( help_text='The log-probability of this reading for this kanji.')),
+            ('reading_alternation', models.ForeignKey(
+                get_model('reading_alt', 'readingalternation'),
+                blank=True, null=True,
+                help_text='The final alternation step which' \
+            ' provided this reading.')),
+        ))
+        
+        db.send_create_signal('reading_alt', ['ReadingAlternation','KanjiReading'])
+    
+    def backwards(self):
+        db.delete_table('reading_alt_kanjireading')
+        db.delete_table('reading_alt_readingalternation')
+        

File src/migrations/__init__.py

Empty file added.

File src/models.py

+# -*- coding: utf-8 -*-
+#
+#  models.py
+#  jp-reading-alt
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+"""
+Models for the reading_alt app.
+"""
+
+from django.db import models
+from django.conf import settings
+from hierarchy.models import HierarchicalModel
+
+ALTERNATION_TYPES = (
+        ('/', 'root node'),
+        ('k', 'kanji node'),
+        ('b', 'base reading'),
+        ('v', 'vowel length'),
+        ('s', 'sequential voicing'),
+        ('g', 'sound euphony'),      # Note: lumped together with 's'
+        ('p', 'palatalization'),
+        ('G', 'graphical similarity'),
+        ('S', 'semantic similarity'),
+        ('c', 'cooccurrence'),
+    )
+
+class ReadingAlternation(HierarchicalModel):
+    "A single reading alternation step."
+
+    # The surface reading form we index by.
+    value = models.CharField(max_length=settings.MAX_READING_LENGTH * \
+            settings.UTF8_BYTES_PER_CHAR)
+
+    # The type of alternation which occurred.
+    code = models.CharField(max_length=1, choices=ALTERNATION_TYPES)
+
+    # The probability of this transition step.
+    probability = models.FloatField()
+
+    def __unicode__(self):
+        return '<ReadingAlternation /%s/, %s, %.03f (%d, %d)>' % (
+                self.value, self.code, self.probability, self.left_visit,
+                self.right_visit)
+
+    def __repr__(self):
+        return unicode(self).encode('utf8')
+
+    @staticmethod
+    def get_alternation_root(kanji):
+        return ReadingAlternation.objects.get(value=kanji, code='k')
+
+class KanjiReading(models.Model):
+    "A reading of a given kanji after alternations have been applied."
+
+    reading = models.CharField(
+            max_length=settings.MAX_READING_LENGTH * \
+                    settings.UTF8_BYTES_PER_CHAR,
+            db_index=True,
+            help_text='The reading of this kanji.',
+        )
+
+    kanji = models.CharField(
+            max_length=settings.UTF8_BYTES_PER_CHAR,
+            help_text='The kanji from which the reading derived.',
+        )
+
+    alternations = models.CharField(max_length=len(ALTERNATION_TYPES),
+            blank=True, null=True,
+            help_text='The alternation codes used to get this reading.')
+
+    probability = models.FloatField(
+            help_text='The log-probability of this reading for this kanji.')
+
+    # The final alternation step which provided this reading.
+    reading_alternation = models.ForeignKey(ReadingAlternation, blank=True,
+            null=True, help_text='The final alternation step which' \
+            ' provided this reading.')
+
+    def __unicode__(self):
+        return u'%s /%s/ (%s)' % (self.kanji, self.reading, self.alternations)
+
+    def get_alternation_path(self):
+        """
+        Determine the entire path of alternations taken to get this reading
+        for this kanji. Returns a list of ReadingAlternation instances.
+        """
+        whole_path = list(
+                ReadingAlternation.objects.filter(
+                        left_visit__lt=self.reading_alternation.left_visit,
+                        right_visit__gt=self.reading_alternation.right_visit
+                    ).order_by('left_visit')
+            )
+        
+        root_node = whole_path[0]
+        whole_path = whole_path[1:]
+        assert root_node.value == 'root'
+
+        return whole_path
+    
+    def shares_alternation_path(self, rhs):
+        """
+        Returns True if the two readings share a common root.
+        """
+        my_path = self.get_alternation_path()
+        rhs_path = rhs.get_alternation_path()
+        return my_path[1] == rhs_path[1]
+    
+    def __cmp__(self, rhs):
+        return cmp(self.pk, rhs.pk)
+
+# vim: ts=4 sw=4 sts=4 et tw=78:

File src/raw_reading_model.py

+# -*- coding: utf-8 -*-
+#
+#  raw_reading_model.py
+#  jp-reading-alt
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+"A raw reading model, without any normalization."
+
+from os.path import join
+
+from cjktools.common import sopen
+from cjktools.smart_cache import disk_proxy_direct
+from cjktools import scripts
+from django.conf import settings
+from simplestats import ConditionalFreqDist
+
+#----------------------------------------------------------------------------#
+
+_edict_aligned_file = join(settings.DATA_DIR, 'aligned',
+        'je_edict.aligned.gz')
+
+_seg_separator = '|'
+_entry_separator = ':'
+_gp_separator = ' '
+
+#----------------------------------------------------------------------------#
+
+class RawReadingModel(ConditionalFreqDist):
+    """
+    A reading model based on exact segment counts, without normalization.
+    """
+    #------------------------------------------------------------------------#
+    # PUBLIC METHODS
+    #------------------------------------------------------------------------#
+
+    def __init__(self):
+        """
+        Constructor. Loads all counts from the aligned Edict dictionary.
+        """
+        ConditionalFreqDist.__init__(self)
+
+        kanji_script = scripts.Script.Kanji
+        i_stream = sopen(_edict_aligned_file, 'r')
+        for line in i_stream:
+            original, alignment = line.rstrip().split(_entry_separator)
+            grapheme_segs, phoneme_segs = alignment.split(_gp_separator)
+            grapheme_segs = grapheme_segs.split(_seg_separator)
+            phoneme_segs = scripts.to_hiragana(phoneme_segs).split(
+                    _seg_separator)
+            
+            segments = [
+                    Segment(g, p) \
+                    for (g,p) in zip(grapheme_segs, phoneme_segs) \
+                    if kanji_script in scripts.script_types(g) \
+                ]
+            for segment in segments:
+                self.inc(segment.graphemes, segment.phonemes)
+
+        i_stream.close()
+
+        return
+
+    #------------------------------------------------------------------------#
+
+    @classmethod
+    def get_cached(cls):
+        """
+        Return a memory or disk cached copy. If neither of these is
+        available, generate a new copy.
+        """
+        if not hasattr(cls, '_cached'):
+            fetch_kanjidic = disk_proxy_direct(
+                    RawReadingModel,
+                    join(settings.CACHE_DIR, 'rawReadingModel.cache'),
+                    dependencies=[__file__, _edict_aligned_file],
+                )
+            cls._cached = fetch_kanjidic()
+
+        return cls._cached
+
+#----------------------------------------------------------------------------#
+
+class Segment(object):
+    """
+    A basic data structure representing grapheme string aligned to a phoneme
+    string.
+    """
+    __slots__ = ('graphemes', 'phonemes')
+
+    def __init__(self, g, p):
+        self.graphemes = g
+        self.phonemes = p
+
+    def __unicode__(self):
+        return u'%s:%s' % (self.graphemes, self.phonemes)
+
+    def __hash__(self):
+        return hash((self.graphemes, self.phonemes))
+
+    def __cmp__(self, rhs):
+        return cmp(
+                (self.graphemes, self.phonemes),
+                (rhs.graphemes, rhs.phonemes)
+            )
+
+#----------------------------------------------------------------------------#
+

File src/reading_database.py

+# -*- coding: utf-8 -*-
+#
+#  reading_database.py
+#  jp-reading-alt
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+"""
+Builds the tables of readings and reading alternations.
+"""
+
+import sys, optparse
+from django.db import connection
+
+from cjktools.sequences import groups_of_n
+from cjktools.resources import kanjidic
+import consoleLog
+from consoleLog.progressBar import withProgress
+from hierarchy.tree import TreeNode
+
+from alternation_model import VowelLengthModel
+from alternation_model import PalatalizationModel
+from reading_model import VoicingAndGeminationModel
+
+#----------------------------------------------------------------------------#
+
+dependencies = []
+
+_alternation_models = [
+    ('voicing and gemination', 's', VoicingAndGeminationModel),
+    ('vowel length', 'v', VowelLengthModel),
+    ('palatalization', 'p', PalatalizationModel),
+]
+
+log = consoleLog.default
+
+#----------------------------------------------------------------------------#
+
+class ReadingDatabase(object):
+    """
+    Builds the dynamic reading scoring part of the FOKS database.
+    """
+
+    #------------------------------------------------------------------------#
+    # PUBLIC METHODS
+    #------------------------------------------------------------------------#
+
+    @classmethod
+    def build(cls):
+        """
+        Build the tables needed to generate search results at runtime. These
+        tables describe readings and reading alternations for each kanji which
+        might be searched for.
+        """
+        log.start('Building reading tables')
+
+        log.log('Detecting unique kanji ', newLine=False)
+        sys.stdout.flush()
+        kanji_set = set(kanjidic.Kanjidic.get_cached().keys())
+
+        alt_tree = cls._build_alternation_tree(kanji_set)
+
+        cls._store_alternation_tree(alt_tree)
+
+        log.log('Storing readings per kanji')
+        cls._store_kanji_readings(alt_tree)
+        cls._prune_kanji_readings()
+        log.finish()
+
+    #------------------------------------------------------------------------#
+    # PRIVATE METHODS
+    #------------------------------------------------------------------------#
+
+    @classmethod
+    def _build_alternation_tree(cls, kanji_set):
+        """
+        Builds the tree of all readings and alternations. Upon completion, any
+        possible reading (erroneous or not) for a given kanji should be a leaf
+        node in the subtree for that kanji. Each fixed depth in that subtree
+        corresponds to an alternation model of some sort.
+        """
+        log.start('Building alternation tree', nSteps=3)
+        log.log('Adding base kanji set')
+        root_node = AltTreeNode('root', '/')
+        for kanji in kanji_set:
+            root_node.add_child(AltTreeNode(kanji, 'k'))
+
+        log.log('Adding good readings')
+        kjdic = kanjidic.Kanjidic.get_cached()
+        for kanji_node in root_node.children.values():
+            kanji = kanji_node.label
+            if kanji in kjdic:
+                for reading in kjdic[kanji].all_readings:
+                    kanji_node.add_child(AltTreeNode(reading, 'b'))
+
+        log.start('Adding alternation models', nSteps=len(_alternation_models))
+        i = 0
+        max_len = max(len(n) for (n, c, cl) in _alternation_models)
+        pattern = '%%-%ds ' % max_len
+        for model_name, model_code, model_class in _alternation_models:
+            log.log(pattern % model_name, newLine=False)
+            sys.stdout.flush()
+            model_obj = model_class.get_cached()
+            cls._add_alternation_model(model_obj, model_code, root_node,
+                first=(i==0))
+            i += 1
+        log.finish()
+
+        log.finish()
+
+        return root_node
+
+    #------------------------------------------------------------------------#
+
+    @classmethod
+    def _store_alternation_tree(cls, alt_tree):
+        """
+        Stores the alternation tree to the database using the nested set
+        abstraction.
+        """
+        log.start('Storing alternation tree', nSteps=2)
+        # Walk the tree, numbering all nodes.
+        log.log('Numbering tree nodes')
+        cls._number_tree(alt_tree)
+
+        # Store the tree
+        log.log('Storing the tree to the database')
+        cls._store_tree(alt_tree)
+        log.finish()
+
+    #------------------------------------------------------------------------#
+
+    @classmethod
+    def _number_tree(cls, root_node, i=1):
+        """
+        Numbers the entire tree, as required for the nested set abstraction.
+
+        @return: The update counter
+        """
+        root_node.left_visit = i
+        i += 1
+
+        for child in root_node.children.itervalues():
+            i = cls._number_tree(child, i)
+
+        root_node.right_visit = i
+        i += 1
+
+        return i
+
+    #------------------------------------------------------------------------#
+
+    @staticmethod
+    def _store_tree(root_node):
+        """
+        Stores the reading alternation tree to the database.
+        """
+        # Build our list of results.
+        def iter_results(tree):
+            for node in tree.walk():
+                yield (node.label, node.code, node.probability,
+                        node.left_visit, node.right_visit)
+            return
+
+        # Insert them to the database.
+        cursor = connection.cursor()
+        cursor.execute('DELETE FROM reading_alt_kanjireading')
+        cursor.execute('DELETE FROM reading_alt_readingalternation')
+        max_per_run = 10000
+        all_results = iter_results(root_node)
+
+        for results in groups_of_n(max_per_run, all_results):
+            cursor.executemany(
+                    """
+                    INSERT INTO reading_alt_readingalternation 
+                    (value, code, probability, left_visit, right_visit)
+                    VALUES (%s, %s, %s, %s, %s)
+                    """,
+                    results,
+                )
+
+        cursor.close()
+        return
+
+    #------------------------------------------------------------------------#
+
+    @staticmethod
+    def _add_alternation_model(model_obj, code, root_node, first=False):
+        """
+        Adds this alternation model to our current alternation tree. This
+        involves walking to each leaf node, then getting all candidates of
+        the model, and appending them as new nodes.
+
+        @param model_obj: An alternation model.
+        @type model_obj: AlternationModelI
+        @param code: A character code for the given alternation.
+        @type code: char
+        @param root_node: The root node of the entire tree.
+        @type root_node: TreeNode
+        """
+        for kanji_node in withProgress(root_node.children.values()):
+            kanji = kanji_node.label
+            leaves = list(kanji_node.walk_leaves())
+            for reading_node in leaves:
+                reading = reading_node.label
+                candidates = model_obj.candidates(kanji, reading)
+                if not first and candidates == [(reading, 0.0)]:
+                    # No changes
+                    continue
+
+                for alt_reading, log_prob in candidates:
+                    # Only tag changes with their alternation code.
+                    if alt_reading == reading:
+                        node_code = ''
+                    else:
+                        node_code = code
+                    assert alt_reading not in reading_node.children
+                    reading_node.add_child(
+                            AltTreeNode(alt_reading, node_code, log_prob))
+
+        return
+
+    #------------------------------------------------------------------------#
+
+    @staticmethod
+    def _store_kanji_readings(alt_tree):
+        """
+        Stores a separate table of only leaf-node readings
+        """
+        def iter_results(tree):
+            for kanji_node in tree.children.itervalues():
+                kanji = kanji_node.label
+
+                for leaf_node in kanji_node.walk_leaves():
+                    # Calculate the probability for this path.
+                    reading = leaf_node.label
+                    leaf_path = leaf_node.ancestors[1:]
+                    probability = sum([n.probability for n in leaf_path])
+                    codes = ''.join([n.code for n in leaf_path])
+                    yield (kanji, reading, codes, probability,
+                        leaf_path[-1].left_visit)
+            return
+
+        max_per_insert = 10000
+        all_results = iter_results(alt_tree)
+        cursor = connection.cursor()
+        cursor.execute('DELETE FROM reading_alt_kanjireading')
+
+        for results in groups_of_n(max_per_insert, all_results):
+            cursor.executemany(
+                    """
+                    INSERT INTO reading_alt_kanjireading
+                    (kanji, reading, alternations, probability,
+                        reading_alternation_id)
+                    VALUES (%s, %s, %s, %s, %s)
+                    """,
+                    results
+                )
+
+        cursor.close()
+
+        return
+
+    #------------------------------------------------------------------------#
+
+    @staticmethod
+    def _prune_kanji_readings():
+        """Prune duplicates from the database."""
+        cursor = connection.cursor()
+        cursor.execute(
+                """
+                SELECT id FROM reading_alt_kanjireading AS A
+                WHERE A.probability != (
+                    SELECT MAX(B.probability)
+                    FROM reading_alt_kanjireading AS B
+                    WHERE A.kanji = B.kanji AND A.reading = B.reading
+                )
+                """
+            )
+        ids = cursor.fetchall()
+        cursor.executemany(
+                """DELETE FROM reading_alt_kanjireading WHERE id = %s""",
+                ids
+        )
+        return
+
+    #------------------------------------------------------------------------#
+
+#----------------------------------------------------------------------------#
+
+class AltTreeNode(TreeNode):
+    def __init__(self, name, code, probability=0.0):
+        TreeNode.__init__(self, label=name,
+                attrib={'code': code, 'probability': probability})
+        return
+
+    def getProperty(name):
+        def getter(self):
+            return self.attrib[name]
+        def setter(self, value):
+            self.attrib[name] = value
+            return
+        return property(getter, setter)
+
+    probability = getProperty('probability')
+    left_visit = getProperty('left_visit')
+    right_visit = getProperty('right_visit')
+    code = getProperty('code')
+
+#----------------------------------------------------------------------------#
+
+def build():
+    obj = ReadingDatabase()
+    obj.build()
+
+#----------------------------------------------------------------------------#
+# MODULE EPILOGUE
+#----------------------------------------------------------------------------#
+
+def _create_option_parser():
+    usage = \
+"""%prog [options]
+
+Builds the reading tables required for FOKS lookup."""
+
+    parser = optparse.OptionParser(usage)
+
+    return parser
+
+#----------------------------------------------------------------------------#
+
+def main(argv):
+    parser = _create_option_parser()
+    (options, args) = parser.parse_args(argv)
+
+    if args:
+        parser.print_help()
+        sys.exit(1)
+
+    return build()
+
+#----------------------------------------------------------------------------#
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
+
+#----------------------------------------------------------------------------#
+  
+# vim: ts=4 sw=4 sts=4 et tw=78:

File src/reading_model.py

+# -*- coding: utf-8 -*-
+#
+#  reading_model.py
+#  jp-reading-alt
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+"A reading model for FOKS search."
+
+#----------------------------------------------------------------------------#
+
+from os.path import join
+import math
+
+from cjktools.common import sopen
+from cjktools import scripts, smart_cache
+from django.conf import settings
+from simplestats.freq import ConditionalFreqDist, UnknownSymbolError
+
+import raw_reading_model
+
+#----------------------------------------------------------------------------#
+
+_reading_counts_file = join(settings.DATA_DIR, 'corpus',
+        'kanji_readings__edict')
+_reading_counts_map_file = join(settings.DATA_DIR, 'corpus',
+        'kanji_readings__edict.map')
+
+#----------------------------------------------------------------------------#
+
+class VoicingAndGeminationModel(object):
+    "A reading model giving P(r|k) = aP_freq(r|k) + (1-a)P(r|r*)P(r*|k)."
+    #------------------------------------------------------------------------#
+    # PUBLIC METHODS
+    #------------------------------------------------------------------------#
+
+    def __init__(self):
+        # Loads up the frequency distribution for P(r*|k).
+        self.normalized_freq_dist = ConditionalFreqDist.from_file(
+                _reading_counts_file)
+
+        # Load up the alternation probabilities P(r|r*).
+        self.alternation_dist = self._load_alternation_dist(
+                _reading_counts_map_file)
+
+        # Build a mapping from (k, r) to r*.
+        self.from_canonical_reading = self._build_alternation_map()
+
+        # Build a frequency distribution for P(r|k).
+        self.raw_freq_dist = raw_reading_model.RawReadingModel.get_cached()
+
+        self.reverse_mapping = self.get_reverse_mapping()
+
+        return
+
+    #------------------------------------------------------------------------#
+
+    def prob(self, grapheme, reading, alt_reading):
+        """
+        Returns the probability of P(r|k), using the formula:
+        P(r|k) ~ (alpha)P_raw(r|k) + (1-alpha)P(r|r*)P(r*|k).
+        """
+        if scripts.to_hiragana(grapheme) == scripts.to_hiragana(alt_reading):
+            # Special case: where the segment is phonetic.
+            return 1.0
+
+        # We only handle entire kanji segments.
+        assert scripts.script_types(grapheme) == set([scripts.Script.Kanji])
+
+        alpha = settings.ALTERNATION_ALPHA
+        assert 0 <= alpha <= 1
+        try:
+            rawProb = self.raw_freq_dist.prob(grapheme, alt_reading)
+        except UnknownSymbolError:
+            rawProb = 0.0
+
+        normalizedProb = self.normalized_freq_dist.prob(grapheme, reading)
+        alternationProb = self.alternation_dist.prob(reading, alt_reading)
+
+        result = alpha*rawProb + (1-alpha)*normalizedProb*alternationProb
+
+        return result
+
+    #------------------------------------------------------------------------#
+
+    def log_prob(self, grapheme, reading, alt_reading):
+        return math.log(self.prob(grapheme, reading, alt_reading))
+
+    #------------------------------------------------------------------------#
+
+    def candidates(self, grapheme, reading):
+        """
+        Returns a list of (alt_reading, log_prob) pairs.
+        """
+        results = []
+
+        key = grapheme, reading
+
+        if key not in self.from_canonical_reading:
+            return [(reading, 0.0)]
+
+        for alt_reading in self.from_canonical_reading[key]:
+            results.append(
+                    (alt_reading, self.log_prob(grapheme, reading, alt_reading))
+                )
+
+        return results
+
+    #------------------------------------------------------------------------#
+
+    def __repr__(self):
+        return '<VoicingAndGeminationModel: %d entries>' % \
+                len(self.normalized_freq_dist)
+
+    #------------------------------------------------------------------------#
+
+    @classmethod
+    def get_cached(cls):
+        """
+        Fetches a memory-cached copy of the class.
+        """
+        if not hasattr(cls, '_cached'):
+            dependencies = [__file__, _reading_counts_file,
+                    _reading_counts_map_file]
+            cacheFile = join(settings.CACHE_DIR, 'kanjiReadingModel.cache')
+            fetchKanjiReadingModel = smart_cache.disk_proxy_direct(cls,
+                    cacheFile, dependencies)
+            cls._cached = fetchKanjiReadingModel()
+
+        return cls._cached
+
+    #------------------------------------------------------------------------#
+
+    def get_reverse_mapping(self):
+        """
+        Generates and returns a map from a reading to the set of possible
+        grapheme candidates for that reading.
+        """
+        reverse_mapping = {}
+
+        # Get the canonical reading pairs.
+        for grapheme, reading, count in self.normalized_freq_dist.itercounts():
+            if not reverse_mapping.has_key(reading):
+                reverse_mapping[reading] = set([grapheme])
+            else:
+                reverse_mapping[reading].add(grapheme)
+
+        # Get the alternation reading pairs.
+        for (grapheme, reading), alt_readings in \
+                self.from_canonical_reading.iteritems():
+            for alt_reading in alt_readings:
+                if not reverse_mapping.has_key(alt_reading):
+                    reverse_mapping[alt_reading] = set([grapheme])
+                else:
+                    reverse_mapping[alt_reading].add(grapheme)
+
+        return reverse_mapping
+
+    #------------------------------------------------------------------------#
+
+    def get_valid_reading_set(self):
+        """
+        Returns a set of readings which are valid for a segment.
+        """
+        valid_readings = set()
+        for alt_readings in self.from_canonical_reading.itervalues():
+            valid_readings.update(alt_readings)
+
+        return valid_readings
+
+    #------------------------------------------------------------------------#
+    # PRIVATE METHODS
+    #------------------------------------------------------------------------#
+
+    def _load_alternation_dist(self, filename):
+        """
+        Loads an alternation distribution and returns it. This
+        distribution gives P(r|r*).
+        """
+        alternation_dist = ConditionalFreqDist()
+        i_stream = sopen(_reading_counts_map_file, 'r')
+        for line in i_stream:
+            line = line.rstrip().split()
+            kanji = line.pop(0)
+            for data in line:
+                data = data.split(":")
+                if len(data) == 2:
+                    reading, count = data
+                    count = int(count)
+                    alt_reading = reading
+                else:
+                    reading, alt_reading, count = data
+                    count = int(count)
+
+                alternation_dist.inc(reading, alt_reading)
+
+        i_stream.close()
+        return alternation_dist
+    
+    #------------------------------------------------------------------------#
+
+    def _build_alternation_map(self):
+        """
+        Calculates and returns an alternation map, from alternation to
+        canonical reading. In other words, it maps (k, r) to r*.
+        """
+        # Generate an alternation distribution.
+        from_canonical_reading = {}
+        i_stream = sopen(_reading_counts_map_file, 'r')
+        for line in i_stream:
+            line = line.rstrip().split()
+            kanji = line.pop(0)
+            assert line
+
+            for lineSeg in line:
+                lineSeg = lineSeg.split(':')
+                if len(lineSeg) == 2:
+                    reading, count = lineSeg
+                    alt_reading = reading
+                elif len(lineSeg) == 3:
+                    reading, alt_reading, count = lineSeg
+                else:
+                    raise Exception, "File %s is badly formatted" % \
+                            _reading_counts_map_file
+
+                key = (kanji, reading)
+                if key in from_canonical_reading:
+                    from_canonical_reading[key].add(alt_reading)
+                else:
+                    from_canonical_reading[key] = set([alt_reading])
+
+        i_stream.close()
+
+        return from_canonical_reading
+
+    #------------------------------------------------------------------------#
+
+#----------------------------------------------------------------------------#

File src/test_alternation_model.py

+# -*- coding: utf-8 -*-
+#
+#  test_alternation_model.py
+#  jp-reading-alt
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+import unittest
+import doctest
+import alternation_model
+
+#----------------------------------------------------------------------------#
+
+def suite():
+    testSuite = unittest.TestSuite((
+            unittest.makeSuite(VowelLengthTestCase),
+            doctest.DocTestSuite(alternation_model)
+        ))
+    return testSuite
+
+#----------------------------------------------------------------------------#
+
+class VowelLengthTestCase(unittest.TestCase):
+    """
+    This class tests the AlternationModel class. 
+    """
+    def setUp(self):
+        self.model = alternation_model.VowelLengthModel.get_cached()
+        pass
+
+    def test_readings(self):
+        pairs = [
+                (u'と', u'とう'),
+                (u'きょ', u'きょう'),
+            ]
+
+        for short_reading, long_reading in pairs:
+            assert self.model.map[short_reading] == [long_reading]
+            assert self.model.map[long_reading] == [short_reading]
+            assert self.model.log_prob(short_reading, long_reading) < 0.0
+            assert self.model.log_prob(long_reading, short_reading) < 0.0
+
+    def test_y_sounds(self):
+        cs = [c[0] for c in self.model.candidates(None, u'きょう')]
+        assert u'ょ' not in cs
+        assert u'き' not in cs
+        assert set(cs) == set([u'きょう', u'きょ'])
+
+    def tearDown(self):
+        pass
+
+#----------------------------------------------------------------------------#
+
+if __name__ == "__main__":
+    unittest.TextTestRunner(verbosity=1).run(suite())
+
+#----------------------------------------------------------------------------#
+
+# vim: ts=4 sw=4 sts=4 et tw=78:
+

File src/test_explain_query.py

+# -*- coding: utf-8 -*-
+#
+#  test_explain_query.py
+#  jp-reading-alt
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+import unittest
+import doctest
+
+import explain_query
+
+#----------------------------------------------------------------------------#
+
+def suite():
+    testSuite = unittest.TestSuite((
+            unittest.makeSuite(ExplainQueryTestCase),
+            doctest.DocTestSuite(explain_query)
+        ))
+    return testSuite
+
+#----------------------------------------------------------------------------#
+
+class ExplainQueryTestCase(unittest.TestCase):
+    """
+    This class tests the _explain_query class. 
+    """
+    def setUp(self):
+        pass
+
+    def testNonCompositional(self):
+        "Tests bug #10: incorrect error categorisation."
+        target = u'海豚'
+        query = u'うみぶた'
+        real_readings = [u'いるか']
+        self.assertEqual(
+                explain_query.error_types(target, query, real_readings),
+                set([explain_query.QueryError.NonCompositionalReading]),
+            )
+
+    def testWithSpaces(self):
+        target = u'海豚'
+        query = u'うみ ぶた'
+        real_readings = [u'いるか']
+        self.assertEqual(
+                explain_query.error_types(target, query, real_readings),
+                set([explain_query.QueryError.NonCompositionalReading]),
+            )
+    
+    def testIncorrectReading(self):
+        target = u'今日'
+        query = u'いまにち'
+        real_readings = [u'きょう', u'こんにち']
+        self.assertEqual(
+                explain_query.error_types(target, query, real_readings),
+                set([explain_query.QueryError.ChoiceOfReading]),
+            )
+    
+    def testVowelLength(self):
+        target = u'東京'
+        query = u'ときょ'
+        real_readings = [u'とうきょう']
+        self.assertEqual(
+                explain_query.error_types(target, query, real_readings),
+                set([explain_query.QueryError.VowelLength]),
+            )
+        
+    def testSequentialVoicing(self):
+        target = u'辞書'
+        query = u'じじょ'
+        real_readings = [u'じしょ']
+        self.assertEqual(
+                explain_query.error_types(target, query, real_readings),
+                set([explain_query.QueryError.SequentialVoicing]),
+            )
+
+    def testSoundEuphony(self):
+        target = u'学期'
+        query = u'がくき'
+        real_readings = [u'がっき']
+        self.assertEqual(
+                explain_query.error_types(target, query, real_readings),
+                set([explain_query.QueryError.SoundEuphony]),
+            )
+    
+    def testPalatalization(self):
+        target = u'選挙'
+        query = u'せんこ'
+        real_readings = [u'せんきょ']
+        self.assertEqual(
+                explain_query.error_types(target, query, real_readings),
+                set([explain_query.QueryError.Palatalisation]),
+            )
+            
+    def tearDown(self):
+        pass
+
+#----------------------------------------------------------------------------#
+
+if __name__ == "__main__":
+    unittest.TextTestRunner(verbosity=1).run(suite())
+
+#----------------------------------------------------------------------------#
+
+# vim: ts=4 sw=4 sts=4 et tw=78:
+

File src/test_reading_model.py

+# -*- coding: utf-8 -*-
+#
+#  test_reading_model.py
+#  jp-reading-alt
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+import unittest
+
+from reading_model import VoicingAndGeminationModel
+
+#----------------------------------------------------------------------------#
+
+def suite():
+    testSuite = unittest.TestSuite((
+            unittest.makeSuite(KanjiReadingModelTestCase)
+        ))
+    return testSuite
+
+#----------------------------------------------------------------------------#
+
+class KanjiReadingModelTestCase(unittest.TestCase):
+    """
+    This class tests the ReadingModel class. 
+    """
+    def setUp(self):
+        self.model = VoicingAndGeminationModel.get_cached()
+        pass
+
+    def testBasicProbability(self):
+        g = u'国'
+        base_reading = u'こく'
+        alternation = u'ごく'
+
+        assert self.model.log_prob(g, base_reading, base_reading) < 0.0
+        assert self.model.log_prob(g, base_reading, alternation) < 0.0
+        assert self.model.log_prob(g, base_reading, base_reading) > \
+                self.model.log_prob(g, base_reading, alternation)
+
+        return
+
+    def testReverseMap(self):
+        reverseMap = self.model.get_reverse_mapping()
+
+        assert u'校' in reverseMap[u'こう']
+        assert u'高' in reverseMap[u'こう']
+        return
+
+    def testCaching(self):
+        doppleganger = VoicingAndGeminationModel.get_cached()
+
+        assert self.model is doppleganger
+        return
+
+    def testBug159(self):
+        """Tests for bug [159]: hatsu"""
+        assert self.model.prob(u'発', u'はつ', u'はっ') > 0.0
+        assert u'はっ' in [c[0] for c in self.model.candidates(u'発', u'はつ')]
+
+    def tearDown(self):
+        pass
+
+#----------------------------------------------------------------------------#
+
+if __name__ == "__main__":
+    unittest.TextTestRunner(verbosity=1).run(suite())
+
+#----------------------------------------------------------------------------#
+
+# vim: ts=4 sw=4 sts=4 et tw=78:
+

File src/views.py

+# Create your views here.