Commits

Lars Yencken committed 00ef3ca

Update and centralize search and translation.

Comments (0)

Files changed (29)

 *.orig
 *-env
 local_settings.py
-simsearch/search/stroke.{so,os,c}
+simsearch/stroke.{so,os,c}
 *.log
 distribute*.tar.gz
 .scon*
 
 #----------------------------------------------------------------------------#
 
-SConscript('simsearch/search/SConscript', exports='env')
+SConscript('simsearch/SConscript', exports='env')
 
 #----------------------------------------------------------------------------#
 nltk
 mercurial
 flask
+simplejson

simsearch/SConscript

+# -*- coding: utf-8 -*-
+#
+#  SConscript
+#  simsearch
+# 
+#  Created by Lars Yencken on 27-08-2010.
+#  Copyright 2010 Lars Yencken. All rights reserved.
+#
+
+"""
+Scons build file for structure extensions.
+"""
+
+#----------------------------------------------------------------------------#
+
+Import('env')
+
+#----------------------------------------------------------------------------#
+
+stroke = env.Cython('stroke.c', 'stroke.pyx')
+env.SharedLibrary('stroke', stroke)
+
+#----------------------------------------------------------------------------#

simsearch/__init__.py

 import os
 
 import flask
+from cjktools import scripts
+import mercurial.hg
+import mercurial.ui
+import mercurial.node
+import simplejson
 
-from mercurial import ui, hg, node
+import models
 
 app = flask.Flask(__name__)
 app.config.from_object('simsearch.settings')
     c = base_context()
     return flask.render_template("static/about.html", **c)
 
+@app.route('/')
+def index():
+    "Renders the search display."
+    kanji = flask.request.args.get('kanji', '')
+    kanji_ok = _is_kanji(kanji)
+    context = base_context()
+
+    if not kanji or not kanji_ok:
+        # show the search dialog
+        context.update({
+                'kanji': kanji,
+                'kanji_ok': kanji_ok,
+            })
+        if kanji:
+            context['error'] = 'Please enter a single kanji only as input.'
+        return flask.render_template('search/index.html', **context)
+
+    # show the search plane instead
+
+    # make sure the path is ok
+    path = flask.request.args.get('path', '')
+    if not all(map(_is_kanji, path)):
+        path = []
+
+    path = list(path) + [kanji]
+    node = models.Node.objects.get(pivot=kanji)
+    neighbours = [n.kanji for n in sorted(node.neighbours, reverse=True)]
+    neighbours = neighbours[:app.conf['N_NEIGHBOURS_RECALLED']]
+
+    context.update({'data': simplejson.dumps({
+                    'kanji': kanji,
+                    'tier1': neighbours[:4],
+                    'tier2': neighbours[4:9],
+                    'tier3': neighbours[9:],
+                    'path': ''.join(path),
+                })})
+    return flask.render_template('search/display.html', **context)
+
+@app.route('/translate/<kanji>/')
+def translate(kanji):
+    "Updates the query model before redirecting to the real translation."
+    kanji = kanji or flask.request.args.get('kanji')
+    if not _is_kanji(kanji):
+        flask.abort(404)
+
+    path = flask.request.args.get('path')
+    if path and len(path) > 1 and all(map(_is_kanji, path)) \
+            and path.endswith(kanji):
+        models.Node.update(path)
+        models.Trace.log(flask.request, path)
+
+    return flask.redirect(flask.url_for('translate'), args=[kanji])
+
+@app.route('/search/json/<pivot>/')
+def search_json(pivot):
+    "Returns the search display data as JSON."
+    pivot = pivot or flask.request.args.get('pivot')
+    node = models.Node.objects.get(pivot=pivot)
+    neighbours = [n.kanji for n in sorted(node.neighbours, reverse=True)]
+    neighbours = neighbours[:app.conf['N_NEIGHBOURS_RECALLED']]
+
+    return flask.jsonify(
+            pivot_kanji=pivot,
+            tier1=neighbours[:4],
+            tier2=neighbours[4:9],
+            tier3=neighbours[9:],
+        )
+
+def _is_kanji(kanji):
+    return isinstance(kanji, unicode) and len(kanji) == 1 \
+            and scripts.script_type(kanji) == scripts.Script.Kanji
+
 def base_context():
     c = {}
     c.update(mercurial_revision())
 
 def mercurial_revision():
     project_base = os.path.join(app.config['PROJECT_ROOT'], '..')
-    repo = hg.repository(ui.ui(), project_base)
+    repo = mercurial.hg.repository(mercurial.ui.ui(), project_base)
     fctx = repo.filectx(project_base, 'tip')
 
     return {'revision': {
-                'short': node.short(fctx.node()),
+                'short': mercurial.node.short(fctx.node()),
                 'number': fctx.rev(),
             }}
 

simsearch/experiments/check_connectivity.py

 occur within the top-k neighbour list of at least one other kanji.
 """
 
-import os, sys, optparse
+import sys
+import optparse
 
-from django.conf import settings
-
+from simsearch import settings
 from simsearch.search import models
 
 def check_connectivity(k=settings.N_NEIGHBOURS_RECALLED):

simsearch/experiments/simulate_accessibility.py

 import codecs
 import random
 
-from django.conf import settings
 from simplestats import FreqDist, basic_stats
 
+from simsearch import settings
 from simsearch.search import models
 
 DEFAULT_THRESHOLD = 0.95

simsearch/experiments/simulate_search.py

 import codecs
 import random
 
-from django.conf import settings
 from consoleLog import withProgress
 
+from simsearch import settings
 from simsearch.search import stroke, models
 
 def simulate_search(output_file, strategy='greedy',

simsearch/heap_cache.py

+# -*- coding: utf-8 -*-
+#
+#  heap_cache.py
+#  simsearch
+# 
+#  Created by Lars Yencken on 30-08-2010.
+#  Copyright 2010 Lars Yencken. All rights reserved.
+#
+
+"""
+Caches to aid similarity caculation, to efficiently maintain only the highest
+similarity neighbours.
+"""
+
+import heapq
+
+class TopNHeap(object):
+    "A heap which only keeps the top-n items and their weights."
+    __slots__ = '_n', '_backing_list'
+    def __init__(self, n):
+        self._n = n
+        self._backing_list = []
+
+    def add(self, item, weight):
+        heapq.heappush(self._backing_list, (weight, item))
+        if len(self._backing_list) > self._n:
+            heapq.heappop(self._backing_list)
+
+    def get_contents(self):
+        return self._backing_list
+
+class FixedSimilarityCache(object):
+    """
+    A kanji similarity cache which only keeps the top-n most similar
+    neighbours.
+    """
+    def __init__(self, n):
+        self._n = n
+        self._heaps = {}
+        self._sum = 0.0
+        self._n_seen = 0.0
+        self._sum_squared = 0.0
+
+    def add(self, kanji_a, kanji_b, similarity):
+        """
+        Attempt to add this similarity score to the cache. If there are
+        already n closer neighbours for either kanji it will be discarded.  
+        """
+        self.get_heap(kanji_a).add(kanji_b, similarity)
+        self.get_heap(kanji_b).add(kanji_a, similarity)
+        self._n_seen += 1
+        self._sum += similarity
+        self._sum_squared += similarity * similarity
+
+
+    def __getitem__(self, kanji):
+        return self.get_heap(kanji)
+
+    def get_heap(self, kanji):
+        heap = self._heaps.get(kanji)
+        if heap is None:
+            heap = self._heaps.setdefault(kanji, TopNHeap(self._n))
+        return heap
+
+    def get_mean(self):
+        return self._sum / self._n_seen
+
+# vim: ts=4 sw=4 sts=4 et tw=78:
+

simsearch/models.py

 import gzip
 import itertools
 
-from django.conf import settings
 import mongoengine
 from cjktools import scripts
+from cjktools.resources import kanjidic
 from nltk.probability import FreqDist, LaplaceProbDist
 
 import stroke
 import heap_cache
 
+from simsearch import settings
+
 class Similarity(mongoengine.Document):
     "Raw similarity scores for kanji pairs."
     kanji_pair = mongoengine.StringField(max_length=2, primary_key=True)
         ip = request.META['REMOTE_ADDR']
         cls(ip_address=ip, path=list(path)).save()
 
+class Translation(mongoengine.Document):
+    "A per-kanji dictionary entry of readings and translations."
+    kanji = mongoengine.StringField(max_length=1, primary_key=True)
+    on_readings = mongoengine.ListField(mongoengine.StringField())
+    kun_readings = mongoengine.ListField(mongoengine.StringField())
+    glosses = mongoengine.ListField(mongoengine.StringField())
+
+    @classmethod
+    def build(cls):
+        cls.drop_collection()
+        kjd = kanjidic.Kanjidic()
+        for entry in kjd.itervalues():
+            translation = cls(
+                    kanji=entry.kanji,
+                    on_readings=entry.on_readings,
+                    kun_readings=entry.kun_readings,
+                    glosses = entry.gloss,
+                )
+            translation.save()
+
 def build():
     "Builds the database."
     cache = Similarity.build()
     Node.build(cache)
+    Translation.build()
 
 #----------------------------------------------------------------------------#
 

simsearch/search/SConscript

-# -*- coding: utf-8 -*-
-#
-#  SConscript
-#  simsearch
-# 
-#  Created by Lars Yencken on 27-08-2010.
-#  Copyright 2010 Lars Yencken. All rights reserved.
-#
-
-"""
-Scons build file for structure extensions.
-"""
-
-#----------------------------------------------------------------------------#
-
-Import('env')
-
-#----------------------------------------------------------------------------#
-
-stroke = env.Cython('stroke.c', 'stroke.pyx')
-env.SharedLibrary('stroke', stroke)
-
-#----------------------------------------------------------------------------#

simsearch/search/__init__.py

-# -*- coding: utf-8 -*-
-#
-#  __init__.py
-#  simsearch
-# 
-#  Created by Lars Yencken on 14-09-2010.
-#  Copyright 2010 Lars Yencken. All rights reserved.
-#
-
-def build():
-    import models
-    models.build()
-
-# vim: ts=4 sw=4 sts=4 et tw=78:

simsearch/search/heap_cache.py

-# -*- coding: utf-8 -*-
-#
-#  heap_cache.py
-#  simsearch
-# 
-#  Created by Lars Yencken on 30-08-2010.
-#  Copyright 2010 Lars Yencken. All rights reserved.
-#
-
-"""
-Caches to aid similarity caculation, to efficiently maintain only the highest
-similarity neighbours.
-"""
-
-import heapq
-
-class TopNHeap(object):
-    "A heap which only keeps the top-n items and their weights."
-    __slots__ = '_n', '_backing_list'
-    def __init__(self, n):
-        self._n = n
-        self._backing_list = []
-
-    def add(self, item, weight):
-        heapq.heappush(self._backing_list, (weight, item))
-        if len(self._backing_list) > self._n:
-            heapq.heappop(self._backing_list)
-
-    def get_contents(self):
-        return self._backing_list
-
-class FixedSimilarityCache(object):
-    """
-    A kanji similarity cache which only keeps the top-n most similar
-    neighbours.
-    """
-    def __init__(self, n):
-        self._n = n
-        self._heaps = {}
-        self._sum = 0.0
-        self._n_seen = 0.0
-        self._sum_squared = 0.0
-
-    def add(self, kanji_a, kanji_b, similarity):
-        """
-        Attempt to add this similarity score to the cache. If there are
-        already n closer neighbours for either kanji it will be discarded.  
-        """
-        self.get_heap(kanji_a).add(kanji_b, similarity)
-        self.get_heap(kanji_b).add(kanji_a, similarity)
-        self._n_seen += 1
-        self._sum += similarity
-        self._sum_squared += similarity * similarity
-
-
-    def __getitem__(self, kanji):
-        return self.get_heap(kanji)
-
-    def get_heap(self, kanji):
-        heap = self._heaps.get(kanji)
-        if heap is None:
-            heap = self._heaps.setdefault(kanji, TopNHeap(self._n))
-        return heap
-
-    def get_mean(self):
-        return self._sum / self._n_seen
-
-# vim: ts=4 sw=4 sts=4 et tw=78:
-

simsearch/search/management/__init__.py

Empty file removed.

simsearch/search/management/commands/__init__.py

Empty file removed.

simsearch/search/management/commands/build.py

-# -*- coding: utf-8 -*-
-#
-#  build.py
-#  simsearch
-# 
-#  Created by Lars Yencken on 27-08-2010.
-#  Copyright 2010 Lars Yencken. All rights reserved.
-#
-
-"""
-Adds a command to automatically run a build for each app used.
-"""
-
-from django.core.management.base import BaseCommand, CommandError
-from django.conf import settings
-
-class Command(BaseCommand):
-    help = 'Builds the initial similarity database.'
-
-    def handle(self, *args, **kwargs):
-        found_one = False
-        for app_name in settings.INSTALLED_APPS:
-            module = __import__(app_name)
-            for part in app_name.split('.')[1:]:
-                module = getattr(module, part)
-
-            if hasattr(module, 'build'):
-                module.build()
-                found_one = True
-
-        if not found_one:
-            raise Exception('no apps had build commands')
-
-# vim: ts=4 sw=4 sts=4 et tw=78:

simsearch/search/stroke.pyx

-# -*- coding: utf-8 -*-
-#
-#  stroke.pyx
-#  simsearch
-# 
-#  Created by Lars Yencken on 03-09-2010.
-#  Copyright 2010 Lars Yencken. All rights reserved.
-#
-
-"""
-Optimised Levenstein distance calculation between stroke signatures for two
-kanji.
-"""
-
-import os
-
-from cjktools.common import sopen
-from django.conf import settings
-
-cdef class StrokeEditDistance:
-    """The edit distance between stroke sequences for both kanji."""
-    cdef readonly signatures
-    cdef readonly object stroke_types
-    cdef readonly int n_stroke_types
-
-    def __init__(self, input_file=None):
-        self.stroke_types = {}
-        self.n_stroke_types = 0
-
-        input_file = input_file or settings.STROKE_SOURCE
-        self.signatures = {}
-        i_stream = sopen(input_file)
-        for i, line in enumerate(i_stream):
-            kanji, raw_strokes = line.rstrip().split()
-            raw_strokes = raw_strokes.split(',')
-            strokes = map(self.get_stroke_type, raw_strokes)
-            self.signatures[kanji] = strokes
-        i_stream.close()
-
-    def get_stroke_type(self, stroke):
-        try:
-            return self.stroke_types[stroke]
-        except KeyError:
-            pass
-
-        self.stroke_types[stroke] = self.n_stroke_types
-        self.n_stroke_types = self.n_stroke_types + 1
-
-        return self.n_stroke_types - 1
-    
-    def raw_distance(self, kanji_a, kanji_b):
-        s_py = self.signatures[kanji_a]
-        t_py = self.signatures[kanji_b]
-
-        return edit_distance(s_py, t_py)
-
-    def __call__(self, kanji_a, kanji_b):
-        s_py = self.signatures[kanji_a]
-        t_py = self.signatures[kanji_b]
-
-        result = edit_distance(s_py, t_py)
-        return float(result) / max(len(s_py), len(t_py))
-    
-    def __contains__(self, kanji):
-        return kanji in self.signatures
-
-#----------------------------------------------------------------------------#
-
-cdef edit_distance(s_py, t_py):
-    cdef int m, n, i, j
-    cdef int table[100][100]
-    cdef int s[100]
-    cdef int t[100]
-    cdef int up, left, diag, cost
-
-    s_len = len(s_py)
-    t_len = len(t_py)
-    if s_len > 99 or t_len > 99:
-        raise ValueError, "stroke sequences too long"
-
-    for 0 <= i < s_len:
-        table[i][0] = i
-        s[i] = s_py[i]
-    table[s_len][0] = s_len
-
-    for 0 <= j < t_len:
-        table[0][j] = j
-        t[j] = t_py[j]
-    table[0][t_len] = t_len
-
-    for 1 <= i <= s_len:
-        for 1 <= j <= t_len:
-            if s[i-1] == t[j-1]:
-                cost = 0
-            else:
-                cost = 1
-
-            up = table[i-1][j] + 1
-            left = table[i][j-1] + 1
-            diag = table[i-1][j-1] + cost
-            if up <= left:
-                if up <= diag:
-                    table[i][j] = up
-                else:
-                    table[i][j] = diag
-            else:
-                if left <= diag:
-                    table[i][j] = left
-                else:
-                    table[i][j] = diag
-
-    return table[s_len][t_len]
-
-# vim: ts=4 sw=4 sts=4 et tw=78:

simsearch/search/tests.py

-"""
-This file demonstrates two different styles of tests (one doctest and one
-unittest). These will both pass when you run "manage.py test".
-
-Replace these with more appropriate tests for your application.
-"""
-
-from django.test import TestCase
-
-class SimpleTest(TestCase):
-    def test_basic_addition(self):
-        """
-        Tests that 1 + 1 always equals 2.
-        """
-        self.failUnlessEqual(1 + 1, 2)
-
-__test__ = {"doctest": """
-Another way to test that 1 + 1 is equal to 2.
-
->>> 1 + 1 == 2
-True
-"""}
-

simsearch/search/urls.py

-# -*- coding: utf-8 -*-
-#
-#  urls.py
-#  simsearch
-# 
-#  Created by Lars Yencken on 31-08-2010.
-#  Copyright 2010 Lars Yencken. All rights reserved.
-#
-
-"""
-Urlconf for search app.
-"""
-
-from django.conf.urls.defaults import patterns, url
-
-urlpatterns = patterns('simsearch.search.views',
-    url(r'^$',                      'index',        name='search_index'),
-#    url(r'^xhr/$',                  'search_json',  name='search_json'),
-#    url(r'^xhr/(?P<pivot>.*)/$',    'search_json',  name='search_json_kanji'),
-    url(r'^target/$',               'translate',    name='search_target'),
-    url(r'^target/(?P<kanji>.*)/$', 'translate',    name='search_target_kanji'),
-)
-
-# vim: ts=4 sw=4 sts=4 et tw=78:
-

simsearch/search/views.py

-# -*- coding: utf-8 -*-
-#
-#  views.py
-#  simsearch
-# 
-#  Created by Lars Yencken on 30-08-2010.
-#  Copyright 2010 Lars Yencken. All rights reserved.
-#
-
-"""
-Views for the search app.
-"""
-
-from django.shortcuts import render_to_response
-from django.template import RequestContext
-from django.utils import simplejson
-from django.http import HttpResponse, Http404, HttpResponseRedirect
-from django.conf import settings
-from django.core.urlresolvers import reverse
-
-from cjktools import scripts
-
-import models
-
-def index(request):
-    "Renders the search display."
-    kanji = request.GET.get('kanji', '')
-    kanji_ok = _is_kanji(kanji)
-
-    if not kanji or not kanji_ok:
-        # show the search dialog
-        context = {
-                'kanji': kanji,
-                'kanji_ok': kanji_ok,
-            }
-        if kanji:
-            context['error'] = 'Please enter a single kanji only as input.'
-        return render_to_response('search/index.html', context,
-                context_instance=RequestContext(request))
-
-    # show the search plane instead
-
-    # make sure the path is ok
-    path = request.GET.get('path', '')
-    if not all(map(_is_kanji, path)):
-        path = []
-
-    path = list(path) + [kanji]
-    node = models.Node.objects.get(pivot=kanji)
-    neighbours = [n.kanji for n in sorted(node.neighbours, reverse=True)]
-    neighbours = neighbours[:settings.N_NEIGHBOURS_RECALLED]
-
-    context = {'data': simplejson.dumps({
-                    'kanji': kanji,
-                    'tier1': neighbours[:4],
-                    'tier2': neighbours[4:9],
-                    'tier3': neighbours[9:],
-                    'path': ''.join(path),
-                })}
-    return render_to_response('search/display.html', context,
-            context_instance=RequestContext(request))
-
-def translate(request, kanji=None):
-    "Updates the query model before redirecting to the real translation."
-    kanji = kanji or request.GET.get('kanji')
-    if not _is_kanji(kanji):
-        raise Http404
-
-    path = request.GET.get('path')
-    if path and len(path) > 1 and all(map(_is_kanji, path)) \
-            and path.endswith(kanji):
-        models.Node.update(path)
-        models.Trace.log(request, path)
-
-    return HttpResponseRedirect(reverse('translate_kanji', args=[kanji]))
-
-def search_json(request, pivot=None):
-    "Returns the search display data as JSON."
-    pivot = pivot or request.GET.get('pivot')
-    node = models.Node.objects.get(pivot=pivot)
-    neighbours = [n.kanji for n in sorted(node.neighbours, reverse=True)]
-    neighbours = neighbours[:settings.N_NEIGHBOURS_RECALLED]
-
-    response_dict = {
-                'pivot_kanji': pivot,
-                'tier1': neighbours[:4],
-                'tier2': neighbours[4:9],
-                'tier3': neighbours[9:],
-            }
-    return HttpResponse(
-            simplejson.dumps(response_dict),
-            mimetype='application/javascript',
-        )
-
-def _is_kanji(kanji):
-    return isinstance(kanji, unicode) and len(kanji) == 1 \
-            and scripts.script_type(kanji) == scripts.Script.Kanji
-
-# vim: ts=4 sw=4 sts=4 et tw=78:

simsearch/settings.py

 #
 
 """
-Django settings for the simsearch project.
+Settings for the simsearch project.
 """
 
 import os
 
 import mongoengine
 
-DEBUG = True
-TEMPLATE_DEBUG = DEBUG
-
-ADMINS = (
-    ('Lars Yencken', 'lars@yencken.org'),
-)
-
-MANAGERS = ADMINS
-
-# normal django database access ignored
-DATABASES = {
-#    'default': {
-#        'ENGINE': 'django_mongodb_engine.mongodb',
-#        'NAME': 'simsearch',
-#        'USER': '',
-#        'PASSWORD': '',
-#        'HOST': 'localhost',
-#        'PORT': 27017,
-#        'SUPPORTS_TRANSACTIONS': False,
-#    },
-}
-
 # custom MongoDB connection settings
 MONGODB_NAME = 'simsearch'
 MONGODB_USERNAME = None
 # Tradeoff in Pr(a|s) and likelihood of reaching a further target from s'
 UPDATE_GAMMA = 0.7
 
-# Local time zone for this installation. Choices can be found here:
-# http://en.wikipedia.org/wiki/List_of_tz_zones_by_name
-# although not all choices may be available on all operating systems.
-# On Unix systems, a value of None will cause Django to use the same
-# timezone as the operating system.
-# If running in a Windows environment this must be set to the same as your
-# system time zone.
-TIME_ZONE = 'Melbourne/Australia'
-
-# Language code for this installation. All choices can be found here:
-# http://www.i18nguy.com/unicode/language-identifiers.html
-LANGUAGE_CODE = 'en-us'
-
-SITE_ID = 1
-
-# If you set this to False, Django will make some optimizations so as not
-# to load the internationalization machinery.
-USE_I18N = True
-
-# If you set this to False, Django will not format dates, numbers and
-# calendars according to the current locale
-USE_L10N = True
-
 PROJECT_ROOT = os.path.dirname(__file__)
 
 # Absolute path to the directory that holds media.
 # Examples: "http://media.lawrence.com", "http://example.com/media/"
 MEDIA_URL = '/static/'
 
-# URL prefix for admin media -- CSS, JavaScript and images. Make sure to use a
-# trailing slash.
-# Examples: "http://foo.com/media/", "/media/".
-ADMIN_MEDIA_PREFIX = '/media/admin/'
-
-# Make this unique, and don't share it with anybody.
-SECRET_KEY = '+-_s&+=m0+bc*$40jf1s#9x(ar=vom5(lt9=&*ol$)co*u(38r'
-
-# List of callables that know how to import templates from various sources.
-TEMPLATE_LOADERS = (
-    'django.template.loaders.filesystem.Loader',
-    'django.template.loaders.app_directories.Loader',
-#     'django.template.loaders.eggs.Loader',
-)
-
-MIDDLEWARE_CLASSES = (
-    'django.middleware.common.CommonMiddleware',
-    'django.contrib.sessions.middleware.SessionMiddleware',
-    'django.middleware.csrf.CsrfViewMiddleware',
-    'django.contrib.auth.middleware.AuthenticationMiddleware',
-    'django.contrib.messages.middleware.MessageMiddleware',
-)
-
-TEMPLATE_CONTEXT_PROCESSORS = (
-    "django.contrib.auth.context_processors.auth",
-    "django.core.context_processors.debug",
-    "django.core.context_processors.i18n",
-    "django.core.context_processors.media",
-    "django.contrib.messages.context_processors.messages",
-    'simsearch.context.mercurial_revision',
-    'simsearch.context.site_settings',
-)
-
-MESSAGE_STORAGE = 'django.contrib.messages.storage.session.SessionStorage'
-
-SESSION_ENGINE = 'mongoengine.django.sessions'
-
-ROOT_URLCONF = 'simsearch.urls'
-
-TEMPLATE_DIRS = (
-    os.path.join(PROJECT_ROOT, 'templates'),
-)
-
-INSTALLED_APPS = (
-#    'django.contrib.messages',
-    'django.contrib.sessions',
-    'djangotoolbox',
-    'simsearch.search',
-    'simsearch.translate',
-    'simsearch.static',
-)
-
 # The source of stroke data for each character
 STROKE_SOURCE = None
 

simsearch/stroke.pyx

+# -*- coding: utf-8 -*-
+#
+#  stroke.pyx
+#  simsearch
+# 
+#  Created by Lars Yencken on 03-09-2010.
+#  Copyright 2010 Lars Yencken. All rights reserved.
+#
+
+"""
+Optimised Levenstein distance calculation between stroke signatures for two
+kanji.
+"""
+
+import os
+
+from cjktools.common import sopen
+
+from simsearch import settings
+
+cdef class StrokeEditDistance:
+    """The edit distance between stroke sequences for both kanji."""
+    cdef readonly signatures
+    cdef readonly object stroke_types
+    cdef readonly int n_stroke_types
+
+    def __init__(self, input_file=None):
+        self.stroke_types = {}
+        self.n_stroke_types = 0
+
+        input_file = input_file or settings.STROKE_SOURCE
+        self.signatures = {}
+        i_stream = sopen(input_file)
+        for i, line in enumerate(i_stream):
+            kanji, raw_strokes = line.rstrip().split()
+            raw_strokes = raw_strokes.split(',')
+            strokes = map(self.get_stroke_type, raw_strokes)
+            self.signatures[kanji] = strokes
+        i_stream.close()
+
+    def get_stroke_type(self, stroke):
+        try:
+            return self.stroke_types[stroke]
+        except KeyError:
+            pass
+
+        self.stroke_types[stroke] = self.n_stroke_types
+        self.n_stroke_types = self.n_stroke_types + 1
+
+        return self.n_stroke_types - 1
+    
+    def raw_distance(self, kanji_a, kanji_b):
+        s_py = self.signatures[kanji_a]
+        t_py = self.signatures[kanji_b]
+
+        return edit_distance(s_py, t_py)
+
+    def __call__(self, kanji_a, kanji_b):
+        s_py = self.signatures[kanji_a]
+        t_py = self.signatures[kanji_b]
+
+        result = edit_distance(s_py, t_py)
+        return float(result) / max(len(s_py), len(t_py))
+    
+    def __contains__(self, kanji):
+        return kanji in self.signatures
+
+#----------------------------------------------------------------------------#
+
+cdef edit_distance(s_py, t_py):
+    cdef int m, n, i, j
+    cdef int table[100][100]
+    cdef int s[100]
+    cdef int t[100]
+    cdef int up, left, diag, cost
+
+    s_len = len(s_py)
+    t_len = len(t_py)
+    if s_len > 99 or t_len > 99:
+        raise ValueError, "stroke sequences too long"
+
+    for 0 <= i < s_len:
+        table[i][0] = i
+        s[i] = s_py[i]
+    table[s_len][0] = s_len
+
+    for 0 <= j < t_len:
+        table[0][j] = j
+        t[j] = t_py[j]
+    table[0][t_len] = t_len
+
+    for 1 <= i <= s_len:
+        for 1 <= j <= t_len:
+            if s[i-1] == t[j-1]:
+                cost = 0
+            else:
+                cost = 1
+
+            up = table[i-1][j] + 1
+            left = table[i][j-1] + 1
+            diag = table[i-1][j-1] + cost
+            if up <= left:
+                if up <= diag:
+                    table[i][j] = up
+                else:
+                    table[i][j] = diag
+            else:
+                if left <= diag:
+                    table[i][j] = left
+                else:
+                    table[i][j] = diag
+
+    return table[s_len][t_len]
+
+# vim: ts=4 sw=4 sts=4 et tw=78:

simsearch/templates/search/display.html

     });
   </script>
 {% endblock %}
+

simsearch/templates/search/index.html

   <br/>
   <p align="center">
   {% if error %}
-    {{error}} (<a href="{% url help %}">help</a>)
+    {{error}} (<a href="/help/">help</a>)
   {% else %}
     Enter the kanji you want to find, or one that looks similar.
   {% endif %}

simsearch/translate/__init__.py

-# -*- coding: utf-8 -*-
-#
-#  __init__.py
-#  simsearch
-# 
-#  Created by Lars Yencken on 14-09-2010.
-#  Copyright 2010 Lars Yencken. All rights reserved.
-#
-
-def build():
-    import models
-    models.Translation.build()
-
-# vim: ts=4 sw=4 sts=4 et tw=78:

simsearch/translate/models.py

-# -*- coding: utf-8 -*-
-#
-#  models.py
-#  simsearch
-# 
-#  Created by Lars Yencken on 11-09-2010.
-#  Copyright 2010 Lars Yencken. All rights reserved.
-#
-
-"""
-A basic database model for a kanji-level dictionary.
-"""
-
-from simsearch import settings
-from cjktools.resources import kanjidic
-import mongoengine
-
-class Translation(mongoengine.Document):
-    "A per-kanji dictionary entry of readings and translations."
-    kanji = mongoengine.StringField(max_length=1, primary_key=True)
-    on_readings = mongoengine.ListField(mongoengine.StringField())
-    kun_readings = mongoengine.ListField(mongoengine.StringField())
-    glosses = mongoengine.ListField(mongoengine.StringField())
-
-    @classmethod
-    def build(cls):
-        cls.drop_collection()
-        kjd = kanjidic.Kanjidic()
-        for entry in kjd.itervalues():
-            translation = cls(
-                    kanji=entry.kanji,
-                    on_readings=entry.on_readings,
-                    kun_readings=entry.kun_readings,
-                    glosses = entry.gloss,
-                )
-            translation.save()
-
-if __name__ == '__main__':
-    Translation.build()
-
-# vim: ts=4 sw=4 sts=4 et tw=78:

simsearch/translate/tests.py

-"""
-This file demonstrates two different styles of tests (one doctest and one
-unittest). These will both pass when you run "manage.py test".
-
-Replace these with more appropriate tests for your application.
-"""
-
-from django.test import TestCase
-
-class SimpleTest(TestCase):
-    def test_basic_addition(self):
-        """
-        Tests that 1 + 1 always equals 2.
-        """
-        self.failUnlessEqual(1 + 1, 2)
-
-__test__ = {"doctest": """
-Another way to test that 1 + 1 is equal to 2.
-
->>> 1 + 1 == 2
-True
-"""}
-

simsearch/translate/urls.py

-# -*- coding: utf-8 -*-
-#
-#  urls.py
-#  simsearch
-# 
-#  Created by Lars Yencken on 11-09-2010.
-#  Copyright 2010 Lars Yencken. All rights reserved.
-#
-
-"""
-Urlconf for translate app.
-"""
-
-from django.conf.urls.defaults import patterns, url
-
-urlpatterns = patterns('simsearch.translate.views',
-    url(r'^(?P<kanji>.)/$', 'translate', name='translate_kanji'),
-)
-
-# vim: ts=4 sw=4 sts=4 et tw=78:
-

simsearch/translate/views.py

-# -*- coding: utf-8 -*-
-#
-#  views.py
-#  simsearch
-# 
-#  Created by Lars Yencken on 11-09-2010.
-#  Copyright 2010 Lars Yencken. All rights reserved.
-#
-
-from django.shortcuts import render_to_response
-from django.template import RequestContext
-from django.http import Http404
-
-import models
-
-def translate(request, kanji=None):
-    if kanji is None:
-        raise Http404
-
-    t = models.Translation.objects.get(kanji=kanji)
-    if t is None:
-        raise Http404
-
-    return render_to_response('translate/kanji.html', {'translation': t},
-            RequestContext(request))
-
-# vim: ts=4 sw=4 sts=4 et tw=78: