Commits

Lars Yencken committed 2954841

Adds initial snapshot from the FOKS dictionary.

Comments (0)

Files changed (11)

+syntax: glob
+*.orig
+*.rej
+*~
+*.o
+*.so
+*.os
+*.pyo
+*.pyc
+*.log
+build
+*.swp
+MANIFEST
+dist
+*.deb
+.DS_Store
+*.egg-info
+__version__.py
+# -*- coding: utf-8 -*-
+# 
+#  setup.py
+#  jp-places
+#  
+#  Created by Lars Yencken on 2009-04-09.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+# 
+
+from setuptools import setup
+
+VERSION = '0.1.0'
+
+f = open('src/__version__.py', 'w')
+f.write('# Autogenerated by setup.py\n')
+f.write('version = "%s"\n' % VERSION)
+f.close()
+
+setup(
+        name='jp-places',
+        description="A simple Django app providing database storage and access for Japanese place name gazetteers.",
+        long_description = """
+        Provides database models and access to place names in the Japan Post
+        Gazetteer, as well as aliases for these places in enamdict. Uses 
+        django-hierarchy to describe the hierarchical relationship between
+        places.
+        """,
+        url="http://bitbucket.org/lars512/jp-places/",
+        version=VERSION,
+        author="Lars Yencken",
+        author_email="lljy@csse.unimelb.edu.au",
+        license="BSD",
+        install_requires=[
+                'django',
+                'django-hierarchy',
+                'cjktools', 
+                'cjktools-data',
+            ],
+
+        package_dir={'jp_places': 'src'},
+        packages=['jp_places'],
+    )
+# -*- coding: utf-8 -*-
+# 
+#  __init__.py
+#  jp-places
+#  
+#  Created by Lars Yencken on 2009-04-09.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+# 
+
+import pkg_resources
+from django.db.models import get_model
+
+def build(filename=None):
+    gazetteer_file = pkg_resources.pkg_filename('cjktools_data',
+            'jp_post_gazetteer')
+    get_model('jp_places', 'placename').from_gazetteer(filename)
+# -*- coding: utf-8 -*-
+#----------------------------------------------------------------------------#
+# goitaikei.py
+# Lars Yencken <lars.yencken@gmail.com>
+# vim: ts=4 sw=4 sts=4 et tw=78:
+# Thu Jun  7 22:06:56 2007
+#
+#----------------------------------------------------------------------------#
+
+"""
+A structure model for the Goitaikei ontology. Provides a nice API for
+accessing the underlying tree structure.
+"""
+
+#----------------------------------------------------------------------------#
+
+import os
+
+from django.conf import settings
+from cjktools.common import sopen
+from cjktools.maps import invertMapping
+from cjktools.smartCache import diskProxyDirect
+from hierarchy.tree import TreeNode
+
+from foks.util.freq_dist import FreqDist
+
+#----------------------------------------------------------------------------#
+# GLOBAL
+#----------------------------------------------------------------------------#
+
+_goitaikeiDir = os.path.join(settings.DATA_DIR, 'corpus', 'goitaikei')
+_index_file = os.path.join(_goitaikeiDir, 'NTT-CThesaurus-struct-tree.gz')
+_leaf_file = os.path.join(_goitaikeiDir, 'NTT-CThesaurus.gz')
+_jpWordFreqFile = os.path.join(settings.DATA_DIR, 'corpus',
+        'jp_word_corpus_counts.gz')
+_rootId = 1
+
+#----------------------------------------------------------------------------#
+# PUBLIC
+#----------------------------------------------------------------------------#
+
+class GoitaikeiHierarchy(object):
+    """
+    A dictionary style object which can be queried for the object
+    hierarchy.
+    """
+    #------------------------------------------------------------------------#
+    # PUBLIC METHODS
+    #------------------------------------------------------------------------#
+
+    def __init__(self, leafFile=_leaf_file, indexFile=_index_file):
+        """
+        Constructor. Parses the Goitaikei ontology, determining the
+        GoitaikeiNode object for the root node.
+        """
+        self.root = self._build_tree(leafFile, indexFile)
+
+        self._buildExtras()
+
+    #------------------------------------------------------------------------#
+
+    def toVector(self):
+        """Counts the number of words in each node, generating a vector."""
+        empty = []
+        return [len(n.get('words', empty)) for n in self.walk()]
+
+    #------------------------------------------------------------------------#
+
+    @classmethod
+    def get_cached(cls):
+        """
+        Returns a cached instance, or builds a new one if no cached 
+        instance is available.
+        """
+        if not hasattr(cls, '_cached'):
+            buildNewHierarchy = diskProxyDirect(
+                    GoitaikeiHierarchy,
+                    os.path.join(settings.CACHE_DIR, 'goitaikei.cache'),
+                    dependencies=[__file__, _index_file, _leaf_file],
+                )
+            cls._cached = buildNewHierarchy()
+
+        return cls._cached
+
+    #------------------------------------------------------------------------#
+    # PRIVATE METHODS
+    #------------------------------------------------------------------------#
+
+    def _build_tree(self, leafFile, indexFile):
+        """Build the main Goitaikei tree."""
+        # Parse a list of words for each category.
+        idToWords = self._parseWordCategories(leafFile)
+
+        # Get the tree structure of the information.
+        idToNode, idToWords = self._parseTreeStructure(indexFile, idToWords)
+
+        # Add words to the tree structure.
+        for id, words in idToWords.iteritems():
+            # Ignore silently categories missing from our id index.
+            if id in idToNode:
+                idToNode[id].attrib['words'].update(words)
+
+        return idToNode[_rootId]
+
+    #------------------------------------------------------------------------#
+
+    def _buildExtras(self):
+        """Build in secondary features for convenience."""
+        # Give each node a set of all characters used in that node, and
+        # generate a map from words to all nodes containing them.
+        wordsToClasses = {}
+        n_words = 0
+        for node in self.root.walk():
+            char_set = set()
+            for word in node['words']:
+                char_set.update(word)
+                if word not in wordsToClasses:
+                    wordsToClasses[word] = set([node])
+                else:
+                    wordsToClasses[word].add(node)
+
+                n_words += 1
+
+            node['chars'] = char_set
+
+        self.wordsToClasses = wordsToClasses
+
+        # Append node frequency.
+        word_dist = FreqDist.from_file(_jpWordFreqFile)
+        for word in wordsToClasses:
+            word_dist.inc(word)
+
+        for node in self.root.walk_postorder():
+            assert 'prob' not in node
+            nodeFreq = 0.0
+            for word in node['words']:
+                wordFreq = word_dist.count(word)
+                nodeFreq += wordFreq / float(len(wordsToClasses[word]))
+            node['prob'] = nodeFreq / word_dist.total
+            node['cProb'] = node['prob'] + \
+                    sum(n['cProb'] for n in node.children.itervalues())
+            assert 0 <= node['prob'] <= 1
+            assert 0 <= node['cProb'] <= 1
+
+        return
+
+    #------------------------------------------------------------------------#
+
+    def _loadUpperNodes(self, indexFile):
+        """
+        Loads the tree-structure of categories as a directed graph, from 
+        child to parent. Ignores category membership of these nodes.
+        """
+        i_stream = sopen(indexFile)
+        for line in i_stream:
+            id, word, parent, wordCategories = line.rstrip().split()
+            id = int(id)
+            assert word not in self.wordToId, "Doubling up on %s" % word
+            self.wordToId[word] = id
+            if parent != '*':
+                # A non-root node.
+                self.categoryParent[id] = int(parent)
+            else:
+                # The single root node.
+                self.categoryParent[id] = None
+
+            wordCategories = [w for w in wordCategories.split(':') if w != '*']
+            self.wordCategories[id] = set(map(int, wordCategories))
+
+        i_stream.close()
+
+        return
+
+    #------------------------------------------------------------------------#
+
+    def _loadLeafNodes(self, leafFile):
+        """
+        Loads all the words which aren't themselves categories.
+        """
+        i_stream = sopen(leafFile)
+        for line in i_stream:
+            line_objs = line.rstrip(':\n').split(':')
+            word = line_objs[0]
+            categories = map(int, line_objs[1:])
+            assert word not in self.wordToId
+            id = self.wordToId[word]
+            assert word not in self.wordCategories
+            self.wordCategories[word] = set(categories)
+        i_stream.close()
+        return
+
+    #------------------------------------------------------------------------#
+
+    def _parseWordCategories(self, _leaf_file):
+        """
+        Determine the categories of all leaf words.
+        """
+        wordToIds = {}
+        i_stream = sopen(_leaf_file)
+        for line in i_stream:
+            line_objs = line.rstrip(':\n').split(':')
+            word = line_objs[0]
+            category_ids = map(int, line_objs[1:])
+            wordToIds[word] = category_ids
+        i_stream.close()
+
+        idToWords = invertMapping(wordToIds)
+
+        return idToWords
+
+    #------------------------------------------------------------------------#
+
+    def _parseTreeStructure(self, indexFile, idToWords):
+        """
+        Determine the tree structure of the ontology.
+        """
+        # Parse the category structure itself.
+        idToNode = {}
+        i_stream = sopen(indexFile)
+        for line in i_stream:
+            id, word, parent, categories = line.rstrip().split()
+            id = int(id)
+
+            # Build the node itself.
+            node = TreeNode(word, attrib={'id': id, 'words': set()})
+            idToNode[id] = node
+
+            # Add link from its parent.
+            if parent != '*':
+                parent = int(parent)
+                assert parent < id
+                idToNode[parent].add_child(node)
+
+            # Store additional word categories.
+            categories = [int(w) for w in categories.split(':') if w != '*']
+            for category_id in categories:
+                if category_id in idToWords:
+                    idToWords[category_id].append(word)
+                else:
+                    idToWords[category_id] = [word]
+
+        i_stream.close()
+
+        return idToNode, idToWords
+
+    #------------------------------------------------------------------------#
+
+#----------------------------------------------------------------------------#
+# -*- coding: utf-8 -*-
+# 
+#  models.py
+#  jp-places
+#  
+#  Created by Lars Yencken on 2009-04-09.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+# 
+
+from django.db import models, connection
+from hierarchy.models import HierarchicalModel
+from cjktools import scripts
+from cjktools.sequences import groupsOfNIter
+
+class PlaceName(HierarchicalModel):
+    "A place name with regional qualifiers."
+    name = models.CharField(max_length=50, db_index=True)
+    reading = models.CharField(max_length=75)
+    path = models.CharField(max_length=200, null=True, blank=True)
+    
+    def get_path_as_ints(self):
+        return map(int, self.path.split('/'))
+
+    def __hash__(self):
+        return hash(self.left_visit)
+
+    def __eq__(self, rhs):
+        return self.left_visit == rhs.left_visit
+
+    def copy(self):
+        return PlaceName(name=self.name, reading=self.reading,
+            left_visit=self.left_visit, right_visit=self.right_visit)
+
+    def __unicode__(self):
+        return u'%s' % (self.name)
+
+    @staticmethod
+    def from_gazetteer(root_node):
+        "Rebuilds the whole place tree from a gazetteer."
+        cursor = connection.cursor()
+        cursor.execute('DELETE FROM jp_places_placenamealias')
+        cursor.execute('DELETE FROM jp_places_placename')
+        cursor.close()
+
+        tree_node_iter = root_node.walk()
+        tree_node_iter.next() # Skip the root node
+
+        left_visit = 1
+        for node in root_node.walk():
+            node['left_visit'] = left_visit
+            left_visit += 1
+
+        for node in root_node.walk_postorder():
+            if node.children:
+                node['right_visit'] = max(c['right_visit'] for c in
+                    node.children.values()) + 1
+            else:
+                node['right_visit'] = node['left_visit'] + 1
+
+        def iterrows(root_node):
+            for node in root_node.walk():
+                yield (
+                        node.label,
+                        scripts.toHiragana(node['reading']),
+                        node['left_visit'],
+                        node['right_visit'],
+                        '/'.join(str(n['left_visit']) for n in
+                                node.ancestors[:-1])
+                    )
+                node['id'] = node['left_visit']
+            return
+
+        cursor = connection.cursor()
+        for rowGroup in groupsOfNIter(10000, iterrows(root_node)):
+            assert len(set(r[2] for r in rowGroup)) == len(rowGroup)
+            cursor.executemany(
+                    """
+                    INSERT INTO lexicon_placename
+                    (name, reading, left_visit, right_visit, path)
+                    VALUES
+                    (%s, %s, %s, %s, %s)
+                    """,
+                    rowGroup,
+                )
+        cursor.close()
+
+        PlaceNameAlias.refreshAliases(root_node)
+        return
+        
+#----------------------------------------------------------------------------#
+
+class PlaceNameAlias(models.Model):
+    "An alias for a place name."
+    name = models.CharField(max_length=50)
+    reading = models.CharField(max_length=75)
+    place_name = models.ForeignKey(PlaceName, related_name='aliases')
+
+    def __unicode__(self):
+        return u'<PlaceNameAlias: %s -> %s>' % (self.name,
+                self.place_name.name)
+
+    class Meta:
+        verbose_name_plural = 'place name aliases'
+
+    @staticmethod
+    def refreshAliases(root_node):
+        "Rebuilds the alias table."
+        nodeIter = root_node.walk()
+        nodeIter.next() # skip root node
+
+        for node in nodeIter:
+            for g_suffix, p_suffix, depth in PlaceNameAlias._knownSuffixes:
+                if node.label.endswith(g_suffix) \
+                        and node['reading'].endswith(p_suffix):
+
+                    ancestor_names = [n.label for n in node.ancestors]
+                    if u'?' in ancestor_names and depth != '*':
+                        continue
+
+                    if depth in ('*', len(ancestor_names)):
+                        alias_obj = PlaceNameAlias(
+                                name=node.label[:-len(g_suffix)],
+                                reading=node['reading'][:-len(p_suffix)],
+                                place_name_id=node['id'],
+                            )
+                        if alias_obj.name and alias_obj.reading:
+                            alias_obj.save()
+        return
+    
+    _knownSuffixes = [
+            (u'都', u'と', 3),
+            (u'県', u'けん', 3),
+            (u'府', u'ふ', 3),
+            (u'道', u'どう', 3),
+            (u'区', u'く', 4),
+            (u'駅', u'えき', u'*')
+        ]
+# -*- coding: utf-8 -*-
+# 
+#  place.py
+#  jp-places
+#  
+#  Created by Lars Yencken on 2009-04-09.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+# 
+
+"""
+A data form for storing and manipulating place data.
+"""
+
+import random
+
+from cjktools.common import sopen
+
+class Place(object):
+    """
+    A location or region and its associated label. It may contain other
+    locations or regions within it.
+    """
+    __slots__ = ('label', 'reading', 'children', 'aliases', 'db', 'parent')
+
+    def __init__(self, label, reading=None, children=None, aliases=None,
+            db=None, parent=None):
+        # Add restrictions imposed by our storage format.
+        if ' ' in label or \
+                (reading and ' ' in reading) or \
+                (aliases and ' ' in ''.join(aliases)):
+            raise ValueError, "No spaces allowed in place details"
+
+        assert type(label) in (str, unicode), \
+                "Expected string, not %s" % `label`
+
+        self.label = label
+        self.reading = reading
+        self.aliases = aliases
+        self.db = db
+        self.parent = parent
+        self.children = children or {}
+        return
+
+    def _get_ancestors(self):
+        ancestors = [self]
+        node = self
+        while node.parent:
+            node = node.parent
+            ancestors.append(node)
+
+        return reversed(ancestors)
+    ancestors = property(_get_ancestors)
+
+    def add_child(self, place_node):
+        place_node.parent = self
+        assert place_node.label not in self
+        self.children[place_node.label] = place_node
+        return
+
+    def __repr__(self): 
+        return unicode(self).encode('utf8')
+
+    def __unicode__(self):
+        return '<Place: %s%s %d children>' % (
+                self.label,
+                (self.reading and ' /%s/' % self.reading or ''),
+                len(self.children),
+            )
+
+    def dump(self, filename):
+        """
+        Dump this place hierarchy to the given filename.
+        """
+        o_stream = sopen(filename, 'w')
+        for depth, place in self.walk():
+            print >> o_stream, place._to_line(depth)
+        o_stream.close()
+        return
+
+    @classmethod
+    def from_file(cls, filename):
+        """Construct a new place hierarchy from the given filename."""
+        i_stream = sopen(filename)
+        lines = iter(enumerate(i_stream))
+
+        depth, root_node = cls._from_line(lines.next()[1])
+        if depth != 0:
+            raise Exception, "File %s should start with a root node" % \
+                    filename
+
+        path = [root_node]
+        lastDepth = depth
+        last_node = root_node
+        for lineNo, line in lines:
+            depth, node = cls._from_line(line)
+            if depth == lastDepth + 1:
+                # One level deeper, the last node was the parent.
+                path.append(last_node)
+
+            elif depth == lastDepth:
+                # Same level, same parent.
+                pass
+
+            elif depth < lastDepth:
+                # Up one or more levels.
+                depthDiff = lastDepth - depth
+                path = path[:-depthDiff]
+
+            else:
+                raise ValueError("Strange depth found %s (line %d)" % (
+                        filename,
+                        lineNo + 1
+                    ))
+
+            path[-1].append(node)
+            last_node = node
+            lastDepth = depth
+
+        i_stream.close()
+
+        return root_node
+
+    def append(self, node):
+        """Simulate a list appending."""
+        self.children[node.label] = node
+        return
+
+    def find_node(self, label):
+        """
+        Searches for the given node name in a breadth first manner.
+        Returns the (node, path) tuple of the first match.
+        """
+        if label == self.label:
+            return []
+
+        nextFrontier = []
+        frontier = [([], self)]
+
+        while frontier or nextFrontier:
+            while frontier:
+                current_path, next_node = frontier.pop()
+                if label in next_node:
+                    # Success!
+                    target_node = next_node[label]
+                    target_path = current_path + [label]
+                    return target_node, target_path
+                else:
+                    for new_label, new_node in next_node.iteritems():
+                        nextFrontier.append(
+                                (current_path + [new_label], new_node)
+                            )
+            else:
+                frontier = nextFrontier
+                nextFrontier = []
+                random.shuffle(frontier)
+
+        raise KeyError, 'No such node %s' % label
+
+    def find_all(self, label):
+        """
+        Finds all nodes which match the given label, returning them as a list
+        of (path, node) tuples.
+        """
+        results = []
+
+        for path, node in self.walk_with_path():
+            if node.label == label:
+                results.append((path, node))
+
+        return results
+
+    def walk(self):
+        """
+        Returns an iterator over the entire tree, yielding nodes in order
+        in (depth, node) pairs.
+        """
+        # Use an output stack to avoid recursion. The reverse() calls ensure
+        # that they get re-parsed in the same order they came out.
+        output_stack = [(0, self)]
+        output_stack.reverse()
+        while output_stack:
+            depth, place = output_stack.pop()
+            yield depth, place
+            children = [(depth+1, p) for p in place.children.values()]
+            children.reverse()
+            output_stack.extend(children)
+        
+        return
+
+    def walk_with_path(self):
+        """
+        Returns an iterator over the entire tree. Each node is iterated
+        over as a (path, node) pair.
+        """
+        path = []
+        lastDepth = 0
+        last_node = None
+        for depth, node in self.walk():
+            if depth == lastDepth + 1:
+                # One level deeper, the last node was the parent.
+                path.append(last_node.label)
+
+            elif depth == lastDepth:
+                # Same level, same parent.
+                pass
+
+            elif depth < lastDepth:
+                # Up one or more levels.
+                depthDiff = lastDepth - depth
+                path = path[:-depthDiff]
+
+            else:
+                raise Exception("didn't expect depth %d" % depth)
+
+            yield '/'.join(path), node
+
+            last_node = node
+            lastDepth = depth
+
+        return
+
+    def __eq__(self, rhs):
+        return self.label == rhs.label and \
+                self.parent is rhs.parent and \
+                self.aliases == rhs.aliases and \
+                self.children == rhs.children
+
+    def __cmp__(self, rhs):
+        return cmp(self.label, rhs.label)
+
+    def _to_line(self, depth):
+        """
+        Given a depth, returns an output line as a string.
+        """
+        if self.aliases:
+            return '%d %s %s %s' % (depth, self.label, self.reading,
+                    self.aliases)
+        else:
+            return '%d %s %s' % (depth, self.label, self.reading)
+    
+    @staticmethod
+    def _from_line(line):
+        """
+        Parses a single line, returning a (depth, Place) pair.
+        """
+        line_objs = line.rstrip().split()
+        if len(line_objs) == 3:
+            depth, label, reading = line_objs
+            depth = int(depth)
+            if reading == 'None':
+                reading = None
+            return depth, Place(label, reading)
+
+        elif len(line_objs) == 4:
+            depth, label, reading, aliases = line_objs
+            depth = int(depth)
+            if reading == 'None':
+                reading = None
+            aliases = aliases.split(':')
+            return depth, Place(label, reading, aliases)
+
+        else:
+            raise ValueError, "Can't parse line %s" % line
+

src/place_database.py

+# -*- coding: utf-8 -*-
+# 
+#  place_database.py
+#  foks
+#  
+#  Created by Lars Yencken on 2007-07-11.
+#  Copyright 2007-2008 Lars Yencken. All rights reserved.
+# 
+
+"A script to build place-name database tables for FOKS."
+
+#----------------------------------------------------------------------------#
+
+from os.path import join, exists
+
+from django.conf import settings
+from cjktools.common import sopen
+import consoleLog
+
+from foks.lexicon import models, unified_places
+
+#----------------------------------------------------------------------------#
+
+_place_gazetteer = join(settings.DATA_DIR, u'place_gazetteer.gz')
+
+dependencies = [_place_gazetteer, unified_places.__file__, models.__file__]
+
+log = consoleLog.default
+
+#----------------------------------------------------------------------------#
+
+class PlaceDatabase(object):
+    """
+    This class generates and stores the dictionary and hierarchy of place
+    to the database tables lexicon_placename and lexicon_placenamealias. All this is
+    done via its build() method.
+    """
+
+
+    def __init__(self):
+        self.built = False
+        self.root_node = None
+        return
+
+    def build(self):
+        """
+        Builds the lexicon_placename and lexicon_placenamealias database tables.
+        """
+        if self.built:
+            return
+
+        log.start('Building place tables', nSteps=2)
+        log.log('Constructing place hierarchy')
+        if exists(_place_gazetteer):
+            self.root_node = unified_places.UnifiedHierarchy.from_file(
+                    _place_gazetteer)
+        else:
+            self.root_node = unified_places.UnifiedHierarchy.get_cached()
+
+        log.log('Storing places to the database')
+        models.PlaceName.from_gazetteer(self.root_node)
+        log.finish()
+
+        self.built = True
+        return
+
+#----------------------------------------------------------------------------#
+
+def build():
+    obj = PlaceDatabase()
+    obj.build()
+
+#----------------------------------------------------------------------------#
+
+if __name__ == u'__main__':
+    build()

src/test_goitaikei.py

+# -*- coding: utf-8 -*-
+#----------------------------------------------------------------------------#
+# testGoitaikei.py
+# Lars Yencken <lars.yencken@gmail.com>
+# vim: ts=4 sw=4 sts=4 et tw=78:
+# Fri Jun  8 09:57:21 2007
+#
+#----------------------------------------------------------------------------# 
+
+import sys
+import codecs
+import unittest
+from goitaikei import GoitaikeiHierarchy
+
+if type(sys.stdout) == file:
+    # Wrap the output with a utf8 decoder for OS X.
+    sys.stdout = codecs.getwriter('utf8')(sys.stdout)
+
+#----------------------------------------------------------------------------#
+
+def suite():
+    testSuite = unittest.TestSuite((
+            unittest.makeSuite(GoitaikeiTestCase)
+        ))
+    return testSuite
+
+#----------------------------------------------------------------------------#
+
+class GoitaikeiTestCase(unittest.TestCase):
+    """
+    This class tests the Goitaikei class. 
+    """
+    def setUp(self):
+        self.obj = GoitaikeiHierarchy.get_cached()
+        self.root = self.obj.root
+        self.ekiPath = [u'具体', u'場', u'施設', u'公共施設', u'駅・港', u'駅']
+        self.badPath = [u'具体', u'場', u'sour eggs']
+        pass
+
+    def testFindNode(self):
+        """Tests finding a node by label."""
+        eki_node = self.root.find_node(u'駅')
+        self.assertEqual(eki_node.label, u'駅')
+        assert u'秋葉原' in eki_node.attrib['words']
+
+    def testGetPath(self):
+        """Test fetching a node at a known path."""
+        eki_node = self.root.get_path(self.ekiPath)
+        self.assertEqual(eki_node.label, u'駅')
+        assert u'秋葉原' in eki_node.attrib['words']
+        self.assertEqual(eki_node, self.root.find_node(u'駅'))
+        return
+
+    def testGetBadPath(self):
+        """Tests fetching a bad path."""
+        self.assertRaises(KeyError, self.root.get_path, self.badPath)
+
+    def testProbabilities(self):
+        """Tests that node probabilities steadily decrease down the tree."""
+        for node in self.root.walk():
+            if node.parent is not None:
+                assert node.parent['cProb'] >= node['cProb']
+        return
+
+    def testCopy(self):
+        c = self.root.copy()
+        
+        # Check they share the same number of nodes.
+        self.assertEqual(len(list(c.walk())), len(list(self.root.walk())))
+
+        # Mark every node in the copy, and make sure that none of the original
+        # nodes were marked (and thus shared with the copy).
+        for node in c.walk():
+            node.attrib['odd'] = 'purple monkey dishwasher'
+
+        for node in self.root.walk():
+            assert 'odd' not in node.attrib
+
+        return
+
+    def testPrune(self):
+        n_nodes = 2705
+        self.assertEqual(n_nodes, len(list(self.root.walk())))
+
+        node = self.root.prune(lambda x: len(x.attrib['words']) > 0)
+        self.assertEqual(n_nodes, len(list(node.walk())))
+
+        node = self.root.prune(lambda x: len(x.attrib['words']) > 10)
+        assert 0 < len(list(node.walk())) < n_nodes
+
+        node = self.root.prune(lambda x: len(x.attrib['words']) > 100)
+        assert 0 < len(list(node.walk())) < n_nodes
+
+    def testWalk(self):
+        for node in self.root.walk_postorder():
+            for child in node.children.values():
+                assert 'isMarked' in child.attrib
+            node.attrib['isMarked'] = True
+        return
+
+    def tearDown(self):
+        pass
+
+#----------------------------------------------------------------------------#
+
+if __name__ == "__main__":
+    unittest.TextTestRunner(verbosity=1).run(suite())
+
+#----------------------------------------------------------------------------#
+
+# vim: ts=4 sw=4 sts=4 et tw=78:

src/test_place.py

+# -*- coding: utf-8 -*-
+#
+#  test_place.py
+#  jp-places
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
+
+import unittest
+import os
+
+from place import Place
+
+import warnings
+warnings.simplefilter("ignore", RuntimeWarning)
+
+#----------------------------------------------------------------------------#
+
+def suite():
+    testSuite = unittest.TestSuite((
+            unittest.makeSuite(PlaceTestCase)
+        ))
+    return testSuite
+
+#----------------------------------------------------------------------------#
+
+class PlaceTestCase(unittest.TestCase):
+    """
+    This class tests the Place class. 
+    """
+    def setUp(self):
+        self.melb = Place(u'Melbourne', u'メルボルン')
+        self.aust = Place(u'Australia', u'オーストラリア')
+        self.filename = os.tmpnam()
+        pass
+
+    def testBasics(self):
+        self.assertEqual(self.melb.label, u'Melbourne')
+        self.assertEqual(self.melb.reading, u'メルボルン')
+
+        self.aust.append(self.melb)
+
+        assert self.melb.label in self.aust.children
+        return
+
+    def test_formatting(self):
+        originalPlace = self.aust
+        melbourne = self.melb
+        melbourne.append(Place('St_Kilda', u'セーントキルダ'))
+        melbourne.append(Place('Collingwood', u'コーリングウード'))
+        originalPlace.append(melbourne)
+        sydney = Place('Sydney', u'シドニー')
+        self.assertRaises(ValueError, Place, "Anja's place")
+        sydney.append(Place("Anja's_place"))
+        originalPlace.append(sydney)
+
+        originalPlace.dump(self.filename)
+        newCopy = Place.from_file(self.filename)
+        self.assertEqual(newCopy, originalPlace)
+
+    def tearDown(self):
+        # Clean up the temp file we may have used.
+        if os.path.exists(self.filename):
+            os.remove(self.filename)
+        return
+
+#----------------------------------------------------------------------------#
+
+if __name__ == "__main__":
+    unittest.TextTestRunner(verbosity=1).run(suite())
+
+#----------------------------------------------------------------------------#
+
+# vim: ts=4 sw=4 sts=4 et tw=78:

src/test_unified_places.py

+# -*- coding: utf-8 -*-
+#----------------------------------------------------------------------------#
+# testUnifiedPlaces.py
+# Lars Yencken <lars.yencken@gmail.com>
+# vim: ts=4 sw=4 sts=4 et tw=78:
+# Tue Jun 12 23:54:31 2007
+#
+#----------------------------------------------------------------------------# 
+
+import unittest
+from unified_places import UnifiedHierarchy
+
+#----------------------------------------------------------------------------#
+
+def suite():
+    testSuite = unittest.TestSuite((
+            unittest.makeSuite(UnifiedPlacesTestCase)
+        ))
+    return testSuite
+
+#----------------------------------------------------------------------------#
+
+class UnifiedPlacesTestCase(unittest.TestCase):
+    """
+    This class tests the UnifiedPlaces class. 
+    """
+    def setUp(self):
+        self.model = UnifiedHierarchy.get_cached()
+        pass
+
+    def testWalk(self):
+        visited_set = set()
+        for node in self.model.walk():
+            visited_set.add(node.label)
+
+        assert u'東京都' in visited_set
+        pass
+
+    def test_prefecture_readings(self):
+        self.assertEqual(
+                self.model.children[u'日本'].children[u'大阪府']['reading'],
+                u'おおさかふ'
+            )
+        self.assertEqual(
+                self.model.children[u'日本'].children[u'東京都']['reading'],
+                u'とうきょうと'
+        )
+        self.assertEqual(
+                self.model.children[u'日本'].children[u'北海道']['reading'],
+                u'ほっかいどう'
+        )
+    
+    def tearDown(self):
+        pass
+
+#----------------------------------------------------------------------------#
+
+if __name__ == "__main__":
+    unittest.TextTestRunner(verbosity=1).run(suite())
+
+#----------------------------------------------------------------------------#
+
+# vim: ts=4 sw=4 sts=4 et tw=78:
+

src/unified_places.py

+# -*- coding: utf-8 -*-
+# 
+#  unified_places.py
+#  foks
+#  
+#  Created by Lars Yencken on 2007-06-09.
+#  Copyright 2007-2008 Lars Yencken. All rights reserved.
+# 
+
+"""
+This module deals with nplaces and geographic information. It attempts
+to reconcile information from jplaces, enamdict, Japan Post and Goitaikei,
+and to use this information to construct a new holistic resource.
+"""
+
+#----------------------------------------------------------------------------#
+
+import os
+from os.path import join
+from itertools import imap
+
+from cjktools.common import sopen
+from cjktools import scripts
+from cjktools.resources.autoFormat import loadDictionary
+from cjktools.resources.splitByCodes import loadCodedDictionary
+from consoleLog.progressBar import withProgress
+from hierarchy.tree import TreeNode
+from django.conf import settings
+
+from goitaikei import GoitaikeiHierarchy
+
+#----------------------------------------------------------------------------#
+
+DATA_DIR = settings.DATA_DIR
+_enamdict_file = join(DATA_DIR, 'dict', 'je_enamdict.gz')
+_jplaces_file = join(DATA_DIR, 'dict', 'je_jplaces.gz')
+_gazetteer_file = join(DATA_DIR, 'jp_places.gz') 
+_edict_file = join(DATA_DIR, 'dict', 'je_edict.gz')
+_place_file = join(DATA_DIR, 'place_gazetteer.gz')
+
+#----------------------------------------------------------------------------#
+
+class GazetteerNode(TreeNode):
+    """A simple wrapper for TreeNode, representing instead a place."""
+    @classmethod
+    def from_file(cls, filename):
+        return cls.from_gazetteer_file(filename)
+
+    @classmethod
+    def from_gazetteer_file(cls, filename):
+        """Parses an entire gazetteer file, returning the root node."""
+        i_stream = sopen(filename, 'r', 'utf8')
+        lines = iter(i_stream)
+        depth, root_node = cls._parse_line(lines.next())
+        if depth != 0:
+            raise Exception, "gazetteer should start with a root node"
+        path = [root_node]
+        for depth, node in imap(cls._parse_line, lines):
+            path[depth-1].add_child(node)
+            if depth >= len(path):
+                path.append(node)
+            else:
+                path[depth] = node
+        i_stream.close()
+        return root_node
+
+    @classmethod
+    def _parse_line(cls, line):
+        """Parses a single line of the gazetteer file."""
+        line = line.strip()
+        depth, name, reading = line.split()
+        depth = int(depth)
+        reading = (reading == 'None') and None or reading
+        return depth, cls(name, attrib={'reading': reading})
+
+    def to_file(self, filename):
+        o_stream = sopen(filename, 'w', 'utf8')
+        for node in self.walk():
+            print >> o_stream, '%d %s %s' % (len(node.ancestors)-1, node.label,
+                    node.attrib.get('reading'))
+        o_stream.close()
+
+#----------------------------------------------------------------------------#
+
+_prefecture_suffixes = [
+        (u'県', u'けん'),
+        (u'府', u'ふ'),
+        (u'都', u'と'),
+    ]
+
+class UnifiedHierarchy(GazetteerNode):
+    """A unified place hierarchy."""
+
+    #------------------------------------------------------------------------#
+    # PUBLIC METHODS
+    #------------------------------------------------------------------------#
+    def __init__(self, *args, **kwargs):
+        GazetteerNode.__init__(self, *args, **kwargs)
+        self.transliterations = {}
+
+    @classmethod
+    def create(cls):
+        root_node = UnifiedHierarchy('*')
+        print 'Building the unified place hierarchy'
+        print '├─ Loading resources'
+        unknown_places = {}
+        print '│  ├─ Edict'
+        unknown_places.update(loadCodedDictionary(_edict_file)['p'])
+        print '│  ├─ Jplaces'
+        japanese_places = dict(loadDictionary(_jplaces_file))
+        print '│  ├─ Enamdict'
+        enamdict = loadCodedDictionary(_enamdict_file)
+        unknown_places.update(enamdict['p'])
+        japanese_places.update(enamdict['st'])
+        del enamdict
+
+        print '│  ├─ Japan post gazetteer'
+        japan = GazetteerNode.from_file(_gazetteer_file)
+
+        print '│  └─ Goitaikei'
+        hierarchy = GoitaikeiHierarchy.get_cached()
+        countries = hierarchy.root.find_node(u'国')['words']
+        del hierarchy
+
+        print '├─ Adding countries'
+        root_node.add_child(japan)
+        # Add other countries from Goitaikei.
+        cls._add_countries(root_node, countries, unknown_places)
+
+        # Add readings for each prefecture.
+        cls._add_prefecture_readings(root_node, japanese_places,
+                unknown_places)
+
+        # Insert objects whose place in the hierarchy is unknown into Japan.
+        print '└─ Inserting unknown places'
+        cls.transliterations = {}
+        cls._insert_unknowns(root_node, japanese_places, inJapan=True,
+                keep_gloss=False)
+        cls._insert_unknowns(root_node, unknown_places, inJapan=False,
+                keep_gloss=True)
+
+        return root_node
+
+    #------------------------------------------------------------------------#
+
+    @classmethod
+    def get_cached(cls):
+        """Fetches a disk or memory cached version."""
+        if not hasattr(cls, '_cached'):
+            if os.path.exists(_place_file):
+                cls._cached = cls.from_file(_place_file)
+            else:
+                cls._cached = cls.create()
+                cls._cached.to_file(_place_file)
+
+        return cls._cached
+
+    #------------------------------------------------------------------------#
+
+    _separator = ':::'
+
+    def to_file(self, filename):
+        # Dump the place hierarchy.
+        GazetteerNode.to_file(self, filename)
+
+        # Dump the transliterations.
+        translitFile = filename + '.tl'
+        sep = UnifiedHierarchy._separator
+        o_stream = sopen(translitFile, 'w')
+        for word, translit in self.transliterations.iteritems():
+            translit = translit.strip()
+            if sep in word or sep in translit:
+                continue
+
+            print >> o_stream, sep.join( (word, translit) )
+        o_stream.close()
+        return
+
+    #------------------------------------------------------------------------#
+
+    @classmethod
+    def from_file(cls, filename):
+        """Fetches a disk-dumped version from a file."""
+        result_obj = cls.from_gazetteer_file(filename)
+
+        sep = UnifiedHierarchy._separator
+        transliterations = {}
+        translitFile = filename + '.tl'
+        i_stream = sopen(translitFile)
+        for line in i_stream:
+            word, translit = line.rstrip().split(sep)
+            transliterations[word] = translit
+        i_stream.close()
+        result_obj.transliterations = transliterations
+
+        return result_obj
+
+    #------------------------------------------------------------------------#
+    # PRIVATE METHODS
+    #------------------------------------------------------------------------#
+
+    @staticmethod
+    def _add_prefecture_readings(root_node, japanese_places, unknown_places):
+        """
+        Adds readings for the prefectures, which have nodes but no reading.
+        """
+        japan = root_node.children[u'日本']
+        for prefecture_node in japan.children.values():
+            label = prefecture_node.label
+            # Try to find a naive match.
+            for jp_dict in (japanese_places, unknown_places):
+                if label in jp_dict:
+                    reading_set = japanese_places[label].readings
+                    if len(reading_set) != 1:
+                        raise Exception, "Unknown reading prefecture"
+
+                    (unique_reading,) = reading_set
+                    prefecture_node.attrib['reading'] = unique_reading
+                    break
+
+            else:
+                # We didn't find an exact match. What if we drop the suffix?
+                for suffix, suffix_reading in root_node._prefecture_suffixes:
+                    if not label.endswith(suffix):
+                        continue
+
+                    base_label = label[:-len(suffix)]
+                    for jp_dict in (japanese_places, unknown_places):
+                        if base_label not in jp_dict:
+                            continue
+
+                        reading_set = japanese_places[label].readings
+                        if len(reading_set) != 1:
+                            raise Exception, "Unknown reading prefecture"
+
+                        prefecture_node.reading = reading_set[0] + \
+                                suffix_reading
+                        break
+
+            if 'reading' not in prefecture_node.attrib:
+                raise Exception, "No reading for prefecture %s" % \
+                        prefecture_node.label
+
+        return
+
+    #------------------------------------------------------------------------#
+
+    @staticmethod
+    def _add_countries(root_node, countries, pooled_dictionary):
+        """
+        Insert countries from the Goitaikei hierarchy.
+        """
+        # Split into countries and words.
+        kanji = scripts.Script.Kanji
+        kanji_countries = set([c for c in countries \
+                if scripts.containsScript(kanji, c)])
+        kanji_countries.remove(u'日本')
+        reading_countries = set([c for c in countries \
+                if not scripts.containsScript(kanji, c)])
+
+        print '│  ├─ Adding kanji countries'
+        for country in withProgress(kanji_countries):
+            if country in pooled_dictionary:
+                reading = pooled_dictionary[country].readings[0]
+                removal_reading = reading
+
+            elif country + u'国' in pooled_dictionary:
+                country += u'国'
+                reading = pooled_dictionary[country].readings[0]
+                hiragana_reading = scripts.toHiragana(reading)
+                assert hiragana_reading[-2:] in (u'くに', u'こく', u'ごく')
+                removal_reading = reading[:-2]
+
+            else:
+                # print 'No readings for %s' % country
+                continue
+
+            root_node.add_child(GazetteerNode(country,
+                    attrib={'reading': reading}))
+            if removal_reading in reading_countries:
+                reading_countries.remove(removal_reading)
+
+        print '│  └─ Adding non-kanji countries'
+        for country in withProgress(reading_countries):
+            root_node.add_child(TreeNode(country,
+                    attrib={'reading': scripts.toHiragana(country)}))
+
+        return
+
+    #------------------------------------------------------------------------#
+
+    @staticmethod
+    def _insert_unknowns(root_node, unknown_dict, inJapan=False, keep_gloss=True):
+        """Insert all the unknown objects into the base hierarchy."""
+        existing_locs = {}
+        for node in root_node.walk():
+            existing_locs[node.label] = \
+                    existing_locs.setdefault(node.label, 0) + 1
+
+        unknown_parent = GazetteerNode('?')
+        
+        for name, entry in withProgress(unknown_dict.items(), 50):
+            if name in existing_locs:
+                continue
+
+            # Ignore prefecture names.
+            matched_prefecture = False
+            for suffix, reading_suffix in _prefecture_suffixes:
+                if name + suffix in existing_locs:
+                    matched_prefecture = True
+                    print 'Skipped %s (%s)' % (name, name + suffix)
+                    break
+
+            if matched_prefecture:
+                continue
+
+            # Try station names:
+            if name.endswith(u'駅'):
+                station_name = name[:-1]
+                if existing_locs.get(station_name, 0) == 1:
+                    node = root_node.find_node(station_name)
+                    node.parent.add_child(GazetteerNode(name,
+                            attrib={'reading': entry.readings[0]}))
+                    existing_locs[name] = \
+                            existing_locs.setdefault(node.label, 0) + 1
+                    continue
+
+            existing_locs[name] = \
+                    existing_locs.setdefault(node.label, 0) + 1
+            newPlace = GazetteerNode(name,
+                    attrib={'reading': entry.readings[0]})
+            unknown_parent[name] = newPlace
+
+            # Add the translation/transliteration of this place.
+            if keep_gloss:
+                transliteration = ', '.join(entry.senses)
+                transliteration = transliteration.replace('(p)', '')
+                root_node.transliterations[name] = transliteration.strip()
+
+        # Insert the unknown place in our hierarchy.
+        if inJapan:
+            # Prefecture is unknown
+            root_node.children[u'日本'].add_child(unknown_parent)
+        else:
+            # Could be another country
+            root_node.add_child(unknown_parent)
+
+        return
+
+#----------------------------------------------------------------------------#
+
+if __name__ == '__main__':
+    UnifiedHierarchy.get_cached()
+
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.