1. Lars Yencken
  2. jp-places

Source

jp-places / src / unified_places.py

# -*- coding: utf-8 -*-
# 
#  unified_places.py
#  foks
#  
#  Created by Lars Yencken on 2007-06-09.
#  Copyright 2007-2008 Lars Yencken. All rights reserved.
# 

"""
This module deals with nplaces and geographic information. It attempts
to reconcile information from jplaces, enamdict, Japan Post and Goitaikei,
and to use this information to construct a new holistic resource.
"""

#----------------------------------------------------------------------------#

import os
from os.path import join
from itertools import imap

from cjktools.common import sopen
from cjktools import scripts
from cjktools.resources.autoFormat import loadDictionary
from cjktools.resources.splitByCodes import loadCodedDictionary
from consoleLog.progressBar import withProgress
from hierarchy.tree import TreeNode
from django.conf import settings

from goitaikei import GoitaikeiHierarchy

#----------------------------------------------------------------------------#

DATA_DIR = settings.DATA_DIR
_enamdict_file = join(DATA_DIR, 'dict', 'je_enamdict.gz')
_jplaces_file = join(DATA_DIR, 'dict', 'je_jplaces.gz')
_gazetteer_file = join(DATA_DIR, 'jp_places.gz') 
_edict_file = join(DATA_DIR, 'dict', 'je_edict.gz')
_place_file = join(DATA_DIR, 'place_gazetteer.gz')

#----------------------------------------------------------------------------#

class GazetteerNode(TreeNode):
    """A simple wrapper for TreeNode, representing instead a place."""
    @classmethod
    def from_file(cls, filename):
        return cls.from_gazetteer_file(filename)

    @classmethod
    def from_gazetteer_file(cls, filename):
        """Parses an entire gazetteer file, returning the root node."""
        i_stream = sopen(filename, 'r', 'utf8')
        lines = iter(i_stream)
        depth, root_node = cls._parse_line(lines.next())
        if depth != 0:
            raise Exception, "gazetteer should start with a root node"
        path = [root_node]
        for depth, node in imap(cls._parse_line, lines):
            path[depth-1].add_child(node)
            if depth >= len(path):
                path.append(node)
            else:
                path[depth] = node
        i_stream.close()
        return root_node

    @classmethod
    def _parse_line(cls, line):
        """Parses a single line of the gazetteer file."""
        line = line.strip()
        depth, name, reading = line.split()
        depth = int(depth)
        reading = (reading == 'None') and None or reading
        return depth, cls(name, attrib={'reading': reading})

    def to_file(self, filename):
        o_stream = sopen(filename, 'w', 'utf8')
        for node in self.walk():
            print >> o_stream, '%d %s %s' % (len(node.ancestors)-1, node.label,
                    node.attrib.get('reading'))
        o_stream.close()

#----------------------------------------------------------------------------#

_prefecture_suffixes = [
        (u'県', u'けん'),
        (u'府', u'ふ'),
        (u'都', u'と'),
    ]

class UnifiedHierarchy(GazetteerNode):
    """A unified place hierarchy."""

    #------------------------------------------------------------------------#
    # PUBLIC METHODS
    #------------------------------------------------------------------------#
    def __init__(self, *args, **kwargs):
        GazetteerNode.__init__(self, *args, **kwargs)
        self.transliterations = {}

    @classmethod
    def create(cls):
        root_node = UnifiedHierarchy('*')
        print 'Building the unified place hierarchy'
        print '├─ Loading resources'
        unknown_places = {}
        print '│  ├─ Edict'
        unknown_places.update(loadCodedDictionary(_edict_file)['p'])
        print '│  ├─ Jplaces'
        japanese_places = dict(loadDictionary(_jplaces_file))
        print '│  ├─ Enamdict'
        enamdict = loadCodedDictionary(_enamdict_file)
        unknown_places.update(enamdict['p'])
        japanese_places.update(enamdict['st'])
        del enamdict

        print '│  ├─ Japan post gazetteer'
        japan = GazetteerNode.from_file(_gazetteer_file)

        print '│  └─ Goitaikei'
        hierarchy = GoitaikeiHierarchy.get_cached()
        countries = hierarchy.root.find_node(u'国')['words']
        del hierarchy

        print '├─ Adding countries'
        root_node.add_child(japan)
        # Add other countries from Goitaikei.
        cls._add_countries(root_node, countries, unknown_places)

        # Add readings for each prefecture.
        cls._add_prefecture_readings(root_node, japanese_places,
                unknown_places)

        # Insert objects whose place in the hierarchy is unknown into Japan.
        print '└─ Inserting unknown places'
        cls.transliterations = {}
        cls._insert_unknowns(root_node, japanese_places, inJapan=True,
                keep_gloss=False)
        cls._insert_unknowns(root_node, unknown_places, inJapan=False,
                keep_gloss=True)

        return root_node

    #------------------------------------------------------------------------#

    @classmethod
    def get_cached(cls):
        """Fetches a disk or memory cached version."""
        if not hasattr(cls, '_cached'):
            if os.path.exists(_place_file):
                cls._cached = cls.from_file(_place_file)
            else:
                cls._cached = cls.create()
                cls._cached.to_file(_place_file)

        return cls._cached

    #------------------------------------------------------------------------#

    _separator = ':::'

    def to_file(self, filename):
        # Dump the place hierarchy.
        GazetteerNode.to_file(self, filename)

        # Dump the transliterations.
        translitFile = filename + '.tl'
        sep = UnifiedHierarchy._separator
        o_stream = sopen(translitFile, 'w')
        for word, translit in self.transliterations.iteritems():
            translit = translit.strip()
            if sep in word or sep in translit:
                continue

            print >> o_stream, sep.join( (word, translit) )
        o_stream.close()
        return

    #------------------------------------------------------------------------#

    @classmethod
    def from_file(cls, filename):
        """Fetches a disk-dumped version from a file."""
        result_obj = cls.from_gazetteer_file(filename)

        sep = UnifiedHierarchy._separator
        transliterations = {}
        translitFile = filename + '.tl'
        i_stream = sopen(translitFile)
        for line in i_stream:
            word, translit = line.rstrip().split(sep)
            transliterations[word] = translit
        i_stream.close()
        result_obj.transliterations = transliterations

        return result_obj

    #------------------------------------------------------------------------#
    # PRIVATE METHODS
    #------------------------------------------------------------------------#

    @staticmethod
    def _add_prefecture_readings(root_node, japanese_places, unknown_places):
        """
        Adds readings for the prefectures, which have nodes but no reading.
        """
        japan = root_node.children[u'日本']
        for prefecture_node in japan.children.values():
            label = prefecture_node.label
            # Try to find a naive match.
            for jp_dict in (japanese_places, unknown_places):
                if label in jp_dict:
                    reading_set = japanese_places[label].readings
                    if len(reading_set) != 1:
                        raise Exception, "Unknown reading prefecture"

                    (unique_reading,) = reading_set
                    prefecture_node.attrib['reading'] = unique_reading
                    break

            else:
                # We didn't find an exact match. What if we drop the suffix?
                for suffix, suffix_reading in root_node._prefecture_suffixes:
                    if not label.endswith(suffix):
                        continue

                    base_label = label[:-len(suffix)]
                    for jp_dict in (japanese_places, unknown_places):
                        if base_label not in jp_dict:
                            continue

                        reading_set = japanese_places[label].readings
                        if len(reading_set) != 1:
                            raise Exception, "Unknown reading prefecture"

                        prefecture_node.reading = reading_set[0] + \
                                suffix_reading
                        break

            if 'reading' not in prefecture_node.attrib:
                raise Exception, "No reading for prefecture %s" % \
                        prefecture_node.label

        return

    #------------------------------------------------------------------------#

    @staticmethod
    def _add_countries(root_node, countries, pooled_dictionary):
        """
        Insert countries from the Goitaikei hierarchy.
        """
        # Split into countries and words.
        kanji = scripts.Script.Kanji
        kanji_countries = set([c for c in countries \
                if scripts.containsScript(kanji, c)])
        kanji_countries.remove(u'日本')
        reading_countries = set([c for c in countries \
                if not scripts.containsScript(kanji, c)])

        print '│  ├─ Adding kanji countries'
        for country in withProgress(kanji_countries):
            if country in pooled_dictionary:
                reading = pooled_dictionary[country].readings[0]
                removal_reading = reading

            elif country + u'国' in pooled_dictionary:
                country += u'国'
                reading = pooled_dictionary[country].readings[0]
                hiragana_reading = scripts.toHiragana(reading)
                assert hiragana_reading[-2:] in (u'くに', u'こく', u'ごく')
                removal_reading = reading[:-2]

            else:
                # print 'No readings for %s' % country
                continue

            root_node.add_child(GazetteerNode(country,
                    attrib={'reading': reading}))
            if removal_reading in reading_countries:
                reading_countries.remove(removal_reading)

        print '│  └─ Adding non-kanji countries'
        for country in withProgress(reading_countries):
            root_node.add_child(TreeNode(country,
                    attrib={'reading': scripts.toHiragana(country)}))

        return

    #------------------------------------------------------------------------#

    @staticmethod
    def _insert_unknowns(root_node, unknown_dict, inJapan=False, keep_gloss=True):
        """Insert all the unknown objects into the base hierarchy."""
        existing_locs = {}
        for node in root_node.walk():
            existing_locs[node.label] = \
                    existing_locs.setdefault(node.label, 0) + 1

        unknown_parent = GazetteerNode('?')
        
        for name, entry in withProgress(unknown_dict.items(), 50):
            if name in existing_locs:
                continue

            # Ignore prefecture names.
            matched_prefecture = False
            for suffix, reading_suffix in _prefecture_suffixes:
                if name + suffix in existing_locs:
                    matched_prefecture = True
                    print 'Skipped %s (%s)' % (name, name + suffix)
                    break

            if matched_prefecture:
                continue

            # Try station names:
            if name.endswith(u'駅'):
                station_name = name[:-1]
                if existing_locs.get(station_name, 0) == 1:
                    node = root_node.find_node(station_name)
                    node.parent.add_child(GazetteerNode(name,
                            attrib={'reading': entry.readings[0]}))
                    existing_locs[name] = \
                            existing_locs.setdefault(node.label, 0) + 1
                    continue

            existing_locs[name] = \
                    existing_locs.setdefault(node.label, 0) + 1
            newPlace = GazetteerNode(name,
                    attrib={'reading': entry.readings[0]})
            unknown_parent[name] = newPlace

            # Add the translation/transliteration of this place.
            if keep_gloss:
                transliteration = ', '.join(entry.senses)
                transliteration = transliteration.replace('(p)', '')
                root_node.transliterations[name] = transliteration.strip()

        # Insert the unknown place in our hierarchy.
        if inJapan:
            # Prefecture is unknown
            root_node.children[u'日本'].add_child(unknown_parent)
        else:
            # Could be another country
            root_node.add_child(unknown_parent)

        return

#----------------------------------------------------------------------------#

if __name__ == '__main__':
    UnifiedHierarchy.get_cached()