Source

jp-places / src / goitaikei.py

Full commit
# -*- coding: utf-8 -*-
#
#  goitaikei.py
#  jp-places
# 
#  Created by Lars Yencken on 10-04-2009.
#  Copyright 2009 Lars Yencken. All rights reserved.
#

"""
A structure model for the Goitaikei ontology. Provides a nice API for
accessing the underlying tree structure.
"""

import os

from django.conf import settings
from cjktools.common import sopen
from cjktools.maps import invertMapping
from cjktools.smartCache import diskProxyDirect
from hierarchy.tree import TreeNode
from simplestats import FreqDist

_root_id = 1

class GoitaikeiHierarchy(object):
    """
    A dictionary style object which can be queried for the object
    hierarchy.
    """
    #------------------------------------------------------------------------#
    # PUBLIC METHODS
    #------------------------------------------------------------------------#

    def __init__(self, leaf_file=None, index_file=None):
        """
        Constructor. Parses the Goitaikei ontology, determining the
        GoitaikeiNode object for the root node.
        """
        leaf_file = leaf_file or settings.GOITAIKEI_LEAF_FILE
        index_file = index_file or settings.GOITAIKEI_INDEX_FILE

        self.root = self._build_tree(leaf_file, index_file)

        self._build_extras()

    #------------------------------------------------------------------------#

    def to_vector(self):
        """Counts the number of words in each node, generating a vector."""
        empty = []
        return [len(n.get('words', empty)) for n in self.walk()]

    #------------------------------------------------------------------------#

    @classmethod
    def get_cached(cls, leaf_file=None, index_file=None):
        """
        Returns a cached instance, or builds a new one if no cached 
        instance is available.
        """
        leaf_file = leaf_file or settings.GOITAIKEI_LEAF_FILE
        index_file = index_file or settings.GOITAIKEI_INDEX_FILE
        if not hasattr(cls, '_cached'):
            build_new_hierarchy = diskProxyDirect(
                    GoitaikeiHierarchy,
                    os.path.join(settings.CACHE_DIR, 'goitaikei.cache'),
                    dependencies=[__file__, index_file, leaf_file],
                )
            cls._cached = build_new_hierarchy()

        return cls._cached

    #------------------------------------------------------------------------#
    # PRIVATE METHODS
    #------------------------------------------------------------------------#

    def _build_tree(self, leaf_file, index_file):
        """Build the main Goitaikei tree."""
        # Parse a list of words for each category.
        id_to_words = self._parse_word_categories(leaf_file)

        # Get the tree structure of the information.
        id_to_node, id_to_words = self._parse_tree_structure(   
                index_file, id_to_words)

        # Add words to the tree structure.
        for id, words in id_to_words.iteritems():
            # Ignore silently categories missing from our id index.
            if id in id_to_node:
                id_to_node[id].attrib['words'].update(words)

        return id_to_node[_root_id]

    #------------------------------------------------------------------------#

    def _build_extras(self):
        """Build in secondary features for convenience."""
        # Give each node a set of all characters used in that node, and
        # generate a map from words to all nodes containing them.
        words_to_classes = {}
        n_words = 0
        for node in self.root.walk():
            char_set = set()
            for word in node['words']:
                char_set.update(word)
                if word not in words_to_classes:
                    words_to_classes[word] = set([node])
                else:
                    words_to_classes[word].add(node)

                n_words += 1

            node['chars'] = char_set

        self.words_to_classes = words_to_classes

        # Append node frequency.
        word_dist = FreqDist.from_file(settings.JP_WORD_COUNTS_FILE)
        for word in words_to_classes:
            word_dist.inc(word)

        for node in self.root.walk_postorder():
            assert 'prob' not in node
            node_freq = 0.0
            for word in node['words']:
                word_freq = word_dist.count(word)
                node_freq += word_freq / float(len(words_to_classes[word]))
            node['prob'] = node_freq / word_dist.total
            node['c_prob'] = node['prob'] + \
                    sum(n['c_prob'] for n in node.children.itervalues())
            assert 0 <= node['prob'] <= 1
            assert 0 <= node['c_prob'] <= 1

        return

    def _load_upper_nodes(self, index_file):
        """
        Loads the tree-structure of categories as a directed graph, from 
        child to parent. Ignores category membership of these nodes.
        """
        i_stream = sopen(index_file)
        for line in i_stream:
            id, word, parent, word_categories = line.rstrip().split()
            id = int(id)
            assert word not in self.word_to_id, "Doubling up on %s" % word
            self.word_to_id[word] = id
            if parent != '*':
                # A non-root node.
                self.category_parent[id] = int(parent)
            else:
                # The single root node.
                self.category_parent[id] = None

            word_categories = [w for w in word_categories.split(':')
                    if w != '*']
            self.word_categories[id] = set(map(int, word_categories))

        i_stream.close()

        return

    def _load_leaf_nodes(self, leaf_file):
        """
        Loads all the words which aren't themselves categories.
        """
        i_stream = sopen(leaf_file)
        for line in i_stream:
            line_objs = line.rstrip(':\n').split(':')
            word = line_objs[0]
            categories = map(int, line_objs[1:])
            assert word not in self.word_to_id
            id = self.word_to_id[word]
            assert word not in self.word_categories
            self.word_categories[word] = set(categories)
        i_stream.close()
        return

    def _parse_word_categories(self, _leaf_file):
        """
        Determine the categories of all leaf words.
        """
        word_to_ids = {}
        i_stream = sopen(_leaf_file)
        for line in i_stream:
            line_objs = line.rstrip(':\n').split(':')
            word = line_objs[0]
            category_ids = map(int, line_objs[1:])
            word_to_ids[word] = category_ids
        i_stream.close()

        id_to_words = invertMapping(word_to_ids)

        return id_to_words

    def _parse_tree_structure(self, index_file, id_to_words):
        """
        Determine the tree structure of the ontology.
        """
        # Parse the category structure itself.
        id_to_node = {}
        i_stream = sopen(index_file)
        for line in i_stream:
            id, word, parent, categories = line.rstrip().split()
            id = int(id)

            # Build the node itself.
            node = TreeNode(word, attrib={'id': id, 'words': set()})
            id_to_node[id] = node

            # Add link from its parent.
            if parent != '*':
                parent = int(parent)
                assert parent < id
                id_to_node[parent].add_child(node)

            # Store additional word categories.
            categories = [int(w) for w in categories.split(':') if w != '*']
            for category_id in categories:
                if category_id in id_to_words:
                    id_to_words[category_id].append(word)
                else:
                    id_to_words[category_id] = [word]

        i_stream.close()

        return id_to_node, id_to_words