Source

jp-places / src / goitaikei.py

Full commit
# -*- coding: utf-8 -*-
#----------------------------------------------------------------------------#
# goitaikei.py
# Lars Yencken <lars.yencken@gmail.com>
# vim: ts=4 sw=4 sts=4 et tw=78:
# Thu Jun  7 22:06:56 2007
#
#----------------------------------------------------------------------------#

"""
A structure model for the Goitaikei ontology. Provides a nice API for
accessing the underlying tree structure.
"""

#----------------------------------------------------------------------------#

import os

from django.conf import settings
from cjktools.common import sopen
from cjktools.maps import invertMapping
from cjktools.smartCache import diskProxyDirect
from hierarchy.tree import TreeNode
from simplestats import FreqDist

#----------------------------------------------------------------------------#
# GLOBAL
#----------------------------------------------------------------------------#

_goitaikeiDir = os.path.join(settings.DATA_DIR, 'corpus', 'goitaikei')
_index_file = os.path.join(_goitaikeiDir, 'NTT-CThesaurus-struct-tree.gz')
_leaf_file = os.path.join(_goitaikeiDir, 'NTT-CThesaurus.gz')
_jpWordFreqFile = os.path.join(settings.DATA_DIR, 'corpus',
        'jp_word_corpus_counts.gz')
_rootId = 1

#----------------------------------------------------------------------------#
# PUBLIC
#----------------------------------------------------------------------------#

class GoitaikeiHierarchy(object):
    """
    A dictionary style object which can be queried for the object
    hierarchy.
    """
    #------------------------------------------------------------------------#
    # PUBLIC METHODS
    #------------------------------------------------------------------------#

    def __init__(self, leafFile=_leaf_file, indexFile=_index_file):
        """
        Constructor. Parses the Goitaikei ontology, determining the
        GoitaikeiNode object for the root node.
        """
        self.root = self._build_tree(leafFile, indexFile)

        self._buildExtras()

    #------------------------------------------------------------------------#

    def toVector(self):
        """Counts the number of words in each node, generating a vector."""
        empty = []
        return [len(n.get('words', empty)) for n in self.walk()]

    #------------------------------------------------------------------------#

    @classmethod
    def get_cached(cls):
        """
        Returns a cached instance, or builds a new one if no cached 
        instance is available.
        """
        if not hasattr(cls, '_cached'):
            buildNewHierarchy = diskProxyDirect(
                    GoitaikeiHierarchy,
                    os.path.join(settings.CACHE_DIR, 'goitaikei.cache'),
                    dependencies=[__file__, _index_file, _leaf_file],
                )
            cls._cached = buildNewHierarchy()

        return cls._cached

    #------------------------------------------------------------------------#
    # PRIVATE METHODS
    #------------------------------------------------------------------------#

    def _build_tree(self, leafFile, indexFile):
        """Build the main Goitaikei tree."""
        # Parse a list of words for each category.
        idToWords = self._parseWordCategories(leafFile)

        # Get the tree structure of the information.
        idToNode, idToWords = self._parseTreeStructure(indexFile, idToWords)

        # Add words to the tree structure.
        for id, words in idToWords.iteritems():
            # Ignore silently categories missing from our id index.
            if id in idToNode:
                idToNode[id].attrib['words'].update(words)

        return idToNode[_rootId]

    #------------------------------------------------------------------------#

    def _buildExtras(self):
        """Build in secondary features for convenience."""
        # Give each node a set of all characters used in that node, and
        # generate a map from words to all nodes containing them.
        wordsToClasses = {}
        n_words = 0
        for node in self.root.walk():
            char_set = set()
            for word in node['words']:
                char_set.update(word)
                if word not in wordsToClasses:
                    wordsToClasses[word] = set([node])
                else:
                    wordsToClasses[word].add(node)

                n_words += 1

            node['chars'] = char_set

        self.wordsToClasses = wordsToClasses

        # Append node frequency.
        word_dist = FreqDist.from_file(_jpWordFreqFile)
        for word in wordsToClasses:
            word_dist.inc(word)

        for node in self.root.walk_postorder():
            assert 'prob' not in node
            nodeFreq = 0.0
            for word in node['words']:
                wordFreq = word_dist.count(word)
                nodeFreq += wordFreq / float(len(wordsToClasses[word]))
            node['prob'] = nodeFreq / word_dist.total
            node['cProb'] = node['prob'] + \
                    sum(n['cProb'] for n in node.children.itervalues())
            assert 0 <= node['prob'] <= 1
            assert 0 <= node['cProb'] <= 1

        return

    #------------------------------------------------------------------------#

    def _loadUpperNodes(self, indexFile):
        """
        Loads the tree-structure of categories as a directed graph, from 
        child to parent. Ignores category membership of these nodes.
        """
        i_stream = sopen(indexFile)
        for line in i_stream:
            id, word, parent, wordCategories = line.rstrip().split()
            id = int(id)
            assert word not in self.wordToId, "Doubling up on %s" % word
            self.wordToId[word] = id
            if parent != '*':
                # A non-root node.
                self.categoryParent[id] = int(parent)
            else:
                # The single root node.
                self.categoryParent[id] = None

            wordCategories = [w for w in wordCategories.split(':') if w != '*']
            self.wordCategories[id] = set(map(int, wordCategories))

        i_stream.close()

        return

    #------------------------------------------------------------------------#

    def _loadLeafNodes(self, leafFile):
        """
        Loads all the words which aren't themselves categories.
        """
        i_stream = sopen(leafFile)
        for line in i_stream:
            line_objs = line.rstrip(':\n').split(':')
            word = line_objs[0]
            categories = map(int, line_objs[1:])
            assert word not in self.wordToId
            id = self.wordToId[word]
            assert word not in self.wordCategories
            self.wordCategories[word] = set(categories)
        i_stream.close()
        return

    #------------------------------------------------------------------------#

    def _parseWordCategories(self, _leaf_file):
        """
        Determine the categories of all leaf words.
        """
        wordToIds = {}
        i_stream = sopen(_leaf_file)
        for line in i_stream:
            line_objs = line.rstrip(':\n').split(':')
            word = line_objs[0]
            category_ids = map(int, line_objs[1:])
            wordToIds[word] = category_ids
        i_stream.close()

        idToWords = invertMapping(wordToIds)

        return idToWords

    #------------------------------------------------------------------------#

    def _parseTreeStructure(self, indexFile, idToWords):
        """
        Determine the tree structure of the ontology.
        """
        # Parse the category structure itself.
        idToNode = {}
        i_stream = sopen(indexFile)
        for line in i_stream:
            id, word, parent, categories = line.rstrip().split()
            id = int(id)

            # Build the node itself.
            node = TreeNode(word, attrib={'id': id, 'words': set()})
            idToNode[id] = node

            # Add link from its parent.
            if parent != '*':
                parent = int(parent)
                assert parent < id
                idToNode[parent].add_child(node)

            # Store additional word categories.
            categories = [int(w) for w in categories.split(':') if w != '*']
            for category_id in categories:
                if category_id in idToWords:
                    idToWords[category_id].append(word)
                else:
                    idToWords[category_id] = [word]

        i_stream.close()

        return idToNode, idToWords

    #------------------------------------------------------------------------#

#----------------------------------------------------------------------------#