Lars Yencken avatar Lars Yencken committed 53a78c1

Changes goitaki to use underscore naming, and to use settings for file paths.

Comments (0)

Files changed (2)

 # -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# goitaikei.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 sts=4 et tw=78:
-# Thu Jun  7 22:06:56 2007
 #
-#----------------------------------------------------------------------------#
+#  goitaikei.py
+#  jp-places
+# 
+#  Created by Lars Yencken on 10-04-2009.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+#
 
 """
 A structure model for the Goitaikei ontology. Provides a nice API for
 accessing the underlying tree structure.
 """
 
-#----------------------------------------------------------------------------#
-
 import os
 
 from django.conf import settings
 from hierarchy.tree import TreeNode
 from simplestats import FreqDist
 
-#----------------------------------------------------------------------------#
-# GLOBAL
-#----------------------------------------------------------------------------#
-
-_goitaikeiDir = os.path.join(settings.DATA_DIR, 'corpus', 'goitaikei')
-_index_file = os.path.join(_goitaikeiDir, 'NTT-CThesaurus-struct-tree.gz')
-_leaf_file = os.path.join(_goitaikeiDir, 'NTT-CThesaurus.gz')
-_jpWordFreqFile = os.path.join(settings.DATA_DIR, 'corpus',
-        'jp_word_corpus_counts.gz')
-_rootId = 1
-
-#----------------------------------------------------------------------------#
-# PUBLIC
-#----------------------------------------------------------------------------#
+_root_id = 1
 
 class GoitaikeiHierarchy(object):
     """
     # PUBLIC METHODS
     #------------------------------------------------------------------------#
 
-    def __init__(self, leafFile=_leaf_file, indexFile=_index_file):
+    def __init__(self, leaf_file=None, index_file=None):
         """
         Constructor. Parses the Goitaikei ontology, determining the
         GoitaikeiNode object for the root node.
         """
-        self.root = self._build_tree(leafFile, indexFile)
+        leaf_file = leaf_file or settings.GOITAIKEI_LEAF_FILE
+        index_file = index_file or settings.GOITAIKEI_INDEX_FILE
 
-        self._buildExtras()
+        self.root = self._build_tree(leaf_file, index_file)
+
+        self._build_extras()
 
     #------------------------------------------------------------------------#
 
-    def toVector(self):
+    def to_vector(self):
         """Counts the number of words in each node, generating a vector."""
         empty = []
         return [len(n.get('words', empty)) for n in self.walk()]
     #------------------------------------------------------------------------#
 
     @classmethod
-    def get_cached(cls):
+    def get_cached(cls, leaf_file=None, index_file=None):
         """
         Returns a cached instance, or builds a new one if no cached 
         instance is available.
         """
+        leaf_file = leaf_file or settings.GOITAIKEI_LEAF_FILE
+        index_file = index_file or settings.GOITAIKEI_INDEX_FILE
         if not hasattr(cls, '_cached'):
-            buildNewHierarchy = diskProxyDirect(
+            build_new_hierarchy = diskProxyDirect(
                     GoitaikeiHierarchy,
                     os.path.join(settings.CACHE_DIR, 'goitaikei.cache'),
-                    dependencies=[__file__, _index_file, _leaf_file],
+                    dependencies=[__file__, index_file, leaf_file],
                 )
-            cls._cached = buildNewHierarchy()
+            cls._cached = build_new_hierarchy()
 
         return cls._cached
 
     # PRIVATE METHODS
     #------------------------------------------------------------------------#
 
-    def _build_tree(self, leafFile, indexFile):
+    def _build_tree(self, leaf_file, index_file):
         """Build the main Goitaikei tree."""
         # Parse a list of words for each category.
-        idToWords = self._parseWordCategories(leafFile)
+        id_to_words = self._parse_word_categories(leaf_file)
 
         # Get the tree structure of the information.
-        idToNode, idToWords = self._parseTreeStructure(indexFile, idToWords)
+        id_to_node, id_to_words = self._parse_tree_structure(   
+                index_file, id_to_words)
 
         # Add words to the tree structure.
-        for id, words in idToWords.iteritems():
+        for id, words in id_to_words.iteritems():
             # Ignore silently categories missing from our id index.
-            if id in idToNode:
-                idToNode[id].attrib['words'].update(words)
+            if id in id_to_node:
+                id_to_node[id].attrib['words'].update(words)
 
-        return idToNode[_rootId]
+        return id_to_node[_root_id]
 
     #------------------------------------------------------------------------#
 
-    def _buildExtras(self):
+    def _build_extras(self):
         """Build in secondary features for convenience."""
         # Give each node a set of all characters used in that node, and
         # generate a map from words to all nodes containing them.
-        wordsToClasses = {}
+        words_to_classes = {}
         n_words = 0
         for node in self.root.walk():
             char_set = set()
             for word in node['words']:
                 char_set.update(word)
-                if word not in wordsToClasses:
-                    wordsToClasses[word] = set([node])
+                if word not in words_to_classes:
+                    words_to_classes[word] = set([node])
                 else:
-                    wordsToClasses[word].add(node)
+                    words_to_classes[word].add(node)
 
                 n_words += 1
 
             node['chars'] = char_set
 
-        self.wordsToClasses = wordsToClasses
+        self.words_to_classes = words_to_classes
 
         # Append node frequency.
-        word_dist = FreqDist.from_file(_jpWordFreqFile)
-        for word in wordsToClasses:
+        word_dist = FreqDist.from_file(settings.JP_WORD_COUNTS_FILE)
+        for word in words_to_classes:
             word_dist.inc(word)
 
         for node in self.root.walk_postorder():
             assert 'prob' not in node
-            nodeFreq = 0.0
+            node_freq = 0.0
             for word in node['words']:
-                wordFreq = word_dist.count(word)
-                nodeFreq += wordFreq / float(len(wordsToClasses[word]))
-            node['prob'] = nodeFreq / word_dist.total
-            node['cProb'] = node['prob'] + \
-                    sum(n['cProb'] for n in node.children.itervalues())
+                word_freq = word_dist.count(word)
+                node_freq += word_freq / float(len(words_to_classes[word]))
+            node['prob'] = node_freq / word_dist.total
+            node['c_prob'] = node['prob'] + \
+                    sum(n['c_prob'] for n in node.children.itervalues())
             assert 0 <= node['prob'] <= 1
-            assert 0 <= node['cProb'] <= 1
+            assert 0 <= node['c_prob'] <= 1
 
         return
 
-    #------------------------------------------------------------------------#
-
-    def _loadUpperNodes(self, indexFile):
+    def _load_upper_nodes(self, index_file):
         """
         Loads the tree-structure of categories as a directed graph, from 
         child to parent. Ignores category membership of these nodes.
         """
-        i_stream = sopen(indexFile)
+        i_stream = sopen(index_file)
         for line in i_stream:
-            id, word, parent, wordCategories = line.rstrip().split()
+            id, word, parent, word_categories = line.rstrip().split()
             id = int(id)
-            assert word not in self.wordToId, "Doubling up on %s" % word
-            self.wordToId[word] = id
+            assert word not in self.word_to_id, "Doubling up on %s" % word
+            self.word_to_id[word] = id
             if parent != '*':
                 # A non-root node.
-                self.categoryParent[id] = int(parent)
+                self.category_parent[id] = int(parent)
             else:
                 # The single root node.
-                self.categoryParent[id] = None
+                self.category_parent[id] = None
 
-            wordCategories = [w for w in wordCategories.split(':') if w != '*']
-            self.wordCategories[id] = set(map(int, wordCategories))
+            word_categories = [w for w in word_categories.split(':')
+                    if w != '*']
+            self.word_categories[id] = set(map(int, word_categories))
 
         i_stream.close()
 
         return
 
-    #------------------------------------------------------------------------#
-
-    def _loadLeafNodes(self, leafFile):
+    def _load_leaf_nodes(self, leaf_file):
         """
         Loads all the words which aren't themselves categories.
         """
-        i_stream = sopen(leafFile)
+        i_stream = sopen(leaf_file)
         for line in i_stream:
             line_objs = line.rstrip(':\n').split(':')
             word = line_objs[0]
             categories = map(int, line_objs[1:])
-            assert word not in self.wordToId
-            id = self.wordToId[word]
-            assert word not in self.wordCategories
-            self.wordCategories[word] = set(categories)
+            assert word not in self.word_to_id
+            id = self.word_to_id[word]
+            assert word not in self.word_categories
+            self.word_categories[word] = set(categories)
         i_stream.close()
         return
 
-    #------------------------------------------------------------------------#
-
-    def _parseWordCategories(self, _leaf_file):
+    def _parse_word_categories(self, _leaf_file):
         """
         Determine the categories of all leaf words.
         """
-        wordToIds = {}
+        word_to_ids = {}
         i_stream = sopen(_leaf_file)
         for line in i_stream:
             line_objs = line.rstrip(':\n').split(':')
             word = line_objs[0]
             category_ids = map(int, line_objs[1:])
-            wordToIds[word] = category_ids
+            word_to_ids[word] = category_ids
         i_stream.close()
 
-        idToWords = invertMapping(wordToIds)
+        id_to_words = invertMapping(word_to_ids)
 
-        return idToWords
+        return id_to_words
 
-    #------------------------------------------------------------------------#
-
-    def _parseTreeStructure(self, indexFile, idToWords):
+    def _parse_tree_structure(self, index_file, id_to_words):
         """
         Determine the tree structure of the ontology.
         """
         # Parse the category structure itself.
-        idToNode = {}
-        i_stream = sopen(indexFile)
+        id_to_node = {}
+        i_stream = sopen(index_file)
         for line in i_stream:
             id, word, parent, categories = line.rstrip().split()
             id = int(id)
 
             # Build the node itself.
             node = TreeNode(word, attrib={'id': id, 'words': set()})
-            idToNode[id] = node
+            id_to_node[id] = node
 
             # Add link from its parent.
             if parent != '*':
                 parent = int(parent)
                 assert parent < id
-                idToNode[parent].add_child(node)
+                id_to_node[parent].add_child(node)
 
             # Store additional word categories.
             categories = [int(w) for w in categories.split(':') if w != '*']
             for category_id in categories:
-                if category_id in idToWords:
-                    idToWords[category_id].append(word)
+                if category_id in id_to_words:
+                    id_to_words[category_id].append(word)
                 else:
-                    idToWords[category_id] = [word]
+                    id_to_words[category_id] = [word]
 
         i_stream.close()
 
-        return idToNode, idToWords
+        return id_to_node, id_to_words
 
-    #------------------------------------------------------------------------#
-
-#----------------------------------------------------------------------------#

src/test_goitaikei.py

 # -*- coding: utf-8 -*-
 #----------------------------------------------------------------------------#
-# testGoitaikei.py
+# test_goitaikei.py
 # Lars Yencken <lars.yencken@gmail.com>
 # vim: ts=4 sw=4 sts=4 et tw=78:
 # Fri Jun  8 09:57:21 2007
 #----------------------------------------------------------------------------#
 
 def suite():
-    testSuite = unittest.TestSuite((
+    test_suite = unittest.TestSuite((
             unittest.makeSuite(GoitaikeiTestCase)
         ))
-    return testSuite
+    return test_suite
 
 #----------------------------------------------------------------------------#
 
     def setUp(self):
         self.obj = GoitaikeiHierarchy.get_cached()
         self.root = self.obj.root
-        self.ekiPath = [u'具体', u'場', u'施設', u'公共施設', u'駅・港', u'駅']
-        self.badPath = [u'具体', u'場', u'sour eggs']
+        self.eki_path = [u'具体', u'場', u'施設', u'公共施設', u'駅・港', u'駅']
+        self.bad_path = [u'具体', u'場', u'sour eggs']
         pass
 
-    def testFindNode(self):
+    def test_find_node(self):
         """Tests finding a node by label."""
         eki_node = self.root.find_node(u'駅')
-        self.assertEqual(eki_node.label, u'駅')
+        self.assert_equal(eki_node.label, u'駅')
         assert u'秋葉原' in eki_node.attrib['words']
 
-    def testGetPath(self):
+    def test_get_path(self):
         """Test fetching a node at a known path."""
-        eki_node = self.root.get_path(self.ekiPath)
-        self.assertEqual(eki_node.label, u'駅')
+        eki_node = self.root.get_path(self.eki_path)
+        self.assert_equal(eki_node.label, u'駅')
         assert u'秋葉原' in eki_node.attrib['words']
-        self.assertEqual(eki_node, self.root.find_node(u'駅'))
+        self.assert_equal(eki_node, self.root.find_node(u'駅'))
         return
 
-    def testGetBadPath(self):
+    def test_get_bad_path(self):
         """Tests fetching a bad path."""
-        self.assertRaises(KeyError, self.root.get_path, self.badPath)
+        self.assert_raises(KeyError, self.root.get_path, self.bad_path)
 
-    def testProbabilities(self):
+    def test_probabilities(self):
         """Tests that node probabilities steadily decrease down the tree."""
         for node in self.root.walk():
             if node.parent is not None:
-                assert node.parent['cProb'] >= node['cProb']
+                assert node.parent['c_prob'] >= node['c_prob']
         return
 
-    def testCopy(self):
+    def test_copy(self):
         c = self.root.copy()
         
         # Check they share the same number of nodes.
-        self.assertEqual(len(list(c.walk())), len(list(self.root.walk())))
+        self.assert_equal(len(list(c.walk())), len(list(self.root.walk())))
 
         # Mark every node in the copy, and make sure that none of the original
         # nodes were marked (and thus shared with the copy).
 
         return
 
-    def testPrune(self):
+    def test_prune(self):
         n_nodes = 2705
-        self.assertEqual(n_nodes, len(list(self.root.walk())))
+        self.assert_equal(n_nodes, len(list(self.root.walk())))
 
         node = self.root.prune(lambda x: len(x.attrib['words']) > 0)
-        self.assertEqual(n_nodes, len(list(node.walk())))
+        self.assert_equal(n_nodes, len(list(node.walk())))
 
         node = self.root.prune(lambda x: len(x.attrib['words']) > 10)
         assert 0 < len(list(node.walk())) < n_nodes
         node = self.root.prune(lambda x: len(x.attrib['words']) > 100)
         assert 0 < len(list(node.walk())) < n_nodes
 
-    def testWalk(self):
+    def test_walk(self):
         for node in self.root.walk_postorder():
             for child in node.children.values():
-                assert 'isMarked' in child.attrib
-            node.attrib['isMarked'] = True
+                assert 'is_marked' in child.attrib
+            node.attrib['is_marked'] = True
         return
 
     def tearDown(self):
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.