Anonymous avatar Anonymous committed 9b5c805

[svn] Issue105: Added a new script to run the theoretical rank experiment, and added
some supporting features to lookup indexes, in the particular a method to
iterate over words and their readings. The script generates statistics, but
doesn't yet dump them to a latex table.

Comments (0)

Files changed (6)

eval/rankExperiment.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#----------------------------------------------------------------------------#
+# rankExperiment.py
+# Lars Yencken <lars.yencken@gmail.com>
+# vim: ts=4 sw=4 sts=4 et tw=78:
+# Tue Mar 27 12:39:58 JST 2007
+#
+#----------------------------------------------------------------------------#
+
+import os, sys, optparse
+import random
+
+from tools.common import sopen
+from tools.stats import mean
+from pinyin.lookupIndex import LookupIndex
+from pinyin.common import Language
+from learning.freqDist import FreqDist
+from www import settings
+
+#----------------------------------------------------------------------------#
+# PUBLIC
+#----------------------------------------------------------------------------#
+ 
+def rankExperiment(outputFile):
+    """ Perform the rank experiment in the kana direction.
+    """
+    chineseIndex = LookupIndex.getCached(Language.Chinese)
+
+    lengthDist = FreqDist()
+
+    bestRankByLen = {}
+    randomRankByLen = {}
+    worstRankByLen = {}
+    for i in range(1, settings.MAX_WORD_LENGTH+1):
+        bestRankByLen[i] = []
+        worstRankByLen[i] = []
+        randomRankByLen[i] = []
+
+    for word, readingCandidates in chineseIndex.iterreadings():
+        if not readingCandidates:
+            # XXX is it ok to skip this case?
+            continue
+        lengthDist.increment(len(word))
+
+        readingCandidates.sort(key=lambda x: x[1], reverse=True)
+        bestReading = readingCandidates[0][0]
+        randomReading = random.choice(readingCandidates)[0]
+        worstReading = readingCandidates[-1][0]
+
+        bestRankByLen[len(word)].append(
+                _getRank(chineseIndex, bestReading, word)
+            )
+        randomRankByLen[len(word)].append(
+                _getRank(chineseIndex, randomReading, word)
+            )
+        worstRankByLen[len(word)].append(
+                _getRank(chineseIndex, worstReading, word)
+            )
+
+    print 'Best'
+    _printTable(bestRankByLen)
+    print '\nRandom'
+    _printTable(randomRankByLen)
+    print '\nWorst'
+    _printTable(worstRankByLen)
+
+    print 'Word length distribution:'
+    for i in range(1, settings.MAX_WORD_LENGTH+1):
+        print '%d %.2f' % (i, lengthDist.prob(i))
+
+    return
+
+#----------------------------------------------------------------------------#
+# PRIVATE
+#----------------------------------------------------------------------------#
+
+def _getRank(index, query, word):
+    """ Returns the rank of the word in the results for the given query.
+        Returns None in the case the word is not found for the given query.
+    """
+    results = index.foreignReadingQuery(query)
+    try:
+        return [r.word for r in results].index(word) + 1
+    except ValueError:
+        return None
+
+#----------------------------------------------------------------------------#
+
+def _printTable(rankDict):
+    ranksByLength = rankDict.items()
+    ranksByLength.sort()
+
+    for length, rankList in ranksByLength:
+        filteredRanks = [r for r in rankList if r is not None]
+        coverage = 100 * len(filteredRanks) / float(len(rankList))
+        print '%d: %.2f (%%%.2f)' % (length, mean(filteredRanks), coverage)
+
+    return
+
+#----------------------------------------------------------------------------#
+# MODULE EPILOGUE
+#----------------------------------------------------------------------------#
+
+def _createOptionParser():
+    """ Creates an option parser instance to handle command-line options.
+    """
+    usage = \
+"""%prog [options] outputFile.tex
+
+Performs a rank analysis experiment over all possible kana readings of each
+dictionary word. Writes the results to a latex table at the specified
+filename."""
+
+    parser = optparse.OptionParser(usage)
+
+    parser.add_option('--debug', action='store_true', dest='debug',
+            default=False, help='Enables debugging mode [False]')
+
+    return parser
+
+#----------------------------------------------------------------------------#
+
+def main(argv):
+    """ The main method for this module.
+    """
+    parser = _createOptionParser()
+    (options, args) = parser.parse_args(argv)
+
+    try:
+        [outputFile] = args
+    except:
+        parser.print_help()
+        sys.exit(1)
+
+    # Avoid psyco in debugging mode, since it merges stack frames.
+    if not options.debug:
+        try:
+            import psyco
+            psyco.profile()
+        except:
+            pass
+
+    rankExperiment(outputFile)
+    
+    return
+
+#----------------------------------------------------------------------------#
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
+
+#----------------------------------------------------------------------------#
+  
+# vim: ts=4 sw=4 sts=4 et tw=78:
+

pinyin/lookupIndex.py

 import dict.pinyinTable
 import learning.common
 from tools.smartCache import diskProxyDirect
+from tools.progressBar import withProgress
+from tools.common import sopen
 
 import hanziModel, hanziReadingModel
 import kanjiModel, kanjiReadingModel
         self.language = language
 
         # Build word and reading indexes.
-        self.words = wordDict.getWordDict(language)
+        self.words = wordDict.WordDict.getCached(language)
 
         # Build a transliteration index.
         self._loadTransliterations()
 
         # Build a reverse reading model.
-        self._loadReadingModels()
+        self.readingModel = getReadingModel(language)
+        self.reverseReadingMap = getReverseMap(language)
 
         self._logStream.close()
         self._logStream = None
         return '<LookupIndex object for %s>' % str(self.language)
 
     #------------------------------------------------------------------------#
+
+    def iterreadings(self):
+        """ Returns an iterator over (word, foreignReadings) pairs.
+        """
+        oStream = sopen('log.out', 'w')
+        for word in withProgress(self.words.keys(), 30):
+            try:
+                readings = {}
+                for translit, logProbA in self.translit.sequenceCandidates(word,
+                        ''.join):
+                    for reading, logProbB in \
+                            self.readingModel.sequenceCandidates(word, ' '.join):
+                        logProb = logProbA + logProbB
+                        if readings.has_key(reading):
+                            readings[reading] += logProb
+                        else:
+                            readings[reading] = logProb
+                readings = readings.items()
+            except learning.common.UnknownSymbolError:
+                print >> oStream, word
+                continue
+
+            yield word, readings
+
+        oStream.close()
+
+        return
+
+    #------------------------------------------------------------------------#
+
+    _cachedLookupIndex = {}
+
+    @staticmethod
+    def getCached(language):
+        """ A method for getting a lookup index which is cached in memory. 
+        """
+        if language not in LookupIndex._cachedLookupIndex:
+            method = diskProxyDirect(
+                    lambda: LookupIndex(language),
+                    os.path.join(www.settings.CACHE_DIR,
+                            'lookupIndex_%s.cache' % language),
+                    [__file__, hanziReadingModel, kanjiReadingModel],
+                )
+            # Cache miss, add a new lookup index for this language.
+            LookupIndex._cachedLookupIndex[language] = method()
+        
+        return LookupIndex._cachedLookupIndex[language]
+
+    #------------------------------------------------------------------------#
     # PRIVATE
     #------------------------------------------------------------------------#
 
             search.
         """
         # Construct a transliteration model.
-        if self.language == Language.Chinese:
-            model = kanjiModel.getTransliterationModel()
-        else:
-            assert self.language == Language.Japanese
-            model = hanziModel.getTransliterationModel()
+        model = getTransliterationModel(self.language)
 
         # Initialize an array indexed by the word length.
         translitByLength = []
                 self._logProblem(symbol, "unknown symbol for word %s" % word)
                 continue
 
+        self.translit = model # XXX added this temporarily
         self.translitReverse = translitReverse
         self.translitByLength = map(list, translitByLength)
 
 
     #------------------------------------------------------------------------#
 
-    def _loadReadingModels(self):
-        """ Loads a reverse reading model for the given lookup type. If we are
-            looking up Chinese words, this model is P(kanji | kana). If we
-            are looking up Japanese words, this model P(hanzi | pinyin).
-        """
-        if self.language == Language.Chinese:
-            readingModule = kanjiReadingModel
-        else:
-            assert self.language == Language.Japanese
-            readingModule = hanziReadingModel
-
-        self.readingModel = readingModule.getReadingModel()
-        self.reverseReadingMap = readingModule.getReverseMap()
-
-        return
-
-    #------------------------------------------------------------------------#
-
     def _resultsFromTranslitCandidates(self, candidates):
         """ We now have a list of (translit, P(reading|translit)) pairs,
             from which we need to recover the search results.
 
         return charSets
 
+    #------------------------------------------------------------------------#
 
 #----------------------------------------------------------------------------#
 
-_cachedLookupIndex = {}
-
-def getLookupIndex(language):
-    """ A method for getting a lookup index which is cached in memory. 
+def getReadingModel(language):
+    """ Returns the appropriate reading foreign word reading model, for
+        dictionary lookup of words in the specified language.
     """
-    global _cachedLookupIndex
-
-    if language not in _cachedLookupIndex:
-        method = diskProxyDirect(
-                lambda: LookupIndex(language),
-                os.path.join(www.settings.CACHE_DIR,
-                        'lookupIndex_%s.cache' % language),
-                [__file__, hanziReadingModel, kanjiReadingModel],
-            )
-        # Cache miss, add a new lookup index for this language.
-        _cachedLookupIndex[language] = method()
-    
-    return _cachedLookupIndex[language]
+    if language == Language.Chinese:
+        return kanjiReadingModel.getReadingModel()
+    else:
+        assert language == Language.Japanese
+        return hanziReadingModel.getReadingModel()
 
 #----------------------------------------------------------------------------#
 
+def getReverseMap(language):
+    if language == Language.Chinese:
+        return kanjiReadingModel.getReverseMap()
+    else:
+        assert language == Language.Japanese
+        return hanziReadingModel.getReverseMap()
+
+#----------------------------------------------------------------------------#
+
+def getTransliterationModel(language):
+    if language == Language.Chinese:
+        return kanjiModel.getTransliterationModel()
+    else:
+        assert language == Language.Japanese
+        return hanziModel.getTransliterationModel()
+
+#----------------------------------------------------------------------------#

pinyin/queries.py

     # Determine its type.
     queryScripts = detectQueryScripts(query)
     query = kana.normalizeAscii(query)
-    index = lookupIndex.getLookupIndex(Language.Japanese)
+    index = lookupIndex.LookupIndex.getCached(Language.Japanese)
 
     # Generate the results depending on the query type.
     results = None
     # Determine its type.
     queryScripts = detectQueryScripts(query)
     query = kana.normalizeAscii(query)
-    index = lookupIndex.getLookupIndex(Language.Chinese)
+    index = lookupIndex.LookupIndex.getCached(Language.Chinese)
 
     # Generate the results depending on the query type.
     results = None

pinyin/testLookupIndex.py

     """ This class tests the LookupIndex class. 
     """
     def setUp(self):
-        self.index = getLookupIndex(Language.Chinese)
+        self.index = LookupIndex.getCached(Language.Chinese)
         pass
 
     def testNativeHanLookup(self):
     """ This class tests the LookupIndex class. 
     """
     def setUp(self):
-        self.index = getLookupIndex(Language.Japanese)
+        self.index = LookupIndex.getCached(Language.Japanese)
         pass
 
     def testNativeHanLookup(self):

pinyin/testWordDict.py

         """ Tests the Japanese side of the word dict. 
         """
         # Load the dictinoary.
-        wordDict = getWordDict(Language.Japanese)
+        wordDict = WordDict.getCached(Language.Japanese)
 
         # Pick a reasonably common example.
         assert u'手紙' in wordDict
         """ Tests the Chinese side of the word dict.
         """
         # Load the dictinoary.
-        wordDict = getWordDict(Language.Chinese)
+        wordDict = WordDict.getCached(Language.Chinese)
 
         # Pick a reasonably common example.
         assert u'自信' in wordDict

pinyin/wordDict.py

 
     #------------------------------------------------------------------------#
 
-#----------------------------------------------------------------------------#
+    _cache = {}
 
-def getWordDict(language):
-    """ Fetches a word dictionary for the given language.
-    """
-    method = diskProxyDirect(
-            lambda: WordDict(language),
-            _cacheFile % language,
-            [__file__, _dictFiles[language], _corpusProbs[language]],
-        )
-    return method()
+    @staticmethod
+    def getCached(language):
+        """ Fetches a word dictionary for the given language.
+        """
+        if not WordDict._cache.has_key(language):
+            method = diskProxyDirect(
+                    lambda: WordDict(language),
+                    _cacheFile % language,
+                    [__file__, _dictFiles[language], _corpusProbs[language]],
+                )
+            WordDict._cache[language] = method()
+
+        return WordDict._cache[language]
+
+    #------------------------------------------------------------------------#
 
 #----------------------------------------------------------------------------#
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.