1. Lars Yencken
  2. gpalign-py

Commits

Lars Yencken  committed c9f8bcb

Updates to use current version of cjktools and consoleLog, instead of jptools.

Also allows configuration of a data directory, which defaults to the current
directory.

  • Participants
  • Parent commits 11a6b78
  • Branches default

Comments (0)

Files changed (19)

File .hgignore

View file
  • Ignore whitespace
 build
 dist
 src/__version__.py
+*.cache

File src/alignment.py

View file
  • Ignore whitespace
 import potentials
 from frequency import FrequencyMap
 
-from jptools import kana
-from jptools.progressBar import ProgressBar
+from cjktools import scripts
+from cjktools.common import sopen
+from consoleLog.progressBar import ProgressBar
 
 import math
-from sets import Set
-import codecs
 import random
 import cPickle as pickle
 
 
         # we write aligned readings as we go, rather than storing them in
         # memory
-        self._output = codecs.open(outputFile, 'w', 'utf8')
+        self._output = sopen(outputFile, 'w')
         self._outputName = outputFile
 
         # ratios for the tf-idf
             entry = ambiguousEntries[i]
             alignments = entry.potentials
 
-            assert len(Set(alignments)) == len(alignments), \
+            assert len(set(alignments)) == len(alignments), \
                     "Readings are not unique"
 
             # update our counts
         """ Calculates the tf-idf score of the alignment passed in based on
             the current model.
         """
-        kanjiScript = kana.Script.kanji
+        kanjiScript = scripts.Script.Kanji
         currentScores = []
 
         gSegments, pSegments = alignment
         for i in range(len(gSegments)):
-            if not kana.scriptType(gSegments[i]) == kanjiScript:
+            if not scripts.scriptType(gSegments[i]) == kanjiScript:
                 continue
 
             gFreq, gpFreq, gpcFreq = self._weightedFreqs(gSegments,

File src/detectOkurigana.py

View file
  • Ignore whitespace
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #----------------------------------------------------------------------------#
 # detectOkurigana.py
 
 import os, sys
 import optparse
-import pdb
-import codecs
 
 import potentials
 import dictionary
 from readingModel import ReadingModel
 from okuriganaModel import OkuriganaModel
 import evaluate
+import settings
 
 #----------------------------------------------------------------------------#
 
     """
     okuriganaModel = OkuriganaModel(options)
 
-    inputFile = options.inputFile or 'data/eval-okurigana.data'
+    inputFile = options.inputFile or os.path.join(settings.DATA_DIR,
+            'eval-okurigana.data')
     okuriganaModel.okuriganaAdjustments(inputFile, outputFile)
 
     if not options.inputFile:

File src/dictionary.py

View file
  • Ignore whitespace
 
 #----------------------------------------------------------------------------#
 
-import codecs
-from bz2 import BZ2File
-
-from jptools import kana
+from cjktools import scripts
+from cjktools.common import sopen
 
 from entry import Entry
 
     """ Determines all the kanji entries available in the input file. The input
         file is assumed to be in edict format.
     """
-    if inputFile.endswith('.bz2'):
-        inputStream = codecs.getreader('utf8')(BZ2File(inputFile, 'r'))
-    else:
-        inputStream = codecs.open(inputFile, 'r', 'utf8')
-
-    rejectionStream = codecs.open('logs/rejected-entries', 'w', 'utf8')
+    inputStream = sopen(inputFile)
+    rejectionStream = sopen('logs/rejected-entries', 'w')
 
     entries = []
     numRejected = 0
         in edict format.
     """
     entries = []
-    inputStream = codecs.open(inputFile, 'r', 'utf8')
+    inputStream = sopen(inputFile, 'r')
 
-    rejectionStream = codecs.open('logs/rejected-entries', 'w', 'utf8')
+    rejectionStream = sopen('logs/rejected-entries', 'w')
 
     numRejected = 0
     for line in inputStream:
-        gString, pString = line.split(':')[0].split('-')
+        gString, pString = line.split(':')[0].split()
         
         if _validEntry(gString, pString):
             entries.append(Entry(gString, pString))
     """ Returns True if the word is only kanji and kana, False otherwise.
     """
     # throw out any grapheme string which contains ascii
-    if kana.Script.ascii in map(kana.scriptType, gString): 
+    if scripts.Script.Ascii in map(scripts.scriptType, gString): 
         return False
 
     # throw out any reading which non-kana readings
-    isKana = lambda x: x in (kana.Script.hiragana, kana.Script.katakana)
+    isKana = lambda x: x in (scripts.Script.Hiragana, scripts.Script.Katakana)
 
-    hasNonKana = (filter(isKana, map(kana.scriptType, pString)) != [])
+    hasNonKana = (filter(isKana, map(scripts.scriptType, pString)) != [])
 
     return hasNonKana
     
     """
     run = 0
     longest = 0
-    kanjiScript = kana.Script.kanji
+    kanjiScript = scripts.Script.Kanji
     for char in gString:
-        if kana.scriptType(char) == kanjiScript:
+        if scripts.scriptType(char) == kanjiScript:
             run += 1
         else:
             if run > longest:

File src/entry.py

View file
  • Ignore whitespace
 #
 #----------------------------------------------------------------------------#
 
-from jptools import kana
+from cjktools import scripts
 
 #----------------------------------------------------------------------------#
 
 
         # normalise the graphical form
         if u'々' in gString:
-            gString = kana.insertDuplicateKanji(gString)
+            gString = self._insertDuplicateKanji(gString)
         self.gString = gString
 
         # have we aligned yet?
         """
         assert self.aligned
 
-        alignment = '-'.join(map(lambda x: '|'.join(x), self.alignment))
+        alignment = ' '.join(map(lambda x: '|'.join(x), self.alignment))
 
         original = '%s-%s'%(self.gString_original, ''.join(self.alignment[1]))
     
         return ':'.join((original, alignment))
 
+    def _insertDuplicateKanji(self, gString):
+        result = []
+        kanjiScript = scripts.Script.Kanji
+        for i, c in enumerate(gString):
+            if c == u'々' and i > 0 and scripts.scriptType(gString[i-1]) == kanjiScript:
+                # Insert a duplicate of the previous kanji
+                result.append(gString[i-1])
+            else:
+                result.append(c)
+
+        return ''.join(result)
+
     def __hash__(self):
         if not self.alignment:
             return hash(self.gString + self.pString)

File src/errors.py

View file
  • Ignore whitespace
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #----------------------------------------------------------------------------#
 # errors.py
 #
 #----------------------------------------------------------------------------#
 
-import codecs
-import string
-from sets import Set
-import pdb
+import okuriganaModel
 
-import okuriganaModel
+from cjktools.common import sopen
 
 #----------------------------------------------------------------------------#
 
     """ Separates out the errors from the alignments, and tries to classify
         them.
     """
-    newUtfFile = lambda x: codecs.open(baseFile + x, 'w', 'utf8')
+    newUtfFile = lambda x: sopen(baseFile + x, 'w')
 
-    inputFile = codecs.open(baseFile, 'r', 'utf8')
+    inputFile = sopen(baseFile, 'r')
 
-    good = Set()
-    bad = Set()
-    badOkuri = Set()
-    badGapping = Set()
-    badAlign = Set()
-    badConstrain = Set()
+    good = set()
+    bad = set()
+    badOkuri = set()
+    badGapping = set()
+    badAlign = set()
+    badConstrain = set()
 
     for line in inputFile:
         original, testCase, correctCase = _parseLine(line)
 #----------------------------------------------------------------------------#
 
 def _linesToFile(lineSet, extension, baseName):
-    oStream = codecs.open(baseName + extension, 'w', 'utf8')
+    oStream = sopen(baseName + extension, 'w')
     oStream.writelines(lineSet)
     oStream.close()
     return 

File src/evaluate.py

View file
  • Ignore whitespace
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #----------------------------------------------------------------------------#
 # evaluate.py
 
 import os, sys
 import optparse
-import codecs
+
+from cjktools import sequences
+from cjktools.common import sopen
 
 import errors
-from jptools import functional
+import settings
 
 #----------------------------------------------------------------------------#
 
     """ Evaluates the alignments provided in the prediction file, writing the
         results to the results file.
     """
-    validationFile = 'data/eval-alignment.data'
+    validationFile = os.path.join(settings.DATA_DIR, 'eval-alignment.data')
 
-    iStream = codecs.open(predictionFile, 'r', 'utf8')
+    iStream = sopen(predictionFile, 'r')
     results = {}
 
     validationCases = _listEntries(validationFile)
     predictionDict = dict(predictionCases)
 
     matching = lambda x: x in validationCases
-    good, bad = functional.separate(matching, predictionCases)
+    good, bad = sequences.separate(matching, predictionCases)
 
     results['good'] = good
 
 
     orFunc = lambda x, y: x or y
     hasGapping = lambda x: reduce(orFunc, map(lambda y: '<' in y, x[2]))
-    gapping, align = functional.separate(hasGapping, bad)
+    gapping, align = sequences.separate(hasGapping, bad)
 
     results['gapping'] = gapping
     results['align'] = align
     """ Evaluates the alignments provided in the prediction file, writing the
         results to the results file.
     """
-    validationFile = 'data/eval-okurigana.data'
+    validationFile = os.path.join(settings.DATA_DIR, 'eval-okurigana.data')
 
-    iStream = codecs.open(predictionFile, 'r', 'utf8')
+    iStream = sopen(predictionFile, 'r')
     results = {}
 
     validationCases = _listEntries(validationFile)
     predictionDict = dict(predictionCases)
 
     matching = lambda x: x in validationCases
-    good, bad = functional.separate(matching, predictionCases)
+    good, bad = sequences.separate(matching, predictionCases)
 
     results['good'] = good
 
         percent = 100.0*number/5000.0
         print >> summaryStream, '%s    %4d    %6.2f%%' % (key, number, percent)
         print '%s    %4d    %6.2f%%' % (key, number, percent)
-        oStream = codecs.open(resultsFile + '.' + key, 'w', 'utf8')
+        oStream = sopen(resultsFile + '.' + key, 'w')
         for line in keyEntries:
             print >> oStream, ':'.join(line)
         oStream.close()
 
 def _listEntries(filename):
     entries = []
-    iStream = codecs.open(filename, 'r', 'utf8')
+    iStream = sopen(filename, 'r')
 
     for line in iStream:
         key, value = line.strip().split(':', 1)
 def _getEntries(filename):
     """ Creates a dictionary of all the entries in the given file.
     """
-    lines = codecs.open(filename, 'r', 'utf8').readlines()
+    lines = sopen(filename, 'r').readlines()
 
     entries = {}
     for line in lines:
     nLines = 0
     nCorrect = 0
     nMissing = 0
-    oStream = codecs.open(resultFile, 'w', 'utf8')
+    oStream = sopen(resultFile, 'w')
     for key, alignment in correctEntries.iteritems():
         testAlignment = testEntries.get(key, '???')
 
 def sortFile(filename):
     """ Sorts the file in a line-based manner.
     """
-    iStream = codecs.open(filename, 'r', 'utf8')
+    iStream = sopen(filename, 'r')
     lines = iStream.readlines()
     iStream.close()
 
     lines.sort()
 
-    oStream = codecs.open(filename, 'w', 'utf8')
+    oStream = sopen(filename, 'w')
     oStream.writelines(lines)
     oStream.close()
 
 
     parser = optparse.OptionParser(usage)
 
+    
     parser.add_option('-e', action='store', dest='correctFile',
-        default='data/evaluation.data', help='The file of correct evaluations')
+        default=os.path.join(settings.DATA_DIR, 'evaluation.data'),
+        help='The file of correct evaluations')
 
     return parser
 

File src/formatEval.py

View file
  • Ignore whitespace
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #----------------------------------------------------------------------------#
 # formatEval.py
 
 import os, sys
 import optparse
-import codecs
+from cjktools.common import sopen
 
 from entry import Entry
 
 
 def formatEvalFile(inputFile, outputFile):
     entries = _parseEntries(inputFile)
-    oStream = codecs.open(outputFile, 'w', 'utf8')
+    oStream = sopen(outputFile, 'w')
 
     for entry in entries:
         lineA = entry.gString.ljust(10, u' ')
 
 def _parseEntries(inputFile):
     entries = []
-    for line in codecs.open(inputFile, 'r', 'utf8'):
+    for line in sopen(inputFile, 'r'):
         base, attempt, actual = line.strip().split(':')
 
-        gString, pString = base.split('-')
+        gString, pString = base.split()
         entry = Entry(gString, pString)
         fixify = lambda x: map(lambda y: y.strip('|').split('|'), 
-                x.split('-'))
+                x.split())
         attempt = fixify(attempt)
         actual = fixify(actual)
 

File src/frequency.py

View file
  • Ignore whitespace
 #
 #----------------------------------------------------------------------------#
 
-from jptools import kana
+from cjktools import scripts
 
 #----------------------------------------------------------------------------#
 
     def addCounts(self, alignment):
         """ This method updates all the counts associated with the entry.
         """
-        kanjiScript = kana.Script.kanji
+        kanjiScript = scripts.Script.Kanji
         gSegments, pSegments = alignment
         for i in range(len(gSegments)):
-            if kana.scriptType(gSegments[i]) == kanjiScript:
+            if scripts.scriptType(gSegments[i]) == kanjiScript:
                 g, gp, gpc = self._getContext(gSegments, pSegments, i)
 
                 if not self._graphemes.has_key(g):
     def delCounts(self, alignment):
         """ This method updates all the counts associated with the entry.
         """
-        kanjiScript = kana.Script.kanji
+        kanjiScript = scripts.Script.Kanji
         gSegments, pSegments = alignment
         for i in range(len(gSegments)):
-            if kana.scriptType(gSegments[i]) == kanjiScript:
+            if scripts.scriptType(gSegments[i]) == kanjiScript:
                 g, gp, gpc = self._getContext(gSegments, pSegments, i)
                 gCount, gpDict = self._graphemes[g]
                 gCount -= 1

File src/gpalign.py

View file
  • Ignore whitespace
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #----------------------------------------------------------------------------#
 # align.py
 import os, sys
 import optparse
 import pdb
-import codecs
+from cjktools.common import sopen
 
 import potentials
 import dictionary
 
 #----------------------------------------------------------------------------#
 
-def performSegmentation(outputFile, options):
+def performSegmentation(inputFile, outputFile, options):
     """ The main method for this module. Performs the entire segmentation run,
         taking an edict dictionary as input and producing a segmented output
         for each kanji input row.
     # read in edict dictionary
     if not options.edict:
         print 'Reading evaluation entries'
-        entries, numRejected = dictionary.evaluationEntries(
-                'data/eval-alignment.data')
+        entries, numRejected = dictionary.evaluationEntries(inputFile)
     else:
         print 'Reading edict entries'
-        entries, numRejected = dictionary.edictEntries('data/edict.bz2')
+        entries, numRejected = dictionary.edictEntries(inputFile)
 
     print '--> Found %d entries (rejected %d)' % (len(entries), numRejected)
 
     """ Creates an option parser instance to handle command-line options.
     """
     usage = \
-"""%prog [options] outputFile
+"""%prog [options] inputFile outputFile
 
 An efficient implementation of the Baldwin-Tanaka automated grapheme-phoneme
 alignment algorithm based on TF-IDF. By default, it uses an evaluation data set
     parser = createOptionParser()
     (options, args) = parser.parse_args(argv)
 
-    if len(args) != 1:
+    if len(args) != 2:
         parser.print_help()
         sys.exit(1)
 
-    outputFile = args[0]
+    inputFile, outputFile = args
 
     if options.random:
         options.tfHeuristic = False
         options.idfHeuristic = False
 
-    performSegmentation(outputFile, options)
+    performSegmentation(inputFile, outputFile, options)
 
     return
 

File src/okuriganaModel.py

View file
  • Ignore whitespace
 
 from os import path
 import cPickle as pickle
-from bz2 import BZ2File
-import codecs
 import re
-import pdb
 
-from jptools import kana, smartCache, enum, progressBar
-from jptools.functional import *
+from cjktools import scripts, smartCache, enum
+from cjktools.sequences import *
+from cjktools.common import sopen
 
-from sets import Set
 from entry import Entry
 from readingModel import ReadingModel
 
         okurigana.
     """
     assert entry.alignment, "How can an empty entry have okurigana?"
-    hiragana = kana.Script.hiragana
-    kanji = kana.Script.kanji
+    hiragana = scripts.Script.Hiragana
+    kanji = scripts.Script.Kanji
 
     gSegments = entry.alignment[0]
 
     lastSegmentType = hiragana
     for i in range(len(gSegments)):
-        segmentType = kana.scriptType(gSegments[i])
+        segmentType = scripts.scriptType(gSegments[i])
 
         if segmentType == hiragana and lastSegmentType == kanji:
             # potential okurigana case
         otherwise.
     """
     for seg in gSegments:
-        if len(kana.scriptBoundaries(seg)) > 1:
+        if len(scripts.scriptBoundaries(seg)) > 1:
             return True
     else:
         return False
 
     i = 0
     while i < len(gSegments):
-        boundaries = kana.scriptBoundaries(gSegments[i])
+        boundaries = scripts.scriptBoundaries(gSegments[i])
         if len(boundaries) == 1:
             new_gSegments += (gSegments[i],)
             new_pSegments += (pSegments[i],)
             i += 1
         elif len(boundaries) == 2:
             kanjiPart, kanaPart = boundaries
-            if i == len(gSegments)-1 or kana.scriptType(kanaPart) != \
-                    kana.scriptType(gSegments[i+1]):
+            if i == len(gSegments)-1 or scripts.scriptType(kanaPart) != \
+                    scripts.scriptType(gSegments[i+1]):
                 # last segment, or differing segments
                 new_gSegments += (kanjiPart, kanaPart)
                 new_pSegments += (pSegments[i][:-len(kanaPart)],
             entries.
         """
         print "Creating a new okurigana model"
-        cacheFile = 'data/okuriganaModel.cache'
-        edictFile = 'data/edict.bz2'
+        cacheFile = path.join(settings.CACHE_DIR, 'okuriganaModel.cache')
+        edictFile = path.join(settings.DATA_DIR, 'edict.bz2')
 
         print '--> Cooccurrence threshold set to %d' % options.okuriThreshold
         self._threshold = options.okuriThreshold
 
-        self._okurigana = smartCache.useCache(cacheFile,
-                threshold=self._threshold)
-
-        if self._okurigana is None:
-            assert path.exists(edictFile)
-
-            print "--> Generating new model from edict"
-            self._okurigana = self._parseEdictEntries(edictFile)
-
-            readingModel = ReadingModel()
-            extraOkurigana = readingModel.getOkurigana()
-            self._addKanjidicOkurigana(extraOkurigana)
-            
-            print "--> Caching model for later"
-            smartCache.recache(self._okurigana, cacheFile, 
-                    ['okuriganaModel.py'],
-                    threshold=self._threshold)
-
-        else:
-            print "--> Using cached model"
+        fetchOkurigana = smartCache.diskProxyDirect(
+                self._rebuildOkurigana,
+                dependencies=['okuriganaModel.py', edictFile],
+            )
+        self._okurigana = fetchOkurigana(edictFile, threshold=self.threshold)
 
         self._evaluationRun = not bool(options.inputFile)
         self._simpleMode = bool(options.simpleOkurigana)
         self._nFixed = 0
         return
 
+
     #------------------------------------------------------------------------#
 
     def okuriganaAdjustments(self, inputFile, outputFile):
             # aligned)
             entryIter = self._resultsInputIter(inputFile)
 
-        oStream = codecs.open(outputFile, 'w', 'utf8')
+        oStream = sopen(outputFile, 'w')
 
         for entry in entryIter:
-            origAlignment = '-'.join((entry.gString_original, entry.pString))
+            origAlignment = ' '.join((entry.gString_original, entry.pString))
             if potentialOkurigana:
                 # potential site, attempt to solve it
                 if self._simpleMode:
             
             print >> oStream, ':'.join((
                     origAlignment,
-                    '-'.join(map(lambda x: '|'.join(x), newAlignment))
+                    ' '.join(map(lambda x: '|'.join(x), newAlignment))
                 ))
 
         print '--> %d cases had shifted alignments' % self._nFixed
         counts = {}
 
         for line in iStream:
-            if not kana.hasKanji(line):
+            if not scripts.hasKanji(line):
                 continue
 
             gString = line.split()[0]
     
     #------------------------------------------------------------------------#
 
+    @staticmethod
+    def _rebuildOkurigana(filename):
+        okuriganaMap = self._parseEdictEntries(filename)
+
+        readingModel = ReadingModel()
+        extraOkurigana = readingModel.getOkurigana()
+        self._addKanjidicOkurigana(extraOkurigana, okuriganaMap)
+        return okuriganaMap
+
+    #------------------------------------------------------------------------#
+
     def _parseVerbDetails(self, gString, line, okurigana):
         """ Determine whether this line defines a verb, and if so grab it's
             details for conjugation.
         """
         verbTag = re.compile('\((.*,)*v(.*)\)')
-        kanjiScript = kana.Script.kanji
+        kanjiScript = scripts.Script.Kanji
 
         tagsFound = verbTag.search(line)
         if not tagsFound:
             return
 
         for i in range(len(gString)-1, -1, -1):
-            if kana.scriptType(gString[i]) == kanjiScript:
+            if scripts.scriptType(gString[i]) == kanjiScript:
                 lastKanji = gString[i]
                 trailingKana = gString[i+1:]
                 baseEntry = (trailingKana, verbType)
 
                 if not okurigana.has_key(lastKanji):
-                    okurigana[lastKanji] = Set()
+                    okurigana[lastKanji] = set()
 
                 okurigana[lastKanji].add((trailingKana, OkuriType.verb, 
                         verbType))
 
         for gString, pString in counts.iterkeys():
             key = gString[-1]
-            thisSet = okurigana.setdefault(key, Set())
+            thisSet = okurigana.setdefault(key, set())
             okurigana[key].add((pString, OkuriType.cooccurrence, None))
 
         return
 
     #------------------------------------------------------------------------#
 
-    def _addKanjidicOkurigana(self, kanjidicOkurigana):
+    @staticmethod
+    def _addKanjidicOkurigana(kanjidicOkurigana, okuriganaMap):
         """ Adds okurigana from kanjidic into the full class dictionary of
             okurigana instances.
         """
         for kanji, okurigana in kanjidicOkurigana.iteritems():
-            possibleOkurigana = self._okurigana.setdefault(kanji, Set())
+            possibleOkurigana = okuriganaMap.setdefault(kanji, set())
             for case in okurigana:
                 possibleOkurigana.add((case, OkuriType.kanjidic, None))
 
         """ Updates counts for each okurigana occurence.
         """
 
-        kanjiScript = kana.Script.kanji
-        hiraganaScript = kana.Script.hiragana
+        kanjiScript = scripts.Script.Kanji
+        hiraganaScript = scripts.Script.Hiragana
     
-        segments = list(kana.scriptBoundaries(gString))
+        segments = list(scripts.scriptBoundaries(gString))
         segments.reverse()
 
         lastSeg = segments.pop()
-        lastSegType = kana.scriptType(lastSeg)
+        lastSegType = scripts.scriptType(lastSeg)
 
         while segments:
             thisSeg = segments.pop()
-            thisSegType = kana.scriptType(thisSeg)
+            thisSegType = scripts.scriptType(thisSeg)
 
             if thisSegType == hiraganaScript and lastSegType == kanjiScript:
                 feature = lastSeg, thisSeg
             lastChar = kanaEnding[-1]
             realBase = kanaEnding[:-1]
 
-            assert kana.isLine(lastChar, u'う')
+            assert scripts.isLine(lastChar, u'う')
             conjugates = [kanaEnding]
 
-            masuBase = realBase + kana.toLine(lastChar, u'い')
+            masuBase = realBase + scripts.toLine(lastChar, u'い')
             conjugates.append(masuBase)
             conjugates.append(masuBase + u'ます')
 
     def _evaluationInputIter(self, filename):
         """ Provide an iterator over the evaluation input.
         """
-        iStream = codecs.open(filename, 'r', 'utf8')
+        iStream = sopen(filename, 'r')
 
         toSegments = lambda x: tuple(x.split('|'))
 
             # get the pre-aligned input
             alignedInput, _correctTarget = line.split(':')[:2]
 
-            gString, pString = alignedInput.split('-')
+            gString, pString = alignedInput.split()
             gSegments = toSegments(gString)
             pSegments = toSegments(pString)
 
         """ Iterates over the entries in a results file (directly output from
             the alignment script).
         """
-        iStream = codecs.open(filename, 'r', 'utf8')
+        iStream = sopen(filename, 'r')
         entries = []
 
         toSegments = lambda x: tuple(x.split('|'))
             # although we also have the unaligned input, ignore it for now
             _unalignedInput, alignedInput = line.split(':')[:2]
 
-            gString, pString = alignedInput.split('-')
+            gString, pString = alignedInput.split()
             gSegments = toSegments(gString)
             pSegments = toSegments(pString)
 
             okurigana is okurigana, and just removing all kanji->kana
             boundaries. 
         """
-        hiragana = kana.Script.hiragana
-        kanji = kana.Script.kanji
+        hiragana = scripts.Script.Hiragana
+        kanji = scripts.Script.Kanji
 
         gSegments = entry.alignment[0]
         i = 1
         while i < len(gSegments):
-            lastSegmentType = kana.scriptType(gSegments[i-1])
-            segmentType = kana.scriptType(gSegments[i])
+            lastSegmentType = scripts.scriptType(gSegments[i-1])
+            segmentType = scripts.scriptType(gSegments[i])
 
             if segmentType == hiragana and lastSegmentType == kanji and \
                     gSegments[i] not in (u'の', u'が'):
     def _solveOkurigana(self, entry):
         """ Resolves this case using our full model.
         """
-        hiragana = kana.Script.hiragana
-        kanji = kana.Script.kanji
+        hiragana = scripts.Script.Hiragana
+        kanji = scripts.Script.Kanji
 
         gSegments = entry.alignment[0]
         i = 1
         while i < len(gSegments):
-            lastSegmentType = kana.scriptType(gSegments[i-1])
-            segmentType = kana.scriptType(gSegments[i])
+            lastSegmentType = scripts.scriptType(gSegments[i-1])
+            segmentType = scripts.scriptType(gSegments[i])
 
             if segmentType == hiragana and lastSegmentType == kanji:
                 # potential okurigana case; solve then move a variable
             return default
 
         baseOkuriOptions = self._okurigana[kanjiIndex]
-        kanaOptions = Set()
+        kanaOptions = set()
         for trailingKana, okuriType, subType in baseOkuriOptions:
             if okuriType == OkuriType.verb and self._useVerbs:
                 # verb okurigana

File src/paramSearch-align.py

View file
  • Ignore whitespace
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #----------------------------------------------------------------------------#
 # scriptedRun.py
 
 warnings.filterwarnings("ignore")
 
-from functional import frange
+from sequences import frange
 import stats
 
 argv = sys.argv[1:]

File src/potentials.py

View file
  • Ignore whitespace
 
 #----------------------------------------------------------------------------#
 
-from jptools import kana
+from cjktools import scripts, kanaTable
 
 import stats
 import potentials
 
 import string
 import sys
-from sets import Set
-import codecs
+from cjktools.common import sopen
 
 #----------------------------------------------------------------------------#
 # PUBLIC METHODS
         second member is a list of (graphemeString, [potentialAlignments]).
     """
     # we record anything which we've overconstrained and can't solve
-    overconstrained = codecs.open('logs/overconstrained', 'w', 'utf8')
+    overconstrained = sopen('logs/overconstrained', 'w')
 
     uniqueEntries = []
     ambiguousEntries = []
         # too many alignments
         return
 
-    assert len(Set(finalAlignments)) == len(finalAlignments), \
+    assert len(set(finalAlignments)) == len(finalAlignments), \
             "duplicate alignments detected"
 
     if len(finalAlignments) == 1:
     """ Determine all possible segmentations of the mixed script entry string
         only, leaving the hiragana reading string untouched for now.
     """
-    kanjiScript = kana.Script.kanji
+    kanjiScript = scripts.Script.Kanji
     combinationSets = []
-    for segment in kana.scriptBoundaries(gString):
-        if len(segment) > 1 and kana.scriptType(segment) == kanjiScript:
+    for segment in scripts.scriptBoundaries(gString):
+        if len(segment) > 1 and scripts.scriptType(segment) == kanjiScript:
             combinationSets.append(stats.segmentCombinations(segment))
         else:
             combinationSets.append([(segment,)])
     """ Creates one or more segmentations which match the kanji segments with
         the reading string.
     """
-    kanjiScript = kana.Script.kanji
+    kanjiScript = scripts.Script.Kanji
 
     # where there's only one segment, no worries
     numSegments = len(gSegments)
         gSegment = gSegments[i]
         # FIXME is this needed? finalSegment = (numSegments == i+1)
 
-        if kana.scriptType(gSegment) == kanjiScript:
+        if scripts.scriptType(gSegment) == kanjiScript:
             pSegmentsList = _alignKanjiSegment(gSegment, pSegmentsList, i,
                     numSegments)
         else:
     """ Applies additional constraints to the list of alignments, returning a
         subset of that list.
     """
-    kanjiScript = kana.Script.kanji
+    kanjiScript = scripts.Script.Kanji
     keptAlignments = []
     for kanjiSeg, readingSeg in alignments:
         assert len(kanjiSeg) == len(readingSeg)
             kSeg = kanjiSeg[i]
 
             # don't allow reading segments to start with ゅ or ん
-            if kana.scriptType(kSeg) == kanjiScript and \
-                    (rSeg[0] == kana.nKana or rSeg[0] in kana.smallKanaSet):
+            if scripts.scriptType(kSeg) == kanjiScript and \
+                    (rSeg[0] == kanaTable.nKana or rSeg[0] in kanaTable.smallKana):
                 break
 
             # don't allow kanji segments with more than 4 kana per kanji
-            rSegShort = filter(lambda x: x not in kana.smallKanaSet, rSeg)
+            rSegShort = filter(lambda x: x not in kanaTable.smallKana, rSeg)
             maxLength = options.maxPerKanji*len(kSeg)
-            if kana.scriptType(kSeg) == kanjiScript and \
+            if scripts.scriptType(kSeg) == kanjiScript and \
                     len(rSegShort) > maxLength:
                 break
         else:

File src/readingModel.py

View file
  • Ignore whitespace
 
 from os import path
 import cPickle as pickle
-import codecs
-import pdb
-from bz2 import BZ2File
-from sets import Set
 
-from jptools import kana, smartCache, enum
-from jptools.functional import *
+from cjktools import scripts, smartCache, enum, alternations
+from cjktools.sequences import *
+from cjktools.common import sopen
 
 from entry import Entry
+import settings
 
 #----------------------------------------------------------------------------#
 
             differences.
         """
         print 'Initialising reading model'
-        cacheFile = 'data/readingModel.cache'
-        dictFiles = 'data/kanjidic.bz2', 'data/kanjd212.bz2'
+        cacheFile = path.join(settings.CACHE_DIR, 'readingModel.cache')
+        dictFiles = [path.join(settings.DATA_DIR, d) for d in (
+                'kanjidic.bz2', 'kanjd212.bz2')]
         self._readings = {}
         self._pool = {}
         self._okuri = {}
-
-        model = smartCache.useCache(cacheFile)
-        if model is None:
-            assert path.exists(dictFiles[0]) and path.exists(dictFiles[1])
-            print '--> Creating and caching new model'
-            model = self._parseKanjiDics(dictFiles)
-
-            smartCache.recache(model, cacheFile, ['readingModel.py',
-                    'data/kanjidic.bz2', 'data/kanjd212.bz2'])
-        else:
-            print '--> Using cached model'
+        
+        fetchModel = smartCache.diskProxyDirect(
+                self._parseKanjiDics,
+                cacheFile,
+                dependencies=[__file__] + list(dictFiles),
+            )
+        model = fetchModel(dictFiles)
 
         self._readings, self._pool, self._okuri = model
 
         """
         unique = []
         ambiguous = []
-        kanjiScript = kana.Script.kanji
+        kanjiScript = scripts.Script.Kanji
         for entry in ambiguousEntries:
             remainingAlignments = []
 
             for gSegments, pSegments in entry.potentials:
                 for i in xrange(len(gSegments)):
-                    if not kana.scriptType(gSegments[i]) == kanjiScript:
+                    if not scripts.scriptType(gSegments[i]) == kanjiScript:
                         continue
 
                     if not self._validateReading(gSegments[i], pSegments[i]):
             pool of readings and reading variants.
         """
         if len(grapheme) == 1:
-            phoneme = kana.toHiragana(phoneme)
+            phoneme = scripts.toHiragana(phoneme)
             readings = self._pool.get(grapheme, [])
             return phoneme in readings
         else:
 
         for filename in filenames:
             print '----> Parsing reading information from %s' % `filename` 
-            iStream = codecs.getreader('utf8')(BZ2File(filename, 'r'))
+            iStream = sopen(filename, 'r')
 
             for line in iStream:
                 if line.startswith('#'):
                 readings[kanji] = potentialReadings
     
                 # add pooled readings for quick checking
-                pooledReadings = Set()
+                pooledReadings = set()
                 for reading, readingLoc, readingType in potentialReadings:
                     pooledReadings.add(reading)
     
-                rendakuExtras = filteredMap(kana.rendakuVariants, pooledReadings)
+                rendakuExtras = filter(None, map(alternations.rendakuVariants,
+                        pooledReadings))
                 pooledReadings = pooledReadings.union(flatten(rendakuExtras))
     
-                onbinExtras = filteredMap(kana.onbinVariants, pooledReadings)
+                onbinExtras = filter(None, map(alternations.onbinVariants,
+                        pooledReadings))
                 pooledReadings = pooledReadings.union(flatten(onbinExtras))
     
                 readingPool[kanji] = pooledReadings
     def _parseEntry(self, line):
         """ Parses a single line from kanjidic.
         """
-        katakana = kana.Script.katakana
-        hiragana = kana.Script.hiragana
+        katakana = scripts.Script.Katakana
+        hiragana = scripts.Script.Hiragana
 
         line = line.split()
         kanji, rest = line[0], line[1:]
 
-        readings = Set()
-        okurigana = Set()
+        readings = set()
+        okurigana = set()
         for item in rest:
             if item == 'T2' or item.startswith('{'):
                 # lets not include radical names or their english meanings
                 reading = item
 
             # deterime whether it is an on or kun reading
-            readingScript = kana.scriptType(reading)
+            readingScript = scripts.scriptType(reading)
             if readingScript == katakana:
                 readingType = ReadingType.on
             elif readingScript == hiragana:
                 # this is not an entry we want to keep
                 continue
 
-            reading = kana.toHiragana(reading)
+            reading = scripts.toHiragana(reading)
 
             if '.' in reading:
                 # the reading is a case of okurigana

File src/segment.py

View file
  • Ignore whitespace
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #----------------------------------------------------------------------------#
 # segment.py
 
 import os, sys
 import optparse
-import pdb
-import codecs
-
-sys.stdout = codecs.getwriter('utf8')(sys.stdout)
+from cjktools.common import sopen
 
 import potentials
 import dictionary
 from readingModel import ReadingModel
 from okuriganaModel import OkuriganaModel
 import evaluate
+import settings
 
 #----------------------------------------------------------------------------#
 
     if not options.edict:
         print 'Reading evaluation entries'
         entries, numRejected = dictionary.evaluationEntries(
-                'data/evaluation.data')
+                os.path.join(settings.DATA_DIR, 'evaluation.data'))
     else:
         print 'Reading edict entries'
         entries, numRejected = dictionary.edictEntries(
-                'data/edict.bz2')
+                os.path.join(settings.DATA_DIR, 'edict.bz2'))
     print '--> Found %d entries (rejected %d)' % (len(entries), numRejected)
 
     print 'Separating long and short entries'

File src/settings.py

View file
  • Ignore whitespace
+# -*- coding: utf-8 -*-
+# 
+#  settings.py
+#  src
+#  
+#  Created by Lars Yencken on 2009-03-18.
+#  Copyright 2009 Lars Yencken. All rights reserved.
+# 
+
+import sys
+from os import path
+
+DATA_DIR = path.dirname(sys.argv[0])
+CACHE_DIR = path.dirname(sys.argv[0])

File src/tools/profiler.py

View file
  • Ignore whitespace
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #----------------------------------------------------------------------------#
 # profile.py

File src/tools/separateErrors.py

View file
  • Ignore whitespace
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #----------------------------------------------------------------------------#
 # separateErrors.py
 import os, sys
 import optparse
 
-import codecs
+from cjktools.common import sopen
 
 import kana
 
     """ Separates out the errors from the alignments, and tries to classify
         them.
     """
-    inputFile = codecs.open(alignmentFile, 'r', 'utf8')
-    goodFile = codecs.open(alignmentFile + '.good', 'w', 'utf8')
-    okuriganaFile = codecs.open(alignmentFile + '.okurigana', 'w', 'utf8')
-    gappingFile = codecs.open(alignmentFile + '.gapping', 'w', 'utf8')
-    badFile = codecs.open(alignmentFile + '.bad', 'w', 'utf8')
+    inputFile = sopen(alignmentFile, 'r')
+    goodFile = sopen(alignmentFile + '.good', 'w')
+    okuriganaFile = sopen(alignmentFile + '.okurigana', 'w')
+    gappingFile = sopen(alignmentFile + '.gapping', 'w')
+    badFile = sopen(alignmentFile + '.bad', 'w')
 
     nGood = 0
     nBad = 0
     """ Detects whether the correct solution contained an okurigana segment.
         These are characterized by mixed script.
     """
-    gSegments, pSegments = segmentation.split('-')
+    gSegments, pSegments = segmentation.split()
     gSegments = gSegments.strip('|').split('|')
     for segment in gSegments:
-        scriptType = kana.scriptType(segment[0])
+        scriptType = scripts.scriptType(segment[0])
         if scriptType != 'kanji':
             continue
 
         for char in segment[1:]:
-            if kana.scriptType(char) != 'kanji':
+            if scripts.scriptType(char) != 'kanji':
                 return True
     else:
         return False
     """ Determines whether this was a case of grapheme gapping. Tell-tale
         signs: a '<' in the phoneme segment.
     """
-    gSegments, pSegments = segmentation.split('-')
+    gSegments, pSegments = segmentation.split()
     pSegments = pSegments.strip('|').split('|')
     for segment in pSegments:
         if '<' in segment:

File src/tools/stripKanjiDic.py

View file
  • Ignore whitespace
-#!/usr/bin/python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #----------------------------------------------------------------------------#
 # stripKanjiDic.py
 import optparse
 import pdb
 
-import codecs
+from cjktools.common import sopen
 import kana
 
 #----------------------------------------------------------------------------#
 def transformInput(inputFile, outputFile):
     """ Transforms each kanjidic entry into just the kanji to readings mapping.
     """
-    inputStream = codecs.open(inputFile, 'r', 'utf8')
-    outputStream = codecs.open(outputFile, 'w', 'utf8')
+    inputStream = sopen(inputFile, 'r')
+    outputStream = sopen(outputFile, 'w')
 
     for line in inputStream:
         if line.startswith('#'):
                     # skip the second half
                     item = item.split('.')[0]
 
-                if kana.scriptType(item) == 'kanji':
+                if scripts.scriptType(item) == 'kanji':
                     # have non-kana item, must be ascii
                     continue
                 else:
                     # have kana item, convert to katakana
-                    potentialReadings.append(kana.katakanaForm(item))
+                    potentialReadings.append(scripts.katakanaForm(item))
 
         print >> outputStream, kanji, ' '.join(potentialReadings)