Commits

Lars Yencken committed 8aef150

Restructures package into standard style and adds packaging script.

  • Participants
  • Parent commits 29a3e86

Comments (0)

Files changed (41)

 logs
 *.pyc
 *.pyo
+build
+dist
+src/__version__.py

File align.py

-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# align.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 noet tw=78:
-# Sat May 14 14:49:45 EST 2005
-#
-#----------------------------------------------------------------------------#
-
-""" This module performs pure segmentation and alignment only.
-"""
-
-#----------------------------------------------------------------------------#
-
-import os, sys
-import optparse
-import pdb
-import codecs
-
-import potentials
-import dictionary
-from alignment import AlignmentModel
-from readingModel import ReadingModel
-from okuriganaModel import OkuriganaModel
-import evaluate
-
-#----------------------------------------------------------------------------#
-
-def performSegmentation(outputFile, options):
-	""" The main method for this module. Performs the entire segmentation run,
-		taking an edict dictionary as input and producing a segmented output
-		for each kanji input row.
-	"""
-	# read in edict dictionary
-	if not options.edict:
-		print 'Reading evaluation entries'
-		entries, numRejected = dictionary.evaluationEntries(
-				'data/eval-alignment.data')
-	else:
-		print 'Reading edict entries'
-		entries, numRejected = dictionary.edictEntries('data/edict.bz2')
-
-	print '--> Found %d entries (rejected %d)' % (len(entries), numRejected)
-
-	print 'Separating long and short entries'
-	shortEntries, longEntries = dictionary.separateEntries(entries,
-			options.longestRun)
-	print '--> %d short, %d long' % (len(shortEntries), len(longEntries))
-
-	alignmentModel = AlignmentModel(outputFile, options)
-
-	if options.useKanjidic:
-		readingModel = ReadingModel()
-		kanjidicOkurigana = readingModel.getOkurigana()
-	else:
-		readingModel = None
-		kanjidicOkurigana = {}
-
-	print 'PASS 1: SHORT ENTRIES'
-	_resolveEntries(alignmentModel, readingModel, shortEntries, options)
-	del shortEntries
-
-	print 'PASS 2: LONG ENTRIES'
-	_resolveEntries(alignmentModel, readingModel, longEntries, options)
-	del longEntries
-
-	del readingModel
-
-	alignmentModel.finish()
-	del alignmentModel
-
-	if not options.edict:
-		evaluate.evaluateAlignment(outputFile, outputFile + '.eval')
-
-	return
-
-#----------------------------------------------------------------------------#
-
-def _resolveEntries(model, readingModel, entries, options):
-	""" 
-	"""
-	print 'Generating possible alignments'
-	unique, ambiguous = potentials.generateAlignments(entries, options)
-	print '--> %d unique, %d ambiguous' % (len(unique), len(ambiguous))
-	print '--> %d overconstrained' % \
-			(len(entries) - (len(unique) + len(ambiguous)))
-
-	if options.useKanjidic:
-		print 'Disambiguating using kanjidic'
-		moreUnique, ambiguous = readingModel.pruneAlignments(ambiguous)
-		print '--> %d unique, %d ambiguous' % (len(moreUnique), len(ambiguous))
-		unique.extend(moreUnique); del moreUnique
-
-	print 'Disambiguating readings using statistical model'
-	print '--> Processing %d unique entries' % len(unique)
-	model.addResolved(unique); del unique
-	print '--> Beginning statistical disambiguation of %d entries' % \
-			len(ambiguous)
-	model.disambiguate(ambiguous); del ambiguous
-
-	return
-
-#----------------------------------------------------------------------------#
-
-
-#----------------------------------------------------------------------------#
-# COMMAND-LINE INTERFACE
-#
-
-def createOptionParser():
-	""" Creates an option parser instance to handle command-line options.
-	"""
-	usage = \
-"""%prog [options] outputFile
-
-An efficient implementation of the Baldwin-Tanaka automated grapheme-phoneme
-alignment algorithm based on TF-IDF. By default, it uses an evaluation data set
-as input, and prints an accuracy analysis for the score."""
-
-	parser = optparse.OptionParser(usage)
-
-	parser.add_option('--max-per-kanji', action='store', dest='maxPerKanji',
-			type='int', default=5,
-			help='The maximum number of kana aligned to one kanji [5]')
-
-	parser.add_option('--no-kanjidic', action='store_false',
-			dest='useKanjidic', default=True,
-			help='Disables the kanjidic reading model')
-
-	parser.add_option('--idf-only', action='store_false', dest='tfHeuristic',
-			default=True, help='Only uses the idf heuristic [False]')
-
-	parser.add_option('--tf-only', action='store_false', dest='idfHeuristic',
-			default=True, help='Only uses the tf heuristic [False]')
-
-	parser.add_option('--random', action='store_true', dest='random',
-			help='Choose a random entry to disambiguate each time [False]')
-
-	parser.add_option('--longest-run', action='store', dest='longestRun',
-			type='int', default=4,
-			help='The longest kanji run to be handled in the first pass [4]')
-
-	parser.add_option('--edict', action='store_true',
-			dest='edict', help='Use the EDICT dictionary as input [False]')
-
-	parser.add_option('-a', '--alpha', action='store', dest='alpha',
-			default=2.5, type='float',
-			help='The smoothing value to use in tf-idf [2.5]')
-
-	parser.add_option('-s', '--solved', action='store', dest='solved',
-			default=0.07, type='float',
-			help='The weight of solved frequencies in the tf-idf equation [0.07]')
-
-	parser.add_option('-m', '--max-potentials', action='store',
-			dest='maxPotentials', type='int', default=120,
-			help='The maximum number of potential alignments for an entry [120]')
-
-	parser.add_option('--non-iterative', action='store_false',
-			dest='iterative', default=True,
-			help='Disables iterative alignment, instead taking one pass [False]')
-	parser.add_option('-u', '--unsolved', action='store', dest='unsolved',
-			default=0.13, type='float',
-			help='The weight of unsolved frequencies in the tf-idf equation [0.13]')
-
-	parser.add_option('--dump-model', action='store', dest='modelOutput',
-			help="At the end of alignment, dump the model " + \
-			"generated to the given file.")
-
-	parser.add_option('--use-model', action='store', dest='modelInput',
-			help="Instead of generating a model, use this one.")
-
-	return parser
-
-#----------------------------------------------------------------------------#
-
-def main(argv):
-	""" The main method for this module.
-	"""
-	parser = createOptionParser()
-	(options, args) = parser.parse_args(argv)
-
-	if len(args) != 1:
-		parser.print_help()
-		sys.exit(1)
-
-	outputFile = args[0]
-
-	if options.random:
-		options.tfHeuristic = False
-		options.idfHeuristic = False
-
-	performSegmentation(outputFile, options)
-
-	return
-
-#----------------------------------------------------------------------------#
-
-if __name__ == '__main__':
-	try:
-		import psyco
-		psyco.profile()
-	except:
-		pass
-
-	try:
-		main(sys.argv[1:])
-	except KeyboardInterrupt:
-		# we cancel runs often, so do it nicely
-		print >> sys.stderr, '\nAborting run!'
-		sys.exit(1)
-
-#----------------------------------------------------------------------------#
-

File alignment.py

-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# alignment.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 noet tw=78:
-# Mon May 16 11:24:31 EST 2005
-#
-#----------------------------------------------------------------------------#
-
-""" This module implements the iterative TF-IDF method.
-"""
-
-#----------------------------------------------------------------------------#
-
-import potentials
-from frequency import FrequencyMap
-
-from jptools import kana
-from jptools.progressBar import ProgressBar
-
-import math
-from sets import Set
-import codecs
-import random
-import cPickle as pickle
-
-#----------------------------------------------------------------------------#
-
-# epsilon for testing for zero
-eps = 1e-8
-
-#----------------------------------------------------------------------------#
-
-class AlignmentModel:
-	""" This class is responsible for the alignment algorithm, and all its
-		internal data structures.
-	"""
-	#------------------------------------------------------------------------#
-	# PUBLIC METHODS
-	#
-
-	def __init__(self, outputFile, options):
-		""" Creates a new instance using the list of correctly aligned
-			readings.
-		"""
-		print 'Initialising alignment model'
-		if options.modelInput:
-			print '--> Seeding from', `options.modelInput`
-			self._uniqueCounts = pickle.load(open(options.modelInput))
-		else:
-			print '--> Seeding from empty model'
-			self._uniqueCounts = FrequencyMap()
-
-		self._ambiguousCounts = FrequencyMap()
-
-		# possibly a filename to dump our final model into
-		self._modelDumpFile = options.modelOutput
-
-		# whether to align all at once or iteratively
-		self._iterative = options.iterative
-
-		# we write aligned readings as we go, rather than storing them in
-		# memory
-		self._output = codecs.open(outputFile, 'w', 'utf8')
-		self._outputName = outputFile
-
-		# ratios for the tf-idf
-		self._alpha = options.alpha
-		self._solved = options.solved
-		self._unsolved = options.unsolved
-
-		# setting either of these defaults non-zero will prevent calculation
-		# of that heuristic
-		if options.random:
-			self._useRandom = True
-			print '--> Random model selected'
-		else:
-			self._useRandom = False
-
-			# only define these variables in the non-random case to ensure
-			# that they never get used in the random case
-			self._defaultTf = 0
-			self._defaultIdf = 0
-	
-			if not options.tfHeuristic:
-				print '--> Disabling tf heuristic'
-				self._defaultTf = 1
-	
-			elif not options.idfHeuristic:
-				print '--> Disabling idf heuristic'
-				self._defaultIdf = 1
-			
-			else:
-				print '--> Full TF-IDF enabled'
-
-		return
-	
-	#------------------------------------------------------------------------#
-
-	def addResolved(self, resolvedEntries):
-		""" Populates the statistical model with a number of resolved entries. 
-		"""
-		# add all unambiguous readings to our model
-		for entry in resolvedEntries:
-			self._uniqueCounts.addCounts(entry.alignment)
-			print >> self._output, entry.toLine()
-
-		return
-
-	#------------------------------------------------------------------------#
-
-	def disambiguate(self, ambiguous):
-		""" Incorporates and aligns the ambiguous readings based on existing
-			alignments.
-		"""
-		if not ambiguous:
-			return
-
-		self._initialiseEntries(ambiguous)
-		numEntries = len(ambiguous)
-
-		if self._useRandom:
-			# randomly pick the best alignment for each entry
-			self._randomAlignment(ambiguous)
-
-		elif not self._iterative:
-			# perform first and only scoring iteration
-			self._rescore(ambiguous)
-	
-		progressBar = ProgressBar()
-		progressBar.start(100)
-
-		i = 0
-		while i < numEntries:
-			if self._iterative and not self._useRandom:
-				# perform expensive rescoring
-				self._rescore(ambiguous)
-				ambiguous.sort()
-
-			bestEntry = ambiguous.pop()
-			self._disambiguateEntry(bestEntry)
-
-			print >> self._output, bestEntry.toLine()
-
-			i += 1
-			progressBar.fractional(math.sqrt(i)/math.sqrt(numEntries))
-
-		progressBar.finish()
-
-		return
-
-	#------------------------------------------------------------------------#
-	
-	def finish(self):
-		""" Closes the output stream and sorts the output for easier
-			comparison.
-		"""
-		self._output.close()
-
-		if self._modelDumpFile:
-			# dump our 
-			oStream = open(self._modelDumpFile, 'w')
-			pickle.dump(self._uniqueCounts, oStream)
-			oStream.close()
-
-		assert self._ambiguousCounts._gSize == 0
-
-		return
-	
-	#------------------------------------------------------------------------#
-
-	#------------------------------------------------------------------------#
-	# PRIVATE METHODS
-	#
-
-	def _initialiseEntries(self, ambiguousEntries):
-		""" Updates the counts for ambiguous readings and restructures them to
-			be updated.
-		"""
-		for i in xrange(len(ambiguousEntries)):
-			entry = ambiguousEntries[i]
-			alignments = entry.potentials
-
-			assert len(Set(alignments)) == len(alignments), \
-					"Readings are not unique"
-
-			# update our counts
-			for alignment in alignments:
-				self._ambiguousCounts.addCounts(alignment)
-
-			entry.score = 0.0
-			entry.scores = [0.0]*len(alignments)
-
-		return
- 
-	#------------------------------------------------------------------------#
-
-	def _disambiguateEntry(self, entry):
-		""" Modify the entry to remove all the additional ambiguous alignments,
-			and update our internal counts.
-		"""
-		entry.scores = None
-
-		# put this count amongst the unique ones
-		self._uniqueCounts.addCounts(entry.alignment)
-
-		# fill in the rest of this count
-		# eliminate the bad readings from the model
-		for alignment in entry.potentials:
-			self._ambiguousCounts.delCounts(alignment)
-
-		entry.potentials = None
-		entry.aligned = True
-
-		return
-
-	#------------------------------------------------------------------------#
-
-	def _rescore(self, ambiguous):
-		""" Loops over the entire list of ambiguous entries, rescoring each.
-		"""
-		for i in xrange(len(ambiguous)):
-			entry = ambiguous[i]
-
-			entry.scores = map(self._tfidf, entry.potentials)
-			entry.score, entry.alignment = max(zip(entry.scores, \
-					entry.potentials))
-
-		return
-
-	#------------------------------------------------------------------------#
-
-	def _weightedFreqs(self, gSegments, pSegments, index):
-		""" Weight the frequencies from the two models.
-		"""
-		s_gFreq, s_gpFreq, s_gpcFreq = self._uniqueCounts.frequencies(
-				gSegments, pSegments, index)
-		u_gFreq, u_gpFreq, u_gpcFreq = self._ambiguousCounts.frequencies(
-				gSegments, pSegments, index)
-
-		gFreq = self._solved*s_gFreq + self._unsolved*u_gFreq
-		gpFreq = self._solved*s_gpFreq + self._unsolved*u_gpFreq
-		gpcFreq = self._solved*s_gpcFreq + self._unsolved*u_gpcFreq
-
-		return gFreq, gpFreq, gpcFreq
-		
-	#------------------------------------------------------------------------#
-
-	def _explainAlignment(self, entry, alignment):
-		"""
-		"""
-		bestScore, allAlignments = entry
-		print '--->', bestScore,
-		potentials.printAlignment(alignment)
-		allAlignments.sort()
-		allAlignments.reverse()
-		for otherScore, otherAlignment in allAlignments:
-			print '----->', otherScore,
-			potentials.printAlignment(otherAlignment)
-	
-		return
-
-	#------------------------------------------------------------------------#
-
-	def _randomAlignment(self, entries):
-		""" Picks a random alignment for each entry in a list of ambiguous
-			entries. 
-		"""
-		for ambiguousEntry in entries:
-			ambiguousEntry.alignment = random.sample(
-					ambiguousEntry.potentials, 1)[0]
-		return
-
-	#------------------------------------------------------------------------#
-
-	def _tfidf(self, alignment):
-		""" Calculates the tf-idf score of the alignment passed in based on
-			the current model.
-		"""
-		kanjiScript = kana.Script.kanji
-		currentScores = []
-
-		gSegments, pSegments = alignment
-		for i in range(len(gSegments)):
-			if not kana.scriptType(gSegments[i]) == kanjiScript:
-				continue
-
-			gFreq, gpFreq, gpcFreq = self._weightedFreqs(gSegments,
-					pSegments, i)
-
-			tf = self._defaultTf or \
-				(gpFreq + self._alpha - self._unsolved) / gFreq
-
-			idf = self._defaultIdf or \
-				math.log(gpFreq/(gpcFreq + self._alpha - self._unsolved))
-
-			currentScores.append(tf*idf)
- 
-		newScore = sum(currentScores) / float(len(currentScores))
-
-		return newScore
-
-	#------------------------------------------------------------------------#
-
-#----------------------------------------------------------------------------#
-

File detectOkurigana.py

-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# detectOkurigana.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 noet tw=78:
-# Sat May 14 14:49:45 EST 2005
-#
-#----------------------------------------------------------------------------#
-
-""" This module is an executable script performing grapheme-phoneme alignment
-	based on papers by Baldwin and Tanaka.
-"""
-
-#----------------------------------------------------------------------------#
-
-import os, sys
-import optparse
-import pdb
-import codecs
-
-import potentials
-import dictionary
-from alignment import AlignmentModel
-from readingModel import ReadingModel
-from okuriganaModel import OkuriganaModel
-import evaluate
-
-#----------------------------------------------------------------------------#
-
-def detectOkurigana(outputFile, options):
-	""" Performs just okurigana detection and alignment alteration.
-	"""
-	okuriganaModel = OkuriganaModel(options)
-
-	inputFile = options.inputFile or 'data/eval-okurigana.data'
-	okuriganaModel.okuriganaAdjustments(inputFile, outputFile)
-
-	if not options.inputFile:
-		evaluate.evaluateOkurigana(outputFile, outputFile + '.eval')
-
-	return
-
-#----------------------------------------------------------------------------#
-
-
-#----------------------------------------------------------------------------#
-# COMMAND-LINE INTERFACE
-#
-
-def createOptionParser():
-	""" Creates an option parser instance to handle command-line options.
-	"""
-	usage = \
-"""%prog [options] outputFile
-
-An efficient implementation of the Baldwin-Tanaka automated grapheme-phoneme
-alignment algorithm based on TF-IDF."""
-
-	parser = optparse.OptionParser(usage)
-
-	parser.add_option('-t', '--threshold', action='store',
-			dest='okuriThreshold', type='int', default=1,
-			help='The threshold used for cooccurrence-based okurigana')
-
-	parser.add_option('--simple', action='store_true',
-			dest='simpleOkurigana', default=False,
-			help='Use a simple okurigana method, ignoring the main model')
-
-	parser.add_option('--no-kanjidic', action='store_false',
-			dest='useKanjidic', default=True,
-			help='Disables the kanjidic reading model')
-
-	parser.add_option('--no-cooccurrence', action='store_false',
-			dest='useCooccurrence', default=True,
-			help='Disables cooccurrence entries from edict')
-
-	parser.add_option('--no-verbs', action='store_false',
-			dest='useVerbs', default=True,
-			help='Disables verb entries from edict')
-
-	parser.add_option('-i', '--input', action='store', dest='inputFile',
-			help="Specify a custom input file to use.")
-
-	return parser
-
-#----------------------------------------------------------------------------#
-
-def main(argv):
-	""" The main method for this module.
-	"""
-	parser = createOptionParser()
-	(options, args) = parser.parse_args(argv)
-
-	if len(args) != 1:
-		parser.print_help()
-		sys.exit(1)
-
-	outputFile = args[0]
-
-	detectOkurigana(outputFile, options)
-
-	return
-
-#----------------------------------------------------------------------------#
-
-if __name__ == '__main__':
-	try:
-		import psyco
-		psyco.profile()
-	except:
-		pass
-
-	try:
-		main(sys.argv[1:])
-	except KeyboardInterrupt:
-		# we cancel runs often, so do it nicely
-		print >> sys.stderr, '\nAborting run!'
-		sys.exit(1)
-
-#----------------------------------------------------------------------------#
-

File dictionary.py

-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# dictionary.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 noet tw=78:
-# Mon May 16 10:50:57 EST 2005
-#
-#----------------------------------------------------------------------------#
-
-""" This module is responsible for parsing input data sets for
-	grapheme/phoneme string pairs to align. Its main methods are
-	edictEntries() and evaluationEntries().
-"""
-
-#----------------------------------------------------------------------------#
-
-import codecs
-from bz2 import BZ2File
-
-from jptools import kana
-
-from entry import Entry
-
-#----------------------------------------------------------------------------#
-# PUBLIC METHODS
-#
-
-def edictEntries(inputFile):
-	""" Determines all the kanji entries available in the input file. The input
-		file is assumed to be in edict format.
-	"""
-	if inputFile.endswith('.bz2'):
-		inputStream = codecs.getreader('utf8')(BZ2File(inputFile, 'r'))
-	else:
-		inputStream = codecs.open(inputFile, 'r', 'utf8')
-
-	rejectionStream = codecs.open('logs/rejected-entries', 'w', 'utf8')
-
-	entries = []
-	numRejected = 0
-	for line in inputStream:
-		lineParts = line.split()
-		gString = lineParts[0]
-		pString = lineParts[1][1:-1]
-		
-		if _validEntry(gString, pString):
-			entries.append(Entry(gString, pString))
-		else:
-			numRejected += 1
-			rejectionStream.write(line)
-
-	return entries, numRejected
-
-#----------------------------------------------------------------------------#
-
-def evaluationEntries(inputFile):
-	""" Get entries from a file formatted like an evaluation type instead of
-		in edict format.
-	"""
-	entries = []
-	inputStream = codecs.open(inputFile, 'r', 'utf8')
-
-	rejectionStream = codecs.open('logs/rejected-entries', 'w', 'utf8')
-
-	numRejected = 0
-	for line in inputStream:
-		gString, pString = line.split(':')[0].split('-')
-		
-		if _validEntry(gString, pString):
-			entries.append(Entry(gString, pString))
-		else:
-			numRejected += 1
-			rejectionStream.write(line)
-
-	return entries, numRejected
-
-#----------------------------------------------------------------------------#
-
-def separateEntries(entries, maxRunLength=3):
-	""" Split out the longest entries for later processing.
-	"""
-	shortEntries = []
-	longEntries = []
-
-	for entry in entries:
-		if _longestKanjiRun(entry.gString) > maxRunLength:
-			longEntries.append(entry)
-		else:
-			shortEntries.append(entry)
-	
-	return shortEntries, longEntries
-
-#----------------------------------------------------------------------------#
-
-#----------------------------------------------------------------------------#
-# PRIVATE METHODS
-#
-
-def _validEntry(gString, pString):
-	""" Returns True if the word is only kanji and kana, False otherwise.
-	"""
-	# throw out any grapheme string which contains ascii
-	if kana.Script.ascii in map(kana.scriptType, gString): 
-		return False
-
-	# throw out any reading which non-kana readings
-	isKana = lambda x: x in (kana.Script.hiragana, kana.Script.katakana)
-
-	hasNonKana = (filter(isKana, map(kana.scriptType, pString)) != [])
-
-	return hasNonKana
-	
-#----------------------------------------------------------------------------#
-
-def _longestKanjiRun(gString):
-	""" Works out the longest number of kanji in a row in the grapheme string.
-	"""
-	run = 0
-	longest = 0
-	kanjiScript = kana.Script.kanji
-	for char in gString:
-		if kana.scriptType(char) == kanjiScript:
-			run += 1
-		else:
-			if run > longest:
-				longest = run
-			run = 0
-	else:
-		if run > longest:
-			longest = run
-	
-	return longest
-
-#----------------------------------------------------------------------------#

File entry.py

-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# entry.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 noet tw=78:
-# Thu Aug 25 15:28:58 EST 2005
-#
-#----------------------------------------------------------------------------#
-
-from jptools import kana
-
-#----------------------------------------------------------------------------#
-
-class Entry:
-	""" A single grapheme-phoneme pair undergoing alignment.
-	"""
-	def __init__(self, gString, pString):
-		""" Creates a new instance.
-
-			@param gString: The grapheme string
-			@param pString: The phoneme string
-			@param potentials: Potential alignments pre-calculated
-			@param score: The current scoring
-		"""
-		self.pString = pString
-		self.gString_original = gString
-
-		# normalise the graphical form
-		if u'々' in gString:
-			gString = kana.insertDuplicateKanji(gString)
-		self.gString = gString
-
-		# have we aligned yet?
-		self.aligned = False
-
-		# best alignment so far and its score
-		self.score = None
-		self.alignment = None
-
-		# potential alignments and their scores
-		self.potentials = None
-		self.scores = None
-
-		return
-
-	def __cmp__(self, otherEntry):
-		return cmp(self.score, otherEntry.score)
-
-	def toString(self):
-		if self.aligned:
-			gSegments, pSegments = self.alignment
-			retStr = 'Entry(%s <-> %s)' % \
-					('|'.join(gSegments), '|'.join(pSegments))
-		elif self.potentials:
-			retStr = 'Entry(%s <-> %s, %d potentials)' % \
-					(self.gString, self.pString, len(self.potentials))
-		else:
-			retStr = 'Entry(%s <-> %s)' % (self.gString, self.pString)
-		return retStr
-
-	def __str__(self):
-		return self.toString()
-	
-	def __repr__(self):
-		return self.toString()
-
-	def toLine(self):
-		""" Prints the final alignment in our desired output format. 
-		"""
-		assert self.aligned
-
-		alignment = '-'.join(map(lambda x: '|'.join(x), self.alignment))
-
-		original = '%s-%s'%(self.gString_original, ''.join(self.alignment[1]))
-	
-		return ':'.join((original, alignment))
-
-	def __hash__(self):
-		if not self.alignment:
-			return hash(self.gString + self.pString)
-		else:
-			return hash(tuple(self.alignment))
-	
-#----------------------------------------------------------------------------#

File errors.py

-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# errors.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 noet tw=78:
-# Wed May 25 23:45:40 EST 2005
-#
-#----------------------------------------------------------------------------#
-
-import codecs
-import string
-from sets import Set
-import pdb
-
-import okuriganaModel
-
-#----------------------------------------------------------------------------#
-
-def separateErrors(baseFile):
-	""" Separates out the errors from the alignments, and tries to classify
-		them.
-	"""
-	newUtfFile = lambda x: codecs.open(baseFile + x, 'w', 'utf8')
-
-	inputFile = codecs.open(baseFile, 'r', 'utf8')
-
-	good = Set()
-	bad = Set()
-	badOkuri = Set()
-	badGapping = Set()
-	badAlign = Set()
-	badConstrain = Set()
-
-	for line in inputFile:
-		original, testCase, correctCase = _parseLine(line)
-
-		if testCase == correctCase:
-			good.add(line)
-			continue
-
-		if testCase == [('???',)]:
-			badConstrain.add(line)
-			bad.add(line)
-			continue
-
-		# the rest of the cases are bad
-		if _detectGapping(correctCase):
-			badGapping.add(line)
-			bad.add(line)
-			continue
-
-		if _badAlignment(testCase, correctCase):
-			badAlign.add(line)
-
-		elif _badOkurigana(correctCase, testCase):
-			badOkuri.add(line)
-
-		bad.add(line)
-	
-	total = len(good.union(bad))
-	badOther = bad.difference(badGapping.union(badAlign).union(badOkuri).union(
-			badConstrain))
-
-	_linesToFile(good, '.good', baseFile)
-	_linesToFile(bad, '.bad', baseFile)
-	_linesToFile(badOkuri, '.bad.okuri', baseFile)
-	_linesToFile(badGapping, '.bad.gapping', baseFile)
-	_linesToFile(badAlign, '.bad.align', baseFile)
-	_linesToFile(badOther, '.bad.other', baseFile)
-	_linesToFile(badConstrain, '.bad.constrain', baseFile)
-
-	nGood, nBad, nBadOkuri, nBadGapping, nBadAlign, nUnknown, nConstrain = \
-			map(
-				len,
-				(good, bad, badOkuri, badGapping, badAlign, badOther,
-				badConstrain)
-			)
-
-	print '%d total alignments' % total
-	print '--> %.2f%% correct (%d)' % ((100*nGood / float(total)),nGood)
-	print '--> %.2f%% in error (%d)' % ((100*nBad / float(total)),nBad)
-	print '----> %.2f%% okurigana (%d)' % ((100*nBadOkuri / float(total)),\
-			nBadOkuri)
-	print '----> %.2f%% gapping (%d)' % ((100*nBadGapping / float(total)),\
-			nBadGapping)
-	print '----> %.2f%% align (%d)' % ((100*nBadAlign / float(total)),\
-			nBadAlign)
-	print '----> %.2f%% overconstrained (%d)' % ((100*nConstrain / \
-			float(total)), nConstrain)
-	print '----> %.2f%% unknown (%d)' % ((100*(nUnknown)/float(total)),\
-			nUnknown)
-
-	return
-
-#----------------------------------------------------------------------------#
-
-def _parseLine(line):
-	lineTuple = line.strip().split(':', 2)
-
-	segment = lambda x: tuple(x.strip('|').split('|'))
-	lineTuple = map(lambda x: map(segment, x.split('-',1)), lineTuple)
-
-	return lineTuple
-
-#----------------------------------------------------------------------------#
-
-def _linesToFile(lineSet, extension, baseName):
-	oStream = codecs.open(baseName + extension, 'w', 'utf8')
-	oStream.writelines(lineSet)
-	oStream.close()
-	return 
-
-#----------------------------------------------------------------------------#
-
-def _badAlignment(testCase, correctCase):
-	""" Determines whether this case is a bad alignment case.
-	"""
-	gSegments, pSegments = testCase
-	cgSegments, cpSegments = correctCase
-
-	if okuriganaModel.alignmentHasOkurigana(cgSegments, cpSegments):
-		testCase = okuriganaModel.removeOkurigana(testCase[0], testCase[1])
-		correctCase = okuriganaModel.removeOkurigana(correctCase[0],
-				correctCase[1])
-
-	return testCase != correctCase
-
-#----------------------------------------------------------------------------#
-
-def _badOkurigana(testCase, correctCase):
-	gSegments, pSegments = testCase
-	cgSegments, cpSegments = correctCase
-
-	if okuriganaModel.alignmentHasOkurigana(cgSegments, cpSegments):
-		if okuriganaModel.alignmentHasOkurigana(gSegments, pSegments):
-			return True
-		else:
-			# we forgot to add okurigana
-			return False
-	else:
-		# have we mistakenly added okurigana?
-		return okuriganaModel.alignmentHasOkurigana(gSegments, pSegments)
-
-#----------------------------------------------------------------------------#
-
-def _detectGapping(correctCase):
-	""" Determines whether this was a case of grapheme gapping. Tell-tale
-		signs: a '<' in the phoneme segment.
-	"""
-	gSegments, pSegments = correctCase
-	for segment in pSegments:
-		if '<' in segment:
-			return True
-	else:
-		return False
-
-#----------------------------------------------------------------------------#
-
-# vim: ts=4 sw=4 noet tw=78:

File evaluate.py

-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# evaluate.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 noet tw=78:
-# Fri Aug 12 11:41:18 EST 2005
-#
-#----------------------------------------------------------------------------#
-
-import os, sys
-import optparse
-import codecs
-
-import errors
-from jptools import functional
-
-#----------------------------------------------------------------------------#
-
-def evaluateAlignment(predictionFile, resultsFile):
-	""" Evaluates the alignments provided in the prediction file, writing the
-		results to the results file.
-	"""
-	validationFile = 'data/eval-alignment.data'
-
-	iStream = codecs.open(predictionFile, 'r', 'utf8')
-	results = {}
-
-	validationCases = _listEntries(validationFile)
-	validationDict = dict(validationCases)
-
-	predictionCases = _listEntries(predictionFile)
-	predictionDict = dict(predictionCases)
-
-	matching = lambda x: x in validationCases
-	good, bad = functional.separate(matching, predictionCases)
-
-	results['good'] = good
-
-	addCorrect = lambda x: x + (validationDict[x[0]],)
-	bad = map(addCorrect, bad)
-
-	results['bad'] = bad
-
-	orFunc = lambda x, y: x or y
-	hasGapping = lambda x: reduce(orFunc, map(lambda y: '<' in y, x[2]))
-	gapping, align = functional.separate(hasGapping, bad)
-
-	results['gapping'] = gapping
-	results['align'] = align
-
-	isMissing = lambda x: not predictionDict.has_key(x[0])
-	missing = filter(isMissing, validationCases)
-	results['missing'] = missing
-
-	_writeResults(results, resultsFile)
-
-	return
-
-#----------------------------------------------------------------------------#
-
-def evaluateOkurigana(predictionFile, resultsFile):
-	""" Evaluates the alignments provided in the prediction file, writing the
-		results to the results file.
-	"""
-	validationFile = 'data/eval-okurigana.data'
-
-	iStream = codecs.open(predictionFile, 'r', 'utf8')
-	results = {}
-
-	validationCases = _listEntries(validationFile)
-	validationDict = dict(validationCases)
-
-	predictionCases = _listEntries(predictionFile)
-	predictionDict = dict(predictionCases)
-
-	matching = lambda x: x in validationCases
-	good, bad = functional.separate(matching, predictionCases)
-
-	results['good'] = good
-
-	addCorrect = lambda x: x + (validationDict[x[0]],)
-	bad = map(addCorrect, bad)
-
-	results['okuri'] = bad
-
-	isMissing = lambda x: not predictionDict.has_key(x[0])
-	missing = filter(isMissing, validationCases)
-	results['missing'] = missing
-
-	results['bad'] = bad + missing
-
-	_writeResults(results, resultsFile)
-
-	return
-
-#----------------------------------------------------------------------------#
-
-def _writeResults(resultsDict, resultsFile):
-	keys = resultsDict.keys()
-	keys.sort()
-
-	summaryStream = open(resultsFile, 'w')
-
-	for key in keys:
-		keyEntries = resultsDict[key]
-		number = len(keyEntries)
-		percent = 100.0*number/5000.0
-		print >> summaryStream, '%s\t%4d\t%6.2f%%' % (key, number, percent)
-		print '%s\t%4d\t%6.2f%%' % (key, number, percent)
-		oStream = codecs.open(resultsFile + '.' + key, 'w', 'utf8')
-		for line in keyEntries:
-			print >> oStream, ':'.join(line)
-		oStream.close()
-
-	return
-
-#----------------------------------------------------------------------------#
-
-
-def _listEntries(filename):
-	entries = []
-	iStream = codecs.open(filename, 'r', 'utf8')
-
-	for line in iStream:
-		key, value = line.strip().split(':', 1)
-		entries.append((key, value))
-
-	iStream.close()
-
-	return entries
-
-#----------------------------------------------------------------------------#
-
-def evaluate(predictionFile, validationFile, validationResults):
-	""" Evaluates the predictions against the validation data, writing the
-		output to a series of files with basename validationResults.
-	"""
-	testEntries = _getEntries(predictionFile)
-	correctEntries = _getEntries(validationFile)
-
-	_compareEntries(testEntries, correctEntries, validationResults)
-
-	# split the errors into a detailed analysis
-	errors.separateErrors(validationResults)
-
-	return
-
-#----------------------------------------------------------------------------#
-
-def _getEntries(filename):
-	""" Creates a dictionary of all the entries in the given file.
-	"""
-	lines = codecs.open(filename, 'r', 'utf8').readlines()
-
-	entries = {}
-	for line in lines:
-		key, value = line.split(':')[:2]
-		entries[key] = value.strip()
-
-	return entries
-
-#----------------------------------------------------------------------------#
-
-def _compareEntries(testEntries, correctEntries, resultFile):
-	""" Compares the entries from the different files.
-	"""
-	nLines = 0
-	nCorrect = 0
-	nMissing = 0
-	oStream = codecs.open(resultFile, 'w', 'utf8')
-	for key, alignment in correctEntries.iteritems():
-		testAlignment = testEntries.get(key, '???')
-
-		if alignment == testAlignment:
-			nCorrect += 1
-
-		if testAlignment == '???':
-			nMissing += 1
-
-		print >> oStream, '%s:%s:%s' % (key, testAlignment, alignment)
-
-		nLines += 1
-	
-	oStream.close()
-
-	print 'Got %.2f%% correct!' % (nCorrect*100.0/nLines)
-	if nMissing > 0:
-		print '   but %d were missing...' % nMissing
-
-	return
-
-#----------------------------------------------------------------------------#
-
-def sortFile(filename):
-	""" Sorts the file in a line-based manner.
-	"""
-	iStream = codecs.open(filename, 'r', 'utf8')
-	lines = iStream.readlines()
-	iStream.close()
-
-	lines.sort()
-
-	oStream = codecs.open(filename, 'w', 'utf8')
-	oStream.writelines(lines)
-	oStream.close()
-
-	return
-
-#----------------------------------------------------------------------------#
-
-def createOptionParser():
-	""" Creates an option parser instance to handle command-line options.
-	"""
-	usage = "%prog [options] rawResults adjustedResults"
-
-	parser = optparse.OptionParser(usage)
-
-	parser.add_option('-e', action='store', dest='correctFile',
-		default='data/evaluation.data', help='The file of correct evaluations')
-
-	return parser
-
-#----------------------------------------------------------------------------#
-
-def main(argv):
-	""" The main method for this module.
-	"""
-	parser = createOptionParser()
-	(options, args) = parser.parse_args(argv)
-
-	try:
-		[testOutputFile, resultsFile] = args
-	except:
-		parser.print_help()
-		sys.exit(1)
-
-	# execute new code here
-	evaluate(testOutputFile, options.correctFile, resultsFile)
-	
-	return
-
-#----------------------------------------------------------------------------#
-
-if __name__ == '__main__':
-	main(sys.argv[1:])
-
-#----------------------------------------------------------------------------#
-
-# vim: ts=4 sw=4 noet tw=78:

File formatEval.py

-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# formatEval.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 noet tw=78:
-# Wed Sep  7 16:10:02 EST 1005
-#
-#----------------------------------------------------------------------------#
-
-import os, sys
-import optparse
-import codecs
-
-from entry import Entry
-
-#----------------------------------------------------------------------------#
-
-def formatEvalFile(inputFile, outputFile):
-	entries = _parseEntries(inputFile)
-	oStream = codecs.open(outputFile, 'w', 'utf8')
-
-	for entry in entries:
-		lineA = entry.gString.ljust(10, u' ')
-		lineB = entry.pString.ljust(10, u' ')
-
-		extraA, extraB = _matchAlignents(entry.alignments[0])
-		lineA += extraA.ljust(10, u' ')
-		lineB += extraB.ljust(10, u' ')
-
-		extraA, extraB = _matchAlignents(entry.alignments[1])
-		lineA += extraA.ljust(10, u' ')
-		lineB += extraB.ljust(10, u' ')
-
-		print >> oStream, lineA
-		print >> oStream, lineB
-		print >> oStream
-
-	oStream.close()
-
-	return
-
-#----------------------------------------------------------------------------#
-
-def _matchAlignents(alignment):
-	gSegments, pSegments = map(list, alignment)
-	for i in range(len(gSegments)):
-		lenDiff = len(pSegments[i]) - len(gSegments[i])
-		gSegments[i] = gSegments[i].ljust(len(pSegments[i]), u' ')
-
-	lineA = u'|'.join(gSegments)
-	lineB = u'|'.join(pSegments)
-
-	return lineA, lineB
-
-#----------------------------------------------------------------------------#
-
-def _parseEntries(inputFile):
-	entries = []
-	for line in codecs.open(inputFile, 'r', 'utf8'):
-		base, attempt, actual = line.strip().split(':')
-
-		gString, pString = base.split('-')
-		entry = Entry(gString, pString)
-		fixify = lambda x: map(lambda y: y.strip('|').split('|'), 
-				x.split('-'))
-		attempt = fixify(attempt)
-		actual = fixify(actual)
-
-		entry.alignments=[attempt, actual]
-		
-		entries.append(entry)
-
-	return entries
-
-#----------------------------------------------------------------------------#
-
-def createOptionParser():
-	""" Creates an option parser instance to handle command-line options.
-	"""
-	usage = "%prog [options] inputFile outputFile"
-
-	parser = optparse.OptionParser(usage)
-
-	return parser
-
-#----------------------------------------------------------------------------#
-
-def main(argv):
-	""" The main method for this module.
-	"""
-
-	parser = createOptionParser()
-	(options, args) = parser.parse_args(argv)
-
-	try:
-		[inputFile, outputFile] = args
-	except:
-		parser.print_help()
-		sys.exit(1)
-
-	# execute new code here
-	formatEvalFile(inputFile, outputFile)
-	
-	return
-
-#----------------------------------------------------------------------------#
-
-if __name__ == '__main__':
-	main(sys.argv[1:])
-
-#----------------------------------------------------------------------------#
-
-# vim: ts=4 sw=4 noet tw=78:

File frequency.py

-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# frequency.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 noet tw=78:
-# Thu Aug 11 16:01:52 EST 2005
-#
-#----------------------------------------------------------------------------#
-
-from jptools import kana
-
-#----------------------------------------------------------------------------#
-
-class FrequencyMap:
-	""" The data structure within which frequency counts for tf-idf
-		calculations are stored.
-	"""
-	def __init__(self):
-		self._graphemes = {}
-
-		self._gSize = 0.0
-		self._gpSize = 0.0
-		self._gpcSize = 0.0
-
-		return
-	
-	def addCounts(self, alignment):
-		""" This method updates all the counts associated with the entry.
-		"""
-		kanjiScript = kana.Script.kanji
-		gSegments, pSegments = alignment
-		for i in range(len(gSegments)):
-			if kana.scriptType(gSegments[i]) == kanjiScript:
-				g, gp, gpc = self._getContext(gSegments, pSegments, i)
-
-				if not self._graphemes.has_key(g):
-					# if we don't have g, we can't have gp, gpc
-					self._graphemes[g] = (1, {gp: (1, {gpc: 1})})
-					self._gSize += 1
-					self._gpSize += 1
-					self._gpcSize += 1
-
-				else:
-					gCount, gpDict = self._graphemes[g]
-					gCount += 1
-					if not gpDict.has_key(gp):
-						# without gp, we also can't have gpc
-						gpDict[gp] = (1, {gpc: 1})
-						self._gpSize += 1
-						self._gpcSize += 1
-					else:
-						gpCount, gpcDict = gpDict[gp]
-						gpCount += 1
-						if not gpcDict.has_key(gpc):
-							gpcDict[gpc] = 1
-							self._gpcSize += 1
-						else:
-							gpcDict[gpc] += 1
-						gpDict[gp] = gpCount, gpcDict
-					self._graphemes[g] = gCount, gpDict
-
-		return
-	
-	def delCounts(self, alignment):
-		""" This method updates all the counts associated with the entry.
-		"""
-		kanjiScript = kana.Script.kanji
-		gSegments, pSegments = alignment
-		for i in range(len(gSegments)):
-			if kana.scriptType(gSegments[i]) == kanjiScript:
-				g, gp, gpc = self._getContext(gSegments, pSegments, i)
-				gCount, gpDict = self._graphemes[g]
-				gCount -= 1
-				if gCount < 1:
-					del self._graphemes[g]
-					self._gSize -= 1
-					continue
-
-				gpCount, gpcDict = gpDict[gp]
-				gpCount -= 1
-				if gpCount < 1:
-					del gpDict[gp]
-					self._gpSize -= 1
-					self._graphemes[g] = gCount, gpDict
-					continue
-
-				gpcCount = gpcDict[gpc]
-				gpcCount -= 1
-				if gpcCount < 1:
-					del gpcDict[gpc]
-					self._gpcSize -= 1
-				else:
-					gpcDict[gpc] = gpcCount
-
-				gpDict[gp] = gpCount, gpcDict
-				self._graphemes[g] = gCount, gpDict
-
-		return
-		
-	def _getContext(self, gSegments, pSegments, index):
-		""" Determine the context needed for calculations or for frequency
-			updates.
-		"""
-		grapheme = gSegments[index]
-		phoneme = pSegments[index]
-
-		# determine the left context...
-		if index > 0:
-			leftG = gSegments[index-1]
-			leftP = pSegments[index-1]
-		else:
-			leftG = None
-			leftP = None
-
-		# ...and the right context 
-		if index < len(gSegments) - 1:
-			rightG = gSegments[index+1]
-			rightP = pSegments[index+1]
-		else:
-			rightG = None
-			rightP = None
-
-		return grapheme, phoneme, (leftG, leftP, rightG, rightP)
-	
-	def frequencies(self, gSegments, pSegments, index):
-		""" Calculates the frequencies of occurence of the segment specified
-			within the alignment.
-		"""
-		g, gp, gpc = self._getContext(gSegments, pSegments, index)
-
-		gFreq, gpDict = self._graphemes.get(g, (0, {}))
-		gpFreq, gpcDict = gpDict.get(gp, (0, {}))
-		gpcFreq = gpcDict.get(gpc, 0)
-
-		gFreq /= self._gSize
-		gpFreq /= self._gpSize
-		gpcFreq /= self._gpcSize
-
-		return gFreq, gpFreq, gpcFreq
-	
-#----------------------------------------------------------------------------#

File okuriganaModel.py

-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# okuriganaModel.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 noet tw=78:
-# Thu Sep  1 16:17:52 EST 2005
-#
-#----------------------------------------------------------------------------#
-
-""" This module provides the OkuriganaModel class.
-"""
-
-#----------------------------------------------------------------------------#
-
-from os import path
-import cPickle as pickle
-from bz2 import BZ2File
-import codecs
-import re
-import pdb
-
-from jptools import kana, smartCache, enum, progressBar
-from jptools.functional import *
-
-from sets import Set
-from entry import Entry
-from readingModel import ReadingModel
-
-#----------------------------------------------------------------------------#
-
-OkuriType = enum.Enum('verb', 'kanjidic', 'cooccurrence')
-VerbType = enum.Enum('ichidan', 'godan', 'suru', 'irregular')
-
-#----------------------------------------------------------------------------#
-
-def potentialOkurigana(entry):
-	""" Determines whether this entry has any potential sites for
-		okurigana.
-	"""
-	assert entry.alignment, "How can an empty entry have okurigana?"
-	hiragana = kana.Script.hiragana
-	kanji = kana.Script.kanji
-
-	gSegments = entry.alignment[0]
-
-	lastSegmentType = hiragana
-	for i in range(len(gSegments)):
-		segmentType = kana.scriptType(gSegments[i])
-
-		if segmentType == hiragana and lastSegmentType == kanji:
-			# potential okurigana case
-			return True
-
-		lastSegmentType = segmentType
-	else:
-		# exhausted this entry, no possible okurigana
-		return False
-
-#----------------------------------------------------------------------------#
-
-def alignmentHasOkurigana(gSegments, pSegments):
-	""" Returns True if the given alignment has okurigana in it, False
-		otherwise.
-	"""
-	for seg in gSegments:
-		if len(kana.scriptBoundaries(seg)) > 1:
-			return True
-	else:
-		return False
-
-#----------------------------------------------------------------------------#
-
-def removeOkurigana(gSegments, pSegments):
-	""" Removes all okurigana from the segmentation.
-	"""
-	new_gSegments = ()
-	new_pSegments = ()
-
-	i = 0
-	while i < len(gSegments):
-		boundaries = kana.scriptBoundaries(gSegments[i])
-		if len(boundaries) == 1:
-			new_gSegments += (gSegments[i],)
-			new_pSegments += (pSegments[i],)
-			i += 1
-		elif len(boundaries) == 2:
-			kanjiPart, kanaPart = boundaries
-			if i == len(gSegments)-1 or kana.scriptType(kanaPart) != \
-					kana.scriptType(gSegments[i+1]):
-				# last segment, or differing segments
-				new_gSegments += (kanjiPart, kanaPart)
-				new_pSegments += (pSegments[i][:-len(kanaPart)],
-						pSegments[i][-len(kanaPart):])
-				i += 1
-			else:
-				# more segments, join with the next segment
-				new_gSegments += (kanjiPart, kanaPart + gSegments[i+1])
-	
-				new_pSegments += (pSegments[i][:-len(kanaPart)],
-						pSegments[i][-len(kanaPart):] + pSegments[i+1])
-
-				i += 2
-		else:
-			raise Exception, "Too many scripts per segment in %s" % gSegments
-
-	return new_gSegments, new_pSegments
-
-#----------------------------------------------------------------------------#
-
-class OkuriganaModel:
-	""" This class provides a verb-conjugation model for GP alignment.
-	"""
-	#------------------------------------------------------------------------#
-	# PUBLIC METHODS
-	#
-
-	def __init__(self, options):
-		""" Creates a new instance by parsing edict for verb conjugation
-			entries.
-		"""
-		print "Creating a new okurigana model"
-		cacheFile = 'data/okuriganaModel.cache'
-		edictFile = 'data/edict.bz2'
-
-		print '--> Cooccurrence threshold set to %d' % options.okuriThreshold
-		self._threshold = options.okuriThreshold
-
-		self._okurigana = smartCache.useCache(cacheFile,
-				threshold=self._threshold)
-
-		if self._okurigana is None:
-			assert path.exists(edictFile)
-
-			print "--> Generating new model from edict"
-			self._okurigana = self._parseEdictEntries(edictFile)
-
-			readingModel = ReadingModel()
-			extraOkurigana = readingModel.getOkurigana()
-			self._addKanjidicOkurigana(extraOkurigana)
-			
-			print "--> Caching model for later"
-			smartCache.recache(self._okurigana, cacheFile, 
-					['okuriganaModel.py'],
-					threshold=self._threshold)
-
-		else:
-			print "--> Using cached model"
-
-		self._evaluationRun = not bool(options.inputFile)
-		self._simpleMode = bool(options.simpleOkurigana)
-
-		# switches to change behaviour
-		self._useKanjidic = options.useKanjidic
-		self._useCooccurrence = options.useCooccurrence
-		self._useVerbs = options.useVerbs
-
-		self._nFixed = 0
-		return
-
-	#------------------------------------------------------------------------#
-
-	def okuriganaAdjustments(self, inputFile, outputFile):
-		""" Reparses the entries in the given file and makes okurigana
-			adjustments where necessary.
-		"""
-		if self._evaluationRun:
-			# read the evaluation input (guaranteed correctly aligned)
-			entryIter = self._evaluationInputIter(inputFile)
-		else:
-			# read regular input from the alignment run (may not be correctly
-			# aligned)
-			entryIter = self._resultsInputIter(inputFile)
-
-		oStream = codecs.open(outputFile, 'w', 'utf8')
-
-		for entry in entryIter:
-			origAlignment = '-'.join((entry.gString_original, entry.pString))
-			if potentialOkurigana:
-				# potential site, attempt to solve it
-				if self._simpleMode:
-					self._solveSimply(entry)
-				else:
-					self._solveOkurigana(entry)
-
-			newAlignment = entry.alignment
-			
-			print >> oStream, ':'.join((
-					origAlignment,
-					'-'.join(map(lambda x: '|'.join(x), newAlignment))
-				))
-
-		print '--> %d cases had shifted alignments' % self._nFixed
-
-		oStream.close()
-
-		return
-
-	#------------------------------------------------------------------------#
-
-	#------------------------------------------------------------------------#
-	# PRIVATE METHODS
-	#
-
-	def _parseEdictEntries(self, edictFile):
-		""" Parses a single edict entry for a verb.
-		"""
-		iStream = codecs.getreader('utf8')(BZ2File(edictFile, 'r'))
-
-		okurigana = {}
-		counts = {}
-
-		for line in iStream:
-			if not kana.hasKanji(line):
-				continue
-
-			gString = line.split()[0]
-
-			# update counts for okurigana clustering
-			self._updateCooccurrence(gString, counts)
-
-			# look for verbSpecific okurigana
-			self._parseVerbDetails(gString, line, okurigana)
-
-		iStream.close()
-
-		self._addCooccurrenceOkurigana(counts, okurigana)
-
-		return okurigana
-	
-	#------------------------------------------------------------------------#
-
-	def _parseVerbDetails(self, gString, line, okurigana):
-		""" Determine whether this line defines a verb, and if so grab it's
-			details for conjugation.
-		"""
-		verbTag = re.compile('\((.*,)*v(.*)\)')
-		kanjiScript = kana.Script.kanji
-
-		tagsFound = verbTag.search(line)
-		if not tagsFound:
-			return
-
-		tag = tagsFound.group(2)
-
-		if tag.endswith('-s'):
-			# FIXME ignore special cases for now
-			return
-
-		if tag == '1':
-			verbType = VerbType.ichidan
-		elif tag.startswith('5') and len(tag) <= 2:
-			verbType = VerbType.godan
-		elif tag == 's':
-			verbType = VerbType.suru
-		else:
-			return
-
-		for i in range(len(gString)-1, -1, -1):
-			if kana.scriptType(gString[i]) == kanjiScript:
-				lastKanji = gString[i]
-				trailingKana = gString[i+1:]
-				baseEntry = (trailingKana, verbType)
-
-				if not okurigana.has_key(lastKanji):
-					okurigana[lastKanji] = Set()
-
-				okurigana[lastKanji].add((trailingKana, OkuriType.verb, 
-						verbType))
-
-				break
-		else:
-			raise Exception, 'Error parsing grapheme string:' + `gString`
-
-		return
-	#------------------------------------------------------------------------#
-
-	def _addCooccurrenceOkurigana(self, counts, okurigana):
-		""" Add okurigana cases based on cooccurrence, thresholded to some
-			value.
-		"""
-		keptItems = filter(lambda x: x[1] >= self._threshold, counts.items())
-
-		counts = dict(keptItems)
-
-		for gString, pString in counts.iterkeys():
-			key = gString[-1]
-			thisSet = okurigana.setdefault(key, Set())
-			okurigana[key].add((pString, OkuriType.cooccurrence, None))
-
-		return
-
-	#------------------------------------------------------------------------#
-
-	def _addKanjidicOkurigana(self, kanjidicOkurigana):
-		""" Adds okurigana from kanjidic into the full class dictionary of
-			okurigana instances.
-		"""
-		for kanji, okurigana in kanjidicOkurigana.iteritems():
-			possibleOkurigana = self._okurigana.setdefault(kanji, Set())
-			for case in okurigana:
-				possibleOkurigana.add((case, OkuriType.kanjidic, None))
-
-		return
-
-	#------------------------------------------------------------------------#
-
-	def _addEndings(self, item, endings):
-		return map(lambda x: item + x, endings)
-
-	#------------------------------------------------------------------------#
-
-	def _updateCooccurrence(self, gString, counts):
-		""" Updates counts for each okurigana occurence.
-		"""
-
-		kanjiScript = kana.Script.kanji
-		hiraganaScript = kana.Script.hiragana
-	
-		segments = list(kana.scriptBoundaries(gString))
-		segments.reverse()
-
-		lastSeg = segments.pop()
-		lastSegType = kana.scriptType(lastSeg)
-
-		while segments:
-			thisSeg = segments.pop()
-			thisSegType = kana.scriptType(thisSeg)
-
-			if thisSegType == hiraganaScript and lastSegType == kanjiScript:
-				feature = lastSeg, thisSeg
-
-				counts[feature] = counts.get(feature, 0) + 1
-
-			lastSeg = thisSeg
-			lastSegType = thisSegType
-
-		return
-
-	#------------------------------------------------------------------------#
-
-	def _conjugate(self, kanaEnding, verbType):
-		""" Returns a list of conjugates of the verb given.
-		"""
-		if verbType == VerbType.ichidan:
-			assert kanaEnding.endswith(u'る')
-			base = kanaEnding[:-1]
-
-			conjugates = self._addEndings(
-					base,
-					[u'て', u'る', u'た', u'ない', u'られる', u'られた',
-					u'られない', u'られて']
-				)
-
-			if len(kanaEnding) > 1:
-				conjugates.append(base)
-
-		elif verbType == VerbType.suru:
-			if kanaEnding.endswith(u'する'):
-				kanaEnding = kanaEnding[:-2]
-
-			conjugates = self._addEndings(
-					kanaEnding,
-					[u'する', u'します', u'して', u'した', u'しない']
-				)
-
-		elif verbType == VerbType.godan:
-			lastChar = kanaEnding[-1]
-			realBase = kanaEnding[:-1]
-
-			assert kana.isLine(lastChar, u'う')
-			conjugates = [kanaEnding]
-
-			masuBase = realBase + kana.toLine(lastChar, u'い')
-			conjugates.append(masuBase)
-			conjugates.append(masuBase + u'ます')
-
-			if lastChar in u'いちり':
-				conjugates.extend([realBase + u'って', realBase + u'った'])
-			elif lastChar in u'みび':
-				conjugates.extend([realBase + u'んで', realBase + u'んだ'])
-			elif lastChar == u'き':
-				conjugates.append(realBase + u'いて')
-			elif lastChar == u'ぎ':
-				conjugates.append(realBase + u'いで')
-
-		return conjugates
-
-	#------------------------------------------------------------------------#
-
-	def _evaluationInputIter(self, filename):
-		""" Provide an iterator over the evaluation input.
-		"""
-		iStream = codecs.open(filename, 'r', 'utf8')
-
-		toSegments = lambda x: tuple(x.split('|'))
-
-		for line in iStream:
-			line = line.strip()
-
-			# we don't care about the correct entries at this stage, so just
-			# get the pre-aligned input
-			alignedInput, _correctTarget = line.split(':')[:2]
-
-			gString, pString = alignedInput.split('-')
-			gSegments = toSegments(gString)
-			pSegments = toSegments(pString)
-
-			assert gSegments and pSegments
-
-			newEntry = Entry(gString, pString)
-			newEntry.aligned = True
-			newEntry.alignment = gSegments, pSegments
-
-			yield newEntry
-
-		iStream.close()
-
-		return
-
-	#------------------------------------------------------------------------#
-
-
-	def _resultsInputIter(self, filename):
-		""" Iterates over the entries in a results file (directly output from
-			the alignment script).
-		"""
-		iStream = codecs.open(filename, 'r', 'utf8')
-		entries = []
-
-		toSegments = lambda x: tuple(x.split('|'))
-
-		for line in iStream:
-			line = line.strip()
-
-			# although we also have the unaligned input, ignore it for now
-			_unalignedInput, alignedInput = line.split(':')[:2]
-
-			gString, pString = alignedInput.split('-')
-			gSegments = toSegments(gString)
-			pSegments = toSegments(pString)
-
-			assert gSegments and pSegments
-
-			newEntry = Entry(gString, pString)
-			newEntry.aligned = True
-			newEntry.alignment = gSegments, pSegments
-
-			yield newEntry
-
-		iStream.close()
-
-		return
-
-	#------------------------------------------------------------------------#
-
-	def _solveSimply(self, entry):
-		""" Resolves this case by simply assuming that every site of potential
-			okurigana is okurigana, and just removing all kanji->kana
-			boundaries. 
-		"""
-		hiragana = kana.Script.hiragana
-		kanji = kana.Script.kanji
-
-		gSegments = entry.alignment[0]
-		i = 1
-		while i < len(gSegments):
-			lastSegmentType = kana.scriptType(gSegments[i-1])
-			segmentType = kana.scriptType(gSegments[i])
-
-			if segmentType == hiragana and lastSegmentType == kanji and \
-					gSegments[i] not in (u'の', u'が'):
-				# potential okurigana case; solve then move a variable
-				# increment
-				i += self._shiftSegments(entry, gSegments[i], i)
-			else:
-				i += 1
-
-			gSegments = entry.alignment[0]
-
-		return
-
-	#------------------------------------------------------------------------#
-
-	def _solveOkurigana(self, entry):
-		""" Resolves this case using our full model.
-		"""
-		hiragana = kana.Script.hiragana
-		kanji = kana.Script.kanji
-
-		gSegments = entry.alignment[0]
-		i = 1
-		while i < len(gSegments):
-			lastSegmentType = kana.scriptType(gSegments[i-1])
-			segmentType = kana.scriptType(gSegments[i])
-
-			if segmentType == hiragana and lastSegmentType == kanji:
-				# potential okurigana case; solve then move a variable
-				# increment
-				i += self._solveSingleCase(entry, i)
-			else:
-				i += 1
-
-			gSegments = entry.alignment[0]
-
-		return
-
-	#------------------------------------------------------------------------#
-
-	def _solveSingleCase(self, entry, i, default=1):
-		""" A potential case of okurigana. Determine if our verb conjugation
-			model solves this case.
-		"""
-		assert entry.alignment, "We've got an empty alignment Scotty!!!"
-		gSegments, pSegments = entry.alignment
-
-		kanjiIndex = gSegments[i-1][-1]
-
-		if not self._okurigana.has_key(kanjiIndex):
-			return default
-
-		baseOkuriOptions = self._okurigana[kanjiIndex]
-		kanaOptions = Set()
-		for trailingKana, okuriType, subType in baseOkuriOptions:
-			if okuriType == OkuriType.verb and self._useVerbs:
-				# verb okurigana
-				kanaOptions.update(self._conjugate(trailingKana, subType))
-			elif okuriType == OkuriType.kanjidic and self._useKanjidic:
-				# unknown okurigana type from kanjidic
-				kanaOptions.add(trailingKana)
-			elif okuriType == OkuriType.cooccurrence and self._useCooccurrence:
-				# unknown okurigana type from kanjidic
-				kanaOptions.add(trailingKana)
-			
-		# make a list of all potential matches
-		potentialHits = []
-		for trailingKana in kanaOptions:
-			if gSegments[i].startswith(trailingKana):
-				potentialHits.append((len(trailingKana), trailingKana))
-
-		if potentialHits:
-			# choose the longest match
-			matchedKana = max(potentialHits)[1]
-		elif gSegments[i] in (u'の', u'が'):
-			return default
-		else:
-			# XXX if we can't match, just match the whole thing =)
-			matchedKana = gSegments[i]
-
-		increment = self._shiftSegments(entry, matchedKana, i)
-
-		return increment
-
-	#------------------------------------------------------------------------#
-
-	def _shiftSegments(self, entry, kanaPrefix, i):
-		""" Upon finding a clear case of okurigana, this method is called to
-			modify the entry.
-		"""
-		assert entry.alignment, "Need a valid alignment to start with"
-		gSegments, pSegments = entry.alignment
-		self._nFixed += 1
-
-		sharedSegments = zip(gSegments, pSegments)
-
-		thisSeg = sharedSegments[i]
-		lastSeg = sharedSegments[i-1]
-
-		if len(thisSeg[1]) == len(kanaPrefix):
-			# simply remove this segment boundary
-			newSeg = lastSeg[0] + thisSeg[0], lastSeg[1] + thisSeg[1]
-
-			newSegments = sharedSegments[:i-1] + [newSeg] + sharedSegments[i+1:]
-
-			entry.alignment = map(tuple, unzip(newSegments))
-
-			return 0
-		else:
-			# shift the segment boundary
-			shiftSize = len(kanaPrefix)
-
-			(gSegA, gSegB), (pSegA, pSegB) = unzip((lastSeg,thisSeg))
-			gSegA += gSegB[:shiftSize]
-			gSegB = gSegB[shiftSize:]
-			pSegA += pSegB[:shiftSize]
-			pSegB = pSegB[shiftSize:]
-
-			lastSeg, thisSeg = zip([gSegA, gSegB], [pSegA, pSegB])
-
-			newSegments = sharedSegments[:i-1] + [lastSeg, thisSeg] + \
-					sharedSegments[i+1:]
-
-			entry.alignment = map(tuple, unzip(newSegments))
-
-			return 1
-
-	#------------------------------------------------------------------------#
-
-#----------------------------------------------------------------------------#

File paramSearch-align.py

-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-#----------------------------------------------------------------------------#
-# scriptedRun.py
-# Lars Yencken <lars.yencken@gmail.com>
-# vim: ts=4 sw=4 noet tw=78:
-# Mon Sep 12 15:09:37 EST 2005
-#
-#----------------------------------------------------------------------------#
-
-import re
-import os, sys
-import csv
-import pdb
-import warnings
-
-warnings.filterwarnings("ignore")
-
-from functional import frange
-import stats
-
-argv = sys.argv[1:]
-
-if len(argv) < 1:
-	print >> sys.stderr, 'Usage: ./scriptedRun.py output.csv [extra args]'
-	sys.exit(1)
-
-outputFile = argv[0]
-argString = ' '.join(argv[1:])
-
-totalGood = re.compile('good[ \t]+([0-9]+)[ \t]+')
-
-alphaRange = frange(0.1, 2.1, 0.2)
-sRange = frange(0.1, 3.1, 0.2)
-uRange = frange(0.1, 3.1, 0.2)
-
-dataFile = csv.writer(open(outputFile, 'w'))
-tmpDir = os.tempnam('/tmp', 'param')
-os.mkdir(tmpDir)
-outputFile = os.path.join(tmpDir, 'align.out')
-
-header = ('alpha', 's', 'u', 'good')
-dataFile.writerow(header)
-header += ('best',)
-line = '%10s %10s %10s %10s %10s' % header
-print line
-print '-'*(len(line)+4)
-
-best = 0
-for alpha, s, u in stats.combinations([alphaRange, sRange