Jacob Perkins avatar Jacob Perkins committed 8644b1e Merge

merge

Comments (0)

Files changed (6)

classify_corpus.py

 
 source_corpus = load_corpus_reader(args.source_corpus, args.reader)
 
+if not source_corpus:
+	raise ValueError('%s is an unknown corpus')
+
+if args.trace:
+	print 'loaded %s' % args.source_corpus
+
 ########################
 ## text normalization ##
 ########################

nltk_trainer/__init__.py

 	mod = __import__(basepath, globals(), locals(), [name])
 	return getattr(mod, name)
 
-def load_corpus_reader(corpus, reader=None, fileids=None, **kwargs):
+def load_corpus_reader(corpus, reader=None, fileids=None, sent_tokenizer=None, word_tokenizer=None, **kwargs):
 	if corpus == 'timit':
 		return LazyCorpusLoader('timit', NumberedTaggedSentCorpusReader,
 			'.+\.tags', tag_mapping_function=simplify_wsj_tag)
 			except LookupError:
 				raise ValueError('cannot find corpus path for %s' % corpus)
 		
+		if sent_tokenizer and isinstance(sent_tokenizer, basestring):
+			kwargs['sent_tokenizer'] = nltk.data.load(sent_tokenizer)
+		
+		if word_tokenizer and isinstance(word_tokenizer, basestring):
+			kwargs['word_tokenizer'] = import_attr(word_tokenizer)()
+		
 		reader_cls = import_attr(reader)
 		real_corpus = reader_cls(root, fileids, **kwargs)
 	

nltk_trainer/writer/__init__.py

+import codecs, collections, os, os.path
+
+class CorpusWriter(object):
+	def __init__(self, fileids, path='~/nltk_data/corpora', mode='a', encoding='utf-8', trace=1):
+		assert fileids and path and mode
+		self.mode = mode
+		self.encoding = encoding
+		self.trace = trace or 0
+		self.full_path = os.path.expanduser(path)
+		
+		for dirname in set([os.path.dirname(fileid) for fileid in fileids]):
+			dirpath = os.path.join(self.full_path, dirname)
+			
+			if not os.path.exists(dirpath):
+				if trace:
+					print 'making directory %s' % dirpath
+				
+				os.makedirs(dirpath)
+		
+		self.fileids = [os.path.join(self.full_path, fileid) for fileid in fileids]
+		self.files = {}
+	
+	def get_file(self, fileid):
+		if not fileid.startswith(self.full_path):
+			fileid = os.path.join(self.full_path, fileid)
+		
+		if fileid not in self.files:
+			self.files[fileid] = codecs.open(fileid, self.mode, self.encoding)
+		
+		return self.files[fileid]
+	
+	def open(self):
+		for fileid in self.fileids:
+			if self.trace:
+				print 'opening %s' % fileid
+			
+			self.get_file(fileid)
+		
+		return self
+	
+	def close(self, *args, **kwargs):
+		for fileid, f in self.files.items():
+			if self.trace:
+				print 'closing %s' % fileid
+			
+			f.close()
+			del self.files[fileid]
+	
+	__enter__ = open
+	__exit__ = close
+	__del__ = close
+	
+	def write(self, s, fileid=None):
+		if not fileid:
+			fileid = self.fileids[0]
+		
+		self.get_file(fileid).write(s)

nltk_trainer/writer/chunked.py

+from nltk.tag.util import tuple2str
+from nltk_trainer.writer import CorpusWriter
+
+class ChunkedCorpusWriter(CorpusWriter):
+	def chunked_sent_string(self, sent):
+		parts = []
+		
+		for word, tag in sent:
+			try:
+				brack = word in u'[]'
+			except:
+				brack = False
+			
+			if brack:
+				# brackets don't get a tag
+				parts.append(word)
+			else:
+				# make sure no brackets or slashes in tag
+				tag = tag.replace(u'[', u'(').replace(u']', u')').replace(u'/', '|')
+				parts.append(tuple2str((word, tag)))
+		
+		return ' '.join(parts)
+	
+	def write_sents(self, sents, *args, **kwargs):
+		first = True
+		
+		for sent in sents:
+			if not first:
+				self.write(' ', *args, **kwargs)
+			else:
+				first = False
+			
+			self.write(self.chunked_sent_string(sent), *args, **kwargs)
+	
+	def write_paras(self, paras, *args, **kwargs):
+		first = True
+		
+		for para in paras:
+			if not first:
+				self.write('\n\n', *args, **kwargs)
+			else:
+				first = False
+			
+			self.write_sents(para, *args, **kwargs)

nltk_trainer/writer/classified.py

+
+class ClassifiedCorpusWriter(CorpusWriter):
+	def __init__(self, path, labels):
+		self.path = path
+		self.labels = labels
+	# TODO: make sure works with with keyword
+	def __enter__(self):
+		self._files = dict([(l, self.open(os.path.join(path, l), 'a')) for l in labels])
+	
+	def __exit__(self):
+		for f in self._files.values():
+			f.close()
+	
+	def write(self, text, label):
+		self._files[label].write(text + u'\n\n')
+import argparse, os.path
+import cPickle as pickle
+import nltk.data, nltk.tag
+from nltk_trainer import load_corpus_reader
+from nltk_trainer.writer.chunked import ChunkedCorpusWriter
+
+########################################
+## command options & argument parsing ##
+########################################
+
+# TODO: many of the args are shared with analyze_classifier_coverage, so abstract
+
+parser = argparse.ArgumentParser(description='Classify a plaintext corpus to a classified corpus')
+# TODO: make sure source_corpus can be a single file
+parser.add_argument('source_corpus', help='corpus name/path relative to an nltk_data directory')
+parser.add_argument('target_corpus', help='corpus name/path relative to an nltk_data directory')
+parser.add_argument('--trace', default=1, type=int,
+	help='How much trace output you want, defaults to 1. 0 is no trace output.')
+parser.add_argument('--tagger', default=nltk.tag._POS_TAGGER,
+	help='''pickled tagger filename/path relative to an nltk_data directory
+default is NLTK's default tagger''')
+
+# TODO: from analyze_tagged_corpus.py
+corpus_group = parser.add_argument_group('Corpus Reader Options')
+corpus_group.add_argument('--reader',
+	default='nltk.corpus.reader.plaintext.PlaintextCorpusReader',
+	help='Full module path to a corpus reader class, defaults to %(default)s.')
+corpus_group.add_argument('--fileids', default=None,
+	help='Specify fileids to load from corpus')
+corpus_group.add_argument('--sent-tokenizer', default='tokenizers/punkt/english.pickle',
+	help='Path to pickled sentence tokenizer')
+corpus_group.add_argument('--word-tokenizer', default='nltk.tokenize.WordPunctTokenizer',
+	help='Full module path to a tokenizer class, defaults to %(default)s.')
+
+args = parser.parse_args()
+
+###################
+## corpus reader ##
+###################
+
+source_corpus = load_corpus_reader(args.source_corpus, reader=args.reader,
+	fileids=args.fileids, encoding='utf-8', sent_tokenizer=args.sent_tokenizer,
+	word_tokenizer=args.word_tokenizer)
+
+if not source_corpus:
+	raise ValueError('%s is an unknown corpus')
+
+if args.trace:
+	print 'loaded %s' % args.source_corpus
+
+############
+## tagger ##
+############
+
+# TODO: from analyze_tagger_coverage.py
+if args.trace:
+	print 'loading tagger %s' % args.tagger
+
+try:
+	tagger = nltk.data.load(args.tagger)
+except LookupError:
+	try:
+		import cPickle as pickle
+	except ImportError:
+		import pickle
+	
+	tagger = pickle.load(open(os.path.expanduser(args.tagger)))
+
+#############
+## tagging ##
+#############
+
+with ChunkedCorpusWriter(fileids=source_corpus.fileids(), path=args.target_corpus) as writer:
+	for fileid in source_corpus.fileids():
+		paras = source_corpus.paras(fileids=[fileid])
+		tagged_paras = ((tagger.tag(sent) for sent in para) for para in paras)
+		writer.write_paras(tagged_paras, fileid=fileid)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.