Commits

Jacob Perkins committed 3ebabb1

feature value type can be int for word/ngram counts

  • Participants
  • Parent commits a27b04c

Comments (0)

Files changed (3)

File nltk_trainer/classification/args.py

 			
 			if not sparse and args.trace:
 				print 'using dense matrix'
-			# TODO: option for dtype
-			classifier_train = scikitlearn.SklearnClassifier(Pipeline(pipe), dtype=bool, sparse=sparse).train
+			
+			if args.value_type == 'bool':
+				dtype = bool
+			elif args.value_type == 'int':
+				dtype = int
+			else:
+				dtype = float
+			
+			classifier_train = scikitlearn.SklearnClassifier(Pipeline(pipe), dtype=dtype, sparse=sparse).train
 		else:
 			if algo != 'Maxent':
 				classifier_train_kwargs['algorithm'] = algo

File nltk_trainer/classification/featx.py

 import math
+from nltk import probability
 
 def bag_of_words(words):
 	return dict([(word, True) for word in words])
 def bag_of_words_in_set(words, wordset):
 	return bag_of_words(set(words) & wordset)
 
+def word_counts(words):
+	return dict(probability.FreqDist((w, 1) for w in words))
+
+def word_counts_in_set(words, wordset):
+	return word_counts((w for w in words if w in wordset))
+
 def train_test_feats(label, instances, featx=bag_of_words, fraction=0.75):
 	labeled_instances = [(featx(i), label) for i in instances]
 	

File train_classifier.py

 from nltk.util import ngrams
 from nltk_trainer import dump_object, import_attr, load_corpus_reader
 from nltk_trainer.classification import corpus, scoring
-from nltk_trainer.classification.featx import bag_of_words, bag_of_words_in_set, train_test_feats
+from nltk_trainer.classification.featx import (bag_of_words, bag_of_words_in_set,
+	word_counts, train_test_feats)
 from nltk_trainer.classification.multi import MultiBinaryClassifier
 
 ########################################
 	help='language stopwords to filter, defaults to "no" to keep stopwords')
 feat_group.add_argument('--punctuation', action='store_true', default=False,
 	help="don't strip punctuation")
+feat_group.add_argument('--value-type', default='bool', choices=('bool', 'int'),
+	help='''Data type of values in featuresets. The default is bool, which ignores word counts.
+	Use int to get word and/or ngram counts.''')
 
 score_group = parser.add_argument_group('Feature Scoring',
 	'The default is no scoring, all words are included as features')
 		ws = ws[:args.max_feats]
 	
 	bestwords = set([w for (w, s) in ws])
-	featx = lambda words: bag_of_words_in_set(words, bestwords)
+	
+	if args.value_type == 'bool':
+		if args.trace:
+			print 'using bag of words from known set feature extraction'
+		
+		featx = lambda words: bag_of_words_in_set(words, bestwords)
+	elif args.value_type == 'int':
+		if args.trace:
+			print 'using word counts from known set feature extraction'
+		
+		featx = lambda words: word_counts_in_set(words, bestwords)
 	
 	if args.trace:
 		print '%d words meet min_score and/or max_feats' % len(bestwords)
+elif args.value_type == 'bool':
+	if args.trace:
+		print 'using bag of words feature extraction'
+	
+	featx = bag_of_words
+elif args.value_type == 'int':
+	if args.trace:
+		print 'using word counts feature extraction'
+	
+	featx = word_counts
 else:
-	featx = bag_of_words
+	raise ValueError('unknown value type %s' % args.value_type)
 
 #####################
 ## text extraction ##