Commits

Jacob Perkins  committed 58ad943

value type float for tfidf

  • Participants
  • Parent commits 3ebabb1

Comments (0)

Files changed (2)

File nltk_trainer/classification/args.py

 			if not sparse and args.trace:
 				print 'using dense matrix'
 			
-			if args.value_type == 'bool':
+			if args.value_type == 'bool' and not args.tfidf:
 				dtype = bool
-			elif args.value_type == 'int':
+			elif args.value_type == 'int' and not args.tfidf:
 				dtype = int
 			else:
 				dtype = float
 			
+			if args.trace:
+				print 'using dtype %s' % dtype.__name__
+			
 			classifier_train = scikitlearn.SklearnClassifier(Pipeline(pipe), dtype=dtype, sparse=sparse).train
 		else:
 			if algo != 'Maxent':

File train_classifier.py

 
 feat_group = parser.add_argument_group('Feature Extraction',
 	'The default is to lowercase every word, strip punctuation, and use stopwords')
-feat_group.add_argument('--ngrams', action='append', type=int,
+feat_group.add_argument('--ngrams', nargs='+', type=int,
 	help='use n-grams as features.')
 feat_group.add_argument('--no-lowercase', action='store_true', default=False,
 	help="don't lowercase every word")
 	help='language stopwords to filter, defaults to "no" to keep stopwords')
 feat_group.add_argument('--punctuation', action='store_true', default=False,
 	help="don't strip punctuation")
-feat_group.add_argument('--value-type', default='bool', choices=('bool', 'int'),
+feat_group.add_argument('--value-type', default='bool', choices=('bool', 'int', 'float'),
 	help='''Data type of values in featuresets. The default is bool, which ignores word counts.
 	Use int to get word and/or ngram counts.''')
 
 			print 'using bag of words from known set feature extraction'
 		
 		featx = lambda words: bag_of_words_in_set(words, bestwords)
-	elif args.value_type == 'int':
+	else:
 		if args.trace:
 			print 'using word counts from known set feature extraction'
 		
 		print 'using bag of words feature extraction'
 	
 	featx = bag_of_words
-elif args.value_type == 'int':
+else:
 	if args.trace:
 		print 'using word counts feature extraction'
 	
 	featx = word_counts
-else:
-	raise ValueError('unknown value type %s' % args.value_type)
 
 #####################
 ## text extraction ##