Jacob Perkins avatar Jacob Perkins committed b93ee04

tfidf transformer, more classifiers & args cleanup

Comments (0)

Files changed (1)


 classifier_choices = ['NaiveBayes', 'DecisionTree', 'Maxent'] + MaxentClassifier.ALGORITHMS
+dense_classifiers = set(['ExtraTreesClassifier', 'GradientBoostingClassifier',
+		'RandomForestClassifier', 'GaussianNB', 'DecisionTreeClassifier'])
+verbose_classifiers = set(['RandomForestClassifier', 'SVC'])
 	from nltk.classify import scikitlearn
+	from sklearn.feature_extraction.text import TfidfTransformer
 	from sklearn.pipeline import Pipeline
-	from sklearn import linear_model, naive_bayes, neighbors, svm, tree
+	from sklearn import ensemble, feature_selection, linear_model, naive_bayes, neighbors, svm, tree
 	classifiers = [
+		ensemble.ExtraTreesClassifier,
+		ensemble.GradientBoostingClassifier,
+		ensemble.RandomForestClassifier,
 		#linear_model.SGDClassifier, # NOTE: this seems terrible, but could just be the options
-		#naive_bayes.GaussianNB, # TODO: requires a dense matrix
+		naive_bayes.GaussianNB,
 		neighbors.KNeighborsClassifier, # TODO: options for nearest neighbors
-		#tree.DecisionTreeClassifier, # TODO: requires a dense matrix
+		tree.DecisionTreeClassifier,
 	sklearn_classifiers = {}
 	decisiontree_group.add_argument('--support_cutoff', default=10, type=int,
 		help='default is 10')
-sklearn_kwargs = {}
+sklearn_kwargs = {
+	# ensemble
+	'ExtraTreesClassifier': ['criterion', 'max_feats', 'depth_cutoff', 'n_estimators'],
+	'GradientBoostingClassifier': ['learn_rate', 'max_feats', 'depth_cutoff', 'n_estimators'],
+	'RandomForestClassifier': ['criterion', 'max_feats', 'depth_cutoff', 'n_estimators'],
+	# linear_model
+	'LogisticRegression': ['C','penalty'],
+	# naive_bayes
+	'BernoulliNB': ['alpha'],
+	'MultinomialNB': ['alpha'],
+	# svm
+	'LinearSVC': ['C', 'loss', 'penalty'],
+	'NuSVC': ['nu', 'kernel'],
+	'SVC': ['C', 'kernel'],
+	# tree
+	'DecisionTreeClassifier': ['criterion', 'max_feats', 'depth_cutoff'],
 def add_sklearn_args(parser):
 	if not sklearn_classifiers: return
 	sklearn_group = parser.add_argument_group('sklearn Classifiers',
-		'These options are common to many of the sklearn classification algorithms.')
+		'These options are used by one or more sklearn classification algorithms.')
 	sklearn_group.add_argument('--alpha', type=float, default=1.0,
 		help='smoothing parameter for naive bayes classifiers, default is %(default)s')
 	sklearn_group.add_argument('--C', type=float, default=1.0,
 		help='penalty parameter, default is %(default)s')
-	sklearn_group.add_argument('--penalty', choices=['l1', 'l2'],
-		default='l2', help='norm for penalization, default is %(default)s')
 	sklearn_group.add_argument('--kernel', default='rbf',
 		choices=['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
 		help='kernel type for support vector machine classifiers, default is %(default)s')
-	sklearn_kwargs['LogisticRegression'] = ['C','penalty']
-	sklearn_kwargs['BernoulliNB'] = ['alpha']
-	sklearn_kwargs['MultinomialNB'] = ['alpha']
-	sklearn_kwargs['SVC'] = ['C', 'kernel']
-	linear_svc_group = parser.add_argument_group('sklearn Linear Support Vector Machine Classifier',
-		'These options only apply when a sklearn.LinearSVC classifier is chosen.')
-	linear_svc_group.add_argument('--loss', choices=['l1', 'l2'],
+	sklearn_group.add_argument('--learn_rate', type=float, default=0.1,
+		help='learning rate, default is %(default)s')
+	sklearn_group.add_argument('--loss', choices=['l1', 'l2'],
 		default='l2', help='loss function, default is %(default)s')
-	sklearn_kwargs['LinearSVC'] = ['C', 'loss', 'penalty']
-	nu_svc_group = parser.add_argument_group('sklearn Nu Support Vector Machine Classifier',
-		'These options only apply when a sklearn.NuSVC classifier is chosen.')
-	nu_svc_group.add_argument('--nu', type=float, default=0.5,
+	sklearn_group.add_argument('--n_estimators', type=int, default=10,
+		help='Number of trees for Decision Tree ensembles, default is %(default)s')
+	sklearn_group.add_argument('--nu', type=float, default=0.5,
 		help='upper bound on fraction of training errors & lower bound on fraction of support vectors, default is %(default)s')
-	sklearn_kwargs['NuSVC'] = ['nu', 'kernel']
+	sklearn_group.add_argument('--penalty', choices=['l1', 'l2'],
+		default='l2', help='norm for penalization, default is %(default)s')
+	sklearn_group.add_argument('--tfidf', default=False, action='store_true',
+		help='Use TfidfTransformer')sklearn_group.add_argument('--criterion', choices=['gini', 'entropy'],
+		default='gini', help='Split quality function, default is %(default)s')
+# for mapping existing args to sklearn args
+sklearn_keys = {
+	'max_feats': 'max_features',
+	'depth_cutoff': 'max_depth'
 def make_sklearn_classifier(algo, args):
 	name = algo.split('.', 1)[1]
 	for key in sklearn_kwargs.get(name, []):
 		val = getattr(args, key)
-		if val is not None: kwargs[key] = val
+		if val: kwargs[sklearn_keys.get(key, key)] = val
 	if args.trace and kwargs:
 		print 'training %s with %s' % (algo, kwargs)
+	if args.trace and name in verbose_classifiers:
+		kwargs['verbose'] = True
 	return sklearn_classifiers[name](**kwargs)
 def make_classifier_builder(args):
 			classifier_train = NaiveBayesClassifier.train
 		elif algo.startswith('sklearn.'):
 			# TODO: support many options for building an estimator pipeline
-			estimator = Pipeline([('classifier', make_sklearn_classifier(algo, args))])
+			pipe = [('classifier', make_sklearn_classifier(algo, args))]
+			if args.tfidf:
+				if args.trace:
+					print 'using tfidf transformer with norm %s' % args.penalty
+				pipe.insert(0, ('tfidf', TfidfTransformer(norm=args.penalty)))
+			sparse = pipe[-1][1].__class__.__name__ not in dense_classifiers
+			if not sparse and args.trace:
+				print 'using dense matrix'
 			# TODO: option for dtype
-			classifier_train = scikitlearn.SklearnClassifier(estimator, dtype=bool).train
+			classifier_train = scikitlearn.SklearnClassifier(Pipeline(pipe), dtype=bool, sparse=sparse).train
 			if algo != 'Maxent':
 				classifier_train_kwargs['algorithm'] = algo
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.