Commits

Jure Žbontar committed 7ce17cf

optimization

Comments (0)

Files changed (3)

 import sklearn.linear_model
 import sklearn.svm
 import sklearn.ensemble
+import sklearn.svm
 
 from hyperopt import hyperopt
 
-
 data = sys.argv.pop(1)
 
 if data == '1':
     X = np.loadtxt('data/D1-2/100-genes-X.tab')
     Y = np.loadtxt('data/D1-2/100-genes-Y.tab')
 
+
+X = X[:,np.std(X, axis=0) > 1e-7]
+X = (X - np.mean(X, axis=0)) / (np.std(X, axis=0))
+
 # remove classes with few positive or negative examples
 s = np.minimum(np.sum(Y, axis=0), np.sum(1 - Y, axis=0))
 Y = Y[:,s >= 10]
 
-data = Orange.data.Table(X, Y)
-if sys.argv[1] == 'lr':
-    params = [
-        {'type': 'c', 'categories': ('l2', 'l1')},  # penalty
-        {'type': 'c', 'categories': (0, 1)},  # dual
-        {'type': 'f', 'trans': lambda i: 2**i},  # C
-        {'type': 'c', 'categories': (1, 0)},  # fit_intercept
-        {'type': 'f', 'trans': lambda i: 3**i},  # intercept_scaling
-        {'type': 'f', 'trans': lambda i: 2**(i - 10)},  # tol
-    ]
-    hyperopt('lr_sk', params)
+if 0:
+    print(1)
+    def parse(params, argv):
+        kwargs = {}
+        for p, a in zip(params, argv):
+            if p['type'] == 'f':
+                kwargs[p['name']] = float(a)
+            elif p['type'] == 'i':
+                kwargs[p['name']] = int(a)
+            elif p['type'] == 'c':
+                val = p['categories'][0]
+                if isinstance(val, str):
+                    kwargs[p['name']] = a
+                elif isinstance(val, int):
+                    kwargs[p['name']] = int(a)
+                else:
+                    assert False
+            else:
+                assert False
+        return kwargs
 
-    model = mtc.BRsklearnLearner(sklearn.linear_model.LogisticRegression())
-elif sys.argv[1] == 'rf':
-    if sys.argv[2] == 'hyperopt':
+    data = Orange.data.Table(X, Y)
+    if sys.argv[1] == 'lr':
         params = [
-            {'name': 'n_estimators', 'type': 'i', 'trans': lambda i: 2**(i + 6)}, # n_estimators
-            {'type': 'c', 'categories': ('gini', 'entropy')}, # criterion
-            {'type': 'f', 'trans': lambda i: i * 0.02 + 0.1}, # max_features
+            {'type': 'c', 'name': 'penalty', 'categories': ('l2', 'l1')},
+            {'type': 'c', 'name': 'dual', 'categories': (0, 1)},
+            {'type': 'f', 'name': 'C', 'trans': lambda i: 2**i},
+            {'type': 'c', 'name': 'fit_intercept', 'categories': (1, 0)},
+            {'type': 'f', 'name': 'intercept_scaling', 'trans': lambda i: 3**i},
+            {'type': 'f', 'name': 'tol', 'trans': lambda i: 2**(i - 10)},
         ]
-        hyperopt(sys.argv[1], params)
+        if sys.argv[2] == 'hyperopt':
+            hyperopt('lr_sk', params)
+        else:
+            #args = parse(params, sys.argv[2:])
+            model = mtc.BRFitter(mtc.SKFitter(sklearn.linear_model.LogisticRegression()))
+    elif sys.argv[1] == 'rf':
+        params = [
+            {'type': 'i', 'name': 'n_estimators', 'trans': lambda i: 2**(i + 6)},
+            {'type': 'c', 'name': 'criterion', 'categories': ('gini', 'entropy')},
+            {'type': 'f', 'name': 'max_features', 'trans': lambda i: i * 0.02 + 0.1},
+        ]
+        if sys.argv[2] == 'hyperopt':
+            hyperopt(sys.argv[1], params)
+        else:
+            kwargs = parse(params, sys.argv[2:])
+            model = mtc.BRFitter(mtc.SKFitter(sklearn.ensemble.RandomForestClassifier(random_state=42, **kwargs)))
+    elif sys.argv[1] == 'gbm':
+        params = [
+            {'type': 'f', 'name': 'learning_rate', 'trans': lambda i: 2**(i - 3)},
+            {'type': 'i', 'name': 'n_estimators', 'trans': lambda i: 2**(i + 8)},
+            {'type': 'f', 'name': 'subsample', 'trans': lambda i: 0.1 * i + 0.5},
+            {'type': 'i', 'name': 'max_depth', 'trans': lambda i: i + 3},
+            {'type': 'f', 'name': 'max_features', 'trans': lambda i: i * 0.02 + 0.1},
+        ]
+        if sys.argv[2] == 'hyperopt':
+            hyperopt(sys.argv[1], params)
+        else:
+            kwargs = parse(params, sys.argv[2:])
+            model = mtc.BRFitter(mtc.SKFitter(sklearn.ensemble.GradientBoostingClassifier(**kwargs)))
+    elif sys.argv[1] == 'svm':
+        params = [
+            {'type': 'f', 'name': 'gamma', 'trans': lambda i: 2**(3 * i - 5)},
+            {'type': 'f', 'name': 'C', 'trans': lambda i: 2**(3 * i + 5)},
+        ]
+        if sys.argv[2] == 'hyperopt':
+            hyperopt(sys.argv[1], params)
+        else:
+            kwargs = parse(params, sys.argv[2:])
+            model = mtc.BRFitter(mtc.SKFitter(sklearn.svm.SVC(C=1.0, gamma=1.0, probability=True)))
 
-    else:
-        n_estimators = int(sys.argv[2])
-        criterion = sys.argv[3]
-        max_features = float(sys.argv[4])
 
-        model = mtc.BRsklearnLearner(sklearn.ensemble.RandomForestClassifier(
-            n_estimators=n_estimators,
-            criterion=criterion,
-            max_features=max_features,
-        ))
-elif sys.argv[1] == 'gbm':
-    model = mtc.BRsklearnLearner(sklearn.ensemble.GradientBoostingClassifier())
-elif sys.argv[1] == 'svm':
-    pass
-cv = Orange.evaluation.CrossValidation(data, model)
-print(1, mtc.mt_average_score(Orange.evaluation.AUC_binary, data, cv.KFold(5)[1]))
+    np.random.seed(42)
+    print(Orange.evaluation.cross_validation(model, data, mtc.auc_mt, Orange.evaluation.TTVSplit(n_repeats=1)))
 
 
-#import ml_metrics
-##model = sklearn.linear_model.LogisticRegression()
-#model = sklearn.ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', max_features=0.1)
-##model = sklearn.ensemble.GradientBoostingClassifier()
-#P = np.zeros_like(Y)
-#fold = 0
-#for tr, te in sklearn.cross_validation.KFold(X.shape[0], 5):
-#    for j in range(Y.shape[1]):
-#        y = Y[:,j]
-#        model.fit(X[tr], y[tr])
-#        P[te,j] = model.predict_proba(X[te])[:,1]
-#    fold += 1
-#
-#scores = []
-#for j in range(Y.shape[1]):
-#    scores.append(ml_metrics.auc(Y[:,j], P[:,j]))
-#print(np.mean(scores))
+elif 0:
+    print(2)
+    np.random.seed(42)
+
+    import ml_metrics
+    import sklearn
+    model = sklearn.linear_model.LogisticRegression()
+    #model = sklearn.ensemble.RandomForestClassifier(n_estimators=32, criterion='gini', max_features=0.1, random_state=42)
+    #model = sklearn.ensemble.GradientBoostingClassifier()
+    #P = np.zeros_like(Y)
+    scores_te, scores_va = [], []
+    p_tr = 0.6
+    p_te = 0.2
+    for i in range(1):
+        inds = np.random.permutation(Y.shape[0])
+        n_tr = int(p_tr * Y.shape[0])
+        n_te = int(p_te * Y.shape[0])
+        tr = inds[:n_tr]
+        te = inds[n_tr:n_tr + n_te]
+        va = inds[n_tr + n_te:]
+        print(te)
+        print(Y.shape)
+        for j in range(Y.shape[1]):
+            y = Y[:,j]
+            model.fit(X[tr], y[tr])
+
+            if np.unique(y[te]).size == 2:
+                p = model.predict_proba(X[te])[:,1]
+                scores_te.append(ml_metrics.auc(y[te], p))
+
+            if np.unique(y[va]).size == 2:
+                p = model.predict_proba(X[va])[:,1]
+                scores_va.append(ml_metrics.auc(y[va], p))
+    print(np.mean(scores_te), np.mean(scores_va))
+
+else:
+    print(3)
+    np.random.seed(42)
+
+    import ml_metrics
+    import sklearn
+    import copy
+
+    model = mtc.SKFitter(sklearn.linear_model.LogisticRegression())
+
+    scores_te, scores_va = [], []
+    p_tr = 0.6
+    p_te = 0.2
+
+
+    mt_domain = Orange.data.Domain.from_numpy(X, Y)
+    for i in range(1):
+        inds = np.random.permutation(Y.shape[0])
+        n_tr = int(p_tr * Y.shape[0])
+        n_te = int(p_te * Y.shape[0])
+        tr = inds[:n_tr]
+        te = inds[n_tr:n_tr + n_te]
+        va = inds[n_tr + n_te:]
+        print(te)
+
+        for j in range(Y.shape[1]):
+            domain = Orange.data.Domain(mt_domain.attributes, mt_domain.class_vars[j])
+            data = Orange.data.Table(domain, X, Y[:,j][:,None])
+            cls = model(data[tr])
+
+            if np.unique(data.Y[te]).size == 2:
+                p = cls(data[te], cls.Probs)[:,1]
+                scores_te.append(ml_metrics.auc(data.Y[te].flat, p))
+
+    print(np.mean(scores_te))
         y_score = np.append(y_score, obj['va'])
 
 
+if 'hyperopt' in sys.argv:
+    eval_cache = shelve.open('pkl/eval_{}.pkl'.format('_'.join(sys.argv[1:])))
+    method_args = sys.argv[1:3]
 
-eval_cache = shelve.open('pkl/eval_{}.pkl'.format('_'.join(sys.argv[1:])))
-method_args = sys.argv[1:3]
 def eval(method, params):
     params_key = repr(params)
     if params_key not in eval_cache:
 
 import numpy as np
 
+import bottleneck
+import sys
+
 def mt_average_score(metric):
     def f(Y, Y_hat):
         scores = []
         for j in range(Y.shape[1]):
-            d = Orange.data.Table(data.X, data.Y[:,j][:,None])
             scores.append(metric(Y[:,j], (Y_hat[0][:,j], Y_hat[1][:,j])))
-        return np.mean(scores)
+        return bottleneck.nanmean(scores)
     return f
 
 ca_mt = mt_average_score(Orange.evaluation.ca)
 
     def fit(self, X, Y, W):
         models = []
+
         for j in range(Y.shape[1]):
             m = copy.deepcopy(self.learner)
-            data = Orange.data.Table(X, Y[:,j][:,None])
+
+            # Optimization -- building a Table from numpy is slow if not given a domain
+            domain = Orange.data.Domain(self.domain.attributes, self.domain.class_vars[j])
+            data = Orange.data.Table(domain, X, Y[:,j][:,None])
+
             models.append(m(data))
         return BRModel(models)
 
 
 if __name__ == '__main__':
     import sklearn.linear_model
+    import sklearn.svm
     import sklearn.cross_validation
 
     #data = Orange.data.Table('iris')
     #print(Orange.evaluation.cross_validation(model, data, Orange.evaluation.CA, Orange.evaluation.KFold()))
 
     data = Orange.data.Table('emotions')
-    model = BRFitter(SKFitter(sklearn.linear_model.LogisticRegression()))
+    #model = BRFitter(SKFitter(sklearn.linear_model.LogisticRegression()))
+    model = BRFitter(SKFitter(sklearn.svm.SVC(probability=True)))
     print(Orange.evaluation.cross_validation(model, data, auc_mt, Orange.evaluation.TTVSplit()))
 
-
     model = sklearn.linear_model.LogisticRegression()
     X = data.X
     Y = data.Y