Commits

Jure Žbontar  committed 22b3500

test on synthetic dataset

  • Participants
  • Parent commits e5455b4

Comments (0)

Files changed (1)

File breiman2002.py

 import mtc
 import Orange
 import sklearn.linear_model
+import sklearn.cross_validation
 
 import numpy as np
 import sys
 
+
 fitters = [
-    mtc.SKFitter(sklearn.linear_model.LinearRegression(), supports_multiclass=True),
-#fitter = mtc.BRFitter(mtc.SKFitter(sklearn.linear_model.Ridge()))
-#fitter = mtc.PLSFitter(n_components=4)
-    mtc.CurdsWheyFitter(type='gcv'),
-    mtc.MTStackFitter(mtc.BRFitter(mtc.SKFitter(sklearn.linear_model.Ridge())), mtc.BRFitter(mtc.SKFitter(sklearn.linear_model.Ridge()))),
-#fitter = mtc.ReducedRankFitter(rank=5)
-#fitter = mtc.FICYREGFitter()
+    ('Linear Regression', [mtc.SKFitter(sklearn.linear_model.LinearRegression(), supports_multiclass=True)]),
+    ('BR Ridge Regression', [mtc.BRFitter(mtc.SKFitter(sklearn.linear_model.Ridge(alpha))) for alpha in (0.1, 0.3, 1, 3, 10)]),
+    ('PLS', [mtc.PLSFitter(n_components=n_components) for n_components in (3, 5, 7, 10, 15, 20)]),
+    ('Curds & Whey (gcv)', [mtc.CurdsWheyFitter(type='gcv')]),
+    ('Curds & Whey (population)', [mtc.CurdsWheyFitter(type='population')]),
+    ('BR Ridge Regression Stack', [mtc.MTStackFitter(mtc.BRFitter(mtc.SKFitter(sklearn.linear_model.Ridge(alpha))), mtc.BRFitter(mtc.SKFitter(sklearn.linear_model.LinearRegression()))) for alpha in (0.1, 0.3, 1, 3, 10)]),
+    ('Reduced Rank Regression', [mtc.ReducedRankFitter(rank=rank) for rank in [3, 5, 7, 10, 15, 20]]),
+    ('FICYREG', [mtc.FICYREGFitter()]),
 ]
 
 
 np.random.seed(42)
 
-p = 50
+"""
+p = 70
 N_te = 1000
 scores = []
 for _ in range(10):
     N = 100
-    q = 20
+    q = 2
     X = np.random.normal(size=(N + N_te, p))
     eps = np.random.normal(size=(N + N_te, q))
 
-    #j = np.random.uniform(1, 50, size=10)
-    #l = np.random.uniform(1, 6, size=10)
-    #
-    #G = np.maximum(0, (l - np.abs(np.arange(p)[:,None] - j)))**2
-    #G = G / np.sum(G, axis=0)
-    #
-    #C = np.random.normal(size=(q, 10))
-    #A = C.dot(G.T)
-
     A = np.random.normal(size=(q, p))
-
     A /= np.std(X.dot(A.T), axis=0)[:,None]
 
     y = X.dot(A.T)
         row.append(np.mean((model(data[N:]) - data[N:].Y)**2))
     scores.append(row)
 print(np.mean(scores, axis=0))
+"""
 
-
-"""
 p = 50
 N_te = 1000
 scores = []
             V = r**np.abs(rows - cols)
             X = np.random.multivariate_normal(np.zeros(p), V, N + N_te)
 
-            for s in (1, np.sqrt(1./3)):
-                eps = np.random.normal(0, s, size=(N + N_te, q))
+            for cov_structure in (0, 1):
+                for snr in (1., 3.):
+                    if cov_structure == 0:
+                        cov = np.eye(q)
+                    else:
+                        cov = np.diag(np.arange(1, q + 1).astype(float)**2)
 
-                j = np.random.uniform(1, 50, size=10)
-                l = np.random.uniform(1, 6, size=10)
+                    cov *= np.mean(1. / np.diag(cov)) / snr
+                    eps = np.random.multivariate_normal(np.zeros(q), cov, size=(N + N_te))
 
-                G = np.maximum(0, (l - np.abs(np.arange(p)[:,None] - j)))**2
-                G = G / np.sum(G, axis=0)
+                    # SNR
+                    #print(snr, np.mean(1. / np.var(eps, axis=0)))
 
-                for rho in (0.9, 0.8, 0.7, 0.6, 0.5):
-                    rows, cols = np.indices((q, q))
-                    Gamma = rho**np.abs(rows - cols)
-                    C = np.random.multivariate_normal(np.zeros(q), Gamma, 10).T
-                    A = C.dot(G.T)
+                    j = np.random.uniform(1, 50, size=10)
+                    l = np.random.uniform(1, 6, size=10)
 
-                    A /= np.std(X.dot(A.T), axis=0)[:,None]
+                    G = np.maximum(0, (l - np.abs(np.arange(p)[:,None] - j)))**2
+                    G = G / np.sum(G, axis=0)
 
-                    y = X.dot(A.T)
-                    Y = y + eps
+                    for rho in (0.83, 0.55, 0):
+                        rows, cols = np.indices((q, q))
+                        Gamma = rho**np.abs(rows - cols)
+                        C = np.random.multivariate_normal(np.zeros(q), Gamma, 10).T
+                        A = C.dot(G.T)
 
-                    data = Orange.data.Table(X, Y)
+                        A /= np.std(X.dot(A.T), axis=0)[:,None]
 
+                        y = X.dot(A.T)
+                        Y = y + eps
 
-                    model = fitter(data[:N])
-                    scores.append(np.mean((model(data[N:]) - data[N:].Y)**2))
-print(np.mean(scores))
+                        data = Orange.data.Table(X, Y)
+                        row = []
+                        for _, fitter_list in fitters:
+                            if len(fitter_list) == 1:
+                                fitter = fitter_list[0]
+                            else:
+                                xs = []
+                                for fitter in fitter_list:
+                                    ys = []
+                                    try:
+                                        for tr, te in sklearn.cross_validation.KFold(N, 3):
+                                            model = fitter(data[tr])
+                                            ys.append(np.mean((model(data[te]) - data[te].Y)**2))
+                                        xs.append((np.mean(ys), fitter))
+                                    except np.linalg.linalg.LinAlgError:
+                                        pass
+                                fitter = min(xs, key=lambda x: x[0])[1]
 
-"""
+                            model = fitter(data[:N])
+                            row.append(np.mean((model(data[N:]) - data[N:].Y)**2))
+                        scores.append(row)
+
+scores = np.mean(scores, axis=0)
+for i in np.argsort(scores):
+    print(scores[i], fitters[i][0])
+