Commits

Jure Žbontar  committed da601f9

kaggle

  • Participants
  • Parent commits db9a835

Comments (0)

Files changed (2)

File predavanja/main_19812.txt

+import sys
+import pickle
+import multiprocessing
+import tempfile
+import os
+import shutil
+from subprocess import check_call, PIPE
+
+import numpy as np
+import scipy.sparse as sp
+from scipy.optimize import fmin_l_bfgs_b
+
+import EMC_IO
+
+np.seterr(divide='ignore')
+
+### util ###
+def append_ones(X):
+    if sp.issparse(X):
+        return sp.hstack((np.ones((X.shape[0], 1)), X)).tocsr()
+    else:
+        return np.hstack((np.ones((X.shape[0], 1)), X))
+
+### validate ###
+def logloss(y, P, eps=1e-15):
+    P = P / np.sum(P, axis=1)[:,None]
+    p = P[xrange(y.size), y]
+    p = np.clip(p, eps, 1 - eps)
+    return -np.mean(np.log(p))
+
+def cv(X, y, fun, submit=False, folds=4):
+    np.random.seed(42)
+    ind = np.random.randint(0, folds, y.size)
+
+    res_cv = []
+    pool = multiprocessing.Pool()
+    for fold in range(folds):
+        tr_ind = np.nonzero(ind != fold)[0]
+        te_ind = np.nonzero(ind == fold)[0]
+
+        r = pool.apply_async(fun, (X[tr_ind], y[tr_ind], X[te_ind]))
+        res_cv.append((r, te_ind))
+
+    if submit:
+        P_submit = fun(X[:y.size], y, X[y.size:])
+        np.savetxt('submission.csv.gz', P_submit, delimiter=',')
+
+    P_cv = np.zeros((y.size, np.max(y) + 1))
+    score = []
+    for r, te_ind in res_cv:
+        P_cv[te_ind] = r.get()
+        score.append(logloss(y[te_ind], P_cv[te_ind]))
+    pool.close()
+    pool.join()
+
+    if submit:
+        fname = 'res/jure_%s.pkl' % '_'.join(sys.argv[1:])
+        pickle.dump(np.vstack((P_cv, P_submit)), open(fname, 'wb'), pickle.HIGHEST_PROTOCOL)
+
+    return P_cv, np.mean(score)
+
+
+### tfidf ### 
+from sklearn.preprocessing import normalize
+
+def tfidf(X, norm='l2', smooth_idf=1, sublinear_tf=True, add_idf=1., remove=1):
+    n, m = X.shape
+    df = np.bincount(X.nonzero()[1], minlength=m)
+    if remove:
+        informative = np.nonzero(df > remove)[0]
+        X = X[:, informative]
+        df = df[informative]
+        m = len(informative)
+    df += smooth_idf
+    idf = np.log10(float(n + smooth_idf) / df) + add_idf
+    if sublinear_tf:
+        X = X.copy()
+        np.log10(X.data, X.data)
+        X.data += 1
+    X = X * sp.spdiags(idf, 0, m, m)
+    if norm:
+        normalize(X, norm=norm, copy=False)
+    return X
+
+
+### data ###
+if sys.argv[1] == 'data':
+    dir = '/apps/poletna_sola/skupno/data/'
+
+    X_tr = EMC_IO.EMC_ReadData(dir + 'train_data.csv')
+    X_te = EMC_IO.EMC_ReadData(dir + 'test_data.csv')
+    y = EMC_IO.EMC_ReadLabels(dir + 'train_labels.csv')
+
+    X = sp.vstack((X_tr, X_te)).tocsr()
+
+    pickle.dump((X, y), open('data/data.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
+
+
+### pca ###
+if sys.argv[1] == 'pca':
+    n_components = int(sys.argv[2])
+
+    X, y = pickle.load(open('data/data.pkl'))
+    X = tfidf(X, remove=1)
+    X = X.tocsc()
+
+    dir = tempfile.mkdtemp()
+
+    f = open(os.path.join(dir, 'data.st'), 'w')
+    f.write('%d %d %d\n' % (X.shape[0], X.shape[1], X.data.size))
+    for j in range(X.shape[1]):
+        l, r = X.indptr[j], X.indptr[j + 1]
+        f.write('%d\n' % (r - l))
+        for i, x in zip(X.indices[l:r], X.data[l:r]):
+            f.write('%d %.18e\n' % (i, x))
+    f.close()
+
+    check_call(('svd -d %d -o %s %s' % (n_components, os.path.join(dir, 'm'), os.path.join(dir, 'data.st'))).split())
+
+    V = np.loadtxt(os.path.join(dir, 'm-Vt'), skiprows=1).T
+    pickle.dump(X.dot(V), open('res/pca%d.pkl' % n_components, 'w'), pickle.HIGHEST_PROTOCOL)
+
+    shutil.rmtree(dir)
+
+
+submit = sys.argv[1] == 'submit'
+if submit:
+    sys.argv.pop(1)
+
+
+### logistic regression ###
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-x))
+
+def cost_grad(theta, X, y, lambda_):
+    m = X.shape[0]
+    sx = sigmoid(X.dot(theta))
+
+    j = -np.log(np.where(y, sx, 1 - sx)).sum()
+    j += lambda_ * theta.dot(theta) / 2.0
+    j /= m
+
+    grad = X.T.dot(sx - y)
+    grad += lambda_ * theta
+    grad /= m
+
+    return j, grad
+
+def logistic_regression(X, y, X_test, lambda_):
+    theta = np.zeros(X.shape[1])
+    theta, j, ret = fmin_l_bfgs_b(cost_grad, theta, args=(X, y, lambda_))
+    if ret['warnflag'] != 0:
+        warnings.warn('L-BFGS failed to converge')
+    return sigmoid(X_test.dot(theta))
+
+
+def lr_all(X, y, X_test):
+    lambda_ = float(sys.argv[2])
+
+    cols = []
+    for i in range(97):
+        cols.append(logistic_regression(X, y == i, X_test, lambda_))
+    return np.column_stack(cols)
+
+if sys.argv[1] == 'lr':
+    X, y = pickle.load(open('data/data.pkl', 'rb'))
+    X = tfidf(X, remove=10)
+    X = sp.hstack((np.ones((X.shape[0], 1)), X)).tocsr()
+
+    print cv(X, y, lr_all, submit)[1]
+
+
+### knn ###
+def knn(X, y, X_test):
+    k = int(sys.argv[2])
+    num_classes = 97
+
+    block = 500
+    Y, D = [], []
+    folds = (X_test.shape[0] - 1) / block + 1
+    for fold in range(folds):
+        l, r = fold * block, min(X_test.shape[0], (fold + 1) * block)
+        s = X_test[l:r].dot(X.T)
+        for i in range(s.shape[0]):
+            l, r = s.indptr[i], s.indptr[i + 1]
+            ind = np.argsort(s.data[l:r])[-k:]
+            Y.append(y[s.indices[l:r][ind]])
+            D.append(s.data[l:r][ind])
+
+    P = []
+    for i in range(X_test.shape[0]):
+        p = [1e-3] * num_classes
+        for yy, dd in zip(Y[i], D[i]):
+            p[yy] += dd
+        P.append(p)
+    P = np.array(P)
+
+    return P / np.sum(P, axis=1, dtype=np.float64)[:,None]
+    
+if sys.argv[1] == 'knn':
+    X, y = pickle.load(open('data/data.pkl', 'rb'))
+
+    X = tfidf(X, remove=10)
+    print cv(X, y, knn, submit)[1]
+
+
+
+### softmax ###
+class SoftmaxRegression:
+    def __init__(self, lambda_=1, **fmin_args):
+        self.lambda_ = lambda_
+        self.fmin_args = fmin_args
+
+    def cost_grad(self, Theta_flat):
+        m, n = self.X.shape
+
+        Theta = Theta_flat.reshape((self.num_classes, n))
+
+        P = np.exp(self.X.dot(Theta.T))
+        P /= np.sum(P, axis=1)[:,None]
+
+        j = -np.sum(np.log(P[xrange(self.y.size),self.y]))
+        j += self.lambda_ * Theta_flat.dot(Theta_flat) / 2.0
+        j /= m
+
+        grad = self.X.T.dot(P - self.Y).T
+        grad += self.lambda_ * Theta
+        grad /= m
+
+        return j, grad.ravel()
+
+    def fit(self, X, y):
+        m, n = X.shape
+        self.num_classes = y.max() + 1
+
+        self.X, self.y = X, y
+        self.Y = np.eye(self.num_classes)[self.y]
+
+        theta = np.zeros(self.num_classes * n)
+        theta, j, ret = fmin_l_bfgs_b(self.cost_grad, theta, **self.fmin_args)
+        if ret['warnflag'] != 0:
+            warnings.warn('L-BFGS failed to converge')
+        self.Theta = theta.reshape((self.num_classes, X.shape[1]))
+
+    def predict(self, X):
+        P = np.exp(X.dot(self.Theta.T))
+        P /= np.sum(P, axis=1)[:,None]
+        return P
+
+def softmax(X, y, X_test):
+    m = SoftmaxRegression(lambda_=lambda_)
+    m.fit(X, y)
+    return m.predict(X_test)
+
+if sys.argv[1] == 'softmax':
+    lambda_ = float(sys.argv[2])
+
+    X, y = pickle.load(open('data/data.pkl'))
+
+#    X = append_ones(X)
+    X = tfidf(X, remove=10)
+    print cv(X, y, softmax, submit)[1]
+
+
+### neural network ###
+class NeuralNetwork:
+    def __init__(self, layers, lambda_=1, callback=None, **fmin_args):
+        self.layers = layers
+        self.lambda_ = lambda_
+        self.callback = callback
+        self.fmin_args = fmin_args
+
+    def unfold_params(self, params):
+        i0, i1, i2 = self.layers
+
+        i = (i0 + 1) * i1
+
+        Theta1 = params[:i].reshape((i1, i0 + 1))
+        Theta2 = params[i:].reshape((i2, i1 + 1))
+
+        return Theta1, Theta2
+
+    def cost_grad(self, params):
+        Theta1, Theta2 = self.unfold_params(params)
+
+        if self.callback:
+            self.Theta1 = Theta1
+            self.Theta2 = Theta2
+            self.callback(self)
+
+        # Feedforward Propagation
+        m, n = self.X.shape
+
+        a1 = self.X
+        z2 = a1.dot(Theta1.T)
+        a2 = np.column_stack((np.ones(m), sigmoid(z2)))
+        z3 = a2.dot(Theta2.T)
+        a3 = sigmoid(z3)
+
+        # Cost
+        J = np.sum(-self.y * np.log(a3) - (1 - self.y) * np.log(1 - a3)) / m
+
+        t1 = Theta1.copy()
+        t1[:, 0] = 0
+        t2 = Theta2.copy()
+        t2[:, 0] = 0
+
+        # regularization
+        reg = np.dot(t1.flat, t1.flat)
+        reg += np.dot(t2.flat, t2.flat)
+        J += float(self.lambda_) * reg / (2.0 * m)
+
+        # Grad
+        d3 = a3 - self.y
+        d2 = d3.dot(Theta2)[:, 1:] * sigmoid_gradient(z2)
+
+        D2 = a2.T.dot(d3).T / m
+        D1 = a1.T.dot(d2).T / m
+
+        # regularization
+        D2 += t2 * (float(self.lambda_) / m)
+        D1 += t1 * (float(self.lambda_) / m)
+
+        return J, np.hstack((D1.flat, D2.flat))
+
+    def fit(self, X, y):
+        i0, i1, i2 = self.layers
+
+        m, n = X.shape
+        n_params = i1 * (i0 + 1) + i2 * (i1 + 1)
+        eps = np.sqrt(6) / np.sqrt(i0 + i2)
+        initial_params = np.random.randn(n_params) * 2 * eps - eps
+
+        self.X = self.append_ones(X)
+        self.y = y
+
+        params, _, _ = fmin_l_bfgs_b(self.cost_grad, initial_params, **self.fmin_args)
+
+        self.Theta1, self.Theta2 = self.unfold_params(params)
+
+    def predict(self, X):
+        m, n = X.shape
+        
+        a2 = sigmoid(self.append_ones(X).dot(self.Theta1.T))
+        a3 = sigmoid(np.column_stack((np.ones(m), a2)).dot(self.Theta2.T))
+
+        return a3
+
+    def append_ones(self, X):
+        m, n = X.shape
+        if scipy.sparse.issparse(X):
+            return scipy.sparse.hstack((np.ones((m, 1)), X)).tocsr()
+        else:
+            return np.column_stack((np.ones(m), X))
+        
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-x))
+
+def sigmoid_gradient(x):
+    sx = sigmoid(x)
+    return sx * (1 - sx)
+
+def nn(X, y, X_test, y_test=None):
+    lambda_ = float(sys.argv[2])
+    hidden = int(sys.argv[3])
+    maxfun = int(sys.argv[4])
+
+    y = np.eye(97)[y]
+
+    def cb(m, cnt=[0]):
+        cnt[0] += 1
+        if cnt[0] % 10 == 0:
+            print cnt[0], logloss(y_test, m.predict(X_test))
+
+    callback = cb if y_test is not None else None
+
+    layers = [X.shape[1], hidden, 97]
+    np.random.seed(42)
+    m = NeuralNetwork(lambda_=lambda_, layers=layers, callback=callback, maxfun=maxfun)
+    m.fit(X, y)
+    return m.predict(X_test)
+
+### stacking ###
+if sys.argv[1] == 'stack':
+    from sklearn.preprocessing import *
+
+    res = [
+        'jure_lr_0.04', 
+        'jure_knn_17',
+        'jure_softmax_0.06',
+        'pca100',
+    ]
+    print res
+    X, y = pickle.load(open('data/data.pkl', 'rb'))
+
+    Xs = []
+    for r in res:
+        X = pickle.load(open('res/%s.pkl' % r))
+        Xs.append(X)
+    X = np.hstack(Xs)
+
+    print cv(X, y, nn, submit)[1]
+
+#    i = int(y.size * 0.7)
+#    nn(X[:i], y[:i], X[i:], y[i:])

File predavanja/ukazi

+0.19812 na Kagglu
+=================
+
+Ukazi, ki so potrebni, da sestavite rešitev z oceno 0.19812 na Kagglu.
+
+[main.py](main_19812.txt)
+
+Podatki
+-------
+
+Preberi in zapiklaj:
+
+	$ time python main.py data
+
+	real    0m24.388s
+	user    0m16.725s
+	sys     0m3.997s
+
+
+Logistična regresija
+--------------------
+
+*Opis*
+
+Logistično regresijo smo kar dobro obdelali na šoli. Spremenil sem
+`tfidf`, da namesto naravnega logaritma, računa logaritem z osnovo
+10. Izkaže se, da pomaga.  Logistična regresija je definirana samo
+za binarne razrede. Funkcija `lr_all` razdeli problem na 97 binarnih,
+tako kot smo to naredili na šoli.
+
+*Parametri*
+
+* `lambda_ = 0.04`
+
+*Podatki*
+
+* `tfidf(remove=10)` na originalnih podatkih.
+* `append_ones`
+
+*Ukaz*
+
+	$ time python main.py submit lr 0.04
+	0.29813846441952563
+
+	real    23m56.028s
+	user    105m22.864s
+	sys     0m36.734s
+
+
+k-nearest neighbor
+------------------
+
+*Opis*
+
+Razdalja je [kosinus
+kota](http://en.wikipedia.org/wiki/Cosine_similarity). Pri implementaciji
+knn sem malo eksperimentiral. Na koncu sem uporabil sledeči algoritem:
+recimo da so najbližji trije (k=3) dokumenti v razredih 1, 3 in 1, ter
+da so od testnega dokumenta oddaljeni 0.9, 0.5 in 0.4. Verjetnosti,
+ki jih izpljune moj algoritem so `[0, 0.9 + 0.4, 0, 0.5, 0, 0, ..., 0]
++ 0.001`.  Tisti `eps = 0.001` na koncu je konstanta, ki vse verjetnosti
+potegne k `1 / 97`. Večji kot je `eps`, bolj popravimo verjetnosti.
+Tabelo je seveda treba normalizirati, da se verjetnosti seštejejo v 1.
+
+Tehnično gledano je implementacija algoritma videti precej zapletena.
+Razdaljo vseh testnih primerov do vseh učnih lahko dobimo preprosto
+z ukazom `X_test.dot(X.T)`, a ker je ta matrika prevelika sem moral
+uporabiti nekaj trikov. Ker za nek testni primer v resnici ni treba
+hraniti razdalje do vseh učnih primerov ampak le do `k` najbližjih,
+lahko najprej izračunamo razdaljo privih 500 testnih primerov do
+vseh učnih. Zapomnimo si razdalje do prvih `k` učnih primerov, ostale zavržemo.
+Postopek ponovimo na naslednjih 500 testnih primerih.
+
+*Parametri*
+
+* `k = 17`
+* `eps = 0.001`
+
+*Podatki*
+
+* `tfidf(remove=10)` na originalnih podatkih.
+
+*Ukaz*
+
+	$ time python main.py submit knn 17
+	0.3905618941
+
+	real    40m21.891s
+	user    149m4.511s
+	sys     11m37.898s
+
+
+
+Softmax regresija
+-----------------
+
+*Opis*
+
+Razširitev logistične regresije na več razredov. [Opis
+metode](http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression).
+
+*Parametri*
+
+* `lambda_ = 0.05`
+
+*Podatki*
+
+* `tfidf(remove=10)` na originalnih podatkih.
+
+Ups! Pozabil sem dodati stolpec enic.
+
+*Ukaz*
+
+	$ time python main.py submit softmax 0.05
+	0.305836638647
+
+	real    43m51.695s
+	user    180m1.189s
+	sys     3m17.444s
+
+
+PCA
+---
+
+*Opis*
+
+SVD je izračunal program
+[svdlibc](http://tedlab.mit.edu/~dr/SVDLIBC/). Stolpcem nisem odštel
+povprečne vrednosti, niti jih nisem skaliral. PCA zna računati tudi
+[scikit-learn](http://scikit-learn.org/dev/modules/generated/sklearn.decomposition.RandomizedPCA.html).
+Odločil sem se za svdlibc, ker mi je že nekajkrat vrnil glavne komponente
+s katerimi sem bil zelo zadovoljen.
+
+*Parametri*
+
+* `število glavnih komponent = 100`
+
+*Podatki*
+
+* `tfidf(remove=1)` na originalnih podatkih.
+
+*Ukaz*
+
+	$ time python main.py pca 100
+	?????
+
+Stacking
+--------
+
+*Parametri*
+
+* `lambda_ = 0.2`
+* `število perceptronov v skritem nivoju = 50`
+* `maksimalno število klicev funkcije cost_grad = 310`
+
+*Podatki*
+
+* Logističa regresija (97 atributov)
+* Softmax regresija (97 atributov)
+* k-nearest neighbor (97 atributov)
+* PCA (100 atributov)
+
+*Opis*
+
+Napovedi modelov sem združil z nevronskimi mrežami. Uporabil sem
+[svojo implementacijo](https://bitbucket.org/jzbontar/ml-class). Sledil
+sem algoritmu, ki ga je predstavil Andrew v filmckih, ki smo jih gledali
+v Fari. Podatke za stacking sem dobil tako, da sem združil atribute
+zgoraj naštetih modelov. Učil sem se torej na podatkih z `97 + 97 +
+97 + 100 = 391` atributi. Nevronska mreža je imela 391 perceptronov
+v vhodnem nivoju, 50 v skritem in 97 v izhodnem nivoju.
+
+*Kako izbrati parametre*
+
+Pri nevronskih mrežah je treba določiti vrednosti treh parametrov. Ker
+učenje s prečnim preverjanjem traja tudi do 8 ur in zasede 4 jedra
+je proces določanja parametrov precej zamuden. Zato namesto 4-kratnega
+prečnega preverjanja sam raje razdelim podatke na dva dela, v razmerju
+70:30. Na prvem delu se učim, na drugem testiram. Ker porabim samo eno
+jedro lahko na ostalih treh poganjam nevronske mreže z drugimi parametri.
+
+Nevronske mreže sem sprogramiral tako, da vsake toliko časa izpišejo
+število klicev funkcije `cost_grad` skupaj s točnostjo na testnih
+podatkih, kar močno olajša izbiro tega parametra.
+
+Glej zadnje zadnje vrstice izvorne datoteke, ki izgledajo tako:
+
+		# to je navadno 4-kratno precno preverjanje
+		print cv(X, y, nn, submit)[1]
+
+		# s tem delom kode dolocam parametre
+		i = int(y.size * 0.7)
+		nn(X[:i], y[:i], X[i:], y[i:])
+
+*Ukaz*
+
+	$ time python main.py submit stack 0.2 50 310
+	?????