Commits

Jure Žbontar committed 01016b3

Added LICENCE and README

Comments (0)

Files changed (5)

+Copyright (c) 2012, Jure Zbontar <jure.zbontar@fri.uni-lj.si>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Team ULjubljana's solution to the EMC challenge
+===============================================
+
+http://www.kaggle.com/c/emc-data-science
+
+Software Used
+-------------
+
+* Python 2.7.3
+* scipy
+* numpy
+* SVDLIBC 1.4 http://tedlab.mit.edu/~dr/SVDLIBC/
+* Scikit-learn
+
+Usage
+-----
+
+Unpack the training and test sets into the directory `data_orig`. The
+directory `data_orig` should contain three file:
+
+	* `test_data.csv`
+	* `train_data.csv`
+	* `train_labels.csv`
+
+To train the models and generate the solution run the command
+`run.sh`. The predictions will be stored in a file named
+`submission.csv.gz`.
 
     return P_cv, np.mean(score)
 
-def sample_cv(X, y, fun):
-    i = int(y.size * 0.7)
-    P = fun(X[:i], y[:i], X[i:y.size])
-    return logloss(y[i:], P)
- 
+
 ### tfidf ### 
 from sklearn.preprocessing import normalize
 
 
     X, y = pickle.load(open('data/data.pkl'))
     X = tfidf(X, remove=1)
+    X = X.tocsc()
 
-    X = X.tocsc()
-    f = open('/tmp/data.st', 'w')
+    dir = tempfile.mkdtemp()
+
+    f = open(os.path.join(dir, 'data.st'), 'w')
     f.write('%d %d %d\n' % (X.shape[0], X.shape[1], X.data.size))
     for j in range(X.shape[1]):
         l, r = X.indptr[j], X.indptr[j + 1]
             f.write('%d %.18e\n' % (i, x))
     f.close()
 
-    check_call(('svd -d %d -o /tmp/m /tmp/data.st' % n_components).split())
+    check_call(('svd -d %d -o %s %s' % (n_components, os.path.join(dir, 'm'), os.path.join(dir, 'data.st'))).split())
 
-    V = np.loadtxt('/tmp/m-Vt', skiprows=1).T
-    pickle.dump((X.dot(V), y), open('res/pca%d.pkl' % n_components, 'w'), pickle.HIGHEST_PROTOCOL)
+    V = np.loadtxt(os.path.join(dir, 'm-Vt'), skiprows=1).T
+    pickle.dump(X.dot(V), open('res/pca%d.pkl' % n_components, 'w'), pickle.HIGHEST_PROTOCOL)
+
+    shutil.rmtree(dir)
 
 
 submit = sys.argv[1] == 'submit'
     X = tfidf(X, remove=10)
     X = sp.hstack((np.ones((X.shape[0], 1)), X)).tocsr()
 
-    cv(X, y, lr_all, submit)
-
-### logistic regression (scikit-learn) ###
-def lr_sk(X, y, X_test):
-    assert np.unique(y).size == 97
-    C = float(sys.argv[2])
-
-    m = LogisticRegression(C=C, dual=True)
-    m.fit(X, y)
-    return m.predict_proba(X_test)
-
-if sys.argv[1] == 'lr_sk':
-    from sklearn.linear_model import LogisticRegression
-
-    X, y = pickle.load(open('data/data.pkl', 'rb'))
-    X = tfidf(X, remove=10)
-    print cv(X, y, lr_sk, submit)
-
-
-### libsvm ###
-def dump_svm(X, y, f):
-    X.sort_indices()
-    for i in range(X.shape[0]):
-        l, r = X.indptr[i], X.indptr[i + 1]
-        row = ' '.join('%d:%.9e' % t for t in zip(X.indices[l:r] + 1, X.data[l:r]))
-        f.write('%d %s\n' % (y[i], row))
-
-def libsvm(X_tr, y_tr, X_te):
-    dir = tempfile.mkdtemp()
-    tr = os.path.join(dir, 'tr')
-    te = os.path.join(dir, 'te')
-    pred = os.path.join(dir, 'pred')
-    model = os.path.join(dir, 'model')
-
-    dump_svm(X_tr, y_tr, open(tr, 'w'))
-    dump_svm(X_te, np.zeros(X_te.shape[0]), open(te, 'w'))
-
-    # liblinear
-    if sys.argv[1] == 'liblinear':
-        check_call(('liblinear-train %s -q %s %s' % (' '.join(sys.argv[2:]), tr, model)).split())
-        check_call(('liblinear-predict -b 1 %s %s %s' % (te, model, pred)).split(), stdout=PIPE)
-
-    # libsvm
-    if sys.argv[1] == 'libsvm':
-        check_call(('svm-train %s -q %s %s' % (' '.join(sys.argv[2:]), tr, model)).split())
-        check_call(('svm-predict -b 1 %s %s %s' % (te, model, pred)).split(), stdout=PIPE)
-
-    ind = np.argsort(map(int, open(pred).readline().split()[1:])) + 1
-    P = np.loadtxt(pred, skiprows=1)[:,ind]
-    shutil.rmtree(dir)
-
-    return P
-
-if sys.argv[1] in {'liblinear', 'libsvm'}:
-    X, y = pickle.load(open('data/data.pkl', 'rb'))
-    X = tfidf(X, remove=10)
-    print cv(X, y, libsvm, submit)
-
-
-### naive bayes ###
-def nb(X, y, X_test):
-    clf = MultinomialNB(0.001)
-    clf.fit(X, y)
-    return clf.predict_proba(X_test)
-
-def discretize_sparse(X, n):
-    Xcsc = X.tocsc()
-    for i in range(X.shape[1]):
-        l, r = Xcsc.indptr[i], Xcsc.indptr[i + 1]
-        order = np.argsort(Xcsc.data[l:r])
-
-        q = float(order.size) / n
-        for j in range(n):
-            l1, r1 = int(q * j), int(q * (j + 1))
-            Xcsc.data[l:r][order[l1:r1]] = (j + 1)
-    return Xcsc.tocsr()
-
-def discretize(X, n):
-    q = float(X.shape[0]) / n
-
-    for i in range(X.shape[1]):
-        order = np.argsort(X[:,i])
-        for j in range(n):
-            l1, r1 = int(q * j), int(q * (j + 1))
-            X[order[l1:r1],i] = j
-    return X.astype(np.int)
-
-
-# def nb_lr_all(X, y, X_test):
-#     lambda_ = float(sys.argv[2])
-# 
-#     cols = []
-#     for i in range(97):
-#         X1 = np.column_stack((np.ones(X.shape[0]), X[:,i]))
-#         cols.append(logistic_regression(X1, y == i, X_test, lambda_))
-#     return np.column_stack(cols)
-
-
-if sys.argv[1] == 'nb':
-    from sklearn.naive_bayes import *
-
-    X, y = pickle.load(open('res/pca200.pkl', 'rb'))
-
-#    X = tfidf(X, remove=50)
-#    X.data[:] = 1
-    X = discretize(X, 2)
-    P, score = cv(X, y, nb, submit)
-    print 'nb score:', score
-
-#    P = np.column_stack((np.ones(P.shape[0]), P))
-#    P, score = cv(P, y, lr_all, submit)
-#    print 'nb+lr score:', score
-
-### nb_jure ###
-def nb_jure(X, y, X_test):
-    Xcsc = X.tocsc()
-
-    y_dist = np.bincount(y)
-    prior = y_dist / float(y_dist.sum())
-
-    cond = []
-    for j in range(Xcsc.shape[1]):
-        l, r = Xcsc.indptr[j], Xcsc.indptr[j + 1]
-
-        count = np.bincount(y[Xcsc.indices[l:r]], minlength=97)
-        P = np.array((y_dist - count, count), dtype=np.float64)
-        P += 1.0
-        P /= y_dist
-        cond.append(P)
-
-    return np.ones((X_test.shape[0], 97))
-
-if sys.argv[1] == 'nb_jure':
-    from sklearn.naive_bayes import *
-
-#    X, y = pickle.load(open('data/data.pkl', 'rb'))
-#    X = tfidf(X, remove=200)
-#    X.data[:] = 1
-#    pickle.dump((X, y), open('/tmp/foo', 'w'), 2)
-
-    X, y = pickle.load(open('/tmp/foo'))
-    
-   
-    print sample_cv(X, y, nb_jure)
-
-
-### softmax ###
-def softmax(X, y, X_test):
-    m = SoftmaxRegressionGD(lambda_=lambda_)
-    m.fit(X, y)
-    return m.predict(X_test)
-
-if sys.argv[1] == 'softmax':
-    from mlclass.logistic_regression import SoftmaxRegressionGD
-
-    lambda_ = float(sys.argv[2])
-
-    X, y = pickle.load(open('data/data.pkl'))
-
-#    X = pickle.load(open('res/jure_lr_0.04.pkl'))
-#    X = append_ones(X)
-
-    X = tfidf(X, remove=10)
-    print cv(X, y, softmax, submit)[1]
-
-#    print sample_cv(X, y, softmax)
+    print cv(X, y, lr_all, submit)[1]
 
 
 ### knn ###
     X = tfidf(X, remove=10)
     print cv(X, y, knn, submit)[1]
 
-### centroids ###
-def centroids(X, y, X_test):
-    num_classes = 97
 
-    cs = []
-    for cls in range(num_classes):
-        ind = np.nonzero(y == cls)[0]
-        cs.append(np.squeeze(np.asarray(X[ind].sum(axis=0))))
-    cs = normalize(np.array(cs))
 
-    P = X_test.dot(cs.T) + 1e-7
-    return P / np.sum(P, axis=1)[:,None]
+### softmax ###
+class SoftmaxRegression:
+    def __init__(self, lambda_=1, **fmin_args):
+        self.lambda_ = lambda_
+        self.fmin_args = fmin_args
 
-if sys.argv[1] == 'centroids':
-    X, y = pickle.load(open('data/data.pkl', 'rb'))
+    def cost_grad(self, Theta_flat):
+        m, n = self.X.shape
 
-    X = tfidf(X, remove=1)
-    print cv(X, y, centroids, submit)[1]
+        Theta = Theta_flat.reshape((self.num_classes, n))
 
-### knn_pca ###
-def knn_pca(X, y, X_test):
-    k = int(sys.argv[2])
-    
-    m = KNeighborsClassifier(k)
+        P = np.exp(self.X.dot(Theta.T))
+        P /= np.sum(P, axis=1)[:,None]
+
+        j = -np.sum(np.log(P[xrange(self.y.size),self.y]))
+        j += self.lambda_ * Theta_flat.dot(Theta_flat) / 2.0
+        j /= m
+
+        grad = self.X.T.dot(P - self.Y).T
+        grad += self.lambda_ * Theta
+        grad /= m
+
+        return j, grad.ravel()
+
+    def fit(self, X, y):
+        m, n = X.shape
+        self.num_classes = y.max() + 1
+
+        self.X, self.y = X, y
+        self.Y = np.eye(self.num_classes)[self.y]
+
+        theta = np.zeros(self.num_classes * n)
+        theta, j, ret = fmin_l_bfgs_b(self.cost_grad, theta, **self.fmin_args)
+        if ret['warnflag'] != 0:
+            warnings.warn('L-BFGS failed to converge')
+        self.Theta = theta.reshape((self.num_classes, X.shape[1]))
+
+    def predict(self, X):
+        P = np.exp(X.dot(self.Theta.T))
+        P /= np.sum(P, axis=1)[:,None]
+        return P
+
+def softmax(X, y, X_test):
+    m = SoftmaxRegression(lambda_=lambda_)
     m.fit(X, y)
-    return m.predict_proba(X_test)
+    return m.predict(X_test)
 
-if sys.argv[1] == 'knn_pca':
-    from sklearn.neighbors import KNeighborsClassifier
+if sys.argv[1] == 'softmax':
+    lambda_ = float(sys.argv[2])
 
-    X, y = pickle.load(open('data/pca.pkl'))
-    print cv(X, y, knn_pca, submit)[1]
-   
+    X, y = pickle.load(open('data/data.pkl'))
+
+#    X = append_ones(X)
+    X = tfidf(X, remove=10)
+    print cv(X, y, softmax, submit)[1]
+
+
 ### neural network ###
+class NeuralNetwork:
+    def __init__(self, layers, lambda_=1, callback=None, **fmin_args):
+        self.layers = layers
+        self.lambda_ = lambda_
+        self.callback = callback
+        self.fmin_args = fmin_args
+
+    def unfold_params(self, params):
+        i0, i1, i2 = self.layers
+
+        i = (i0 + 1) * i1
+
+        Theta1 = params[:i].reshape((i1, i0 + 1))
+        Theta2 = params[i:].reshape((i2, i1 + 1))
+
+        return Theta1, Theta2
+
+    def cost_grad(self, params):
+        Theta1, Theta2 = self.unfold_params(params)
+
+        if self.callback:
+            self.Theta1 = Theta1
+            self.Theta2 = Theta2
+            self.callback(self)
+
+        # Feedforward Propagation
+        m, n = self.X.shape
+
+        a1 = self.X
+        z2 = a1.dot(Theta1.T)
+        a2 = np.column_stack((np.ones(m), sigmoid(z2)))
+        z3 = a2.dot(Theta2.T)
+        a3 = sigmoid(z3)
+
+        # Cost
+        J = np.sum(-self.y * np.log(a3) - (1 - self.y) * np.log(1 - a3)) / m
+
+        t1 = Theta1.copy()
+        t1[:, 0] = 0
+        t2 = Theta2.copy()
+        t2[:, 0] = 0
+
+        # regularization
+        reg = np.dot(t1.flat, t1.flat)
+        reg += np.dot(t2.flat, t2.flat)
+        J += float(self.lambda_) * reg / (2.0 * m)
+
+        # Grad
+        d3 = a3 - self.y
+        d2 = d3.dot(Theta2)[:, 1:] * sigmoid_gradient(z2)
+
+        D2 = a2.T.dot(d3).T / m
+        D1 = a1.T.dot(d2).T / m
+
+        # regularization
+        D2 += t2 * (float(self.lambda_) / m)
+        D1 += t1 * (float(self.lambda_) / m)
+
+        return J, np.hstack((D1.flat, D2.flat))
+
+    def fit(self, X, y):
+        i0, i1, i2 = self.layers
+
+        m, n = X.shape
+        n_params = i1 * (i0 + 1) + i2 * (i1 + 1)
+        eps = np.sqrt(6) / np.sqrt(i0 + i2)
+        initial_params = np.random.randn(n_params) * 2 * eps - eps
+
+        self.X = self.append_ones(X)
+        self.y = y
+
+        params, _, _ = fmin_l_bfgs_b(self.cost_grad, initial_params, **self.fmin_args)
+
+        self.Theta1, self.Theta2 = self.unfold_params(params)
+
+    def predict(self, X):
+        m, n = X.shape
+        
+        a2 = sigmoid(self.append_ones(X).dot(self.Theta1.T))
+        a3 = sigmoid(np.column_stack((np.ones(m), a2)).dot(self.Theta2.T))
+
+        return a3
+
+    def append_ones(self, X):
+        m, n = X.shape
+        if scipy.sparse.issparse(X):
+            return scipy.sparse.hstack((np.ones((m, 1)), X)).tocsr()
+        else:
+            return np.column_stack((np.ones(m), X))
+        
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-x))
+
+def sigmoid_gradient(x):
+    sx = sigmoid(x)
+    return sx * (1 - sx)
+
 def nn(X, y, X_test, y_test=None):
-    from mlclass.neural_networks import NeuralNetwork
-
     lambda_ = float(sys.argv[2])
     hidden = int(sys.argv[3])
     maxfun = int(sys.argv[4])
         'jure_lr_0.04', 
         'jure_knn_17',
         'jure_softmax_0.06',
-#        'jure_nb',
-
         'pca100',
     ]
     print res
     Xs = []
     for r in res:
         X = pickle.load(open('res/%s.pkl' % r))
-        if 'pca' in r:
-            X = X[0]
         Xs.append(X)
     X = np.hstack(Xs)
 

report/jzbontar.tex

 component analysis without the preprocessing step of mean centering the
 data.}. Latent semantic indexing was implemented with the help of the
 SVDLIBC library~\footnote{\url{http://tedlab.mit.edu/~dr/SVDLIBC/}}.
+Stacking was the must time consuming component. It needed 7 hours to run.
 
 
 \section{Results}
 
 \subsection{Conclusion}
 In this report we presented our approach to the EMC Israel Data Science
-Challenge. The method required TODO minutes to run and was placed second
+Challenge. The method required 8 hours to run and was placed second
 in the competition.
 
 From Table \ref{tab:results} we conclude that stacking has an enormous
 impact on score on this particular dataset. The best score obtained by
-any single method was 0.29837. With stacking, we were able to improve the
-score to our final score of 0.19812.
+any single method was 0.29837. With stacking, we were able to improve
+to our final score of 0.19812.
 
 \end{document}
 #! /bin/sh -x
 
-#./main.py data
-#time ./main.py submit lr 0.04
-#time ./main.py submit knn 17
-#time ./main.py submit softmax 0.06
-#time ./main.py pca 100
-time ./main.py submit stack 0.2 50 310
+./main.py data
+./main.py submit lr 0.04
+./main.py submit knn 17
+./main.py submit softmax 0.06
+./main.py pca 100
+./main.py submit stack 0.2 50 310
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.