# Commits

committed 1bf6e45

LICENCE

• Participants
• Parent commits 1db0a4e

# File main.py

+#! /usr/bin/python2
+
 import sys
 import pickle
 import multiprocessing

 ### data ###
 if sys.argv[1] == 'data':
-    dir = 'data/'
+    dir = 'data_orig/'

     X_tr = EMC_IO.EMC_ReadData(dir + 'train_data.csv')
     X_te = EMC_IO.EMC_ReadData(dir + 'test_data.csv')
     X = tfidf(X, remove=10)
     X = sp.hstack((np.ones((X.shape[0], 1)), X)).tocsr()

-    print cv(X, y, lr_all, submit)
+    cv(X, y, lr_all, submit)

 ### logistic regression (scikit-learn) ###
 def lr_sk(X, y, X_test):

# File report/jzbontar.tex

 \usepackage{amsfonts}
 \usepackage{amsmath}

+\newcommand{\specialcell}[2][c]{%
+  \begin{tabular}[#1]{@{}l@{}}#2\end{tabular}}
+
 \begin{document}
 \title{EMC Israel Data Science Challenge \\
 Team ULjubljana's Solution}
 \section{Introduction}

 We solved the challenge by training a set of base learners
-and combining them whit stacking.
+and combining them with stacking. Our final score was 0.19812
+which was enough to claim second place in the competition.
+
+\section{Data Preparation}
+
+We found data preparation to be a very important
+component in our solution. At the end, we settled
+on using {\em tf-idf}.  Smoothing (similar to Laplace
+smoothing~\footnote{\url{http://en.wikipedia.org/wiki/Additive_smoothing}})
+and sublinear term frequency scaling (take the logarithm of every term
+count) were also applied. The exact method that we used can be found in
+the function {\tt tfidf} in the file {\tt main.py}.
+
+Prior to training, we used basic feature selection, where we removed
+the features with document frequency less then {\tt n} ({\tt n} was a
+parameter of feature selection). We tried different values of {\tt n}
+for different models.

 \section{Methods}
-\subsection{Logistic Regression}
+\subsection{L2-Regularized Logistic Regression}
+We used our own implementation of L2-regularized logistic regression,
+implemented in Python. We removed features with document frequency
+less then 10 and added an artificial constant feature with value 1.
+In order to extend logistic regression to the multi-class case,
+the binary relevance method was used, i.e., we trained a separate model
+for each of the 97 classes, performing one-vs-all classification on each
+run. Logistic regression took around 15 minutes to run\footnote{
+All timings were performed on an Intel i7-3770 3.4 GHz processor,
+with 16 GB of RAM.}.
+
+\subsection{K-Nearest Neighbour}
+We implemented our own k-nearest neighbour (KNN) algorithm using the
+cosine similarity measure. A slight modification of KNN was used, which
+is best described on an example. Assume that the three ($k=3$)
+closest documents belong to classes 1, 3, and 1 and that their similarity
+with the query document is 0.9, 0.5, and 0.4, respectively. The predicted
+probabilities in this case are $[0, 0.9 + 0.4, 0, 0.5, 0, 0, \ldots, 0] ++ 0.001$ \footnote{Note that the rightmost plus sign is overloaded and
+should be interpreted as adding a constant to every element of the
+vector.}.  $eps = 0.001$ is a smoothing parameter. The bigger the
+$eps$, the more each probability is pulled towards $1/97$. The vector of
+predictions was normalized to sum to 1.  As with logistic regression,
+we remove features with document frequency less then 10. The number of
+neighbours was set to 17. KNN took 25 minutes to run.
+
+\subsection{Softmax Regression}
+Softmax regression is a generalization of logistic
+regression to the multi-class classification setting
+and was implemented following the description on
+\url{http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression}.
+As always, we removed features with document frequency less then 10.
+Softmax regression is memory intensive. It needs around 14GB of RAM and
+30 minutes to run.
+
+
+\section{Stacking}
+We combined the prediction of the described methods with stacking.  5-fold
+cross-validation was used to obtain out-of-sample predictions for each
+of the three base learners on all the training samples. These predictions
+were subsequently used as training data for an artificial neural network,
+which produced a blend of the predictions of the base learners. We
+used our own implementation of artificial neural networks.  We noticed
+that stacking improved if we added 100 components of latent semantic
+indexing~\footnote{Latent semantic indexing is nothing more than principal
+component analysis without the preprocessing step of mean centering the
+data.}. Latent semantic indexing was implemented with the help of the
+SVDLIBC library~\footnote{\url{http://tedlab.mit.edu/~dr/SVDLIBC/}}.
+
+
+\section{Results}
+Table \ref{tab:results} presents the scores of our submissions, from
+our earliest attempts with the score of 0.29837, to our final submission
+with score 0.19812.
+
+\begin{table}[htb]
+\centering
+\begin{tabular}{ll}
+\textbf{Method} & \textbf{Score} \\\hline\hline
+Logistic Regression & 0.29837 \\\hline
+Stack: Logistic Regression & 0.26025 \\\hline
+Stack: Logistic Regression, KNN & 0.22721 \\\hline
+Stack: Logistic Regression, KNN, Softmax & 0.20912 \\\hline
+Stack: Logistic Regression, KNN, Softmax, LSI & 0.19812 \\
+\end{tabular}
+\caption{This table presents scores of our submissions. The word {\em
+stack} means that the outputs were passed to a neural network for stacking
+as described in this report. Note that it also makes sense to send the
+output of a single learner through the stacking procedure, as can be
+seen from the logistic regression example in the table.}
+\label{tab:results}
+\end{table}
+
+\section{How To Generate the Solution}
+
+The source code can be downloaded from an online repository at
+\url{https://bitbucket.org/jzbontar/emc}. First, unpack the training
+and test sets into the directory \texttt{data\_orig}. The directory
+\texttt{data\_orig} should contain three file:
+
+\begin{itemize}
+\item \texttt{test\_data.csv}
+\item \texttt{train\_data.csv}
+\item \texttt{train\_labels.csv}
+\end{itemize}
+
+
+To train the models and generate the solution run the command
+\texttt{run.sh}. The predictions will be stored in a file named
+\texttt{submission.csv.gz}.
+
+
+\subsection{Conclusion}
+In this report we presented our approach to the EMC Israel Data Science
+Challenge. The method required TODO minutes to run and was placed second
+in the competition.
+
+From Table \ref{tab:results} we conclude that stacking has an enormous
+impact on score on this particular dataset. The best score obtained by
+any single method was 0.29837. With stacking, we were able to improve the
+score to our final score of 0.19812.

 \end{document}

# File run.sh

 #! /bin/sh -x

 #./main.py data
-time ./main.py submit lr 0.04
+#time ./main.py submit lr 0.04
+#time ./main.py submit knn 17
+#time ./main.py submit softmax 0.06
+#time ./main.py pca 100
+time ./main.py submit stack 0.2 50 310