Source

delta-H / doc / clin-2012 / slides.tex

Full commit
\documentclass[17pt]{beamer}
\usefonttheme[onlymath]{serif}
\usepackage{latexsym,amsmath,url}
\usepackage{pifont}
\usepackage{hyperref}
\usepackage{color}
\usepackage[utf8]{inputenc}
\usepackage{framed}

\DeclareSymbolFont{AMSb}{U}{msb}{m}{n}
\DeclareMathOperator*{\argmax}{argmax}
\newcommand{\tick}{{\color{blue}\ding{51}}}
\newcommand{\cross}{{\color{red}\ding{55}}}


\mode<presentation>{ 
  \usetheme{Boadilla}
} \title[Word class learning]{Word class learning}
\subtitle{Computational and cognitive aspects}

\date[CLIN 2012]{CLIN 2012}
\author[Chrupa\l a...]{Grzegorz Chrupa{\l}a \and Afra Alishahi \\ \and Yevgen Matusevych}

\institute[UdS, UvT]
%{
%UdS and UvT
%}

\begin{document}
\frame{\titlepage}

\begin{frame}\frametitle{Word classes}
  \begin{block}{}\small
    \begin{center}
      \begin{tabular}{c}
        \color{red} go come fit try hang read say take see blow \\
        \color{blue} bricks bits food things medicine cream \\
        \color{olive} the your that this a my his some\\
      \end{tabular}
    \end{center}
  \end{block}

  \begin{block}{}\small
    \begin{center}
      \begin{tabular}{c}
        \color{magenta} Berlin  Bangkok  Tokyo  Warsaw\\
        \color{violet} Sarkozy  Merkel Obama  Berlusconi \\
        \color{orange} Mr  Ms  President  Dr \\
      \end{tabular}
    \end{center}
  \end{block}
  \begin{itemize}
  \item Groups of words sharing syntax/semantics
  \item Useful for generalization and abstraction
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Perspectives on class learning}
  \begin{itemize}
  \item NLP
    \begin{itemize}
    \item Efficiency
    \item Performance on NLP tasks
    \end{itemize}
  \item Cognitive modeling
    \begin{itemize}
    \item Plausible cognitive constraints 
    \item Performance on simulations of human tasks
    \end{itemize}
  \end{itemize}
\end{frame}



\begin{frame}
\frametitle{Goals}
  \begin{itemize}
  \item Bring two perspectives closer together
  \item Analyze and improve 2 algorithms
    \begin{itemize}
    \item {\color{blue}$\Delta$H} - simulate online learning of word
      classes by humans (Chrupa{\l}a and Alishahi 2010)
    \item {\color{blue}Word class LDA} - efficiently learn soft word
      classes for NLP (Chrupa{\l}a 2011)
    \end{itemize}
  \end{itemize}
\end{frame}


\begin{frame}
  \frametitle{Brief comparison}
  \begin{center}
    \begin{tabular}{l|c|c}
      & $\Delta$H     & cLDA \\ \hline
      Token level     & \tick     & \tick \\
      Soft classes    & \tick     & \tick \\\hline
      Bayesian        & \cross    & \tick \\
      \bf Online      & \tick     & \cross \\
      \bf Parameters  & \cross    & \tick \\ 
      Adaptive K      & \tick     & \cross \\
      Fast            & \cross    & \tick \\
    \end{tabular}
  \end{center}
\end{frame}

\begin{frame}
  \frametitle{$\Delta$H}
  \begin{itemize}
  \item Incrementally optimizes a joint entropy criterion:
    \begin{small}
      \begin{equation*}
        H(X,Y) =  {\color{blue} H(X|Y)} + {\color{red} H(Y)} 
      \end{equation*}
    \end{small}
    \begin{itemize}
    \item Small {\color{red}class entropy} - parsimony
    \item Small {\color{blue} conditional feature entropy} -
      informativeness
    \end{itemize}
  \item New classes are created as needed
  \item No free parameters
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Word class LDA}
  \begin{itemize}
  \item \small Generative model equivalent to LDA for topic models
  \end{itemize}
  \begin{center}
    \includegraphics[scale=0.25]{lda-plate.pdf}
  \end{center}
\end{frame}


\begin{frame}
   \frametitle{Word class LDA}
   \begin{itemize}
   \item Number of classes K is specified as a parameter
   \item $\alpha$ and $\beta$ control sparsity of priors
   \item Inference using Gibbs sampler (batch)
   \end{itemize}
\end{frame}


\begin{frame}
  \frametitle{Model evaluation}
  \begin{center}
    \begin{block}{}
      Evaluate \vskip 0.5cm
      \begin{itemize}
      \item {\bf Parameterized} $\Delta$H
      \item {\bf Online} Gibbs sampler for word class LDA
      \end{itemize}\vskip 0.5cm
      on the {\bf same task} and the {\bf same dataset}.
    \end{block}
  \end{center}
\end{frame}

\begin{frame}
  \frametitle{Dataset}
  \begin{itemize}
  \item Manchester portion of CHILDES (mothers)
  \item Discard one-word sentences and punctuation
  \end{itemize}
  \begin{center}
    \begin{tabular}[!t]{l r r r }
      \hline
      {\bf Data Set} & {\bf Sessions} & {\bf \#Sent} & {\bf \#Words} \\
      \hline
      Training    & 26--28  & 22,491   & 125,339 \\ 
      Development & 29--30  &  15,193     &  85,361 \\
      \hline
    \end{tabular}
  \end{center}
\end{frame}


\begin{frame}
  \frametitle{Task: word prediction}
  \begin{itemize}
  \item Relevant for cognitive modeling
  \item Used in NLP -- language model evaluation 
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Word prediction}
  \begin{itemize}\small
  \item (Soft)-assign classes from context
  \item Rank words based on predicted class
  \end{itemize}
\begin{block}{Reciprocal rank}
  \small
  \begin{tabular}{c|c|c}
    want{\textvisiblespace}to  
    & \color{gray} put & them{\textvisiblespace}on   \\
    \pause 
    &     $y_{123}$                      &   \\
  \end{tabular}
  \begin{tabular}{l|l|r}
    $y_{123}$ 
    &      make & \\
    &      take & \\
    & \color{red}put& $\textit{rank}^{-1}=\frac{1}{3}$\\
    &      get & \\
    &      sit & \\
    &      eat & \\
    &      let & \\
  \end{tabular}
\end{block}
\end{frame}

\begin{frame}
   \frametitle{Parametrizing $\Delta$H}
   \begin{itemize}
   \item No free parameters in $\Delta$H
     \begin{itemize}
     \item[\tick] No need to optimize them separately
     \item[\cross] Lack of flexibility
     \end{itemize}
   \item If we force parametrization
     \begin{itemize}
     \item Is the algorithm well-behaved?
     \item Can we smoothly control the trade-off?
     \end{itemize}
   \end{itemize}
\end{frame}

\begin{frame}
  \begin{block}{Parametrized $\Delta$H}
    \begin{small}
      \begin{equation*}
        H_{\alpha}(X,Y) =  {\color{blue} \alpha H(X|Y)} + {\color{red} (1-\alpha) H(Y)} 
      \end{equation*}
    \end{small}
  \end{block}
 \end{frame}

 \begin{frame}
   \frametitle{Results}
   \begin{center}
\vskip -1cm
     \includegraphics[scale=0.45]{delta-h-alpha-mrr.pdf}
   \end{center}
 \end{frame}

 \begin{frame}
   \frametitle{Interpretation}
   \begin{itemize}
   \item K increases with $\alpha$
   \item Word prediction performance changes smoothly with $\alpha$
   \item Values of $\alpha$ slightly $>$ 0.5
     \begin{itemize}
     \item Give best MRR
     \item Best ratio of $K_{\text{test}}/K$
     \end{itemize}
   \item Some degree of trade-off tuning possible $\alpha$
   \item Parameterless $\Delta$H close to optimal
   \end{itemize}
 \end{frame}


 \begin{frame}
   \frametitle{Running word class LDA online}
   \begin{itemize}
   \item Common LDA inference algorithm: Batch collapsed Gibbs sampler 
   \item Online extensions compared by Canini et al 2005 for topic modeling
   \item Only one, oLDA, strictly online
   \item oLDA did not work very well for inferring document topic 
   \end{itemize}
 \end{frame}

 \begin{frame}
   \frametitle{Word classes with online LDA (CoLaDA)}
   \begin{itemize}
   \item $d$ - word type
   \item $w$ - context feature
   \item $z$ - class 
   \item { \color{gray} Replicate incoming sentence $j$ times }
     \begin{itemize}

     \item For each $w_i$ in the sentence, sample:
       \begin{equation*}
         P(z_i|\mathbf{z}_{i-1},\mathbf{w}_i, \mathbf{d}_i) \propto 
         \frac{(n_{z,d} + \alpha) \times (n_{z,w} + \beta)}{n_{z,\bullet} + V\beta}
       \end{equation*}
       and update the counts.
     \end{itemize}
   \end{itemize}
 \end{frame}


 \begin{frame}
   \frametitle{CoLaDA} 
   \begin{itemize}
   \item oLDA did not work for inferring topics
   \item Key difference: word types $d$ recur
     \pause
     \begin{block}{}
       \begin{itemize}
       \item Classes for common word types
         will be {\bf frequently  resampled}
       \item {\bf Without any special arrangements}
       \end{itemize}
     \end{block}
   \end{itemize}
 \end{frame}

 \begin{frame}
   \frametitle{CoLaDA results}
   \begin{center}
\vskip -0.5cm
     \includegraphics[scale=0.8]{colada-best-mrr-10K.pdf}
   \end{center}
 \end{frame}

 \begin{frame}
   \frametitle{CoLaDA discussion}
   \begin{itemize}
   \item Word prediction for $K \in \{200, 800\}$ similar 
     to $\Delta$H
   \item Multiple passes help a bit
   \item Best parameters
     \begin{itemize}
     \item 1 pass: $K \alpha = 0.1$, $\beta = 0.01$
     \item 20 passes: $K \alpha = 10$, $\beta = 0.1$
     \end{itemize}
   \item Clusters don't always ``look'' as coherent as with batch LDA
   \end{itemize}
 \end{frame}

 \begin{frame}\frametitle{$\Delta$H vs CoLaDA}

   \begin{center}
\vskip -1cm
     \includegraphics[scale=0.5]{side-by-side.pdf}
   \end{center}

 \end{frame}

 \begin{frame}
   \frametitle{Conclusion} Look at models from complementary
   perspectives:
\vskip 0.5cm
   \begin{itemize}
\item Make the cognitive model more flexible
  \begin{itemize}
  \item Learn more about it
  \item  Make it tweakable
  \end{itemize}
\item Impose cognitive plausibility on practical model
  \begin{itemize}
  \item Improve memory efficiency
  \item Learn from data streams
  \end{itemize}
\end{itemize}
\end{frame}

 \begin{frame}
   \frametitle{Future}
   \begin{itemize}
   \item Nonparametric version of CoLaDA
     \begin{itemize}
     \item  Adaptive K
     \end{itemize}
   \item Other tasks, including large-scale NLP
     \begin{itemize}
     \item Speed up (especially $\Delta$H)
     \end{itemize}
   \end{itemize}
 \end{frame}

\begin{frame}
  \begin{center}
    \large Thank you
  \end{center}
\end{frame}

\begin{frame}
   \frametitle{Word prediction: variants}
   \begin{itemize}
   \item $\Delta H_{\max}$
     \[
     P(w|h) = P(w|\argmax_i R(y_i|h)^{-1})
     \]
   \item $\Delta H_\Sigma$
     \[
       P(w | h) = \sum_{i=1}^N P(w | y_i) \frac{\mathrm{R}(y_i|h)^{-1}}{\sum_{i=1}^N \mathrm{R}(y_i | h)^{-1}}
     \]
   \end{itemize}
 \end{frame}
\end{document}