Source

delta-H / doc / clin-2012 / slides.tex

Full commit
\documentclass[17pt]{beamer}
\usefonttheme[onlymath]{serif}
\usepackage{latexsym,amsmath,url}
\usepackage{pifont}
\usepackage{hyperref}
\usepackage{color}
\usepackage[utf8]{inputenc}
\usepackage{framed}

\DeclareSymbolFont{AMSb}{U}{msb}{m}{n}
\DeclareMathOperator*{\argmax}{argmax}
\newcommand{\tick}{{\color{blue}\ding{51}}}
\newcommand{\cross}{{\color{red}\ding{55}}}


\mode<presentation>{ 
  \usetheme{Boadilla}
} \title[Word class learning]{Word class learning}
\subtitle{Computational and cognitive aspects}

\date[CLIN 2012]{CLIN 2012}
\author[Chrupa\l a...]{Grzegorz Chrupa{\l}a \and Afra Alishahi \\ \and Yevgen Matusevych}

\institute[UdS, UvT]
%{
%UdS and UvT
%}

\begin{document}
\frame{\titlepage}

\begin{frame}\frametitle{Word classes}
  \begin{block}{}\small
    \begin{center}
      \begin{tabular}{c}
        \color{red} go come fit try hang read say take see blow \\
        \color{blue} bricks bits food things medicine cream \\
        \color{olive} the your that this a my his some\\
      \end{tabular}
    \end{center}
  \end{block}

  \begin{block}{}\small
    \begin{center}
      \begin{tabular}{c}
        \color{magenta} Berlin  Bangkok  Tokyo  Warsaw\\
        \color{violet} Sarkozy  Merkel Obama  Berlusconi \\
        \color{orange} Mr  Ms  President  Dr \\
      \end{tabular}
    \end{center}
  \end{block}
  \begin{itemize}
  \item Groups of words sharing syntax/semantics
  \item Useful for generalization and abstraction
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Perspectives on class learning}
  \begin{itemize}
  \item NLP
    \begin{itemize}
    \item Efficiency
    \item Performance on NLP tasks
    \end{itemize}
  \item Cognitive modeling
    \begin{itemize}
    \item Plausible cognitive constraints 
    \item Performance on simulations of human tasks
    \end{itemize}
  \end{itemize}
\end{frame}



\begin{frame}
\frametitle{Goals}
  \begin{itemize}
  \item Bring two perspectives closer together
  \item Analyze and improve 2 algorithms
    \begin{itemize}
    \item {\color{blue}$\Delta$H} - simulate online learning of word
      classes by humans (Chrupa{\l}a and Alishahi 2010)
    \item {\color{blue}Word class LDA} - efficiently learn soft word
      classes for NLP (Chrupa{\l}a 2011)
    \end{itemize}
  \end{itemize}
\end{frame}


\begin{frame}
  \frametitle{Brief comparison}
  \begin{center}
    \begin{tabular}{l|c|c}
      & $\Delta$H     & cLDA \\ \hline
      Token level     & \tick     & \tick \\
      Soft classes    & \tick     & \tick \\\hline
      Bayesian        & \cross    & \tick \\
      \bf Online      & \tick     & \cross \\
      \bf Parameters  & \cross    & \tick \\ 
      Adaptive K      & \tick     & \cross \\
      Fast            & \cross    & \tick \\
    \end{tabular}
  \end{center}
\end{frame}

\begin{frame}
  \frametitle{$\Delta$H}
  \begin{itemize}
  \item Incrementally optimizes a joint entropy criterion:
    \begin{small}
      \begin{equation*}
        H(X,Y) =  {\color{blue} H(X|Y)} + {\color{red} H(Y)} 
      \end{equation*}
    \end{small}
    \begin{itemize}
    \item Small {\color{red}class entropy} - parsimony
    \item Small {\color{blue} conditional feature entropy} -
      informativeness
    \end{itemize}
  \item New classes are created as needed
  \item No free parameters
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Word class LDA}
  \begin{itemize}
  \item \small Generative model equivalent to LDA for topic models
  \end{itemize}
  \begin{center}
    \includegraphics[scale=0.25]{lda-plate.pdf}
  \end{center}
\end{frame}


\begin{frame}
   \frametitle{Word class LDA}
   \begin{itemize}
   \item Number of classes K is specified as a parameter
   \item $\alpha$ and $\beta$ control sparsity of priors
   \item Inference using Gibbs sampler (batch)
   \end{itemize}
\end{frame}


\begin{frame}
  \frametitle{Model evaluation}
  \begin{center}
    \begin{block}{}
      Evaluate \vskip 0.5cm
      \begin{itemize}
      \item {\bf Parameterized} $\Delta$H
      \item {\bf Online} Gibbs sampler for word class LDA
      \end{itemize}\vskip 0.5cm
      on the {\bf same task} and the {\bf same dataset}.
    \end{block}
  \end{center}
\end{frame}

\begin{frame}
  \frametitle{Dataset}
  \begin{itemize}
  \item Manchester portion of CHILDES (mothers)
  \item Discard one-word sentences and punctuation
  \end{itemize}
  \begin{center}
    \begin{tabular}[!t]{l r r r }
      \hline
      {\bf Data Set} & {\bf Sessions} & {\bf \#Sent} & {\bf \#Words} \\
      \hline
      Training    & 26--28  & 22,491   & 125,339 \\ 
      Development & 29--30  &  15,193     &  85,361 \\
      \hline
    \end{tabular}
  \end{center}
\end{frame}


\begin{frame}
  \frametitle{Task: word prediction}
  \begin{itemize}
  \item Relevant for cognitive modeling
  \item Used in NLP -- language model evaluation 
  \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Word prediction}
  \begin{itemize}\small
  \item (Soft)-assign classes from context
  \item Rank words based on predicted class
  \end{itemize}
\begin{block}{Reciprocal rank}
  \small
  \begin{tabular}{cc|c|cc}
    want & to  & \color{gray} put & them & on   \\
    \pause 
    &     & $y_{123}$                 &      &   \\
  \end{tabular}
  \begin{tabular}{l|l|r}
    $y_{123}$ 
    &      make & \\
    &      take & \\
    & \color{red}put& $\textit{rank}^{-1}=\frac{1}{3}$\\
    &      get & \\
    &      sit & \\
    &      eat & \\
    &      let & \\
  \end{tabular}
\end{block}
\end{frame}

\begin{frame}
   \frametitle{Parametrizing $\Delta$H}
   \begin{itemize}
   \item No free parameters in $\Delta$H
     \begin{itemize}
     \item[\tick] No need to optimize them separately
     \item[\cross] Lack of flexibility
     \end{itemize}
   \item If we force parameterization
     \begin{itemize}
     \item Is the algorithm well-behaved?
     \item Can we smoothly control the tradeoff?
     \end{itemize}
   \end{itemize}
\end{frame}

\begin{frame}
  \begin{block}{Parametrized $\Delta$H}
    \begin{small}
      \begin{equation*}
        H_{\alpha}(X,Y) =  {\color{blue} \alpha H(X|Y)} + {\color{red} (1-\alpha) H(Y)} 
      \end{equation*}
    \end{small}
  \end{block}
 \end{frame}

 \begin{frame}
   \frametitle{Results}
\vskip -2cm
   \begin{center}
     \includegraphics[scale=0.5]{delta-h-alpha-mrr.pdf}
\vskip -1.7cm
     \includegraphics[scale=0.5]{delta-h-alpha-K.pdf}
   \end{center}
 \end{frame}



\begin{frame}
  \begin{center}
    \large Thank you
  \end{center}
\end{frame}

\begin{frame}
   \frametitle{Word prediction: variants}
   \begin{itemize}
   \item $\Delta H_{\max}$
     \[
     P(w|h) = P(w|\argmax_i R(y_i|h)^{-1})
     \]
   \item $\Delta H_\Sigma$
     \[
       P(w | h) = \sum_{i=1}^N P(w | y_i) \frac{\mathrm{R}(y_i|h)^{-1}}{\sum_{i=1}^N \mathrm{R}(y_i | h)^{-1}}
     \]
   \end{itemize}
 \end{frame}
\end{document}