Commits

yanchuan sim committed c35dc6a Merge

Merge branch 'develop'

Comments (0)

Files changed (72)

docs/html/_modules/bagofwords.html

-
-
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-
-
-<html xmlns="http://www.w3.org/1999/xhtml">
-  <head>
-    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-    
-    <title>bagofwords &mdash; yc-pyutils 0.1 documentation</title>
-    
-    <link rel="stylesheet" href="../_static/sphinxdoc.css" type="text/css" />
-    <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
-    
-    <script type="text/javascript">
-      var DOCUMENTATION_OPTIONS = {
-        URL_ROOT:    '../',
-        VERSION:     '0.1',
-        COLLAPSE_INDEX: false,
-        FILE_SUFFIX: '.html',
-        HAS_SOURCE:  true
-      };
-    </script>
-    <script type="text/javascript" src="../_static/jquery.js"></script>
-    <script type="text/javascript" src="../_static/underscore.js"></script>
-    <script type="text/javascript" src="../_static/doctools.js"></script>
-    <link rel="top" title="yc-pyutils 0.1 documentation" href="../index.html" />
-    <link rel="up" title="Module code" href="index.html" /> 
-  </head>
-  <body>
-    <div class="related">
-      <h3>Navigation</h3>
-      <ul>
-        <li class="right" style="margin-right: 10px">
-          <a href="../genindex.html" title="General Index"
-             accesskey="I">index</a></li>
-        <li class="right" >
-          <a href="../py-modindex.html" title="Python Module Index"
-             >modules</a> |</li>
-        <li><a href="../index.html">yc-pyutils 0.1 documentation</a> &raquo;</li>
-          <li><a href="index.html" accesskey="U">Module code</a> &raquo;</li> 
-      </ul>
-    </div>
-      <div class="sphinxsidebar">
-        <div class="sphinxsidebarwrapper">
-<div id="searchbox" style="display: none">
-  <h3>Quick search</h3>
-    <form class="search" action="../search.html" method="get">
-      <input type="text" name="q" />
-      <input type="submit" value="Go" />
-      <input type="hidden" name="check_keywords" value="yes" />
-      <input type="hidden" name="area" value="default" />
-    </form>
-    <p class="searchtip" style="font-size: 90%">
-    Enter search terms or a module, class or function name.
-    </p>
-</div>
-<script type="text/javascript">$('#searchbox').show(0);</script>
-        </div>
-      </div>
-
-    <div class="document">
-      <div class="documentwrapper">
-        <div class="bodywrapper">
-          <div class="body">
-            
-  <h1>Source code for bagofwords</h1><div class="highlight"><pre>
-<span class="kn">import</span> <span class="nn">collections</span><span class="o">,</span> <span class="nn">math</span>
-
-<div class="viewcode-block" id="BOW"><a class="viewcode-back" href="../index.html#bagofwords.BOW">[docs]</a><span class="k">class</span> <span class="nc">BOW</span><span class="p">(</span><span class="n">collections</span><span class="o">.</span><span class="n">Counter</span><span class="p">):</span>
-  <span class="sd">&quot;&quot;&quot;The bag of Words class.</span>
-
-<span class="sd">  This class provides the data structures and methods to store, retrieve, output, etc, of bags of words.</span>
-<span class="sd">  &quot;&quot;&quot;</span>
-
-  <span class="c"># def __init__(self, words=[]): self.update(words)</span>
-
-  <span class="k">def</span> <span class="nf">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Returns the bag of words in a LDA-style document string.</span>
-
-<span class="sd">    It basically looks like ``word1:count1 word2:count2 ...``&quot;&quot;&quot;</span>
-
-    <span class="k">return</span> <span class="s">&#39; &#39;</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="s">&#39;{}:{}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span> <span class="k">for</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">iteritems</span><span class="p">()])</span>
-
-<div class="viewcode-block" id="BOW.add_tokens"><a class="viewcode-back" href="../index.html#bagofwords.BOW.add_tokens">[docs]</a>  <span class="k">def</span> <span class="nf">add_tokens</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tokens</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Adds a list of tokenized words to the bag of words collections.&quot;&quot;&quot;</span>
-
-    <span class="n">bow</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">tokens</span><span class="p">)</span>
-</div>
-<div class="viewcode-block" id="BOW.add_from_bow_string"><a class="viewcode-back" href="../index.html#bagofwords.BOW.add_from_bow_string">[docs]</a>  <span class="k">def</span> <span class="nf">add_from_bow_string</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">s</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Adds words:count from a bag of words ascii formatted string.&quot;&quot;&quot;</span>
-
-    <span class="bp">self</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span>
-    <span class="k">for</span> <span class="n">wc_str</span> <span class="ow">in</span> <span class="n">s</span><span class="o">.</span><span class="n">split</span><span class="p">():</span>
-      <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="o">=</span> <span class="n">wc_str</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&#39;:&#39;</span><span class="p">)</span>
-      <span class="bp">self</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">c</span><span class="p">)</span>
-    <span class="c">#end for</span>
-  <span class="c">#end def</span>
-</div>
-  <span class="k">def</span> <span class="nf">__mul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Returns the dot product of this class with other.</span>
-
-<span class="sd">    See :func:`dot_product` for details.&quot;&quot;&quot;</span>
-
-    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">dot_product</span><span class="p">(</span><span class="n">other</span><span class="p">)</span>
-  <span class="c">#end for</span>
-
-<div class="viewcode-block" id="BOW.dot_product"><a class="viewcode-back" href="../index.html#bagofwords.BOW.dot_product">[docs]</a>  <span class="k">def</span> <span class="nf">dot_product</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Returns the dot product of this class with other.</span>
-
-<span class="sd">    Iterates through words in the counter and multiplies counts for the same words together.&quot;&quot;&quot;</span>
-
-    <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">other</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> <span class="k">return</span> <span class="mf">0.0</span>
-
-    <span class="n">dot_prod</span> <span class="o">=</span> <span class="mf">0.0</span>
-    <span class="k">for</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span> <span class="n">dot_prod</span> <span class="o">+=</span> <span class="n">c</span> <span class="o">*</span> <span class="n">other</span><span class="p">[</span><span class="n">w</span><span class="p">]</span>
-
-    <span class="k">return</span> <span class="n">dot_prod</span>
-  <span class="c">#def</span>
-</div>
-<div class="viewcode-block" id="BOW.l2_norm"><a class="viewcode-back" href="../index.html#bagofwords.BOW.l2_norm">[docs]</a>  <span class="k">def</span> <span class="nf">l2_norm</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Returns the L2-norm of the bag of words vector.&quot;&quot;&quot;</span>
-    <span class="k">return</span> <span class="n">math</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="nb">sum</span><span class="p">([</span><span class="n">c</span> <span class="o">*</span> <span class="n">c</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">itervalues</span><span class="p">()]))</span>
-</div>
-<div class="viewcode-block" id="BOW.l1_norm"><a class="viewcode-back" href="../index.html#bagofwords.BOW.l1_norm">[docs]</a>  <span class="k">def</span> <span class="nf">l1_norm</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Returns the L1-norm of the bag of words vector.&quot;&quot;&quot;</span>
-    <span class="k">return</span> <span class="nb">sum</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">itervalues</span><span class="p">())</span>
-<span class="c">#end class</span>
-</div></div>
-<div class="viewcode-block" id="cosine_similarity"><a class="viewcode-back" href="../index.html#bagofwords.cosine_similarity">[docs]</a><span class="k">def</span> <span class="nf">cosine_similarity</span><span class="p">(</span><span class="n">bow1</span><span class="p">,</span> <span class="n">bow2</span><span class="p">):</span>
-  <span class="sd">&quot;&quot;&quot;Returns the cosine similarity between 2 bag of words objects.&quot;&quot;&quot;</span>
-
-  <span class="k">return</span> <span class="p">(</span><span class="n">bow1</span> <span class="o">*</span> <span class="n">bow2</span><span class="p">)</span> <span class="o">/</span> <span class="p">(</span><span class="n">bow1</span><span class="o">.</span><span class="n">l2_norm</span><span class="p">()</span> <span class="o">*</span> <span class="n">bow2</span><span class="o">.</span><span class="n">l2_norm</span><span class="p">())</span></div>
-</pre></div>
-
-          </div>
-        </div>
-      </div>
-      <div class="clearer"></div>
-    </div>
-    <div class="related">
-      <h3>Navigation</h3>
-      <ul>
-        <li class="right" style="margin-right: 10px">
-          <a href="../genindex.html" title="General Index"
-             >index</a></li>
-        <li class="right" >
-          <a href="../py-modindex.html" title="Python Module Index"
-             >modules</a> |</li>
-        <li><a href="../index.html">yc-pyutils 0.1 documentation</a> &raquo;</li>
-          <li><a href="index.html" >Module code</a> &raquo;</li> 
-      </ul>
-    </div>
-    <div class="footer">
-        &copy; Copyright 2012, yanchuan sim.
-      Created using <a href="http://sphinx.pocoo.org/">Sphinx</a> 1.1.3.
-    </div>
-  </body>
-</html>

docs/html/_modules/index.html

           <div class="body">
             
   <h1>All modules for which code is available</h1>
-<ul><li><a href="bagofwords.html">bagofwords</a></li>
-<li><a href="ycutils/bagofwords.html">ycutils.bagofwords</a></li>
+<ul><li><a href="ycutils/bagofwords.html">ycutils.bagofwords</a></li>
 <li><a href="ycutils/bigvocab.html">ycutils.bigvocab</a></li>
+<li><a href="ycutils/bleu.html">ycutils.bleu</a></li>
 <li><a href="ycutils/corpus.html">ycutils.corpus</a></li>
 <li><a href="ycutils/tfidf.html">ycutils.tfidf</a></li>
 <li><a href="ycutils/tokenize.html">ycutils.tokenize</a></li>
 <li><a href="ycutils/tsvio.html">ycutils.tsvio</a></li>
-</ul>
+<li><a href="ycutils/urls.html">ycutils.urls</a></li>
+<ul><li><a href="ycutils/urls/googlebooks.html">ycutils.urls.googlebooks</a></li>
+<li><a href="ycutils/urls/printable.html">ycutils.urls.printable</a></li>
+<li><a href="ycutils/urls/webpages.html">ycutils.urls.webpages</a></li>
+<li><a href="ycutils/urls/wikipedia.html">ycutils.urls.wikipedia</a></li>
+<li><a href="ycutils/urls/youtube.html">ycutils.urls.youtube</a></li>
+</ul></ul>
 
           </div>
         </div>

docs/html/_modules/ycutils/bagofwords.html

           <div class="body">
             
   <h1>Source code for ycutils.bagofwords</h1><div class="highlight"><pre>
-<span class="sd">&quot;&quot;&quot;This module contains classes and methods that are useful when handling bags of words data structure.</span>
-<span class="sd">&quot;&quot;&quot;</span>
-
-<span class="kn">import</span> <span class="nn">collections</span><span class="o">,</span> <span class="nn">math</span>
+<span class="kn">import</span> <span class="nn">collections</span><span class="o">,</span> <span class="nn">math</span><span class="o">,</span> <span class="nn">sys</span><span class="o">,</span> <span class="nn">random</span><span class="o">,</span> <span class="nn">cPickle</span>
 
 <div class="viewcode-block" id="BOW"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.BOW">[docs]</a><span class="k">class</span> <span class="nc">BOW</span><span class="p">(</span><span class="n">collections</span><span class="o">.</span><span class="n">Counter</span><span class="p">):</span>
-  <span class="sd">&quot;&quot;&quot;The bag of Words class.</span>
+  <span class="sd">&quot;&quot;&quot;The bag of words class, which is based on the :class:`Counter` class in :mod:`collections`.</span>
 
 <span class="sd">  This class provides the data structures and methods to store, retrieve, output, etc, of bags of words.</span>
-<span class="sd">  &quot;&quot;&quot;</span>
 
-  <span class="c"># def __init__(self, words=[]): self.update(words)</span>
+<span class="sd">  :param tokens: iterable list of tokensto add to the bag of word.</span>
+<span class="sd">  :param wc_string: initialize bag of words with a word:count formatted string.&quot;&quot;&quot;</span>
+
+  <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tokens</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">wc_string</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
+    <span class="k">if</span> <span class="n">tokens</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">add_tokens</span><span class="p">(</span><span class="n">tokens</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">wc_string</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">add_wc_string</span><span class="p">(</span><span class="n">wc_string</span><span class="p">)</span>
+  <span class="c">#end def</span>
 
 <div class="viewcode-block" id="BOW.__str__"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.BOW.__str__">[docs]</a>  <span class="k">def</span> <span class="nf">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Returns the bag of words in a LDA-style document string.</span>
+    <span class="sd">&quot;&quot;&quot;The string representation.</span>
 
-<span class="sd">    It basically looks like ``word1:count1 word2:count2 ...``&quot;&quot;&quot;</span>
+<span class="sd">    See :func:`to_wc_string`.&quot;&quot;&quot;</span>
 
-    <span class="k">return</span> <span class="s">&#39; &#39;</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="s">&#39;{}:{}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span> <span class="k">for</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">iteritems</span><span class="p">()])</span>
+    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_wc_string</span><span class="p">()</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="BOW.__mul__"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.BOW.__mul__">[docs]</a>  <span class="k">def</span> <span class="nf">__mul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Multiplies two :class:`BOW`.</span>
+
+<span class="sd">    :param other: the other BOW object to find the dot product with.</span>
+<span class="sd">    :returns: the dot product of this class with ``other``.</span>
+
+<span class="sd">    See :func:`dot_product`.&quot;&quot;&quot;</span>
+
+    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">dot_product</span><span class="p">(</span><span class="n">other</span><span class="p">)</span>
+  <span class="c">#end def</span>
 </div>
 <div class="viewcode-block" id="BOW.add_tokens"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.BOW.add_tokens">[docs]</a>  <span class="k">def</span> <span class="nf">add_tokens</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tokens</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Adds a list of tokenized words to the bag of words collections.&quot;&quot;&quot;</span>
+    <span class="sd">&quot;&quot;&quot;Adds a list of tokenized words to the bag of words collections.</span>
+
+<span class="sd">    :param tokens: the list of tokens to add to our :class:`BOW`. Should be an iterable.&quot;&quot;&quot;</span>
 
-    <span class="n">bow</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">tokens</span><span class="p">)</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">tokens</span><span class="p">)</span>
+  <span class="c">#end def</span>
 </div>
-<div class="viewcode-block" id="BOW.add_from_bow_string"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.BOW.add_from_bow_string">[docs]</a>  <span class="k">def</span> <span class="nf">add_from_bow_string</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">s</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Adds words:count from a bag of words ascii formatted string.&quot;&quot;&quot;</span>
+<div class="viewcode-block" id="BOW.add_wc_string"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.BOW.add_wc_string">[docs]</a>  <span class="k">def</span> <span class="nf">add_wc_string</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">s</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Adds words and their counts from a ``word:count`` formatted string.</span>
 
-    <span class="bp">self</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span>
+<span class="sd">    :param s: a string of the form in ``word:count`` format.&quot;&quot;&quot;</span>
     <span class="k">for</span> <span class="n">wc_str</span> <span class="ow">in</span> <span class="n">s</span><span class="o">.</span><span class="n">split</span><span class="p">():</span>
       <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="o">=</span> <span class="n">wc_str</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&#39;:&#39;</span><span class="p">)</span>
-      <span class="bp">self</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">c</span><span class="p">)</span>
+      <span class="bp">self</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">+=</span> <span class="nb">float</span><span class="p">(</span><span class="n">c</span><span class="p">)</span>
     <span class="c">#end for</span>
   <span class="c">#end def</span>
 </div>
-<div class="viewcode-block" id="BOW.__mul__"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.BOW.__mul__">[docs]</a>  <span class="k">def</span> <span class="nf">__mul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Returns the dot product of this class with other.</span>
+<div class="viewcode-block" id="BOW.to_wc_string"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.BOW.to_wc_string">[docs]</a>  <span class="k">def</span> <span class="nf">to_wc_string</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Format the :class:`BOW` object in a ``word:count`` formatted string which looks like ``word1:count1 word2:count2 ...``.</span>
 
-<span class="sd">    See :func:`dot_product` for details.&quot;&quot;&quot;</span>
-
-    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">dot_product</span><span class="p">(</span><span class="n">other</span><span class="p">)</span>
-  <span class="c">#end for</span>
+<span class="sd">    :returns: the formatted string.&quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="s">&#39; &#39;</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="s">u&#39;{}:{}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span> <span class="k">for</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">iteritems</span><span class="p">()])</span>
+  <span class="c">#end def</span>
 </div>
 <div class="viewcode-block" id="BOW.dot_product"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.BOW.dot_product">[docs]</a>  <span class="k">def</span> <span class="nf">dot_product</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Returns the dot product of this class with other.</span>
+    <span class="sd">&quot;&quot;&quot;Iterates through words in the counter and multiplies counts for the same words together.</span>
 
-<span class="sd">    Iterates through words in the counter and multiplies counts for the same words together.&quot;&quot;&quot;</span>
+<span class="sd">    :param other: the other BOW object to find the dot product with.</span>
+<span class="sd">    :returns: dot product of this object with ``other``&quot;&quot;&quot;</span>
 
     <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">other</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> <span class="k">return</span> <span class="mf">0.0</span>
 
     <span class="k">for</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span> <span class="n">dot_prod</span> <span class="o">+=</span> <span class="n">c</span> <span class="o">*</span> <span class="n">other</span><span class="p">[</span><span class="n">w</span><span class="p">]</span>
 
     <span class="k">return</span> <span class="n">dot_prod</span>
-  <span class="c">#def</span>
+  <span class="c">#end def</span>
 </div>
 <div class="viewcode-block" id="BOW.l2_norm"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.BOW.l2_norm">[docs]</a>  <span class="k">def</span> <span class="nf">l2_norm</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Returns the L2-norm of the bag of words vector.&quot;&quot;&quot;</span>
+    <span class="sd">&quot;&quot;&quot;:returns: the L2-norm of the bag of words vector.&quot;&quot;&quot;</span>
     <span class="k">return</span> <span class="n">math</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="nb">sum</span><span class="p">([</span><span class="n">c</span> <span class="o">*</span> <span class="n">c</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">itervalues</span><span class="p">()]))</span>
+  <span class="c">#end def</span>
 </div>
 <div class="viewcode-block" id="BOW.l1_norm"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.BOW.l1_norm">[docs]</a>  <span class="k">def</span> <span class="nf">l1_norm</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Returns the L1-norm of the bag of words vector.&quot;&quot;&quot;</span>
+    <span class="sd">&quot;&quot;&quot;:returns: the L1-norm of the bag of words vector.&quot;&quot;&quot;</span>
     <span class="k">return</span> <span class="nb">sum</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">itervalues</span><span class="p">())</span>
+  <span class="c">#end def</span>
+<span class="c">#end class</span>
+</div></div>
+<div class="viewcode-block" id="Document"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.Document">[docs]</a><span class="k">class</span> <span class="nc">Document</span><span class="p">(</span><span class="n">BOW</span><span class="p">):</span>
+  <span class="sd">&quot;&quot;&quot;A document data structure. Basically a :class:`BOW` object with a :attr:`title` attribute.</span>
+
+<span class="sd">  :param title: title of this document, should be unique.</span>
+<span class="sd">  :param tokens: iterable list of tokens in the document.</span>
+<span class="sd">  :param bow: bag of words object to &#39;convert&#39; to a document object.</span>
+<span class="sd">  :param wc_string: initialize document with a word:count formatted string.</span>
+<span class="sd">  &quot;&quot;&quot;</span>
+
+  <span class="n">title</span> <span class="o">=</span> <span class="bp">None</span>
+  <span class="sd">&quot;&quot;&quot;The title of this document.&quot;&quot;&quot;</span>
+
+  <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">title</span><span class="o">=</span><span class="s">&#39;&#39;</span><span class="p">,</span> <span class="n">tokens</span><span class="o">=</span><span class="p">[],</span> <span class="n">bow</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">wc_string</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
+    <span class="k">if</span> <span class="n">bow</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">bow</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">tokens</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">add_tokens</span><span class="p">(</span><span class="n">tokens</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">wc_string</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">add_wc_string</span><span class="p">(</span><span class="n">wc_string</span><span class="p">)</span>
+
+    <span class="bp">self</span><span class="o">.</span><span class="n">title</span> <span class="o">=</span> <span class="nb">str</span><span class="p">(</span><span class="n">title</span><span class="p">)</span> <span class="k">if</span> <span class="n">title</span> <span class="k">else</span> <span class="n">random_title</span><span class="p">()</span>
+  <span class="c">#end def</span>
+
+<div class="viewcode-block" id="Document.__str__"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.Document.__str__">[docs]</a>  <span class="k">def</span> <span class="nf">__str__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;The string representation.</span>
+
+<span class="sd">    See :func:`to_wc_string`.&quot;&quot;&quot;</span>
+
+    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_wc_string</span><span class="p">()</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="Document.to_wc_string"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.Document.to_wc_string">[docs]</a>  <span class="k">def</span> <span class="nf">to_wc_string</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Format the :class:`Document` object in a ``word:count`` formatted string which looks like ``title&lt;tab&gt;word1:count1 word2:count2``.</span>
+
+<span class="sd">    :returns: the formatted string.&quot;&quot;&quot;</span>
+
+    <span class="k">return</span> <span class="s">u&#39;{}</span><span class="se">\t</span><span class="s">{}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">title</span><span class="p">,</span> <span class="s">&#39; &#39;</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="s">u&#39;{}:{}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span> <span class="k">for</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">iteritems</span><span class="p">()]))</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="Document.add_wc_string"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.Document.add_wc_string">[docs]</a>  <span class="k">def</span> <span class="nf">add_wc_string</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">s</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Adds tokens and their counts from a ``word:count`` document formatted string.</span>
+
+<span class="sd">    :param s: a string of the form in ``word:count`` format.&quot;&quot;&quot;</span>
+
+    <span class="k">try</span><span class="p">:</span>
+      <span class="n">title</span><span class="p">,</span> <span class="n">wc_str</span> <span class="o">=</span> <span class="n">s</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&#39;</span><span class="se">\t</span><span class="s">&#39;</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
+    <span class="k">except</span> <span class="ne">ValueError</span><span class="p">:</span> <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s">&#39;Tab separator for title and word:counts not found.&#39;</span><span class="p">)</span>
+
+    <span class="bp">self</span><span class="o">.</span><span class="n">title</span> <span class="o">=</span> <span class="n">title</span>
+    <span class="nb">super</span><span class="p">(</span><span class="n">Document</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">add_wc_string</span><span class="p">(</span><span class="n">wc_str</span><span class="p">)</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="Document.bow"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.Document.bow">[docs]</a>  <span class="k">def</span> <span class="nf">bow</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;:returns: the :class:`BOW` object of this document (i.e without the title)&quot;&quot;&quot;</span>
+
+    <span class="k">return</span> <span class="nb">super</span><span class="p">(</span><span class="n">Document</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span>
+  <span class="c">#end def</span>
 <span class="c">#end class</span>
 </div></div>
 <div class="viewcode-block" id="cosine_similarity"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.cosine_similarity">[docs]</a><span class="k">def</span> <span class="nf">cosine_similarity</span><span class="p">(</span><span class="n">bow1</span><span class="p">,</span> <span class="n">bow2</span><span class="p">):</span>
-  <span class="sd">&quot;&quot;&quot;Returns the cosine similarity between 2 bag of words objects.&quot;&quot;&quot;</span>
+  <span class="sd">&quot;&quot;&quot;Calculates the cosine similarity of two bag of words vectors (or any :class:`Counter`-like object really).</span>
+
+<span class="sd">  :param bow1: the bag of words vector.</span>
+<span class="sd">  :param bow2: the bag of words vector.</span>
+<span class="sd">  :returns: the cosine similarity.&quot;&quot;&quot;</span>
+  <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">bow1</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">bow2</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> <span class="k">return</span> <span class="mf">0.0</span>
+
+  <span class="n">dot_prod</span> <span class="o">=</span> <span class="mf">0.0</span>
+  <span class="n">bow1_norm</span> <span class="o">=</span> <span class="mf">0.0</span>
+  <span class="n">bow2_norm</span> <span class="o">=</span> <span class="mf">0.0</span>
+
+  <span class="k">for</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">bow1</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span> 
+    <span class="n">dot_prod</span> <span class="o">+=</span> <span class="n">c</span> <span class="o">*</span> <span class="n">bow2</span><span class="p">[</span><span class="n">w</span><span class="p">]</span>
+    <span class="n">bow1_norm</span> <span class="o">+=</span> <span class="n">c</span> <span class="o">*</span> <span class="n">c</span>
+
+  <span class="k">if</span> <span class="n">dot_prod</span> <span class="o">==</span> <span class="mf">0.0</span><span class="p">:</span> <span class="k">return</span> <span class="mf">0.0</span>
+
+  <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">bow2</span><span class="o">.</span><span class="n">itervalues</span><span class="p">():</span> <span class="n">bow2_norm</span> <span class="o">+=</span> <span class="n">c</span> <span class="o">*</span> <span class="n">c</span>
+
+  <span class="k">return</span> <span class="n">dot_prod</span> <span class="o">/</span> <span class="n">math</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">bow1_norm</span> <span class="o">*</span> <span class="n">bow2_norm</span><span class="p">)</span>
+<span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="random_title"><a class="viewcode-back" href="../../bagofwords.html#ycutils.bagofwords.random_title">[docs]</a><span class="k">def</span> <span class="nf">random_title</span><span class="p">():</span>
+  <span class="sd">&quot;&quot;&quot;:returns: return a randomly generate 16 character hexadecimal string.&quot;&quot;&quot;</span>
 
-  <span class="k">return</span> <span class="p">(</span><span class="n">bow1</span> <span class="o">*</span> <span class="n">bow2</span><span class="p">)</span> <span class="o">/</span> <span class="p">(</span><span class="n">bow1</span><span class="o">.</span><span class="n">l2_norm</span><span class="p">()</span> <span class="o">*</span> <span class="n">bow2</span><span class="o">.</span><span class="n">l2_norm</span><span class="p">())</span></div>
+  <span class="k">return</span> <span class="s">&#39;&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">random</span><span class="o">.</span><span class="n">choice</span><span class="p">(</span><span class="s">&#39;0123456789abcdef&#39;</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">xrange</span><span class="p">(</span><span class="mi">16</span><span class="p">))</span>
+<span class="c">#end def</span></div>
 </pre></div>
 
           </div>

docs/html/_modules/ycutils/bigvocab.html

   <h1>Source code for ycutils.bigvocab</h1><div class="highlight"><pre>
 <span class="kn">import</span> <span class="nn">ycutils.bagofwords</span><span class="o">,</span> <span class="nn">ycutils.tsvio</span>
 
-<div class="viewcode-block" id="Vocabulary"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.Vocabulary">[docs]</a><span class="k">class</span> <span class="nc">Vocabulary</span><span class="p">:</span>
-  <span class="sd">&quot;&quot;&quot;This class handles mappings between integer keys and the words they represent.</span>
+<div class="viewcode-block" id="VocabularyMap"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.VocabularyMap">[docs]</a><span class="k">class</span> <span class="nc">VocabularyMap</span><span class="p">:</span>
+  <span class="sd">&quot;&quot;&quot;This class handles mappings between integer keys and the tokens they represent.</span>
 
-<span class="sd">  :param corpus_vocabulary: populate the object with words from a :class:`ycutils.corpus.CorpusVocabulary`.&quot;&quot;&quot;</span>
+<span class="sd">  :param corpus_vocabulary: populate the object with tokens from a :class:`ycutils.corpus.CorpusVocabulary`.</span>
+<span class="sd">  :param create_token: sets the default behaviour when retrieving index for tokens we have not seen before, to create a new index or to return the unknown token (if one exist)</span>
+<span class="sd">  :param unknown_token: use this token as the unknown token. Set to `None` to ignore unknown tokens (i.e raise an error on unknown terms). Otherwise, an unknown token will be created.</span>
 
-  <span class="n">__index</span> <span class="o">=</span> <span class="p">{}</span>
-  <span class="n">__words</span> <span class="o">=</span> <span class="p">[]</span>
+<span class="sd">  .. seealso:: :meth:`get_index` for detailed behaviour when we encounter a token that is not found in the vocabulary.&quot;&quot;&quot;</span>
+
+  <span class="n">__index</span> <span class="o">=</span> <span class="bp">None</span>
+  <span class="n">__tokens</span> <span class="o">=</span> <span class="bp">None</span>
+
+  <span class="n">create_token</span> <span class="o">=</span> <span class="bp">False</span>
+  <span class="sd">&quot;&quot;&quot;Defines the default behaviour when we encounter a never seen before token.&quot;&quot;&quot;</span>
+
+  <span class="n">unknown_token</span> <span class="o">=</span> <span class="s">&#39;__UNK__&#39;</span>
+  <span class="sd">&quot;&quot;&quot;The type for an unknown token, which is ``__UNK__`` by default.&quot;&quot;&quot;</span>
+
+  <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">from_filename</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">corpus_vocabulary</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">create_token</span><span class="o">=</span><span class="bp">False</span><span class="p">,</span> <span class="n">unknown_token</span><span class="o">=</span><span class="s">&#39;__UNK__&#39;</span><span class="p">):</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">__index</span> <span class="o">=</span> <span class="p">{}</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span> <span class="o">=</span> <span class="p">[]</span>
 
-  <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">corpus_vocabulary</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
     <span class="k">if</span> <span class="n">corpus_vocabulary</span><span class="p">:</span>
       <span class="k">for</span> <span class="n">w</span> <span class="ow">in</span> <span class="n">corpus_vocabulary</span><span class="p">:</span>
-        <span class="k">if</span> <span class="n">w</span> <span class="o">==</span> <span class="s">&#39;__UNK__&#39;</span><span class="p">:</span> <span class="k">continue</span>
-
         <span class="k">if</span> <span class="n">w</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">:</span> <span class="k">continue</span>
 
-        <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__words</span><span class="p">)</span>
-        <span class="bp">self</span><span class="o">.</span><span class="n">__words</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
       <span class="c">#end for</span>
+
+    <span class="k">elif</span> <span class="n">from_filename</span><span class="p">:</span>
+      <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">from_filename</span><span class="p">,</span> <span class="s">&#39;r&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">from_file</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="n">unknown_token</span><span class="o">=</span><span class="n">unknown_token</span><span class="p">)</span>
     <span class="c">#end if</span>
+
+    <span class="k">if</span> <span class="n">unknown_token</span> <span class="ow">and</span> <span class="n">unknown_token</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">:</span>
+      <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="n">unknown_token</span><span class="p">]</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="p">)</span>
+      <span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">unknown_token</span><span class="p">)</span>
+    <span class="c">#end if</span>
+
+    <span class="bp">self</span><span class="o">.</span><span class="n">create_token</span> <span class="o">=</span> <span class="n">create_token</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span> <span class="o">=</span> <span class="n">unknown_token</span>
   <span class="c">#end def</span>
 
-<div class="viewcode-block" id="Vocabulary.add_bow"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.Vocabulary.add_bow">[docs]</a>  <span class="k">def</span> <span class="nf">add_bow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">bow</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Adds words from a :class:`ycutils.bagofwords.BOW`.</span>
+<div class="viewcode-block" id="VocabularyMap.add_bow"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.VocabularyMap.add_bow">[docs]</a>  <span class="k">def</span> <span class="nf">add_bow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">bow</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Adds words from a :class:`ycutils.bagofwords.BOW` (or any :class:`collections.Counter` for that matter).</span>
 
 <span class="sd">    :param bow: bag of words to add to vocabulary, creating new mappings for new words.&quot;&quot;&quot;</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">add_words</span><span class="p">(</span><span class="n">bow</span><span class="o">.</span><span class="n">iterkeys</span><span class="p">())</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">add_tokens</span><span class="p">(</span><span class="n">bow</span><span class="o">.</span><span class="n">iterkeys</span><span class="p">())</span>
+  <span class="c">#end def</span>
 </div>
-<div class="viewcode-block" id="Vocabulary.add_words"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.Vocabulary.add_words">[docs]</a>  <span class="k">def</span> <span class="nf">add_words</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">words</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Adds words from a iterable of string.</span>
+<div class="viewcode-block" id="VocabularyMap.add_tokens"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.VocabularyMap.add_tokens">[docs]</a>  <span class="k">def</span> <span class="nf">add_tokens</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tokens</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Adds tokens from a iterable of string.</span>
 
-<span class="sd">    :param words: iterable of words to add to vocabulary, creating new mappings for new words.&quot;&quot;&quot;</span>
+<span class="sd">    :param tokens: iterable of tokens to add to vocabulary, creating new mappings for new tokens.&quot;&quot;&quot;</span>
 
-    <span class="k">for</span> <span class="n">w</span> <span class="ow">in</span> <span class="n">words</span><span class="p">:</span>
+    <span class="k">for</span> <span class="n">w</span> <span class="ow">in</span> <span class="n">tokens</span><span class="p">:</span>
       <span class="k">if</span> <span class="n">w</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">:</span> <span class="k">continue</span>
 
-      <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__words</span><span class="p">)</span>
-      <span class="bp">self</span><span class="o">.</span><span class="n">__words</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
+      <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="p">)</span>
+      <span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
     <span class="c">#end for</span>
   <span class="c">#end def</span>
 </div>
-<div class="viewcode-block" id="Vocabulary.get_index"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.Vocabulary.get_index">[docs]</a>  <span class="k">def</span> <span class="nf">get_index</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">word</span><span class="p">,</span> <span class="n">create</span><span class="o">=</span><span class="bp">False</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Finds the index of the word given.</span>
+<div class="viewcode-block" id="VocabularyMap.get_index"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.VocabularyMap.get_index">[docs]</a>  <span class="k">def</span> <span class="nf">get_index</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">token</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Finds the index of the token given.</span>
+
+<span class="sd">    :param token: the token to find the index for.</span>
+<span class="sd">    :returns: an index to the given token.</span>
+
+<span class="sd">    .. note:: On encountering a never seen before token, if :attr:`create_token` is set to `True`, a new index will be created and returned. Else if :attr:`unknown_token` is defined (not `None`), the index for :attr:`unknown_token` will be returned. If :attr:`unknown_token` is set to `None`, a :exc:`KeyError` will be raised.&quot;&quot;&quot;</span>
 
-<span class="sd">    :param word: the word to find the index for.</span>
-<span class="sd">    :param create: whether to add the word to vocabulary if we have not seen it before (or raise an error).</span>
-<span class="sd">    :returns: an index to the given word.&quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="n">token</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">:</span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="n">token</span><span class="p">]</span>
+
+    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">create_token</span><span class="p">:</span> <span class="c"># add the token to the vocabulary</span>
+      <span class="bp">self</span><span class="o">.</span><span class="n">add_tokens</span><span class="p">([</span><span class="n">token</span><span class="p">])</span>
+      <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="n">token</span><span class="p">]</span>
+    <span class="c">#end if</span>
 
-    <span class="k">if</span> <span class="n">create</span> <span class="ow">and</span> <span class="n">word</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">add_words</span><span class="p">([</span><span class="n">word</span><span class="p">])</span>
+    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span><span class="p">:</span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span><span class="p">]</span>
 
-    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="n">word</span><span class="p">]</span>
+    <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s">u&#39;</span><span class="se">\&#39;</span><span class="s">{}</span><span class="se">\&#39;</span><span class="s"> not found in vocabulary.&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">token</span><span class="p">))</span>
   <span class="c">#end def</span>
 </div>
-<div class="viewcode-block" id="Vocabulary.get_indexes"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.Vocabulary.get_indexes">[docs]</a>  <span class="k">def</span> <span class="nf">get_indexes</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">words</span><span class="p">,</span> <span class="n">create</span><span class="o">=</span><span class="bp">False</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Finds the index of words in a given list.</span>
+<div class="viewcode-block" id="VocabularyMap.get_indexes"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.VocabularyMap.get_indexes">[docs]</a>  <span class="k">def</span> <span class="nf">get_indexes</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tokens</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Finds the index of tokens in a given list.</span>
 
-<span class="sd">    :param words: a list of words to find the index for.</span>
-<span class="sd">    :param create: whether to add words to vocabulary if we have not seen them before.</span>
-<span class="sd">    :returns: a list of indexes corresponding to the list of words.&quot;&quot;&quot;</span>
+<span class="sd">    :param tokens: a list of tokens to find the index for.</span>
+<span class="sd">    :returns: a list of indexes corresponding to the list of tokens.</span>
+
+<span class="sd">    .. seealso:: :meth:`get_index` for the behaviour when we encounter a token that is not found in the vocabulary.&quot;&quot;&quot;</span>
+
+    <span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">w</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_index</span><span class="p">(</span><span class="n">w</span><span class="p">),</span> <span class="n">tokens</span><span class="p">)</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="VocabularyMap.get_token"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.VocabularyMap.get_token">[docs]</a>  <span class="k">def</span> <span class="nf">get_token</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Finds the token at the index position.</span>
 
-    <span class="k">if</span> <span class="n">create</span><span class="p">:</span> <span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">w</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">get_index</span><span class="p">(</span><span class="n">w</span><span class="p">,</span> <span class="n">create</span><span class="o">=</span><span class="bp">True</span><span class="p">),</span> <span class="n">words</span><span class="p">)</span>
+<span class="sd">    :param index: the index to retrieve the token.</span>
+<span class="sd">    :returns: token with the given index.&quot;&quot;&quot;</span>
 
-    <span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">w</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="n">w</span><span class="p">],</span> <span class="n">words</span><span class="p">)</span>
+    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="p">[</span><span class="n">index</span><span class="p">]</span>
   <span class="c">#end def</span>
 </div>
-<div class="viewcode-block" id="Vocabulary.get_word"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.Vocabulary.get_word">[docs]</a>  <span class="k">def</span> <span class="nf">get_word</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Finds the word at the index position.</span>
+<div class="viewcode-block" id="VocabularyMap.get_tokens"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.VocabularyMap.get_tokens">[docs]</a>  <span class="k">def</span> <span class="nf">get_tokens</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">indexes</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Finds the tokens at the given indexes.</span>
 
-<span class="sd">    :param index: the index to retrieve the word.</span>
-<span class="sd">    :returns: word with the given index.&quot;&quot;&quot;</span>
+<span class="sd">    :param indexes: the indexes to retrieve the tokens.</span>
+<span class="sd">    :returns: tokens with the given indexes.&quot;&quot;&quot;</span>
 
-    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__words</span><span class="p">[</span><span class="n">index</span><span class="p">]</span>
+    <span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">indexes</span><span class="p">)</span>
   <span class="c">#end def</span>
 </div>
-<div class="viewcode-block" id="Vocabulary.get_words"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.Vocabulary.get_words">[docs]</a>  <span class="k">def</span> <span class="nf">get_words</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">indexes</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Finds the words at the given indexes.</span>
+<div class="viewcode-block" id="VocabularyMap.keys_to_tokens"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.VocabularyMap.keys_to_tokens">[docs]</a>  <span class="k">def</span> <span class="nf">keys_to_tokens</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">d</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Takes a dictionary and transforms all its key from indexes to tokens.</span>
 
-<span class="sd">    :param indexes: the indexes to retrieve the words.</span>
-<span class="sd">    :returns: words with the given indexes.&quot;&quot;&quot;</span>
+<span class="sd">    :param d: dictionary to do the transformation on.&quot;&quot;&quot;</span>
 
-    <span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">__words</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">indexes</span><span class="p">)</span>
+    <span class="k">return</span> <span class="nb">dict</span><span class="p">([(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_token</span><span class="p">(</span><span class="n">k</span><span class="p">),</span> <span class="n">v</span><span class="p">)</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">d</span><span class="o">.</span><span class="n">iteritems</span><span class="p">()])</span>
   <span class="c">#end def</span>
 </div>
-<div class="viewcode-block" id="Vocabulary.to_file"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.Vocabulary.to_file">[docs]</a>  <span class="k">def</span> <span class="nf">to_file</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Writes out the vocabulary mapping (indexes and words) to file descriptor.</span>
+<div class="viewcode-block" id="VocabularyMap.keys_to_indexes"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.VocabularyMap.keys_to_indexes">[docs]</a>  <span class="k">def</span> <span class="nf">keys_to_indexes</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">d</span><span class="p">,</span> <span class="n">sum_unknown</span><span class="o">=</span><span class="bp">True</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Takes a dictionary and transforms all its key from tokens to indexes.</span>
 
-<span class="sd">    File format is tab separated values with index and word columns, one word per line.</span>
+<span class="sd">    .. warning:: Assuming :attr:`unknown_token` is defined. If :attr:`sum_unknown` is set to False, all unknown tokens are placed together under the index for :attr:`unknown_token`, and its corresponding value will not be defined.</span>
+
+<span class="sd">    .. seealso:: :meth:`get_index` for the behaviour when we encounter a token that is not found in the vovabulary.</span>
+
+<span class="sd">    :param d: dictionary to do the transformation on.</span>
+<span class="sd">    :param sum_unknown: performs a sum over values that are mapped to unknown.&quot;&quot;&quot;</span>
+
+    <span class="n">kv</span> <span class="o">=</span> <span class="p">[(</span><span class="bp">self</span><span class="o">.</span><span class="n">get_index</span><span class="p">(</span><span class="n">k</span><span class="p">),</span> <span class="n">v</span><span class="p">)</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">d</span><span class="o">.</span><span class="n">iteritems</span><span class="p">()]</span>
+
+    <span class="n">new_d</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="n">kv</span><span class="p">)</span>
+    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span> <span class="ow">and</span> <span class="n">sum_unknown</span><span class="p">:</span>
+      <span class="n">unknown_i</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span><span class="p">]</span>
+      <span class="n">new_d</span><span class="p">[</span><span class="n">unknown_i</span><span class="p">]</span> <span class="o">=</span> <span class="nb">sum</span><span class="p">([</span><span class="n">v</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">kv</span> <span class="k">if</span> <span class="n">k</span> <span class="o">==</span> <span class="n">unknown_i</span><span class="p">])</span>
+    <span class="c">#end if</span>
+
+    <span class="k">return</span> <span class="n">new_d</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="VocabularyMap.to_file"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.VocabularyMap.to_file">[docs]</a>  <span class="k">def</span> <span class="nf">to_file</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Writes out the vocabulary mapping (indexes and tokens) to file descriptor.</span>
+
+<span class="sd">    File format is tab separated values with index and token columns, one token per line.</span>
 
 <span class="sd">    :param f: file descriptor to write to.&quot;&quot;&quot;</span>
 
     <span class="n">t</span> <span class="o">=</span> <span class="n">ycutils</span><span class="o">.</span><span class="n">tsvio</span><span class="o">.</span><span class="n">TSVFile</span><span class="p">()</span>
 
-    <span class="n">t</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="s">&#39;index</span><span class="se">\t</span><span class="s">word&#39;</span><span class="p">,</span> <span class="n">comment</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
-    <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">w</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__words</span><span class="p">):</span> <span class="n">t</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="p">[</span><span class="n">i</span><span class="p">,</span> <span class="n">w</span><span class="p">])</span>
+    <span class="n">t</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="s">&#39;vocab_count={}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">size</span><span class="p">()),</span> <span class="n">comment</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+    <span class="n">t</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="s">&#39;index</span><span class="se">\t</span><span class="s">token&#39;</span><span class="p">,</span> <span class="n">comment</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+    <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">w</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="p">):</span> <span class="n">t</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="p">[</span><span class="n">i</span><span class="p">,</span> <span class="n">w</span><span class="p">])</span>
   <span class="c">#end def</span>
 </div>
-<div class="viewcode-block" id="Vocabulary.from_file"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.Vocabulary.from_file">[docs]</a>  <span class="k">def</span> <span class="nf">from_file</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
+<div class="viewcode-block" id="VocabularyMap.from_file"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.VocabularyMap.from_file">[docs]</a>  <span class="k">def</span> <span class="nf">from_file</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">,</span> <span class="n">unknown_token</span><span class="o">=</span><span class="s">&#39;__UNK__&#39;</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Reads from file descriptor a previous saved file using :meth:`to_file`.</span>
 
-<span class="sd">    :param f: file descriptor to read from.&quot;&quot;&quot;</span>
+<span class="sd">    :param f: file descriptor to read from.</span>
+<span class="sd">    :param unknown_token: unknown token that is used in this file.&quot;&quot;&quot;</span>
 
     <span class="bp">self</span><span class="o">.</span><span class="n">__index</span> <span class="o">=</span> <span class="p">{}</span>
-    <span class="bp">self</span><span class="o">.</span><span class="n">__words</span> <span class="o">=</span> <span class="p">[]</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span> <span class="o">=</span> <span class="p">[]</span>
 
     <span class="n">t</span> <span class="o">=</span> <span class="n">ycutils</span><span class="o">.</span><span class="n">tsvio</span><span class="o">.</span><span class="n">TSVFile</span><span class="p">()</span>
-    <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">word</span> <span class="ow">in</span> <span class="n">t</span><span class="o">.</span><span class="n">readlines</span><span class="p">(</span><span class="n">f</span><span class="p">):</span>
+    <span class="k">for</span> <span class="n">index</span><span class="p">,</span> <span class="n">token</span> <span class="ow">in</span> <span class="n">t</span><span class="o">.</span><span class="n">readlines</span><span class="p">(</span><span class="n">f</span><span class="p">):</span>
       <span class="n">index</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">index</span><span class="p">)</span>
-      <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__words</span><span class="p">)</span> <span class="o">==</span> <span class="n">index</span><span class="p">:</span>
-        <span class="bp">self</span><span class="o">.</span><span class="n">__words</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">word</span><span class="p">)</span>
-        <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="n">word</span><span class="p">]</span> <span class="o">=</span> <span class="n">index</span>
+      <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="p">)</span> <span class="o">==</span> <span class="n">index</span><span class="p">:</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">token</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">__index</span><span class="p">[</span><span class="n">token</span><span class="p">]</span> <span class="o">=</span> <span class="n">index</span>
       <span class="k">else</span><span class="p">:</span>
         <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s">&#39;Vocabulary index must be consecutive. Missing index = {}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">index</span><span class="p">))</span>
     <span class="c">#end for</span>
   <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="VocabularyMap.size"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.VocabularyMap.size">[docs]</a>  <span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> 
+    <span class="sd">&quot;&quot;&quot;:returns: number of tokens in the vocabulary.&quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="p">)</span>
+    </div>
+  <span class="k">def</span> <span class="nf">__len__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> 
+    <span class="sd">&quot;&quot;&quot;:returns: number of tokens in the vocabulary.&quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__tokens</span><span class="p">)</span>
+<span class="c">#end class</span>
+</div>
+<div class="viewcode-block" id="BigBOW"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.BigBOW">[docs]</a><span class="k">class</span> <span class="nc">BigBOW</span><span class="p">(</span><span class="n">ycutils</span><span class="o">.</span><span class="n">bagofwords</span><span class="o">.</span><span class="n">BOW</span><span class="p">):</span>
+  <span class="sd">&quot;&quot;&quot;A derived :class:`ycutils.bagofwords.BOW` class which is able to transparently handle tokens and indexes mapping.</span>
+
+<span class="sd">  :param vocabulary: the vocabulary mapping file that we use.</span>
+<span class="sd">  :param tokens: iterable list of tokensto add to the bag of word.</span>
+<span class="sd">  :param wc_string: initialize bag of words with a word:count formatted string.</span>
+
+<span class="sd">  .. note:: If vocabulary has :attr:`Vocabulary.create_token` set to ``True``, the vocabulary can be created on the fly by adding word to this :class:`BigBOW`.</span>
+
+<span class="sd">  .. note:: Standard dictionary access through :meth:`__getitem__()`, :meth:`__iter__()`, etc will use the indexes as keys. To access using tokens, use the wrapper functions :meth:`itertokens`, :meth:`tokens`, :meth:`set_token_count`, :meth:`get_token_count`, :meth:`inc_token_count` and :meth:`delete_token`.&quot;&quot;&quot;</span>
+
+  <span class="n">__vocabulary</span> <span class="o">=</span> <span class="bp">None</span>
+
+  <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">vocabulary</span><span class="p">,</span> <span class="n">tokens</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">wc_string</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">bow</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span> <span class="o">=</span> <span class="n">vocabulary</span>
+
+    <span class="k">if</span> <span class="n">tokens</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">add_tokens</span><span class="p">(</span><span class="n">tokens</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">wc_string</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">add_wc_string</span><span class="p">(</span><span class="n">wc_string</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">bow</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">add_bow</span><span class="p">(</span><span class="n">bow</span><span class="p">)</span>
+  <span class="c">#end def</span>
+
+<div class="viewcode-block" id="BigBOW.add_bow"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.BigBOW.add_bow">[docs]</a>  <span class="k">def</span> <span class="nf">add_bow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">bow</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Adds a bag of word counts to our object, transparently converting the keys.&quot;&quot;&quot;</span>
+
+    <span class="bp">self</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span><span class="o">.</span><span class="n">keys_to_indexes</span><span class="p">(</span><span class="n">bow</span><span class="p">))</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="BigBOW.add_tokens"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.BigBOW.add_tokens">[docs]</a>  <span class="k">def</span> <span class="nf">add_tokens</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tokens</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Converts given list of tokens to indexes and adds them to the object.</span>
+
+<span class="sd">    :param tokens: the list of tokens to add to the :class:`BigBOW`.&quot;&quot;&quot;</span>
+
+    <span class="bp">self</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span><span class="o">.</span><span class="n">get_indexes</span><span class="p">(</span><span class="n">tokens</span><span class="p">))</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="BigBOW.add_wc_string"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.BigBOW.add_wc_string">[docs]</a>  <span class="k">def</span> <span class="nf">add_wc_string</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">s</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Adds tokens and their counts from a ``word:count`` formatted string, transparently converting them to indexes.</span>
+
+<span class="sd">    :param s: a string of the form in ``word:count`` format.&quot;&quot;&quot;</span>
+
+    <span class="k">for</span> <span class="n">wc_str</span> <span class="ow">in</span> <span class="n">s</span><span class="o">.</span><span class="n">split</span><span class="p">():</span>
+      <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="o">=</span> <span class="n">wc_str</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">&#39;:&#39;</span><span class="p">)</span>
+      <span class="bp">self</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span><span class="o">.</span><span class="n">get_index</span><span class="p">(</span><span class="n">w</span><span class="p">)]</span> <span class="o">+=</span> <span class="nb">float</span><span class="p">(</span><span class="n">c</span><span class="p">)</span>
+    <span class="c">#end for</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="BigBOW.to_wc_string"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.BigBOW.to_wc_string">[docs]</a>  <span class="k">def</span> <span class="nf">to_wc_string</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Format the :class:`BigBOW` object in a ``word:count`` formatted string which looks like ``word1:count1 word2:count2 ...``.</span>
+
+<span class="sd">    :returns: the formatted string.&quot;&quot;&quot;</span>
+
+    <span class="k">return</span> <span class="s">&#39; &#39;</span><span class="o">.</span><span class="n">join</span><span class="p">([</span><span class="s">u&#39;{}:{}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span><span class="o">.</span><span class="n">get_token</span><span class="p">(</span><span class="n">w</span><span class="p">),</span> <span class="n">c</span><span class="p">)</span> <span class="k">for</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">iteritems</span><span class="p">()])</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="BigBOW.itertokens"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.BigBOW.itertokens">[docs]</a>  <span class="k">def</span> <span class="nf">itertokens</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;:returns: an iterator object for the tokens in this object (not indexes, which is the default :meth:`__iter__()` behaviour).&quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="n">BigBOW</span><span class="o">.</span><span class="n">TokenIter</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">__iter__</span><span class="p">())</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="BigBOW.tokens"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.BigBOW.tokens">[docs]</a>  <span class="k">def</span> <span class="nf">tokens</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;:returns: a list of tokens in this object, instead of indexes, as in :meth:`keys()`.&quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">i</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span><span class="o">.</span><span class="n">get_word</span><span class="p">(</span><span class="n">i</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">iterkeys</span><span class="p">())</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="BigBOW.set_token_count"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.BigBOW.set_token_count">[docs]</a>  <span class="k">def</span> <span class="nf">set_token_count</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">token</span><span class="p">,</span> <span class="n">freq</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Sets the frequency of ``token`` in this object.&quot;&quot;&quot;</span>
+
+    <span class="bp">self</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span><span class="o">.</span><span class="n">get_index</span><span class="p">(</span><span class="n">token</span><span class="p">)]</span> <span class="o">=</span> <span class="n">freq</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="BigBOW.inc_token_count"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.BigBOW.inc_token_count">[docs]</a>  <span class="k">def</span> <span class="nf">inc_token_count</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">token</span><span class="p">,</span> <span class="n">inc</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Increment the frequency of ``token`` in this object.&quot;&quot;&quot;</span>
+
+    <span class="bp">self</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span><span class="o">.</span><span class="n">get_index</span><span class="p">(</span><span class="n">token</span><span class="p">)]</span> <span class="o">+=</span> <span class="n">inc</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="BigBOW.get_token_count"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.BigBOW.get_token_count">[docs]</a>  <span class="k">def</span> <span class="nf">get_token_count</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">token</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;:returns: the frequency of ``token`` in this object.&quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="bp">self</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span><span class="o">.</span><span class="n">get_index</span><span class="p">(</span><span class="n">token</span><span class="p">)]</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="BigBOW.delete_token"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.BigBOW.delete_token">[docs]</a>  <span class="k">def</span> <span class="nf">delete_token</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">token</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Deletes ``token`` from this object.&quot;&quot;&quot;</span>
+    <span class="k">del</span> <span class="bp">self</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span><span class="o">.</span><span class="n">get_index</span><span class="p">(</span><span class="n">token</span><span class="p">)]</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="BigBOW.TokenIter"><a class="viewcode-back" href="../../bigvocab.html#ycutils.bigvocab.BigBOW.TokenIter">[docs]</a>  <span class="k">class</span> <span class="nc">TokenIter</span><span class="p">:</span>
+    <span class="sd">&quot;&quot;&quot;The :class:`Iterator` object returned by :meth:`itertokens`, which maps indexes to tokens.&quot;&quot;&quot;</span>
+
+    <span class="n">__vocabulary</span> <span class="o">=</span> <span class="bp">None</span>
+    <span class="n">__iter_obj</span> <span class="o">=</span> <span class="bp">None</span>
+
+    <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">vocabulary</span><span class="p">,</span> <span class="n">iter_obj</span><span class="p">):</span>
+      <span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span> <span class="o">=</span> <span class="n">vocabulary</span>
+      <span class="bp">self</span><span class="o">.</span><span class="n">__iter_obj</span> <span class="o">=</span> <span class="n">iter_obj</span>
+    <span class="c">#end def</span>
+
+    <span class="k">def</span> <span class="nf">__iter__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> <span class="k">return</span> <span class="bp">self</span>
+
+    <span class="k">def</span> <span class="nf">next</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+      <span class="n">i</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__iter_obj</span><span class="o">.</span><span class="n">next</span><span class="p">()</span>
+      <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocabulary</span><span class="o">.</span><span class="n">get_token</span><span class="p">(</span><span class="n">i</span><span class="p">)</span>
+    <span class="c">#end def</span>
+  <span class="c">#end class</span>
 <span class="c">#end class</span></div></div>
 </pre></div>
 

docs/html/_modules/ycutils/bleu.html

+
+
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+    
+    <title>ycutils.bleu &mdash; yc-pyutils 0.1 documentation</title>
+    
+    <link rel="stylesheet" href="../../_static/sphinxdoc.css" type="text/css" />
+    <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
+    
+    <script type="text/javascript">
+      var DOCUMENTATION_OPTIONS = {
+        URL_ROOT:    '../../',
+        VERSION:     '0.1',
+        COLLAPSE_INDEX: false,
+        FILE_SUFFIX: '.html',
+        HAS_SOURCE:  true
+      };
+    </script>
+    <script type="text/javascript" src="../../_static/jquery.js"></script>
+    <script type="text/javascript" src="../../_static/underscore.js"></script>
+    <script type="text/javascript" src="../../_static/doctools.js"></script>
+    <link rel="top" title="yc-pyutils 0.1 documentation" href="../../index.html" />
+    <link rel="up" title="Module code" href="../index.html" /> 
+  </head>
+  <body>
+    <div class="related">
+      <h3>Navigation</h3>
+      <ul>
+        <li class="right" style="margin-right: 10px">
+          <a href="../../genindex.html" title="General Index"
+             accesskey="I">index</a></li>
+        <li class="right" >
+          <a href="../../py-modindex.html" title="Python Module Index"
+             >modules</a> |</li>
+        <li><a href="../../index.html">yc-pyutils 0.1 documentation</a> &raquo;</li>
+          <li><a href="../index.html" accesskey="U">Module code</a> &raquo;</li> 
+      </ul>
+    </div>
+      <div class="sphinxsidebar">
+        <div class="sphinxsidebarwrapper">
+<div id="searchbox" style="display: none">
+  <h3>Quick search</h3>
+    <form class="search" action="../../search.html" method="get">
+      <input type="text" name="q" />
+      <input type="submit" value="Go" />
+      <input type="hidden" name="check_keywords" value="yes" />
+      <input type="hidden" name="area" value="default" />
+    </form>
+    <p class="searchtip" style="font-size: 90%">
+    Enter search terms or a module, class or function name.
+    </p>
+</div>
+<script type="text/javascript">$('#searchbox').show(0);</script>
+        </div>
+      </div>
+
+    <div class="document">
+      <div class="documentwrapper">
+        <div class="bodywrapper">
+          <div class="body">
+            
+  <h1>Source code for ycutils.bleu</h1><div class="highlight"><pre>
+<span class="kn">import</span> <span class="nn">collections</span>
+<span class="kn">import</span> <span class="nn">numpy</span> <span class="kn">as</span> <span class="nn">np</span>
+
+<div class="viewcode-block" id="count_ngrams"><a class="viewcode-back" href="../../bleu.html#ycutils.bleu.count_ngrams">[docs]</a><span class="k">def</span> <span class="nf">count_ngrams</span><span class="p">(</span><span class="n">tokens</span><span class="p">,</span> <span class="n">n</span><span class="p">,</span> <span class="n">all_smaller</span><span class="o">=</span><span class="bp">False</span><span class="p">):</span>
+  <span class="sd">&quot;&quot;&quot;Counts the frequency of n-grams in the given list of tokens.</span>
+
+<span class="sd">  :param tokens: list of tokens to compute ngrams for.</span>
+<span class="sd">  :param n: number of grams to count.</span>
+<span class="sd">  :param all_smaller: set to True to include all n-grams from n=1 to n.</span>
+<span class="sd">  &quot;&quot;&quot;</span>
+
+  <span class="n">counts</span> <span class="o">=</span> <span class="n">collections</span><span class="o">.</span><span class="n">Counter</span><span class="p">()</span>
+  <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="nb">xrange</span><span class="p">(</span><span class="mi">1</span> <span class="k">if</span> <span class="n">all_smaller</span> <span class="k">else</span> <span class="n">n</span><span class="p">,</span> <span class="n">n</span><span class="o">+</span><span class="mi">1</span><span class="p">):</span>
+    <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">xrange</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">tokens</span><span class="p">)</span><span class="o">-</span><span class="n">k</span><span class="o">+</span><span class="mi">1</span><span class="p">):</span>
+      <span class="n">counts</span><span class="p">[</span><span class="nb">tuple</span><span class="p">(</span><span class="n">tokens</span><span class="p">[</span><span class="n">i</span><span class="p">:</span><span class="n">i</span><span class="o">+</span><span class="n">k</span><span class="p">])]</span> <span class="o">+=</span> <span class="mi">1</span>
+
+  <span class="k">return</span> <span class="n">counts</span>
+<span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="score"><a class="viewcode-back" href="../../bleu.html#ycutils.bleu.score">[docs]</a><span class="k">def</span> <span class="nf">score</span><span class="p">(</span><span class="n">ref_ngrams</span><span class="p">,</span> <span class="n">ref_len</span><span class="p">,</span> <span class="n">pred_ngrams</span><span class="p">,</span> <span class="n">pred_len</span><span class="p">,</span> <span class="n">n</span><span class="p">):</span>
+  <span class="sd">&quot;&quot;&quot;Calculate the BLEU precision and recall from ngram counts.</span>
+
+<span class="sd">  :param ref_ngrams: reference sentence ngrams.</span>
+<span class="sd">  :param ref_len: reference sentence length.</span>
+<span class="sd">  :param pred_ngrams: predicted sentence ngrams.</span>
+<span class="sd">  :param pred_len: predicted sentence length.</span>
+<span class="sd">  :param n: the maximum number of ngrams to consider.</span>
+<span class="sd">  &quot;&quot;&quot;</span>
+  
+  <span class="k">if</span> <span class="ow">not</span> <span class="n">ref_len</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">pred_len</span><span class="p">:</span> <span class="k">return</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span>
+  <span class="k">if</span> <span class="ow">not</span> <span class="nb">len</span><span class="p">(</span><span class="n">ref_ngrams</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">len</span><span class="p">(</span><span class="n">pred_ngrams</span><span class="p">):</span> <span class="k">return</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span>
+
+  <span class="n">ngram_score</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span> <span class="o">+</span> <span class="mf">0.1</span>
+
+  <span class="c"># compute the ngram intersections</span>
+  <span class="k">for</span> <span class="n">ngram</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">ref_ngrams</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
+    <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">ngram</span><span class="p">)</span> <span class="o">&gt;</span> <span class="n">n</span><span class="p">:</span> <span class="k">continue</span>
+
+    <span class="n">k</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="n">pred_ngrams</span><span class="p">[</span><span class="n">ngram</span><span class="p">])</span>
+    <span class="n">ngram_score</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">ngram</span><span class="p">)</span> <span class="o">-</span> <span class="mi">1</span><span class="p">]</span> <span class="o">+=</span> <span class="n">k</span>
+  <span class="c">#end for</span>
+
+  <span class="c"># compute the geometric mean of the ngrams precision/recall</span>
+  <span class="n">precision</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">ngram_score</span> <span class="o">/</span> <span class="nb">len</span><span class="p">(</span><span class="n">pred_ngrams</span><span class="p">)))</span>
+  <span class="n">recall</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">ngram_score</span> <span class="o">/</span> <span class="nb">len</span><span class="p">(</span><span class="n">ref_ngrams</span><span class="p">)))</span>
+
+  <span class="c"># apply the brevity penalty</span>
+  <span class="k">if</span> <span class="n">pred_len</span> <span class="o">&lt;=</span> <span class="n">ref_len</span><span class="p">:</span> <span class="n">precision</span> <span class="o">+=</span> <span class="mf">1.0</span> <span class="o">-</span> <span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">ref_len</span><span class="p">)</span> <span class="o">/</span> <span class="n">pred_len</span><span class="p">)</span>
+  <span class="k">if</span> <span class="n">ref_len</span> <span class="o">&lt;=</span> <span class="n">pred_len</span><span class="p">:</span> <span class="n">recall</span> <span class="o">+=</span> <span class="mf">1.0</span> <span class="o">-</span> <span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">pred_len</span><span class="p">)</span> <span class="o">/</span> <span class="n">ref_len</span><span class="p">)</span>
+
+  <span class="n">precision</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="n">precision</span><span class="p">)</span>
+  <span class="n">recall</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">exp</span><span class="p">(</span><span class="n">recall</span><span class="p">)</span>
+
+  <span class="k">return</span> <span class="n">precision</span><span class="p">,</span> <span class="n">recall</span>
+<span class="c">#end def</span></div>
+</pre></div>
+
+          </div>
+        </div>
+      </div>
+      <div class="clearer"></div>
+    </div>
+    <div class="related">
+      <h3>Navigation</h3>
+      <ul>
+        <li class="right" style="margin-right: 10px">
+          <a href="../../genindex.html" title="General Index"
+             >index</a></li>
+        <li class="right" >
+          <a href="../../py-modindex.html" title="Python Module Index"
+             >modules</a> |</li>
+        <li><a href="../../index.html">yc-pyutils 0.1 documentation</a> &raquo;</li>
+          <li><a href="../index.html" >Module code</a> &raquo;</li> 
+      </ul>
+    </div>
+    <div class="footer">
+        &copy; Copyright 2012, yanchuan sim.
+      Created using <a href="http://sphinx.pocoo.org/">Sphinx</a> 1.1.3.
+    </div>
+  </body>
+</html>

docs/html/_modules/ycutils/corpus.html

           <div class="body">
             
   <h1>Source code for ycutils.corpus</h1><div class="highlight"><pre>
-<span class="kn">import</span> <span class="nn">collections</span><span class="o">,</span> <span class="nn">math</span><span class="o">,</span> <span class="nn">sys</span><span class="o">,</span> <span class="nn">random</span><span class="o">,</span> <span class="nn">cPickle</span><span class="o">,</span> <span class="nn">operator</span>
+<span class="kn">import</span> <span class="nn">collections</span><span class="o">,</span> <span class="nn">math</span><span class="o">,</span> <span class="nn">sys</span><span class="o">,</span> <span class="nn">random</span><span class="o">,</span> <span class="nn">cPickle</span><span class="o">,</span> <span class="nn">operator</span><span class="o">,</span> <span class="nn">warnings</span><span class="o">,</span> <span class="nn">codecs</span>
 <span class="kn">import</span> <span class="nn">ycutils.bagofwords</span><span class="o">,</span> <span class="nn">ycutils.tsvio</span>
 
 <div class="viewcode-block" id="Corpus"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.Corpus">[docs]</a><span class="k">class</span> <span class="nc">Corpus</span><span class="p">(</span><span class="nb">dict</span><span class="p">):</span>
-  <span class="sd">&quot;&quot;&quot;A dictionary collection of :class:`Document` objects.&quot;&quot;&quot;</span>
+  <span class="sd">&quot;&quot;&quot;A dictionary collection of :class:`ycutils.bagofwords.Document` objects.&quot;&quot;&quot;</span>
 
+  <span class="c">#: .. note:: A large setting will lead to a more uniform weightage over words.</span>
   <span class="c">#: .. seealso:: :meth:`inverse_document_frequency`</span>
   <span class="n">IDF_LAPLACE_SMOOTHING</span> <span class="o">=</span> <span class="o">.</span><span class="mo">001</span>
 
 </div>
 <div class="viewcode-block" id="Corpus.add_bow"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.Corpus.add_bow">[docs]</a>  <span class="k">def</span> <span class="nf">add_bow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">bow</span><span class="p">,</span> <span class="n">title</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Adds a :class:`BOW` to the corpus.</span>
+
 <span class="sd">    :param title: title of document (defaults to a random hexadecimal string).</span>
 <span class="sd">    :param bow: bag of words object.&quot;&quot;&quot;</span>
 
-    <span class="k">if</span> <span class="ow">not</span> <span class="n">title</span><span class="p">:</span>
-      <span class="n">title</span> <span class="o">=</span> <span class="nb">hex</span><span class="p">(</span><span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">sys</span><span class="o">.</span><span class="n">maxint</span><span class="p">))</span>
-      <span class="k">while</span> <span class="n">title</span> <span class="ow">in</span> <span class="bp">self</span><span class="p">:</span> <span class="n">title</span> <span class="o">=</span> <span class="nb">hex</span><span class="p">(</span><span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">sys</span><span class="o">.</span><span class="n">maxint</span><span class="p">))</span> <span class="c"># generate a random title string</span>
-    <span class="c">#end if</span>
+    <span class="k">if</span> <span class="ow">not</span> <span class="n">title</span><span class="p">:</span> <span class="n">title</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">unique_title</span><span class="p">()</span>
 
     <span class="bp">self</span><span class="p">[</span><span class="n">title</span><span class="p">]</span> <span class="o">=</span> <span class="n">bow</span><span class="o">.</span><span class="n">bow</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">bow</span><span class="p">,</span> <span class="n">ycutils</span><span class="o">.</span><span class="n">bagofwords</span><span class="o">.</span><span class="n">Document</span><span class="p">)</span> <span class="k">else</span> <span class="n">bow</span>
   <span class="c">#end def</span>
 </div>
+<div class="viewcode-block" id="Corpus.unique_title"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.Corpus.unique_title">[docs]</a>  <span class="k">def</span> <span class="nf">unique_title</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;:returns: a title that is unique to this corpus.&quot;&quot;&quot;</span>
+    <span class="n">title</span> <span class="o">=</span> <span class="n">ycutils</span><span class="o">.</span><span class="n">bagofwords</span><span class="o">.</span><span class="n">random_title</span><span class="p">()</span>
+    <span class="k">while</span> <span class="n">title</span> <span class="ow">in</span> <span class="bp">self</span><span class="p">:</span>
+      <span class="n">title</span> <span class="o">=</span> <span class="n">ycutils</span><span class="o">.</span><span class="n">bagofwords</span><span class="o">.</span><span class="n">random_title</span><span class="p">()</span>
+
+    <span class="k">return</span> <span class="n">title</span>
+  <span class="c">#end def</span>
+</div>
 <div class="viewcode-block" id="Corpus.to_file"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.Corpus.to_file">[docs]</a>  <span class="k">def</span> <span class="nf">to_file</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">,</span> <span class="n">sort</span><span class="o">=</span><span class="bp">True</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Writes out corpus to a file descriptor in textual format.</span>
 
 <span class="sd">    :param f: file description to write to.</span>
 <span class="sd">    :param sort: whether to save document according to their titles&quot;&quot;&quot;</span>
 
+    <span class="n">tsv_corpus</span> <span class="o">=</span> <span class="n">ycutils</span><span class="o">.</span><span class="n">tsvio</span><span class="o">.</span><span class="n">TSVFile</span><span class="p">()</span>
+    <span class="n">tsv_corpus</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="s">&#39;IDF_LAPLACE_SMOOTHING={}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">IDF_LAPLACE_SMOOTHING</span><span class="p">),</span> <span class="n">comment</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+    <span class="n">tsv_corpus</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="s">&#39;doc_count={}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="p">)),</span> <span class="n">comment</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+    <span class="n">tsv_corpus</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="s">&#39;title</span><span class="se">\t</span><span class="s">wc_string&#39;</span><span class="p">,</span> <span class="n">comment</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+
     <span class="n">title_list</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">iterkeys</span><span class="p">())</span> <span class="k">if</span> <span class="n">sort</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">iterkeys</span><span class="p">()</span>
-    <span class="k">for</span> <span class="n">title</span> <span class="ow">in</span> <span class="n">title_list</span><span class="p">:</span> <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s">&#39;{}</span><span class="se">\t</span><span class="s">{}</span><span class="se">\n</span><span class="s">&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">title</span><span class="p">,</span> <span class="bp">self</span><span class="p">[</span><span class="n">title</span><span class="p">]</span><span class="o">.</span><span class="n">to_wc_string</span><span class="p">()))</span>
+    <span class="k">for</span> <span class="n">title</span> <span class="ow">in</span> <span class="n">title_list</span><span class="p">:</span> <span class="n">tsv_corpus</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="p">(</span><span class="n">title</span><span class="p">,</span> <span class="bp">self</span><span class="p">[</span><span class="n">title</span><span class="p">]</span><span class="o">.</span><span class="n">to_wc_string</span><span class="p">()))</span>
   <span class="c">#end def</span>
 </div>
 <div class="viewcode-block" id="Corpus.from_file"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.Corpus.from_file">[docs]</a>  <span class="k">def</span> <span class="nf">from_file</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;Reads a :class:`Corpus` object from a file.</span>
+    <span class="sd">&quot;&quot;&quot;Reads a :class:`Corpus` object from a text file.</span>
 
 <span class="sd">    :param f: file to read from.&quot;&quot;&quot;</span>
 
     <span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="n">f</span><span class="p">:</span>
       <span class="n">line</span> <span class="o">=</span> <span class="n">line</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
       <span class="k">if</span> <span class="ow">not</span> <span class="n">line</span><span class="p">:</span> <span class="k">continue</span>
-      <span class="n">doc</span> <span class="o">=</span> <span class="n">ycutils</span><span class="o">.</span><span class="n">bagofwords</span><span class="o">.</span><span class="n">Document</span><span class="p">()</span>
-      <span class="n">doc</span><span class="o">.</span><span class="n">add_wc_string</span><span class="p">(</span><span class="n">line</span><span class="p">)</span>
-      <span class="bp">self</span><span class="o">.</span><span class="n">add_document</span><span class="p">(</span><span class="n">doc</span><span class="p">)</span>
+      <span class="bp">self</span><span class="o">.</span><span class="n">add_document</span><span class="p">(</span><span class="n">ycutils</span><span class="o">.</span><span class="n">bagofwords</span><span class="o">.</span><span class="n">Document</span><span class="p">(</span><span class="n">wc_string</span><span class="o">=</span><span class="n">line</span><span class="p">))</span>
     <span class="c">#end for</span>
   <span class="c">#end def</span>
 </div>
     <span class="k">return</span> <span class="n">V</span>
   <span class="c">#end def</span>
 </div>
-<div class="viewcode-block" id="Corpus.inverse_document_frequency"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.Corpus.inverse_document_frequency">[docs]</a>  <span class="k">def</span> <span class="nf">inverse_document_frequency</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+<div class="viewcode-block" id="Corpus.inverse_document_frequency"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.Corpus.inverse_document_frequency">[docs]</a>  <span class="k">def</span> <span class="nf">inverse_document_frequency</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Builds a :class:`Counter` of vocabulary and their inverse document frequency.</span>
 
+<span class="sd">    :param df: document frequency in the form of a :class:`Counter`, returned by :meth:`document_frequency`.</span>
 <span class="sd">    :returns: vocabulary and their idf values.</span>
 <span class="sd">    :rtype: :class:`Counter`</span>
 
 <span class="sd">    .. note:: A small constant, :attr:`IDF_LAPLACE_SMOOTHING`, is added to each term&#39;s document frequency to avoid divide by zero errors.</span>
 <span class="sd">    &quot;&quot;&quot;</span>
 
-    <span class="n">idf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">document_frequency</span><span class="p">()</span>
+    <span class="n">idf</span> <span class="o">=</span> <span class="n">df</span> <span class="k">if</span> <span class="n">df</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">document_frequency</span><span class="p">()</span>
     <span class="n">log_D</span> <span class="o">=</span> <span class="n">math</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">+</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">idf</span><span class="p">)</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">IDF_LAPLACE_SMOOTHING</span><span class="p">))</span>
     <span class="k">for</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">idf</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span> <span class="n">idf</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="n">log_D</span> <span class="o">-</span> <span class="n">math</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">c</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">IDF_LAPLACE_SMOOTHING</span><span class="p">)</span>
 
 <span class="c">#end class</span>
 </div></div>
 <div class="viewcode-block" id="CorpusVocabulary"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.CorpusVocabulary">[docs]</a><span class="k">class</span> <span class="nc">CorpusVocabulary</span><span class="p">:</span>
-  <span class="sd">&quot;&quot;&quot;A class that handles vocabulary information related to a corpus, like term frequencies, idf, etc.&quot;&quot;&quot;</span>
+  <span class="sd">&quot;&quot;&quot;A class that handles vocabulary information related to a corpus, like term frequencies, idf, etc.</span>
+
+<span class="sd">  :param corpus: :class:`Corpus` object to build :class:`CorpusVocabulary` from.</span>
+<span class="sd">  :param from_filename: path to text file where we can load vocabulary information from.</span>
+<span class="sd">  :param unknown_token: the type to use for an unknown token. If set to `None`, an error will be raised whenever an unknown token is encountered.&quot;&quot;&quot;</span>
+
+  <span class="n">__vocab</span> <span class="o">=</span> <span class="p">{}</span>
+
+  <span class="n">unknown_token</span> <span class="o">=</span> <span class="s">&#39;__UNK__&#39;</span>
+  <span class="sd">&quot;&quot;&quot;The type for an unknown token, which is ``__UNK__`` by default.&quot;&quot;&quot;</span>
+
+  <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">corpus</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">from_filename</span><span class="o">=</span><span class="bp">None</span><span class="p">,</span> <span class="n">unknown_token</span><span class="o">=</span><span class="s">&#39;__UNK__&#39;</span><span class="p">):</span>
+    <span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span> <span class="o">=</span> <span class="n">unknown_token</span>
+
+    <span class="k">if</span> <span class="n">corpus</span><span class="p">:</span>
+      <span class="bp">self</span><span class="o">.</span><span class="n">from_corpus</span><span class="p">(</span><span class="n">corpus</span><span class="p">)</span>
+
+    <span class="k">elif</span> <span class="n">from_filename</span><span class="p">:</span>
+      <span class="k">with</span> <span class="n">codecs</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">from_filename</span><span class="p">,</span> <span class="s">&#39;r&#39;</span><span class="p">,</span> <span class="s">&#39;utf-8&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">from_file</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
+
+    <span class="k">else</span><span class="p">:</span>
+      <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s">&#39;Creating CorpusVocabulary from empty corpus!&#39;</span><span class="p">)</span>
+  <span class="c">#end def</span>
+
+<div class="viewcode-block" id="CorpusVocabulary.__getitem__"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.CorpusVocabulary.__getitem__">[docs]</a>  <span class="k">def</span> <span class="nf">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">token</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Returns statistics about the given token in the corpus.</span>
+
+<span class="sd">    :param token: token to get statistics for.</span>
+<span class="sd">    :returns: a tuple `(frequency, document frequency, inverse document frequency)`.</span>
+
+<span class="sd">    .. note:: An error will be raised if the token is not found in the vocabulary.&quot;&quot;&quot;</span>
+
+    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="p">[</span><span class="n">token</span><span class="p">]</span>
+  <span class="c">#end def</span>
+</div>
+  <span class="k">def</span> <span class="nf">__iter__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="o">.</span><span class="n">__iter__</span><span class="p">()</span>
+
+  <span class="k">def</span> <span class="nf">__contains__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">):</span> <span class="k">return</span> <span class="n">item</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span>
+  
+<div class="viewcode-block" id="CorpusVocabulary.__len__"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.CorpusVocabulary.__len__">[docs]</a>  <span class="k">def</span> <span class="nf">__len__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> 
+    <span class="sd">&quot;&quot;&quot;Returns the number of terms in the vocabulary.&quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="p">)</span>
+</div>
+<div class="viewcode-block" id="CorpusVocabulary.find_token"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.CorpusVocabulary.find_token">[docs]</a>  <span class="k">def</span> <span class="nf">find_token</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">token</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Retrieves statistics for the given token, and default statistics if the token is not found.</span>
+
+<span class="sd">    :param token: token to get statistics for.</span>
+<span class="sd">    :returns: a tuple `(frequency, document frequency, inverse document frequency)`. If token is not found, statistics for :attr:`unknown_token` is returned.&quot;&quot;&quot;</span>
+
+    <span class="k">if</span> <span class="n">token</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="p">:</span> <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="p">[</span><span class="n">token</span><span class="p">]</span>
 
-  <span class="n">__vocab</span> <span class="o">=</span> <span class="bp">None</span>
+    <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span><span class="p">:</span> <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s">&#39;</span><span class="se">\&#39;</span><span class="s">{}</span><span class="se">\&#39;</span><span class="s"> not found in corpus vocabulary.&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">token</span><span class="p">))</span>
 
-<div class="viewcode-block" id="CorpusVocabulary.__init__"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.CorpusVocabulary.__init__">[docs]</a>  <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">corpus</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
-    <span class="sd">&quot;&quot;&quot;See :meth:`from_corpus`.&quot;&quot;&quot;</span>
-    <span class="k">if</span> <span class="n">corpus</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">from_corpus</span><span class="p">(</span><span class="n">corpus</span><span class="p">)</span>
+    <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span><span class="p">]</span>
   <span class="c">#end def</span>
 </div>
 <div class="viewcode-block" id="CorpusVocabulary.from_corpus"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.CorpusVocabulary.from_corpus">[docs]</a>  <span class="k">def</span> <span class="nf">from_corpus</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">corpus</span><span class="p">):</span>
 
     <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span> <span class="o">=</span> <span class="p">{}</span>
 
-    <span class="n">log_D</span> <span class="o">=</span> <span class="n">math</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">corpus</span><span class="p">)</span> <span class="o">+</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">V_df</span><span class="p">)</span> <span class="o">*</span> <span class="n">corpus</span><span class="o">.</span><span class="n">IDF_LAPLACE_SMOOTHING</span><span class="p">))</span>
+    <span class="n">log_D</span> <span class="o">=</span> <span class="n">math</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">corpus</span><span class="p">)</span> <span class="o">+</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">V_df</span><span class="p">)</span> <span class="o">*</span> <span class="n">corpus</span><span class="o">.</span><span class="n">IDF_LAPLACE_SMOOTHING</span><span class="p">)</span> <span class="o">+</span> <span class="n">corpus</span><span class="o">.</span><span class="n">IDF_LAPLACE_SMOOTHING</span><span class="p">)</span> <span class="c"># one more for the unknown token</span>
     <span class="k">for</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">V_tf</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
       <span class="n">df</span> <span class="o">=</span> <span class="n">V_df</span><span class="p">[</span><span class="n">w</span><span class="p">]</span>
       <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="n">df</span><span class="p">,</span> <span class="n">log_D</span> <span class="o">-</span> <span class="n">math</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">df</span> <span class="o">+</span> <span class="n">corpus</span><span class="o">.</span><span class="n">IDF_LAPLACE_SMOOTHING</span><span class="p">))</span>
     <span class="c">#end for</span>
+
+    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="o">.</span><span class="n">setdefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span><span class="p">,</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="n">log_D</span> <span class="o">-</span> <span class="n">math</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">corpus</span><span class="o">.</span><span class="n">IDF_LAPLACE_SMOOTHING</span><span class="p">)))</span> <span class="c"># unknown token appears minimal times</span>
   <span class="c">#end def</span>
 </div>
-<div class="viewcode-block" id="CorpusVocabulary.to_file"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.CorpusVocabulary.to_file">[docs]</a>  <span class="k">def</span> <span class="nf">to_file</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">,</span> <span class="n">sort_key</span><span class="o">=</span><span class="p">[</span><span class="s">&#39;idf&#39;</span><span class="p">,</span> <span class="s">&#39;freq&#39;</span><span class="p">]):</span>
+<div class="viewcode-block" id="CorpusVocabulary.to_file"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.CorpusVocabulary.to_file">[docs]</a>  <span class="k">def</span> <span class="nf">to_file</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">,</span> <span class="n">sort_key</span><span class="o">=</span><span class="p">[</span><span class="s">&#39;idf+&#39;</span><span class="p">,</span> <span class="s">&#39;freq+&#39;</span><span class="p">],</span> <span class="n">save_unknown</span><span class="o">=</span><span class="bp">False</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Writes out corpus vocabulary to a file descriptor in textual format.</span>
 
+<span class="sd">    File format is a tab separated values file (see :class:`tsvio.TSVFile`), and the columns are: tokens, frequency in corpus, no. of documents containing token and inverse document frequency.</span>
+
 <span class="sd">    :param f: file description to write to.</span>
-<span class="sd">    :param sort_key: sort order to use for vocabulary. Possible options are `idf` and `freq`. A `+` at the back denotes largest first (descending order) and `-` for ascending order. Defaults to descending order.&quot;&quot;&quot;</span>
+<span class="sd">    :param sort_key: sort order to use for vocabulary. Possible options are `idf` and `freq`. A `+` at the back denotes largest first (descending order) and `-` for ascending order. Defaults to descending order.</span>
+<span class="sd">    :param save_unknown: saves the default statistics for :attr:`unknown_token` to file.&quot;&quot;&quot;</span>
 
     <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">key</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">sort_key</span><span class="p">):</span>
       <span class="k">if</span> <span class="n">key</span> <span class="o">==</span> <span class="s">&#39;idf&#39;</span><span class="p">:</span> <span class="n">sort_key</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="s">&#39;idf+&#39;</span>
 
     <span class="n">sort_list</span> <span class="o">=</span> <span class="p">[]</span>
     <span class="k">for</span> <span class="p">(</span><span class="n">w</span><span class="p">,</span> <span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="n">df</span><span class="p">,</span> <span class="n">idf</span><span class="p">))</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
+      <span class="k">if</span> <span class="ow">not</span> <span class="n">save_unknown</span> <span class="ow">and</span> <span class="n">w</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span><span class="p">:</span> <span class="k">continue</span>
+
       <span class="n">row</span> <span class="o">=</span> <span class="p">[]</span>
       <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">sort_key</span><span class="p">:</span>
         <span class="k">if</span> <span class="n">key</span> <span class="o">==</span> <span class="s">&#39;idf+&#39;</span><span class="p">:</span> <span class="n">row</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="o">-</span><span class="n">idf</span><span class="p">)</span>
     <span class="c">#end for</span>
 
     <span class="n">tsv_vocab</span> <span class="o">=</span> <span class="n">ycutils</span><span class="o">.</span><span class="n">tsvio</span><span class="o">.</span><span class="n">TSVFile</span><span class="p">()</span>
-    <span class="n">tsv_vocab</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="s">&#39;word</span><span class="se">\t</span><span class="s">freq</span><span class="se">\t</span><span class="s">df</span><span class="se">\t</span><span class="s">idf&#39;</span><span class="p">,</span> <span class="n">comment</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+
+    <span class="n">tsv_vocab</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="s">&#39;vocab_size={}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sort_list</span><span class="p">)),</span> <span class="n">comment</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
     <span class="n">tsv_vocab</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="s">&#39;sort_key={}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">sort_key</span><span class="p">),</span> <span class="n">comment</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+    <span class="n">tsv_vocab</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="s">&#39;unknown_token={}&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span><span class="p">),</span> <span class="n">comment</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+    <span class="n">tsv_vocab</span><span class="o">.</span><span class="n">writeline</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="s">&#39;token</span><span class="se">\t</span><span class="s">freq</span><span class="se">\t</span><span class="s">df</span><span class="se">\t</span><span class="s">idf&#39;</span><span class="p">,</span> <span class="n">comment</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+    <span class="c"># tsv_vocab.writeline(f, &#39;IDF_LAPLACE_SMOOTHING={}&#39;.format(corpus.IDF_LAPLACE_SMOOTHING), comment=True)</span>
+
     <span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">sort_list</span><span class="p">):</span>
       <span class="n">w</span> <span class="o">=</span> <span class="n">row</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
       <span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="n">df</span><span class="p">,</span> <span class="n">idf</span><span class="p">)</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="p">[</span><span class="n">w</span><span class="p">]</span>
 <div class="viewcode-block" id="CorpusVocabulary.from_file"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.CorpusVocabulary.from_file">[docs]</a>  <span class="k">def</span> <span class="nf">from_file</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Reads a :class:`CorpusVocabulary` object from a file.</span>
 
+<span class="sd">    See :meth:`to_file` for information on file format.</span>
+
 <span class="sd">    :param f: file to read from.&quot;&quot;&quot;</span>
 
     <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span> <span class="o">=</span> <span class="p">{}</span>
     <span class="n">tsv_vocab</span> <span class="o">=</span> <span class="n">ycutils</span><span class="o">.</span><span class="n">tsvio</span><span class="o">.</span><span class="n">TSVFile</span><span class="p">()</span>
-    <span class="k">for</span> <span class="p">(</span><span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">df</span><span class="p">,</span> <span class="n">idf</span><span class="p">)</span> <span class="ow">in</span> <span class="n">tsv_vocab</span><span class="o">.</span><span class="n">readlines</span><span class="p">(</span><span class="n">f</span><span class="p">):</span>
-      <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">c</span><span class="p">),</span> <span class="nb">float</span><span class="p">(</span><span class="n">df</span><span class="p">),</span> <span class="nb">float</span><span class="p">(</span><span class="n">idf</span><span class="p">))</span>
+    <span class="k">for</span> <span class="p">(</span><span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">,</span> <span class="n">df</span><span class="p">,</span> <span class="n">idf</span><span class="p">)</span> <span class="ow">in</span> <span class="n">tsv_vocab</span><span class="o">.</span><span class="n">readlines</span><span class="p">(</span><span class="n">f</span><span class="p">):</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">c</span><span class="p">),</span> <span class="nb">float</span><span class="p">(</span><span class="n">df</span><span class="p">),</span> <span class="nb">float</span><span class="p">(</span><span class="n">idf</span><span class="p">))</span>
+
+    <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="o">.</span><span class="n">setdefault</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span><span class="p">,</span> <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">))</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="CorpusVocabulary.filter"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.CorpusVocabulary.filter">[docs]</a>  <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">minimums</span><span class="o">=</span><span class="p">(</span><span class="bp">None</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="bp">None</span><span class="p">),</span> <span class="n">maximums</span><span class="o">=</span><span class="p">(</span><span class="bp">None</span><span class="p">,</span> <span class="bp">None</span><span class="p">,</span> <span class="bp">None</span><span class="p">)):</span>
+    <span class="sd">&quot;&quot;&quot;Remove words that are not in the range between ``minimums`` and ``maximums``.</span>
+
+<span class="sd">    :param minimums: a 3-tuple containing the minimum (word frequency, document frequency, IDF). Use ``None`` if no minimums are defined.</span>
+<span class="sd">    :param maximums: a 3-tuple containing the maximum (word frequency, document frequency, IDF). Use ``None`` if no minimums are defined.&quot;&quot;&quot;</span>
+
+    <span class="n">min_c</span><span class="p">,</span> <span class="n">min_df</span><span class="p">,</span> <span class="n">min_idf</span> <span class="o">=</span> <span class="n">minimums</span>
+    <span class="n">max_c</span><span class="p">,</span> <span class="n">max_df</span><span class="p">,</span> <span class="n">max_idf</span> <span class="o">=</span> <span class="n">maximums</span>
+
+    <span class="n">delete</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
+    <span class="k">for</span> <span class="n">w</span><span class="p">,</span> <span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="n">df</span><span class="p">,</span> <span class="n">idf</span><span class="p">)</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
+      <span class="k">if</span> <span class="n">min_c</span> <span class="ow">and</span> <span class="n">c</span> <span class="o">&lt;</span> <span class="n">min_c</span><span class="p">:</span> <span class="n">delete</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
+      <span class="k">elif</span> <span class="n">min_df</span> <span class="ow">and</span> <span class="n">df</span> <span class="o">&lt;</span> <span class="n">min_df</span><span class="p">:</span> <span class="n">delete</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
+      <span class="k">elif</span> <span class="n">min_idf</span> <span class="ow">and</span> <span class="n">idf</span> <span class="o">&lt;</span> <span class="n">min_idf</span><span class="p">:</span> <span class="n">delete</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
+
+      <span class="k">elif</span> <span class="n">max_c</span> <span class="ow">and</span> <span class="n">c</span> <span class="o">&gt;</span> <span class="n">max_c</span><span class="p">:</span> <span class="n">delete</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
+      <span class="k">elif</span> <span class="n">max_df</span> <span class="ow">and</span> <span class="n">df</span> <span class="o">&gt;</span> <span class="n">max_df</span><span class="p">:</span> <span class="n">delete</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
+      <span class="k">elif</span> <span class="n">max_idf</span> <span class="ow">and</span> <span class="n">idf</span> <span class="o">&gt;</span> <span class="n">max_idf</span><span class="p">:</span> <span class="n">delete</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
     <span class="c">#end for</span>
+
+    <span class="k">for</span> <span class="n">w</span> <span class="ow">in</span> <span class="n">delete</span><span class="p">:</span> <span class="k">del</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="p">[</span><span class="n">w</span><span class="p">]</span>
+  <span class="c">#end def</span>
+</div>
+<div class="viewcode-block" id="CorpusVocabulary.to_bow"><a class="viewcode-back" href="../../corpus.html#ycutils.corpus.CorpusVocabulary.to_bow">[docs]</a>  <span class="k">def</span> <span class="nf">to_bow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">use_counts</span><span class="o">=</span><span class="s">&#39;freq&#39;</span><span class="p">):</span>
+    <span class="sd">&quot;&quot;&quot;Creates a :class:`bagofwords.BOW` object from this vocabulary.</span>
+
+<span class="sd">    .. note:: __UNK__ tokens will be ignored.</span>
+
+<span class="sd">    :params use_counts: type of frequency to use for bag of words counts, can be ``freq`` (default), ``df`` or ``idf``.</span>
+
+<span class="sd">    :returns: a :class:`bagofwords.BOW` object containing words and counts from this vocabulary.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="n">bow</span> <span class="o">=</span> <span class="n">ycutils</span><span class="o">.</span><span class="n">bagofwords</span><span class="o">.</span><span class="n">BOW</span><span class="p">()</span>
+    <span class="k">for</span> <span class="n">w</span><span class="p">,</span> <span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="n">df</span><span class="p">,</span> <span class="n">idf</span><span class="p">)</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">__vocab</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
+      <span class="k">if</span> <span class="n">w</span> <span class="o">==</span> <span class="bp">self</span><span class="o">.</span><span class="n">unknown_token</span> <span class="ow">or</span> <span class="n">c</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> <span class="k">continue</span>
+      <span class="k">if</span> <span class="n">use_counts</span> <span class="o">==</span> <span class="s">&#39;idf&#39;</span><span class="p">:</span> <span class="n">bow</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="n">idf</span>
+      <span class="k">elif</span> <span class="n">use_counts</span> <span class="o">==</span> <span class="s">&#39;df&#39;</span><span class="p">:</span> <span class="n">bow</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span>
+      <span class="k">else</span><span class="p">:</span> <span class="n">bow</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="n">c</span>
+    <span class="c">#end for</span>
+
+    <span class="k">return</span> <span class="n">bow</span>
   <span class="c">#end def</span>
 <span class="c">#end class</span></div></div>
 </pre></div>

docs/html/_modules/ycutils/tfidf.html

 <span class="kn">import</span> <span class="nn">ycutils.corpus</span>
 
 <div class="viewcode-block" id="TFIDF"><a class="viewcode-back" href="../../tfidf.html#ycutils.tfidf.TFIDF">[docs]</a><span class="k">class</span> <span class="nc">TFIDF</span><span class="p">:</span>
-  <span class="sd">&quot;&quot;&quot;This class is a wrapper class for computing TF-IDF values of :class:`BOW` or :class:`Document`.</span>
+  <span class="sd">&quot;&quot;&quot;This class is a wrapper class for computing TF-IDF values of :class:`ycutils.bagofwords.BOW` or :class:`ycutils.bagofwords.Document`.</span>
 
-<span class="sd">  :param corpus: corpus to initialize with, we will calculate the tf-idf statistics we need by passing it through the :class:`CorpusVocabulary` class.</span>
-<span class="sd">  :param corpus_vocabulary: use this :class:`CorpusVocabulary` for out IDF values.</span>
+<span class="sd">  :param corpus: corpus to initialize with, we will calculate the tf-idf statistics we need by passing it through the :class:`ycutils.corpus.CorpusVocabulary` class.</span>
+<span class="sd">  :param corpus_vocabulary: use this :class:`ycutils.corpus.CorpusVocabulary` for IDF values.</span>
+<span class="sd">  </span>
+<span class="sd">  .. warning: Changes to the underlying corpus_vocabulary will affect future tfidf calculations.</span>
 <span class="sd">  &quot;&quot;&quot;</span>
 
   <span class="n">corpus_vocabulary</span> <span class="o">=</span> <span class="bp">None</span>
 <div class="viewcode-block" id="TFIDF.transform"><a class="viewcode-back" href="../../tfidf.html#ycutils.tfidf.TFIDF.transform">[docs]</a>  <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">bow</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Performs TF-IDF transform on the given bag of words vector.</span>
 
-<span class="sd">    .. note:: The vector is modified in place.</span>
+<span class="sd">    :param bow: bag of words vector (a :class:`Counter` class).</span>
+
+<span class="sd">    .. note:: The vector is modified in place.&quot;&quot;&quot;</span>
 
-<span class="sd">    :param bow: bag of words vector (a :class:`Counter` class).&quot;&quot;&quot;</span>
     <span class="k">for</span> <span class="n">w</span> <span class="ow">in</span> <span class="n">bow</span><span class="o">.</span><span class="n">iterkeys</span><span class="p">():</span>
-      <span class="p">(</span><span class="n">freq</span><span class="p">,</span> <span class="n">df</span><span class="p">,</span> <span class="n">idf</span><span class="p">)</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">corpus_vocabulary</span><span class="o">.</span><span class="n">find_word</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
+      <span class="p">(</span><span class="n">freq</span><span class="p">,</span> <span class="n">df</span><span class="p">,</span> <span class="n">idf</span><span class="p">)</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">corpus_vocabulary</span><span class="o">.</span><span class="n">find_token</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
       <span class="n">bow</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">*=</span> <span class="n">idf</span>
     <span class="c">#end for</span>
 
 <div class="viewcode-block" id="TFIDF.untransform"><a class="viewcode-back" href="../../tfidf.html#ycutils.tfidf.TFIDF.untransform">[docs]</a>  <span class="k">def</span> <span class="nf">untransform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">bow</span><span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Reverses the TF-IDF transform on the given bag of words vector.</span>
 
-<span class="sd">    .. note:: The vector is modified in place.</span>
+<span class="sd">    :param bow: bag of words vector (a :class:`Counter` class).</span>
+
+<span class="sd">    .. note:: The vector is modified in place.&quot;&quot;&quot;</span>
 
-<span class="sd">    :param bow: bag of words vector (a :class:`Counter` class).&quot;&quot;&quot;</span>
     <span class="k">for</span> <span class="n">w</span> <span class="ow">in</span> <span class="n">bow</span><span class="o">.</span><span class="n">iterkeys</span><span class="p">():</span>
-      <span class="p">(</span><span class="n">freq</span><span class="p">,</span> <span class="n">df</span><span class="p">,</span> <span class="n">idf</span><span class="p">)</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">corpus_vocabulary</span><span class="o">.</span><span class="n">find_word</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
+      <span class="p">(</span><span class="n">freq</span><span class="p">,</span> <span class="n">df</span><span class="p">,</span> <span class="n">idf</span><span class="p">)</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">corpus_vocabulary</span><span class="o">.</span><span class="n">find_token</span><span class="p">(</span><span class="n">w</span><span class="p">)</span>
       <span class="n">bow</span><span class="p">[</span><span class="n">w</span><span class="p">]</span> <span class="o">/=</span> <span class="n">idf</span>
     <span class="c">#end for</span>
 

docs/html/_modules/ycutils/tokenize.html

           <div class="body">
             
   <h1>Source code for ycutils.tokenize</h1><div class="highlight"><pre>
-<span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">string</span><span class="o">,</span> <span class="nn">collections</span><span class="o">,</span> <span class="nn">re</span><span class="o">,</span> <span class="nn">unicodedata</span>
-<span class="sd">&quot;&quot;&quot;</span>
-<span class="sd">This module contains function to split a piece of text into individual sentences (using Splitta &lt;code.google.com/p/splitta/&gt;) and into individual words (using many regexes).</span>
+<span class="kn">import</span> <span class="nn">os</span><span class="o">,</span> <span class="nn">string</span><span class="o">,</span> <span class="nn">collections</span><span class="o">,</span> <span class="nn">unicodedata</span><span class="o">,</span> <span class="nn">itertools</span>
+<span class="kn">import</span> <span class="nn">regex</span> <span class="kn">as</span> <span class="nn">re</span> <span class="c"># required</span>
 
-<span class="sd">Example code::</span>
-<span class="sd">.. highlight:: python</span>
-<span class="sd">   :linenothreshold: 5</span>
+<span class="n">__MONTHS__</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="nb">unicode</span><span class="p">,</span> <span class="p">[</span><span class="s">&#39;feb&#39;</span><span class="p">,</span> <span class="s">&#39;oct&#39;</span><span class="p">,</span> <span class="s">&#39;mar&#39;</span><span class="p">,</span> <span class="s">&#39;aug&#39;</span><span class="p">,</span> <span class="s">&#39;sep&#39;</span><span class="p">,</span> <span class="s">&#39;jan&#39;</span><span class="p">,</span> <span class="s">&#39;jun&#39;</span><span class="p">,</span> <span class="s">&#39;apr&#39;</span><span class="p">,</span> <span class="s">&#39;dec&#39;</span><span class="p">,</span> <span class="s">&#39;jul&#39;</span><span class="p">,</span> <span class="s">&#39;nov&#39;</span><span class="p">,</span> <span class="s">&#39;january&#39;</span><span class="p">,</span> <span class="s">&#39;february&#39;</span><span class="p">,</span> <span class="s">&#39;march&#39;</span><span class="p">,</span> <span class="s">&#39;april&#39;</span><span class="p">,</span> <span class="s">&#39;may&#39;</span><span class="p">,</span> <span class="s">&#39;june&#39;</span><span class="p">,</span> <span class="s">&#39;july&#39;</span><span class="p">,</span> <span class="s">&#39;august&#39;</span><span class="p">,</span> <span class="s">&#39;september&#39;</span><span class="p">,</span> <span class="s">&#39;october&#39;</span><span class="p">,</span> <span class="s">&#39;november&#39;</span><span class="p">,</span> <span class="s">&#39;december&#39;</span><span class="p">]))</span>
 
-<span class="sd">   import ycutils.tokenize</span>
-<span class="sd">  </span>
-<span class="sd">  text = &#39;At 2PM today, In fairness, the national networks did show 74,730st other images, especially shots of Lake Pontchartrain spilling out onto Lakeshore Drive or traffic signs twisting in the wind, but there was no information about when the images were recorded. Is this what the wind is like now? Three hours ago? Or as far back as yesterday? I can watch the same sports highlights on ESPN over and over again and never be bored by the repetition. The difference here, though, is that I know that Lebron&#39;s blocked shot, Drew&#39;s pinpoint touchdown pass, Tiger&#39;s escape from the bunker isn&#39;t still happening. But watching the Weather Channel hour after hour - and what else does one concerned about the city do but that? - one has a hard time separating the past from the present, distinguishing the weather that was from the weather that is. You&#39;d also have a hard time separating flooding that&#39;s perilous from the flooding that&#39;s inconvenient. What do these reporters mean when they say it&#39;s flooding? Is it water up to the headlights or water into raised houses? Water that people cross with rubber boots? Or water requiring a pirogue? Is it an amount we see every few years or an amount we see every Wednesday? So long as it wasn&#39;t an amount caused by busted levees I knew the city would survive. Jarvis DeBerry can be reached at jdeberry@timespicayune.com or (504) 826.3355. Follow him at http://connect.nola.com/user/jdeberry/index.html and at twitter.com/jerry.&#39;</span>
-<span class="sd">  sents = ycutils.tokenize.sentences(text)</span>
-<span class="sd">  tokens = ycutils.tokenize.words_in_sentences(sents)</span>
-
-<span class="sd">&quot;&quot;&quot;</span>
-
-<span class="n">__MONTHS__</span> <span class="o">=</span> <span class="nb">set</span><span class="p">([</span><span class="s">&#39;feb&#39;</span><span class="p">,</span> <span class="s">&#39;oct&#39;</span><span class="p">,</span> <span class="s">&#39;mar&#39;</span><span class="p">,</span> <span class="s">&#39;aug&#39;</span><span class="p">,</span> <span class="s">&#39;sep&#39;</span><span class="p">,</span> <span class="s">&#39;jan&#39;</span><span class="p">,</span> <span class="s">&#39;jun&#39;</span><span class="p">,</span> <span class="s">&#39;apr&#39;</span><span class="p">,</span> <span class="s">&#39;dec&#39;</span><span class="p">,</span> <span class="s">&#39;jul&#39;</span><span class="p">,</span> <span class="s">&#39;nov&#39;</span><span class="p">,</span> <span class="s">&#39;january&#39;</span><span class="p">,</span> <span class="s">&#39;february&#39;</span><span class="p">,</span> <span class="s">&#39;march&#39;</span><span class="p">,</span> <span class="s">&#39;april&#39;</span><span class="p">,</span> <span class="s">&#39;may&#39;</span><span class="p">,</span> <span class="s">&#39;june&#39;</span><span class="p">,</span> <span class="s">&#39;july&#39;</span><span class="p">,</span> <span class="s">&#39;august&#39;</span><span class="p">,</span> <span class="s">&#39;september&#39;</span><span class="p">,</span> <span class="s">&#39;october&#39;</span><span class="p">,</span> <span class="s">&#39;november&#39;</span><span class="p">,</span> <span class="s">&#39;december&#39;</span><span class="p">])</span>
-
-<span class="n">__STOPWORDS__</span> <span class="o">=</span> <span class="nb">set</span><span class="p">([</span><span class="s">&#39;a</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;able&#39;</span><span class="p">,</span> <span class="s">&#39;about&#39;</span><span class="p">,</span> <span class="s">&#39;above&#39;</span><span class="p">,</span> <span class="s">&#39;according&#39;</span><span class="p">,</span> <span class="s">&#39;accordingly&#39;</span><span class="p">,</span> <span class="s">&#39;across&#39;</span><span class="p">,</span> <span class="s">&#39;actually&#39;</span><span class="p">,</span> <span class="s">&#39;after&#39;</span><span class="p">,</span> <span class="s">&#39;afterwards&#39;</span><span class="p">,</span> <span class="s">&#39;again&#39;</span><span class="p">,</span> <span class="s">&#39;against&#39;</span><span class="p">,</span> <span class="s">&#39;ain</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;all&#39;</span><span class="p">,</span> <span class="s">&#39;allow&#39;</span><span class="p">,</span> <span class="s">&#39;allows&#39;</span><span class="p">,</span> <span class="s">&#39;almost&#39;</span><span class="p">,</span> <span class="s">&#39;alone&#39;</span><span class="p">,</span> <span class="s">&#39;along&#39;</span><span class="p">,</span> <span class="s">&#39;already&#39;</span><span class="p">,</span> <span class="s">&#39;also&#39;</span><span class="p">,</span> <span class="s">&#39;although&#39;</span><span class="p">,</span> <span class="s">&#39;always&#39;</span><span class="p">,</span> <span class="s">&#39;am&#39;</span><span class="p">,</span> <span class="s">&#39;among&#39;</span><span class="p">,</span> <span class="s">&#39;amongst&#39;</span><span class="p">,</span> <span class="s">&#39;an&#39;</span><span class="p">,</span> <span class="s">&#39;and&#39;</span><span class="p">,</span> <span class="s">&#39;another&#39;</span><span class="p">,</span> <span class="s">&#39;any&#39;</span><span class="p">,</span> <span class="s">&#39;anybody&#39;</span><span class="p">,</span> <span class="s">&#39;anyhow&#39;</span><span class="p">,</span> <span class="s">&#39;anyone&#39;</span><span class="p">,</span> <span class="s">&#39;anything&#39;</span><span class="p">,</span> <span class="s">&#39;anyway&#39;</span><span class="p">,</span> <span class="s">&#39;anyways&#39;</span><span class="p">,</span> <span class="s">&#39;anywhere&#39;</span><span class="p">,</span> <span class="s">&#39;apart&#39;</span><span class="p">,</span> <span class="s">&#39;appear&#39;</span><span class="p">,</span> <span class="s">&#39;appreciate&#39;</span><span class="p">,</span> <span class="s">&#39;appropriate&#39;</span><span class="p">,</span> <span class="s">&#39;are&#39;</span><span class="p">,</span> <span class="s">&#39;aren</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;around&#39;</span><span class="p">,</span> <span class="s">&#39;as&#39;</span><span class="p">,</span> <span class="s">&#39;aside&#39;</span><span class="p">,</span> <span class="s">&#39;ask&#39;</span><span class="p">,</span> <span class="s">&#39;asking&#39;</span><span class="p">,</span> <span class="s">&#39;associated&#39;</span><span class="p">,</span> <span class="s">&#39;at&#39;</span><span class="p">,</span> <span class="s">&#39;available&#39;</span><span class="p">,</span> <span class="s">&#39;away&#39;</span><span class="p">,</span> <span class="s">&#39;awfully&#39;</span><span class="p">,</span> <span class="s">&#39;be&#39;</span><span class="p">,</span> <span class="s">&#39;became&#39;</span><span class="p">,</span> <span class="s">&#39;because&#39;</span><span class="p">,</span> <span class="s">&#39;become&#39;</span><span class="p">,</span> <span class="s">&#39;becomes&#39;</span><span class="p">,</span> <span class="s">&#39;becoming&#39;</span><span class="p">,</span> <span class="s">&#39;been&#39;</span><span class="p">,</span> <span class="s">&#39;before&#39;</span><span class="p">,</span> <span class="s">&#39;beforehand&#39;</span><span class="p">,</span> <span class="s">&#39;behind&#39;</span><span class="p">,</span> <span class="s">&#39;being&#39;</span><span class="p">,</span> <span class="s">&#39;believe&#39;</span><span class="p">,</span> <span class="s">&#39;below&#39;</span><span class="p">,</span> <span class="s">&#39;beside&#39;</span><span class="p">,</span> <span class="s">&#39;besides&#39;</span><span class="p">,</span> <span class="s">&#39;best&#39;</span><span class="p">,</span> <span class="s">&#39;better&#39;</span><span class="p">,</span> <span class="s">&#39;between&#39;</span><span class="p">,</span> <span class="s">&#39;beyond&#39;</span><span class="p">,</span> <span class="s">&#39;both&#39;</span><span class="p">,</span> <span class="s">&#39;brief&#39;</span><span class="p">,</span> <span class="s">&#39;but&#39;</span><span class="p">,</span> <span class="s">&#39;by&#39;</span><span class="p">,</span> <span class="s">&#39;c</span><span class="se">\&#39;</span><span class="s">mon&#39;</span><span class="p">,</span> <span class="s">&#39;c</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;came&#39;</span><span class="p">,</span> <span class="s">&#39;can&#39;</span><span class="p">,</span> <span class="s">&#39;can</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;cannot&#39;</span><span class="p">,</span> <span class="s">&#39;cant&#39;</span><span class="p">,</span> <span class="s">&#39;cause&#39;</span><span class="p">,</span> <span class="s">&#39;causes&#39;</span><span class="p">,</span> <span class="s">&#39;certain&#39;</span><span class="p">,</span> <span class="s">&#39;certainly&#39;</span><span class="p">,</span> <span class="s">&#39;changes&#39;</span><span class="p">,</span> <span class="s">&#39;clearly&#39;</span><span class="p">,</span> <span class="s">&#39;co&#39;</span><span class="p">,</span> <span class="s">&#39;com&#39;</span><span class="p">,</span> <span class="s">&#39;come&#39;</span><span class="p">,</span> <span class="s">&#39;comes&#39;</span><span class="p">,</span> <span class="s">&#39;concerning&#39;</span><span class="p">,</span> <span class="s">&#39;consequently&#39;</span><span class="p">,</span> <span class="s">&#39;consider&#39;</span><span class="p">,</span> <span class="s">&#39;considering&#39;</span><span class="p">,</span> <span class="s">&#39;contain&#39;</span><span class="p">,</span> <span class="s">&#39;containing&#39;</span><span class="p">,</span> <span class="s">&#39;contains&#39;</span><span class="p">,</span> <span class="s">&#39;corresponding&#39;</span><span class="p">,</span> <span class="s">&#39;could&#39;</span><span class="p">,</span> <span class="s">&#39;couldn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;course&#39;</span><span class="p">,</span> <span class="s">&#39;currently&#39;</span><span class="p">,</span> <span class="s">&#39;definitely&#39;</span><span class="p">,</span> <span class="s">&#39;described&#39;</span><span class="p">,</span> <span class="s">&#39;despite&#39;</span><span class="p">,</span> <span class="s">&#39;did&#39;</span><span class="p">,</span> <span class="s">&#39;didn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;different&#39;</span><span class="p">,</span> <span class="s">&#39;do&#39;</span><span class="p">,</span> <span class="s">&#39;does&#39;</span><span class="p">,</span> <span class="s">&#39;doesn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;doing&#39;</span><span class="p">,</span> <span class="s">&#39;don</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;done&#39;</span><span class="p">,</span> <span class="s">&#39;down&#39;</span><span class="p">,</span> <span class="s">&#39;downwards&#39;</span><span class="p">,</span> <span class="s">&#39;during&#39;</span><span class="p">,</span> <span class="s">&#39;each&#39;</span><span class="p">,</span> <span class="s">&#39;edu&#39;</span><span class="p">,</span> <span class="s">&#39;eg&#39;</span><span class="p">,</span> <span class="s">&#39;eight&#39;</span><span class="p">,</span> <span class="s">&#39;either&#39;</span><span class="p">,</span> <span class="s">&#39;else&#39;</span><span class="p">,</span> <span class="s">&#39;elsewhere&#39;</span><span class="p">,</span> <span class="s">&#39;enough&#39;</span><span class="p">,</span> <span class="s">&#39;entirely&#39;</span><span class="p">,</span> <span class="s">&#39;especially&#39;</span><span class="p">,</span> <span class="s">&#39;et&#39;</span><span class="p">,</span> <span class="s">&#39;etc&#39;</span><span class="p">,</span> <span class="s">&#39;even&#39;</span><span class="p">,</span> <span class="s">&#39;ever&#39;</span><span class="p">,</span> <span class="s">&#39;every&#39;</span><span class="p">,</span> <span class="s">&#39;everybody&#39;</span><span class="p">,</span> <span class="s">&#39;everyone&#39;</span><span class="p">,</span> <span class="s">&#39;everything&#39;</span><span class="p">,</span> <span class="s">&#39;everywhere&#39;</span><span class="p">,</span> <span class="s">&#39;ex&#39;</span><span class="p">,</span> <span class="s">&#39;exactly&#39;</span><span class="p">,</span> <span class="s">&#39;example&#39;</span><span class="p">,</span> <span class="s">&#39;except&#39;</span><span class="p">,</span> <span class="s">&#39;far&#39;</span><span class="p">,</span> <span class="s">&#39;few&#39;</span><span class="p">,</span> <span class="s">&#39;fifth&#39;</span><span class="p">,</span> <span class="s">&#39;first&#39;</span><span class="p">,</span> <span class="s">&#39;five&#39;</span><span class="p">,</span> <span class="s">&#39;followed&#39;</span><span class="p">,</span> <span class="s">&#39;following&#39;</span><span class="p">,</span> <span class="s">&#39;follows&#39;</span><span class="p">,</span> <span class="s">&#39;for&#39;</span><span class="p">,</span> <span class="s">&#39;former&#39;</span><span class="p">,</span> <span class="s">&#39;formerly&#39;</span><span class="p">,</span> <span class="s">&#39;forth&#39;</span><span class="p">,</span> <span class="s">&#39;four&#39;</span><span class="p">,</span> <span class="s">&#39;from&#39;</span><span class="p">,</span> <span class="s">&#39;further&#39;</span><span class="p">,</span> <span class="s">&#39;furthermore&#39;</span><span class="p">,</span> <span class="s">&#39;get&#39;</span><span class="p">,</span> <span class="s">&#39;gets&#39;</span><span class="p">,</span> <span class="s">&#39;getting&#39;</span><span class="p">,</span> <span class="s">&#39;given&#39;</span><span class="p">,</span> <span class="s">&#39;gives&#39;</span><span class="p">,</span> <span class="s">&#39;go&#39;</span><span class="p">,</span> <span class="s">&#39;goes&#39;</span><span class="p">,</span> <span class="s">&#39;going&#39;</span><span class="p">,</span> <span class="s">&#39;gone&#39;</span><span class="p">,</span> <span class="s">&#39;got&#39;</span><span class="p">,</span> <span class="s">&#39;gotten&#39;</span><span class="p">,</span> <span class="s">&#39;greetings&#39;</span><span class="p">,</span> <span class="s">&#39;had&#39;</span><span class="p">,</span> <span class="s">&#39;hadn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;happens&#39;</span><span class="p">,</span> <span class="s">&#39;hardly&#39;</span><span class="p">,</span> <span class="s">&#39;has&#39;</span><span class="p">,</span> <span class="s">&#39;hasn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;have&#39;</span><span class="p">,</span> <span class="s">&#39;haven</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;having&#39;</span><span class="p">,</span> <span class="s">&#39;he&#39;</span><span class="p">,</span> <span class="s">&#39;he</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;hello&#39;</span><span class="p">,</span> <span class="s">&#39;help&#39;</span><span class="p">,</span> <span class="s">&#39;hence&#39;</span><span class="p">,</span> <span class="s">&#39;her&#39;</span><span class="p">,</span> <span class="s">&#39;here&#39;</span><span class="p">,</span> <span class="s">&#39;here</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;hereafter&#39;</span><span class="p">,</span> <span class="s">&#39;hereby&#39;</span><span class="p">,</span> <span class="s">&#39;herein&#39;</span><span class="p">,</span> <span class="s">&#39;hereupon&#39;</span><span class="p">,</span> <span class="s">&#39;hers&#39;</span><span class="p">,</span> <span class="s">&#39;herself&#39;</span><span class="p">,</span> <span class="s">&#39;hi&#39;</span><span class="p">,</span> <span class="s">&#39;him&#39;</span><span class="p">,</span> <span class="s">&#39;himself&#39;</span><span class="p">,</span> <span class="s">&#39;his&#39;</span><span class="p">,</span> <span class="s">&#39;hither&#39;</span><span class="p">,</span> <span class="s">&#39;hopefully&#39;</span><span class="p">,</span> <span class="s">&#39;how&#39;</span><span class="p">,</span> <span class="s">&#39;howbeit&#39;</span><span class="p">,</span> <span class="s">&#39;however&#39;</span><span class="p">,</span> <span class="s">&#39;i</span><span class="se">\&#39;</span><span class="s">d&#39;</span><span class="p">,</span> <span class="s">&#39;i</span><span class="se">\&#39;</span><span class="s">ll&#39;</span><span class="p">,</span> <span class="s">&#39;i</span><span class="se">\&#39;</span><span class="s">m&#39;</span><span class="p">,</span> <span class="s">&#39;i</span><span class="se">\&#39;</span><span class="s">ve&#39;</span><span class="p">,</span> <span class="s">&#39;ie&#39;</span><span class="p">,</span> <span class="s">&#39;if&#39;</span><span class="p">,</span> <span class="s">&#39;ignored&#39;</span><span class="p">,</span> <span class="s">&#39;immediate&#39;</span><span class="p">,</span> <span class="s">&#39;in&#39;</span><span class="p">,</span> <span class="s">&#39;inasmuch&#39;</span><span class="p">,</span> <span class="s">&#39;inc&#39;</span><span class="p">,</span> <span class="s">&#39;indeed&#39;</span><span class="p">,</span> <span class="s">&#39;indicate&#39;</span><span class="p">,</span> <span class="s">&#39;indicated&#39;</span><span class="p">,</span> <span class="s">&#39;indicates&#39;</span><span class="p">,</span> <span class="s">&#39;inner&#39;</span><span class="p">,</span> <span class="s">&#39;insofar&#39;</span><span class="p">,</span> <span class="s">&#39;instead&#39;</span><span class="p">,</span> <span class="s">&#39;into&#39;</span><span class="p">,</span> <span class="s">&#39;inward&#39;</span><span class="p">,</span> <span class="s">&#39;is&#39;</span><span class="p">,</span> <span class="s">&#39;isn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;it&#39;</span><span class="p">,</span> <span class="s">&#39;it</span><span class="se">\&#39;</span><span class="s">d&#39;</span><span class="p">,</span> <span class="s">&#39;it</span><span class="se">\&#39;</span><span class="s">ll&#39;</span><span class="p">,</span> <span class="s">&#39;it</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;its&#39;</span><span class="p">,</span> <span class="s">&#39;itself&#39;</span><span class="p">,</span> <span class="s">&#39;just&#39;</span><span class="p">,</span> <span class="s">&#39;keep&#39;</span><span class="p">,</span> <span class="s">&#39;keeps&#39;</span><span class="p">,</span> <span class="s">&#39;kept&#39;</span><span class="p">,</span> <span class="s">&#39;know&#39;</span><span class="p">,</span> <span class="s">&#39;known&#39;</span><span class="p">,</span> <span class="s">&#39;knows&#39;</span><span class="p">,</span> <span class="s">&#39;last&#39;</span><span class="p">,</span> <span class="s">&#39;lately&#39;</span><span class="p">,</span> <span class="s">&#39;later&#39;</span><span class="p">,</span> <span class="s">&#39;latter&#39;</span><span class="p">,</span> <span class="s">&#39;latterly&#39;</span><span class="p">,</span> <span class="s">&#39;least&#39;</span><span class="p">,</span> <span class="s">&#39;less&#39;</span><span class="p">,</span> <span class="s">&#39;lest&#39;</span><span class="p">,</span> <span class="s">&#39;let&#39;</span><span class="p">,</span> <span class="s">&#39;let</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;like&#39;</span><span class="p">,</span> <span class="s">&#39;liked&#39;</span><span class="p">,</span> <span class="s">&#39;likely&#39;</span><span class="p">,</span> <span class="s">&#39;little&#39;</span><span class="p">,</span> <span class="s">&#39;look&#39;</span><span class="p">,</span> <span class="s">&#39;looking&#39;</span><span class="p">,</span> <span class="s">&#39;looks&#39;</span><span class="p">,</span> <span class="s">&#39;ltd&#39;</span><span class="p">,</span> <span class="s">&#39;mainly&#39;</span><span class="p">,</span> <span class="s">&#39;many&#39;</span><span class="p">,</span> <span class="s">&#39;may&#39;</span><span class="p">,</span> <span class="s">&#39;maybe&#39;</span><span class="p">,</span> <span class="s">&#39;me&#39;</span><span class="p">,</span> <span class="s">&#39;mean&#39;</span><span class="p">,</span> <span class="s">&#39;meanwhile&#39;</span><span class="p">,</span> <span class="s">&#39;merely&#39;</span><span class="p">,</span> <span class="s">&#39;might&#39;</span><span class="p">,</span> <span class="s">&#39;more&#39;</span><span class="p">,</span> <span class="s">&#39;moreover&#39;</span><span class="p">,</span> <span class="s">&#39;most&#39;</span><span class="p">,</span> <span class="s">&#39;mostly&#39;</span><span class="p">,</span> <span class="s">&#39;much&#39;</span><span class="p">,</span> <span class="s">&#39;must&#39;</span><span class="p">,</span> <span class="s">&#39;my&#39;</span><span class="p">,</span> <span class="s">&#39;myself&#39;</span><span class="p">,</span> <span class="s">&#39;name&#39;</span><span class="p">,</span> <span class="s">&#39;namely&#39;</span><span class="p">,</span> <span class="s">&#39;nd&#39;</span><span class="p">,</span> <span class="s">&#39;near&#39;</span><span class="p">,</span> <span class="s">&#39;nearly&#39;</span><span class="p">,</span> <span class="s">&#39;necessary&#39;</span><span class="p">,</span> <span class="s">&#39;need&#39;</span><span class="p">,</span> <span class="s">&#39;needs&#39;</span><span class="p">,</span> <span class="s">&#39;neither&#39;</span><span class="p">,</span> <span class="s">&#39;never&#39;</span><span class="p">,</span> <span class="s">&#39;nevertheless&#39;</span><span class="p">,</span> <span class="s">&#39;new&#39;</span><span class="p">,</span> <span class="s">&#39;next&#39;</span><span class="p">,</span> <span class="s">&#39;nine&#39;</span><span class="p">,</span> <span class="s">&#39;no&#39;</span><span class="p">,</span> <span class="s">&#39;nobody&#39;</span><span class="p">,</span> <span class="s">&#39;non&#39;</span><span class="p">,</span> <span class="s">&#39;none&#39;</span><span class="p">,</span> <span class="s">&#39;noone&#39;</span><span class="p">,</span> <span class="s">&#39;nor&#39;</span><span class="p">,</span> <span class="s">&#39;normally&#39;</span><span class="p">,</span> <span class="s">&#39;not&#39;</span><span class="p">,</span> <span class="s">&#39;nothing&#39;</span><span class="p">,</span> <span class="s">&#39;novel&#39;</span><span class="p">,</span> <span class="s">&#39;now&#39;</span><span class="p">,</span> <span class="s">&#39;nowhere&#39;</span><span class="p">,</span> <span class="s">&#39;obviously&#39;</span><span class="p">,</span> <span class="s">&#39;of&#39;</span><span class="p">,</span> <span class="s">&#39;off&#39;</span><span class="p">,</span> <span class="s">&#39;often&#39;</span><span class="p">,</span> <span class="s">&#39;oh&#39;</span><span class="p">,</span> <span class="s">&#39;ok&#39;</span><span class="p">,</span> <span class="s">&#39;okay&#39;</span><span class="p">,</span> <span class="s">&#39;old&#39;</span><span class="p">,</span> <span class="s">&#39;on&#39;</span><span class="p">,</span> <span class="s">&#39;once&#39;</span><span class="p">,</span> <span class="s">&#39;one&#39;</span><span class="p">,</span> <span class="s">&#39;ones&#39;</span><span class="p">,</span> <span class="s">&#39;only&#39;</span><span class="p">,</span> <span class="s">&#39;onto&#39;</span><span class="p">,</span> <span class="s">&#39;or&#39;</span><span class="p">,</span> <span class="s">&#39;other&#39;</span><span class="p">,</span> <span class="s">&#39;others&#39;</span><span class="p">,</span> <span class="s">&#39;otherwise&#39;</span><span class="p">,</span> <span class="s">&#39;ought&#39;</span><span class="p">,</span> <span class="s">&#39;our&#39;</span><span class="p">,</span> <span class="s">&#39;ours&#39;</span><span class="p">,</span> <span class="s">&#39;ourselves&#39;</span><span class="p">,</span> <span class="s">&#39;out&#39;</span><span class="p">,</span> <span class="s">&#39;outside&#39;</span><span class="p">,</span> <span class="s">&#39;over&#39;</span><span class="p">,</span> <span class="s">&#39;overall&#39;</span><span class="p">,</span> <span class="s">&#39;own&#39;</span><span class="p">,</span> <span class="s">&#39;particular&#39;</span><span class="p">,</span> <span class="s">&#39;particularly&#39;</span><span class="p">,</span> <span class="s">&#39;per&#39;</span><span class="p">,</span> <span class="s">&#39;perhaps&#39;</span><span class="p">,</span> <span class="s">&#39;placed&#39;</span><span class="p">,</span> <span class="s">&#39;please&#39;</span><span class="p">,</span> <span class="s">&#39;plus&#39;</span><span class="p">,</span> <span class="s">&#39;possible&#39;</span><span class="p">,</span> <span class="s">&#39;presumably&#39;</span><span class="p">,</span> <span class="s">&#39;probably&#39;</span><span class="p">,</span> <span class="s">&#39;provides&#39;</span><span class="p">,</span> <span class="s">&#39;que&#39;</span><span class="p">,</span> <span class="s">&#39;quite&#39;</span><span class="p">,</span> <span class="s">&#39;qv&#39;</span><span class="p">,</span> <span class="s">&#39;rather&#39;</span><span class="p">,</span> <span class="s">&#39;rd&#39;</span><span class="p">,</span> <span class="s">&#39;re&#39;</span><span class="p">,</span> <span class="s">&#39;really&#39;</span><span class="p">,</span> <span class="s">&#39;reasonably&#39;</span><span class="p">,</span> <span class="s">&#39;regarding&#39;</span><span class="p">,</span> <span class="s">&#39;regardless&#39;</span><span class="p">,</span> <span class="s">&#39;regards&#39;</span><span class="p">,</span> <span class="s">&#39;relatively&#39;</span><span class="p">,</span> <span class="s">&#39;respectively&#39;</span><span class="p">,</span> <span class="s">&#39;right&#39;</span><span class="p">,</span> <span class="s">&#39;said&#39;</span><span class="p">,</span> <span class="s">&#39;same&#39;</span><span class="p">,</span> <span class="s">&#39;saw&#39;</span><span class="p">,</span> <span class="s">&#39;say&#39;</span><span class="p">,</span> <span class="s">&#39;saying&#39;</span><span class="p">,</span> <span class="s">&#39;says&#39;</span><span class="p">,</span> <span class="s">&#39;second&#39;</span><span class="p">,</span> <span class="s">&#39;secondly&#39;</span><span class="p">,</span> <span class="s">&#39;see&#39;</span><span class="p">,</span> <span class="s">&#39;seeing&#39;</span><span class="p">,</span> <span class="s">&#39;seem&#39;</span><span class="p">,</span> <span class="s">&#39;seemed&#39;</span><span class="p">,</span> <span class="s">&#39;seeming&#39;</span><span class="p">,</span> <span class="s">&#39;seems&#39;</span><span class="p">,</span> <span class="s">&#39;seen&#39;</span><span class="p">,</span> <span class="s">&#39;self&#39;</span><span class="p">,</span> <span class="s">&#39;selves&#39;</span><span class="p">,</span> <span class="s">&#39;sensible&#39;</span><span class="p">,</span> <span class="s">&#39;sent&#39;</span><span class="p">,</span> <span class="s">&#39;serious&#39;</span><span class="p">,</span> <span class="s">&#39;seriously&#39;</span><span class="p">,</span> <span class="s">&#39;seven&#39;</span><span class="p">,</span> <span class="s">&#39;several&#39;</span><span class="p">,</span> <span class="s">&#39;shall&#39;</span><span class="p">,</span> <span class="s">&#39;she&#39;</span><span class="p">,</span> <span class="s">&#39;should&#39;</span><span class="p">,</span> <span class="s">&#39;shouldn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;since&#39;</span><span class="p">,</span> <span class="s">&#39;six&#39;</span><span class="p">,</span> <span class="s">&#39;so&#39;</span><span class="p">,</span> <span class="s">&#39;some&#39;</span><span class="p">,</span> <span class="s">&#39;somebody&#39;</span><span class="p">,</span> <span class="s">&#39;somehow&#39;</span><span class="p">,</span> <span class="s">&#39;someone&#39;</span><span class="p">,</span> <span class="s">&#39;something&#39;</span><span class="p">,</span> <span class="s">&#39;sometime&#39;</span><span class="p">,</span> <span class="s">&#39;sometimes&#39;</span><span class="p">,</span> <span class="s">&#39;somewhat&#39;</span><span class="p">,</span> <span class="s">&#39;somewhere&#39;</span><span class="p">,</span> <span class="s">&#39;soon&#39;</span><span class="p">,</span> <span class="s">&#39;sorry&#39;</span><span class="p">,</span> <span class="s">&#39;specified&#39;</span><span class="p">,</span> <span class="s">&#39;specify&#39;</span><span class="p">,</span> <span class="s">&#39;specifying&#39;</span><span class="p">,</span> <span class="s">&#39;still&#39;</span><span class="p">,</span> <span class="s">&#39;sub&#39;</span><span class="p">,</span> <span class="s">&#39;such&#39;</span><span class="p">,</span> <span class="s">&#39;sup&#39;</span><span class="p">,</span> <span class="s">&#39;sure&#39;</span><span class="p">,</span> <span class="s">&#39;t</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;take&#39;</span><span class="p">,</span> <span class="s">&#39;taken&#39;</span><span class="p">,</span> <span class="s">&#39;tell&#39;</span><span class="p">,</span> <span class="s">&#39;tends&#39;</span><span class="p">,</span> <span class="s">&#39;th&#39;</span><span class="p">,</span> <span class="s">&#39;than&#39;</span><span class="p">,</span> <span class="s">&#39;thank&#39;</span><span class="p">,</span> <span class="s">&#39;thanks&#39;</span><span class="p">,</span> <span class="s">&#39;thanx&#39;</span><span class="p">,</span> <span class="s">&#39;that&#39;</span><span class="p">,</span> <span class="s">&#39;that</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;thats&#39;</span><span class="p">,</span> <span class="s">&#39;the&#39;</span><span class="p">,</span> <span class="s">&#39;their&#39;</span><span class="p">,</span> <span class="s">&#39;theirs&#39;</span><span class="p">,</span> <span class="s">&#39;them&#39;</span><span class="p">,</span> <span class="s">&#39;themselves&#39;</span><span class="p">,</span> <span class="s">&#39;then&#39;</span><span class="p">,</span> <span class="s">&#39;thence&#39;</span><span class="p">,</span> <span class="s">&#39;there&#39;</span><span class="p">,</span> <span class="s">&#39;there</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;thereafter&#39;</span><span class="p">,</span> <span class="s">&#39;thereby&#39;</span><span class="p">,</span> <span class="s">&#39;therefore&#39;</span><span class="p">,</span> <span class="s">&#39;therein&#39;</span><span class="p">,</span> <span class="s">&#39;theres&#39;</span><span class="p">,</span> <span class="s">&#39;thereupon&#39;</span><span class="p">,</span> <span class="s">&#39;these&#39;</span><span class="p">,</span> <span class="s">&#39;they&#39;</span><span class="p">,</span> <span class="s">&#39;they</span><span class="se">\&#39;</span><span class="s">d&#39;</span><span class="p">,</span> <span class="s">&#39;they</span><span class="se">\&#39;</span><span class="s">ll&#39;</span><span class="p">,</span> <span class="s">&#39;they</span><span class="se">\&#39;</span><span class="s">re&#39;</span><span class="p">,</span> <span class="s">&#39;they</span><span class="se">\&#39;</span><span class="s">ve&#39;</span><span class="p">,</span> <span class="s">&#39;think&#39;</span><span class="p">,</span> <span class="s">&#39;third&#39;</span><span class="p">,</span> <span class="s">&#39;this&#39;</span><span class="p">,</span> <span class="s">&#39;thorough&#39;</span><span class="p">,</span> <span class="s">&#39;thoroughly&#39;</span><span class="p">,</span> <span class="s">&#39;those&#39;</span><span class="p">,</span> <span class="s">&#39;though&#39;</span><span class="p">,</span> <span class="s">&#39;three&#39;</span><span class="p">,</span> <span class="s">&#39;through&#39;</span><span class="p">,</span> <span class="s">&#39;throughout&#39;</span><span class="p">,</span> <span class="s">&#39;thru&#39;</span><span class="p">,</span> <span class="s">&#39;thus&#39;</span><span class="p">,</span> <span class="s">&#39;to&#39;</span><span class="p">,</span> <span class="s">&#39;together&#39;</span><span class="p">,</span> <span class="s">&#39;too&#39;</span><span class="p">,</span> <span class="s">&#39;took&#39;</span><span class="p">,</span> <span class="s">&#39;toward&#39;</span><span class="p">,</span> <span class="s">&#39;towards&#39;</span><span class="p">,</span> <span class="s">&#39;tried&#39;</span><span class="p">,</span> <span class="s">&#39;tries&#39;</span><span class="p">,</span> <span class="s">&#39;truly&#39;</span><span class="p">,</span> <span class="s">&#39;try&#39;</span><span class="p">,</span> <span class="s">&#39;trying&#39;</span><span class="p">,</span> <span class="s">&#39;twice&#39;</span><span class="p">,</span> <span class="s">&#39;two&#39;</span><span class="p">,</span> <span class="s">&#39;un&#39;</span><span class="p">,</span> <span class="s">&#39;under&#39;</span><span class="p">,</span> <span class="s">&#39;unfortunately&#39;</span><span class="p">,</span> <span class="s">&#39;unless&#39;</span><span class="p">,</span> <span class="s">&#39;unlikely&#39;</span><span class="p">,</span> <span class="s">&#39;until&#39;</span><span class="p">,</span> <span class="s">&#39;unto&#39;</span><span class="p">,</span> <span class="s">&#39;up&#39;</span><span class="p">,</span> <span class="s">&#39;upon&#39;</span><span class="p">,</span> <span class="s">&#39;us&#39;</span><span class="p">,</span> <span class="s">&#39;use&#39;</span><span class="p">,</span> <span class="s">&#39;used&#39;</span><span class="p">,</span> <span class="s">&#39;useful&#39;</span><span class="p">,</span> <span class="s">&#39;uses&#39;</span><span class="p">,</span> <span class="s">&#39;using&#39;</span><span class="p">,</span> <span class="s">&#39;usually&#39;</span><span class="p">,</span> <span class="s">&#39;value&#39;</span><span class="p">,</span> <span class="s">&#39;various&#39;</span><span class="p">,</span> <span class="s">&#39;very&#39;</span><span class="p">,</span> <span class="s">&#39;via&#39;</span><span class="p">,</span> <span class="s">&#39;viz&#39;</span><span class="p">,</span> <span class="s">&#39;vs&#39;</span><span class="p">,</span> <span class="s">&#39;want&#39;</span><span class="p">,</span> <span class="s">&#39;wants&#39;</span><span class="p">,</span> <span class="s">&#39;was&#39;</span><span class="p">,</span> <span class="s">&#39;wasn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;way&#39;</span><span class="p">,</span> <span class="s">&#39;we&#39;</span><span class="p">,</span> <span class="s">&#39;we</span><span class="se">\&#39;</span><span class="s">d&#39;</span><span class="p">,</span> <span class="s">&#39;we</span><span class="se">\&#39;</span><span class="s">ll&#39;</span><span class="p">,</span> <span class="s">&#39;we</span><span class="se">\&#39;</span><span class="s">re&#39;</span><span class="p">,</span> <span class="s">&#39;we</span><span class="se">\&#39;</span><span class="s">ve&#39;</span><span class="p">,</span> <span class="s">&#39;welcome&#39;</span><span class="p">,</span> <span class="s">&#39;well&#39;</span><span class="p">,</span> <span class="s">&#39;went&#39;</span><span class="p">,</span> <span class="s">&#39;were&#39;</span><span class="p">,</span> <span class="s">&#39;weren</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;what&#39;</span><span class="p">,</span> <span class="s">&#39;what</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;whatever&#39;</span><span class="p">,</span> <span class="s">&#39;when&#39;</span><span class="p">,</span> <span class="s">&#39;whence&#39;</span><span class="p">,</span> <span class="s">&#39;whenever&#39;</span><span class="p">,</span> <span class="s">&#39;where&#39;</span><span class="p">,</span> <span class="s">&#39;where</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;whereafter&#39;</span><span class="p">,</span> <span class="s">&#39;whereas&#39;</span><span class="p">,</span> <span class="s">&#39;whereby&#39;</span><span class="p">,</span> <span class="s">&#39;wherein&#39;</span><span class="p">,</span> <span class="s">&#39;whereupon&#39;</span><span class="p">,</span> <span class="s">&#39;wherever&#39;</span><span class="p">,</span> <span class="s">&#39;whether&#39;</span><span class="p">,</span> <span class="s">&#39;which&#39;</span><span class="p">,</span> <span class="s">&#39;while&#39;</span><span class="p">,</span> <span class="s">&#39;whither&#39;</span><span class="p">,</span> <span class="s">&#39;who&#39;</span><span class="p">,</span> <span class="s">&#39;who</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;whoever&#39;</span><span class="p">,</span> <span class="s">&#39;whole&#39;</span><span class="p">,</span> <span class="s">&#39;whom&#39;</span><span class="p">,</span> <span class="s">&#39;whose&#39;</span><span class="p">,</span> <span class="s">&#39;why&#39;</span><span class="p">,</span> <span class="s">&#39;will&#39;</span><span class="p">,</span> <span class="s">&#39;willing&#39;</span><span class="p">,</span> <span class="s">&#39;wish&#39;</span><span class="p">,</span> <span class="s">&#39;with&#39;</span><span class="p">,</span> <span class="s">&#39;within&#39;</span><span class="p">,</span> <span class="s">&#39;without&#39;</span><span class="p">,</span> <span class="s">&#39;won</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;wonder&#39;</span><span class="p">,</span> <span class="s">&#39;would&#39;</span><span class="p">,</span> <span class="s">&#39;wouldn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;yes&#39;</span><span class="p">,</span> <span class="s">&#39;yet&#39;</span><span class="p">,</span> <span class="s">&#39;you&#39;</span><span class="p">,</span> <span class="s">&#39;you</span><span class="se">\&#39;</span><span class="s">d&#39;</span><span class="p">,</span> <span class="s">&#39;you</span><span class="se">\&#39;</span><span class="s">ll&#39;</span><span class="p">,</span> <span class="s">&#39;you</span><span class="se">\&#39;</span><span class="s">re&#39;</span><span class="p">,</span> <span class="s">&#39;you</span><span class="se">\&#39;</span><span class="s">ve&#39;</span><span class="p">,</span> <span class="s">&#39;your&#39;</span><span class="p">,</span> <span class="s">&#39;yours&#39;</span><span class="p">,</span> <span class="s">&#39;yourself&#39;</span><span class="p">,</span> <span class="s">&#39;yourselves&#39;</span><span class="p">,</span> <span class="s">&#39;zero&#39;</span><span class="p">])</span>
-<span class="sd">&quot;&quot;&quot;Stopwords based on MySQL `http://dev.mysql.com/doc/refman/5.5/en/fulltext-stopwords.html`&quot;&quot;&quot;</span>
+<span class="n">STOPWORDS</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="nb">unicode</span><span class="p">,</span> <span class="p">[</span><span class="s">&#39;a&#39;</span><span class="p">,</span> <span class="s">&#39;a</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;able&#39;</span><span class="p">,</span> <span class="s">&#39;about&#39;</span><span class="p">,</span> <span class="s">&#39;above&#39;</span><span class="p">,</span> <span class="s">&#39;according&#39;</span><span class="p">,</span> <span class="s">&#39;accordingly&#39;</span><span class="p">,</span> <span class="s">&#39;across&#39;</span><span class="p">,</span> <span class="s">&#39;actually&#39;</span><span class="p">,</span> <span class="s">&#39;after&#39;</span><span class="p">,</span> <span class="s">&#39;afterwards&#39;</span><span class="p">,</span> <span class="s">&#39;again&#39;</span><span class="p">,</span> <span class="s">&#39;against&#39;</span><span class="p">,</span> <span class="s">&#39;ain</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;all&#39;</span><span class="p">,</span> <span class="s">&#39;allow&#39;</span><span class="p">,</span> <span class="s">&#39;allows&#39;</span><span class="p">,</span> <span class="s">&#39;almost&#39;</span><span class="p">,</span> <span class="s">&#39;alone&#39;</span><span class="p">,</span> <span class="s">&#39;along&#39;</span><span class="p">,</span> <span class="s">&#39;already&#39;</span><span class="p">,</span> <span class="s">&#39;also&#39;</span><span class="p">,</span> <span class="s">&#39;although&#39;</span><span class="p">,</span> <span class="s">&#39;always&#39;</span><span class="p">,</span> <span class="s">&#39;am&#39;</span><span class="p">,</span> <span class="s">&#39;among&#39;</span><span class="p">,</span> <span class="s">&#39;amongst&#39;</span><span class="p">,</span> <span class="s">&#39;an&#39;</span><span class="p">,</span> <span class="s">&#39;and&#39;</span><span class="p">,</span> <span class="s">&#39;another&#39;</span><span class="p">,</span> <span class="s">&#39;any&#39;</span><span class="p">,</span> <span class="s">&#39;anybody&#39;</span><span class="p">,</span> <span class="s">&#39;anyhow&#39;</span><span class="p">,</span> <span class="s">&#39;anyone&#39;</span><span class="p">,</span> <span class="s">&#39;anything&#39;</span><span class="p">,</span> <span class="s">&#39;anyway&#39;</span><span class="p">,</span> <span class="s">&#39;anyways&#39;</span><span class="p">,</span> <span class="s">&#39;anywhere&#39;</span><span class="p">,</span> <span class="s">&#39;apart&#39;</span><span class="p">,</span> <span class="s">&#39;appear&#39;</span><span class="p">,</span> <span class="s">&#39;appreciate&#39;</span><span class="p">,</span> <span class="s">&#39;appropriate&#39;</span><span class="p">,</span> <span class="s">&#39;are&#39;</span><span class="p">,</span> <span class="s">&#39;aren</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;around&#39;</span><span class="p">,</span> <span class="s">&#39;as&#39;</span><span class="p">,</span> <span class="s">&#39;aside&#39;</span><span class="p">,</span> <span class="s">&#39;ask&#39;</span><span class="p">,</span> <span class="s">&#39;asking&#39;</span><span class="p">,</span> <span class="s">&#39;associated&#39;</span><span class="p">,</span> <span class="s">&#39;at&#39;</span><span class="p">,</span> <span class="s">&#39;available&#39;</span><span class="p">,</span> <span class="s">&#39;away&#39;</span><span class="p">,</span> <span class="s">&#39;awfully&#39;</span><span class="p">,</span> <span class="s">&#39;be&#39;</span><span class="p">,</span> <span class="s">&#39;became&#39;</span><span class="p">,</span> <span class="s">&#39;because&#39;</span><span class="p">,</span> <span class="s">&#39;become&#39;</span><span class="p">,</span> <span class="s">&#39;becomes&#39;</span><span class="p">,</span> <span class="s">&#39;becoming&#39;</span><span class="p">,</span> <span class="s">&#39;been&#39;</span><span class="p">,</span> <span class="s">&#39;before&#39;</span><span class="p">,</span> <span class="s">&#39;beforehand&#39;</span><span class="p">,</span> <span class="s">&#39;behind&#39;</span><span class="p">,</span> <span class="s">&#39;being&#39;</span><span class="p">,</span> <span class="s">&#39;believe&#39;</span><span class="p">,</span> <span class="s">&#39;below&#39;</span><span class="p">,</span> <span class="s">&#39;beside&#39;</span><span class="p">,</span> <span class="s">&#39;besides&#39;</span><span class="p">,</span> <span class="s">&#39;best&#39;</span><span class="p">,</span> <span class="s">&#39;better&#39;</span><span class="p">,</span> <span class="s">&#39;between&#39;</span><span class="p">,</span> <span class="s">&#39;beyond&#39;</span><span class="p">,</span> <span class="s">&#39;both&#39;</span><span class="p">,</span> <span class="s">&#39;brief&#39;</span><span class="p">,</span> <span class="s">&#39;but&#39;</span><span class="p">,</span> <span class="s">&#39;by&#39;</span><span class="p">,</span> <span class="s">&#39;c</span><span class="se">\&#39;</span><span class="s">mon&#39;</span><span class="p">,</span> <span class="s">&#39;c</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;came&#39;</span><span class="p">,</span> <span class="s">&#39;can&#39;</span><span class="p">,</span> <span class="s">&#39;can</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;cannot&#39;</span><span class="p">,</span> <span class="s">&#39;cant&#39;</span><span class="p">,</span> <span class="s">&#39;cause&#39;</span><span class="p">,</span> <span class="s">&#39;causes&#39;</span><span class="p">,</span> <span class="s">&#39;certain&#39;</span><span class="p">,</span> <span class="s">&#39;certainly&#39;</span><span class="p">,</span> <span class="s">&#39;changes&#39;</span><span class="p">,</span> <span class="s">&#39;clearly&#39;</span><span class="p">,</span> <span class="s">&#39;co&#39;</span><span class="p">,</span> <span class="s">&#39;com&#39;</span><span class="p">,</span> <span class="s">&#39;come&#39;</span><span class="p">,</span> <span class="s">&#39;comes&#39;</span><span class="p">,</span> <span class="s">&#39;concerning&#39;</span><span class="p">,</span> <span class="s">&#39;consequently&#39;</span><span class="p">,</span> <span class="s">&#39;consider&#39;</span><span class="p">,</span> <span class="s">&#39;considering&#39;</span><span class="p">,</span> <span class="s">&#39;contain&#39;</span><span class="p">,</span> <span class="s">&#39;containing&#39;</span><span class="p">,</span> <span class="s">&#39;contains&#39;</span><span class="p">,</span> <span class="s">&#39;corresponding&#39;</span><span class="p">,</span> <span class="s">&#39;could&#39;</span><span class="p">,</span> <span class="s">&#39;couldn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;course&#39;</span><span class="p">,</span> <span class="s">&#39;currently&#39;</span><span class="p">,</span> <span class="s">&#39;definitely&#39;</span><span class="p">,</span> <span class="s">&#39;described&#39;</span><span class="p">,</span> <span class="s">&#39;despite&#39;</span><span class="p">,</span> <span class="s">&#39;did&#39;</span><span class="p">,</span> <span class="s">&#39;didn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;different&#39;</span><span class="p">,</span> <span class="s">&#39;do&#39;</span><span class="p">,</span> <span class="s">&#39;does&#39;</span><span class="p">,</span> <span class="s">&#39;doesn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;doing&#39;</span><span class="p">,</span> <span class="s">&#39;don</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;done&#39;</span><span class="p">,</span> <span class="s">&#39;down&#39;</span><span class="p">,</span> <span class="s">&#39;downwards&#39;</span><span class="p">,</span> <span class="s">&#39;during&#39;</span><span class="p">,</span> <span class="s">&#39;each&#39;</span><span class="p">,</span> <span class="s">&#39;edu&#39;</span><span class="p">,</span> <span class="s">&#39;eg&#39;</span><span class="p">,</span> <span class="s">&#39;eight&#39;</span><span class="p">,</span> <span class="s">&#39;either&#39;</span><span class="p">,</span> <span class="s">&#39;else&#39;</span><span class="p">,</span> <span class="s">&#39;elsewhere&#39;</span><span class="p">,</span> <span class="s">&#39;enough&#39;</span><span class="p">,</span> <span class="s">&#39;entirely&#39;</span><span class="p">,</span> <span class="s">&#39;especially&#39;</span><span class="p">,</span> <span class="s">&#39;et&#39;</span><span class="p">,</span> <span class="s">&#39;etc&#39;</span><span class="p">,</span> <span class="s">&#39;even&#39;</span><span class="p">,</span> <span class="s">&#39;ever&#39;</span><span class="p">,</span> <span class="s">&#39;every&#39;</span><span class="p">,</span> <span class="s">&#39;everybody&#39;</span><span class="p">,</span> <span class="s">&#39;everyone&#39;</span><span class="p">,</span> <span class="s">&#39;everything&#39;</span><span class="p">,</span> <span class="s">&#39;everywhere&#39;</span><span class="p">,</span> <span class="s">&#39;ex&#39;</span><span class="p">,</span> <span class="s">&#39;exactly&#39;</span><span class="p">,</span> <span class="s">&#39;example&#39;</span><span class="p">,</span> <span class="s">&#39;except&#39;</span><span class="p">,</span> <span class="s">&#39;far&#39;</span><span class="p">,</span> <span class="s">&#39;few&#39;</span><span class="p">,</span> <span class="s">&#39;fifth&#39;</span><span class="p">,</span> <span class="s">&#39;first&#39;</span><span class="p">,</span> <span class="s">&#39;five&#39;</span><span class="p">,</span> <span class="s">&#39;followed&#39;</span><span class="p">,</span> <span class="s">&#39;following&#39;</span><span class="p">,</span> <span class="s">&#39;follows&#39;</span><span class="p">,</span> <span class="s">&#39;for&#39;</span><span class="p">,</span> <span class="s">&#39;former&#39;</span><span class="p">,</span> <span class="s">&#39;formerly&#39;</span><span class="p">,</span> <span class="s">&#39;forth&#39;</span><span class="p">,</span> <span class="s">&#39;four&#39;</span><span class="p">,</span> <span class="s">&#39;from&#39;</span><span class="p">,</span> <span class="s">&#39;further&#39;</span><span class="p">,</span> <span class="s">&#39;furthermore&#39;</span><span class="p">,</span> <span class="s">&#39;get&#39;</span><span class="p">,</span> <span class="s">&#39;gets&#39;</span><span class="p">,</span> <span class="s">&#39;getting&#39;</span><span class="p">,</span> <span class="s">&#39;given&#39;</span><span class="p">,</span> <span class="s">&#39;gives&#39;</span><span class="p">,</span> <span class="s">&#39;go&#39;</span><span class="p">,</span> <span class="s">&#39;goes&#39;</span><span class="p">,</span> <span class="s">&#39;going&#39;</span><span class="p">,</span> <span class="s">&#39;gone&#39;</span><span class="p">,</span> <span class="s">&#39;got&#39;</span><span class="p">,</span> <span class="s">&#39;gotten&#39;</span><span class="p">,</span> <span class="s">&#39;greetings&#39;</span><span class="p">,</span> <span class="s">&#39;had&#39;</span><span class="p">,</span> <span class="s">&#39;hadn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;happens&#39;</span><span class="p">,</span> <span class="s">&#39;hardly&#39;</span><span class="p">,</span> <span class="s">&#39;has&#39;</span><span class="p">,</span> <span class="s">&#39;hasn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;have&#39;</span><span class="p">,</span> <span class="s">&#39;haven</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;having&#39;</span><span class="p">,</span> <span class="s">&#39;he&#39;</span><span class="p">,</span> <span class="s">&#39;he</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;hello&#39;</span><span class="p">,</span> <span class="s">&#39;help&#39;</span><span class="p">,</span> <span class="s">&#39;hence&#39;</span><span class="p">,</span> <span class="s">&#39;her&#39;</span><span class="p">,</span> <span class="s">&#39;here&#39;</span><span class="p">,</span> <span class="s">&#39;here</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;hereafter&#39;</span><span class="p">,</span> <span class="s">&#39;hereby&#39;</span><span class="p">,</span> <span class="s">&#39;herein&#39;</span><span class="p">,</span> <span class="s">&#39;hereupon&#39;</span><span class="p">,</span> <span class="s">&#39;hers&#39;</span><span class="p">,</span> <span class="s">&#39;herself&#39;</span><span class="p">,</span> <span class="s">&#39;hi&#39;</span><span class="p">,</span> <span class="s">&#39;him&#39;</span><span class="p">,</span> <span class="s">&#39;himself&#39;</span><span class="p">,</span> <span class="s">&#39;his&#39;</span><span class="p">,</span> <span class="s">&#39;hither&#39;</span><span class="p">,</span> <span class="s">&#39;hopefully&#39;</span><span class="p">,</span> <span class="s">&#39;how&#39;</span><span class="p">,</span> <span class="s">&#39;howbeit&#39;</span><span class="p">,</span> <span class="s">&#39;however&#39;</span><span class="p">,</span> <span class="s">&#39;i</span><span class="se">\&#39;</span><span class="s">d&#39;</span><span class="p">,</span> <span class="s">&#39;i</span><span class="se">\&#39;</span><span class="s">ll&#39;</span><span class="p">,</span> <span class="s">&#39;i</span><span class="se">\&#39;</span><span class="s">m&#39;</span><span class="p">,</span> <span class="s">&#39;i</span><span class="se">\&#39;</span><span class="s">ve&#39;</span><span class="p">,</span> <span class="s">&#39;ie&#39;</span><span class="p">,</span> <span class="s">&#39;if&#39;</span><span class="p">,</span> <span class="s">&#39;ignored&#39;</span><span class="p">,</span> <span class="s">&#39;immediate&#39;</span><span class="p">,</span> <span class="s">&#39;in&#39;</span><span class="p">,</span> <span class="s">&#39;inasmuch&#39;</span><span class="p">,</span> <span class="s">&#39;inc&#39;</span><span class="p">,</span> <span class="s">&#39;indeed&#39;</span><span class="p">,</span> <span class="s">&#39;indicate&#39;</span><span class="p">,</span> <span class="s">&#39;indicated&#39;</span><span class="p">,</span> <span class="s">&#39;indicates&#39;</span><span class="p">,</span> <span class="s">&#39;inner&#39;</span><span class="p">,</span> <span class="s">&#39;insofar&#39;</span><span class="p">,</span> <span class="s">&#39;instead&#39;</span><span class="p">,</span> <span class="s">&#39;into&#39;</span><span class="p">,</span> <span class="s">&#39;inward&#39;</span><span class="p">,</span> <span class="s">&#39;is&#39;</span><span class="p">,</span> <span class="s">&#39;isn</span><span class="se">\&#39;</span><span class="s">t&#39;</span><span class="p">,</span> <span class="s">&#39;it&#39;</span><span class="p">,</span> <span class="s">&#39;it</span><span class="se">\&#39;</span><span class="s">d&#39;</span><span class="p">,</span> <span class="s">&#39;it</span><span class="se">\&#39;</span><span class="s">ll&#39;</span><span class="p">,</span> <span class="s">&#39;it</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;its&#39;</span><span class="p">,</span> <span class="s">&#39;itself&#39;</span><span class="p">,</span> <span class="s">&#39;just&#39;</span><span class="p">,</span> <span class="s">&#39;keep&#39;</span><span class="p">,</span> <span class="s">&#39;keeps&#39;</span><span class="p">,</span> <span class="s">&#39;kept&#39;</span><span class="p">,</span> <span class="s">&#39;know&#39;</span><span class="p">,</span> <span class="s">&#39;known&#39;</span><span class="p">,</span> <span class="s">&#39;knows&#39;</span><span class="p">,</span> <span class="s">&#39;last&#39;</span><span class="p">,</span> <span class="s">&#39;lately&#39;</span><span class="p">,</span> <span class="s">&#39;later&#39;</span><span class="p">,</span> <span class="s">&#39;latter&#39;</span><span class="p">,</span> <span class="s">&#39;latterly&#39;</span><span class="p">,</span> <span class="s">&#39;least&#39;</span><span class="p">,</span> <span class="s">&#39;less&#39;</span><span class="p">,</span> <span class="s">&#39;lest&#39;</span><span class="p">,</span> <span class="s">&#39;let&#39;</span><span class="p">,</span> <span class="s">&#39;let</span><span class="se">\&#39;</span><span class="s">s&#39;</span><span class="p">,</span> <span class="s">&#39;like&#39;</span><span class="p">,</span> <span class="s">&#39;liked&#39;</span><span class="p">,</span> <span class="s">&#39;likely&#39;</span><span class="p">,</span> <span class="s">&#39;little&#39;</span><span class="p">,</span> <span class="s">&#39;look&#39;</span><span class="p">,</span> <span class="s">&#39;looking&#39;</span><span class="p">,</span> <span class="s">&#39;looks&#39;</span><span class="p">,</span> <span class="s">&#39;ltd&#39;</span><span class="p">,</span> <span class="s">&#39;mainly&#39;</span><span class="p">,</span> <span class="s">&#39;many&#39;</span><span class="p">,</span> <span class="s">&#39;may&#39;</span><span class="p">,</span> <span class="s">&#39;maybe&#39;</span><span class="p">,</span> <span class="s">&#39;me&#39;</span><span class="p">,</span> <span class="s">&#39;mean&#39;</span><span class="p">,</span> <span class="s">&#39;meanwhile&#39;</span><span class="p">,</span> <span class="s">&#39;merely&#39;</span><span class="p">,</span> <span class="s">&#39;might&#39;</span><span class="p">,</span> <span class="s">&#39;more&#39;</span><span class="p">,</span> <span class="s">&#39;moreover&#39;</span><span class="p">,</span> <span class="s">&#39;most&#39;</span><span class="p">,</span> <span class="s">&#39;mostly&#39;</span><span class="p">,</span> <span class="s">&#39;much&#39;</span><span class="p">,</span> <span class="s">&#39;must&#39;</span><span class="p">,</span> <span class="s">&#39;my&#39;</span><span class="p">,</span> <span class="s">&#39;myself&#39;</span><span class="p">,</span> <span class="s">&#39;name&#39;</span><span class="p">,</span> <span class="s">&#39;namely&#39;</span><span class="p">,</span> <span class="s">&#39;nd&#39;</span><span class="p">,</span> <span class="s">&#39;near&#39;</span><span class="p">,</span> <span class="s">&#39;nearly&#39;</span><span class="p">,</span> <span class="s">&#39;necessary&#39;</span><span class="p">,</span> <span class="s">&#39;need&#39;</span><span class="p">,</span> <span class="s">&#39;needs&#39;</span><span class="p">,</span> <span class="s">&#39;neither&#39;</span><span class="p">,</span> <span class="s">&#39;never&#39;</span><span class="p">,</span> <span class="s">&#39;nevertheless&#39;</span><span class="p">,</span> <span class="s">&#39;new&#39;</span><span class="p">,</span> <span class="s">&#39;next&#39;</span><span class="p">,</span> <span class="s">&#39;nine&#39;</span><span class="p">,</span> <span class="s">&#39;no&#39;</span><span class="p">,</span> <span class="s">&#39;nobody&#39;</span><span class="p">,</span> <span class="s">&#39;non&#39;</span><span class="p">,</span> <span class="s">&#39;none&#39;</span><span class="p">,</span> <span class="s">&#39;noone&#39;</span><span class="p">,</span> <span class="s">&#39;nor&#39;</span><span class="p">,</span> <span class="s">&#39;normally&#39;</span><span class="p">,</span> <span class="s">&#39;not&#39;</span><span class="p">,</span> <span class="s">&#39;nothing&#39;</span><span class="p">,</span> <span class="s">&#39;novel&#39;</span><span class="p">,</span> <span class="s">&#39;now&#39;</span><span class="p">,</span> <span class="s">&#39;nowhere&#39;</span><span class="p">,</span> <span class="s">&#39;obviously&#39;</span><span class="p">,</span> <span class="s">&#39;of&#39;</span><span class="p">,</span> <span class="s">&#39;off&#39;</span><span class="p">,</span> <span class="s">&#39;often&#39;</span><span class="p">,</span> <span class="s">&#39;oh&#39;</span><span class="p">,</span> <span class="s">&#39;ok&#39;</span><span class="p">,</span> <span class="s">&#39;okay&#39;</span><span class="p">,</span> <span class="s">&#39;old&#39;</span><span class="p">,</span> <span class="s">&#39;on&#39;</span><span class="p">,</span> <span class="s">&#39;once&#39;</span><span class="p">,</span> <span class="s">&#39;one&#39;</span><span class="p">,</span> <span class="s">&#39;ones&#39;</span><span class="p">,</span> <span class="s">&#39;only&#39;</span><span class="p">,</span> <span class="s">&#39;onto&#39;</span><span class="p">,</span> <span class="s">&#39;or&#39;</span><span class="p">,</span> <span class="s">&#39;other&#39;</span><span class="p">,</span> <span class="s">&#39;others&#39;</span><span class="p">,</span> <span class="s">&#39;otherwise&#39;</span><span class="p">,</span> <span class="s">&#39;ought&#39;</span><span class="p">,</span> <span class="s">&#39;our&#39;</span><span class="p">,</span> <span class="s">&#39;ours&#39;</span><span class="p">,</span> <span class="s">&#39;ourselves&#39;</span><span class="p">,</span> <span class="s">&#39;out&#39;</span><span class="p">,</span> <span class="s">&#39;outside&#39;</span><span class="p">,</span> <span class="s">&#39;over&#39;</span><span class="p">,</span> <span class="s">&#39;overall&#39;</span><span class="p">,</span> <span class="s">&#39;own&#39;</span><span class="p">,</span> <span class="s">&#39;particular&#39;</span><span class="p">,</span> <span class="s">&#39;particularly&#39;</span><span class="p">,</span> <span class="s">&#39;per&#39;</span><span class="p">,</span> <span class="s">&#39;perhaps&#39;</span><span class="p">,</span> <span class="s">&#39;placed&#39;</span><span class="p">,</span> <span class="s">&#39;please&#39;</span><span class="p">,</span> <span class="s">&#39;plus&#39;</span><span class="p">,</span> <span class="s">&#39;possible&#39;</span><span class="p">,</span> <span class="s">&#39;presumably&#39;</span><span class="p">,</span> <span class="s">&#39;probably&#39;</span><span class="p">,</span> <span class="s">&#39;provides&#39;</span><span class="p">,</span> <span class="s">&#39;que&#39;</span><span class="p">,</span> <span class="s">&#39;quite&#39;</span><span class="p">,</span> <span class="s">&#39;qv&#39;</span><span class="p">,</span> <span class="s">&#39;rather&#39;</span><span class="p">,</span> <span class="s">&#39;rd&#39;</span><span class="p">,</span> <span class="s">&#39;re&#39;</span><span class="p">,</span> <span class="s">&#39;really&#39;</span><span class="p">,</span> <span class="s">&#39;reasonably&#39;</span><span class="p">,</span> <span class="s">&#39;regarding&#39;</span><span class="p">,</span> <span class="s">&#39;regardless&#39;</span><span class="p">,</span> <span class="s">&#39;regards&#39;</span><span class="p">,</span> <span class="s">&#39;relatively&#39;</span><span class="p">,</span> <span class="s">&#39;respectively&#39;</span><span class="p">,</span> <span class="s">&#39;right&#3