Commits

yanchuan sim committed fa2654c

added stemmer to documentation

Comments (0)

Files changed (10)

docs/html/_modules/ycutils/tokenize.html

   <span class="k">return</span> <span class="n">sents</span>
 <span class="c">#end def</span>
 </div>
-<span class="k">def</span> <span class="nf">filter_stopwords</span><span class="p">(</span><span class="n">tokens</span><span class="p">,</span> <span class="n">my_stopwords</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
+<div class="viewcode-block" id="filter_stopwords"><a class="viewcode-back" href="../../tokenize.html#ycutils.tokenize.filter_stopwords">[docs]</a><span class="k">def</span> <span class="nf">filter_stopwords</span><span class="p">(</span><span class="n">tokens</span><span class="p">,</span> <span class="n">my_stopwords</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
   <span class="sd">&quot;&quot;&quot;Filter stopwords from a list of tokens.</span>
 
 <span class="sd">  :param tokens: a list of tokens.</span>
 
   <span class="k">return</span> <span class="nb">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">w</span><span class="p">:</span> <span class="n">w</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">my_stopwords</span><span class="p">,</span> <span class="n">tokens</span><span class="p">)</span>
 <span class="c">#end def</span>
-
-<span class="k">def</span> <span class="nf">ngram_tokens</span><span class="p">(</span><span class="n">tokens</span><span class="p">,</span> <span class="n">n</span><span class="p">,</span> <span class="n">sep_char</span><span class="o">=</span><span class="s">&#39;_&#39;</span><span class="p">,</span> <span class="n">multiplier</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
+</div>
+<div class="viewcode-block" id="ngram_tokens"><a class="viewcode-back" href="../../tokenize.html#ycutils.tokenize.ngram_tokens">[docs]</a><span class="k">def</span> <span class="nf">ngram_tokens</span><span class="p">(</span><span class="n">tokens</span><span class="p">,</span> <span class="n">n</span><span class="p">,</span> <span class="n">sep_char</span><span class="o">=</span><span class="s">&#39;_&#39;</span><span class="p">,</span> <span class="n">multiplier</span><span class="o">=</span><span class="bp">None</span><span class="p">):</span>
   <span class="sd">&quot;&quot;&quot;Generate list of n-grams from a sequence of tokens.</span>
 
 <span class="sd">  :param tokens: a sequence of tokens.</span>
 
   <span class="k">return</span> <span class="n">ngrams</span>
 <span class="c">#end def</span>
-</pre></div>
+</pre></div></div>
 
           </div>
         </div>

docs/html/_sources/tokenize.txt

 Module methods
 ==============
 .. automodule:: ycutils.tokenize
-  :members: to_ascii, words, sentences, words_in_sentences, tag_tokens, TAG_EMPTY, TAG_WORD, TAG_NUM, TAG_PUNCT, TAG_TIME, TAG_PHONE, TAG_EMAIL, TAG_URL
+  :members: to_ascii, words, sentences, words_in_sentences, tag_tokens, filter_stopwords, ngram_tokens, stem_tokens, TAG_EMPTY, TAG_WORD, TAG_NUM, TAG_PUNCT, TAG_TIME, TAG_PHONE, TAG_EMAIL, TAG_URL
   :special-members: __DEFAULT_NORMALIZE__, __DEFAULT_TAG_LIST__

docs/html/bagofwords.html

 </dd></dl>
 
 <dl class="method">
+<dt id="ycutils.bagofwords.BOW.to_sentence">
+<tt class="descname">to_sentence</tt><big>(</big><em>sort=False</em><big>)</big><a class="headerlink" href="#ycutils.bagofwords.BOW.to_sentence" title="Permalink to this definition">¶</a></dt>
+<dd><p>Returns a sentence which is equivalent (assuming non-negative integer counts) in content to the <a class="reference internal" href="#ycutils.bagofwords.BOW" title="ycutils.bagofwords.BOW"><tt class="xref py py-class docutils literal"><span class="pre">BOW</span></tt></a> object.</p>
+<table class="docutils field-list" frame="void" rules="none">
+<col class="field-name" />
+<col class="field-body" />
+<tbody valign="top">
+<tr class="field-odd field"><th class="field-name">Returns:</th><td class="field-body">a sentence string.</td>
+</tr>
+</tbody>
+</table>
+</dd></dl>
+
+<dl class="method">
 <dt id="ycutils.bagofwords.BOW.to_wc_string">
 <tt class="descname">to_wc_string</tt><big>(</big><big>)</big><a class="reference internal" href="_modules/ycutils/bagofwords.html#BOW.to_wc_string"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#ycutils.bagofwords.BOW.to_wc_string" title="Permalink to this definition">¶</a></dt>
 <dd><p>Format the <a class="reference internal" href="#ycutils.bagofwords.BOW" title="ycutils.bagofwords.BOW"><tt class="xref py py-class docutils literal"><span class="pre">BOW</span></tt></a> object in a <tt class="docutils literal"><span class="pre">word:count</span></tt> formatted string which looks like <tt class="docutils literal"><span class="pre">word1:count1</span> <span class="pre">word2:count2</span> <span class="pre">...</span></tt>.</p>

docs/html/genindex.html

   </dt>
 
       
+  <dt><a href="tokenize.html#ycutils.tokenize.filter_stopwords">filter_stopwords() (in module ycutils.tokenize)</a>
+  </dt>
+
+      
   <dt><a href="urls/printable.html#ycutils.urls.printable.PrintableURL.find">find() (ycutils.urls.printable.PrintableURL method)</a>
   </dt>
 
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%" valign="top"><dl>
       
+  <dt><a href="tokenize.html#ycutils.tokenize.ngram_tokens">ngram_tokens() (in module ycutils.tokenize)</a>
+  </dt>
+
+  </dl></td>
+  <td style="width: 33%" valign="top"><dl>
+      
   <dt><a href="bagofwords.html#ycutils.bagofwords.BOW.normalize">normalize() (ycutils.bagofwords.BOW method)</a>
   </dt>
 
   <dt><a href="urls.html#ycutils.urls.sort_key">sort_key() (in module ycutils.urls)</a>
   </dt>
 
+      
+  <dt><a href="tokenize.html#ycutils.tokenize.stem_tokens">stem_tokens() (in module ycutils.tokenize)</a>
+  </dt>
+
   </dl></td>
 </tr></table>
 
   <dt><a href="tokenize.html#ycutils.tokenize.TAG_WORD">TAG_WORD (in module ycutils.tokenize)</a>
   </dt>
 
-  </dl></td>
-  <td style="width: 33%" valign="top"><dl>
       
   <dt><a href="tfidf.html#ycutils.tfidf.TFIDF">TFIDF (class in ycutils.tfidf)</a>
   </dt>
 
+  </dl></td>
+  <td style="width: 33%" valign="top"><dl>
       
   <dt><a href="bagofwords.html#ycutils.bagofwords.Document.title">title (ycutils.bagofwords.Document attribute)</a>
   </dt>
 
       </dl></dd>
       
+  <dt><a href="bagofwords.html#ycutils.bagofwords.BOW.to_sentence">to_sentence() (ycutils.bagofwords.BOW method)</a>
+  </dt>
+
+      
   <dt><a href="bagofwords.html#ycutils.bagofwords.BOW.to_wc_string">to_wc_string() (ycutils.bagofwords.BOW method)</a>
   </dt>
 

docs/html/objects.inv

Binary file modified.

docs/html/searchindex.js

-Search.setIndex({objects:{ycutils:{bleu:[0,0,1,""],bagofwords:[2,0,1,""],tsvio:[4,0,1,""],tfidf:[5,0,1,""],bigvocab:[6,0,1,""],urls:[7,0,1,""],tokenize:[8,0,1,""],corpus:[9,0,1,""]},"ycutils.bagofwords":{Document:[2,3,1,""],cosine_similarity:[2,4,1,""],random_title:[2,4,1,""],BOW:[2,3,1,""]},"ycutils.bigvocab.BigBOW":{itertokens:[6,1,1,""],TokenIter:[6,3,1,""],add_bow:[6,1,1,""],add_tokens:[6,1,1,""],delete_token:[6,1,1,""],tokens:[6,1,1,""],inc_token_count:[6,1,1,""],get_token_count:[6,1,1,""],to_wc_string:[6,1,1,""],set_token_count:[6,1,1,""],add_wc_string:[6,1,1,""]},"ycutils.tfidf.TFIDF":{untransform:[5,1,1,""],transform:[5,1,1,""]},"ycutils.urls.webpages":{download:[3,4,1,""],MAX_TRIES:[3,2,1,""],WGET_PATH:[3,2,1,""],USER_AGENT:[3,2,1,""]},"ycutils.bleu":{count_ngrams:[0,4,1,""],score:[0,4,1,""]},"ycutils.bagofwords.BOW":{normalize:[2,1,1,""],dot_product:[2,1,1,""],to_wc_string:[2,1,1,""],"__str__":[2,1,1,""],"__iadd__":[2,1,1,""],l2_norm:[2,1,1,""],add_tokens:[2,1,1,""],"__mul__":[2,1,1,""],l1_norm:[2,1,1,""],add_wc_string:[2,1,1,""]},"ycutils.tokenize":{words_in_sentences:[8,4,1,""],tag_tokens:[8,4,1,""],TAG_PHONE:[8,2,1,""],TAG_URL:[8,2,1,""],TAG_PUNCT:[8,2,1,""],TAG_TIME:[8,2,1,""],to_ascii:[8,4,1,""],words:[8,4,1,""],sentences:[8,4,1,""],TAG_EMPTY:[8,2,1,""],TAG_EMAIL:[8,2,1,""],TAG_NUM:[8,2,1,""],TAG_WORD:[8,2,1,""]},"ycutils.corpus":{CorpusVocabulary:[9,3,1,""],Corpus:[9,3,1,""],DEFAULT_UNKNOWN_TOKEN:[9,2,1,""]},"ycutils.corpus.Corpus":{from_file:[9,1,1,""],unique_title:[9,1,1,""],vocabulary:[9,1,1,""],add_bow:[9,1,1,""],inverse_document_frequency:[9,1,1,""],add_document:[9,1,1,""],document_frequency:[9,1,1,""],IDF_LAPLACE_SMOOTHING:[9,5,1,""],to_file:[9,1,1,""]},"ycutils.bigvocab":{BigBOW:[6,3,1,""],VocabularyMap:[6,3,1,""]},"ycutils.bagofwords.Document":{to_wc_string:[2,1,1,""],title:[2,5,1,""],"__str__":[2,1,1,""],add_wc_string:[2,1,1,""],bow:[2,1,1,""]},"ycutils.tfidf":{TFIDF:[5,3,1,""]},"ycutils.tsvio.TSVFile":{readline:[4,1,1,""],column_headers:[4,5,1,""],readlines:[4,1,1,""],writeline:[4,1,1,""],parseline:[4,1,1,""]},"ycutils.corpus.CorpusVocabulary":{from_file:[9,1,1,""],"__getitem__":[9,1,1,""],"__contains__":[9,1,1,""],from_corpus:[9,1,1,""],find_token:[9,1,1,""],to_bow:[9,1,1,""],filter:[9,1,1,""],"__iter__":[9,1,1,""],iteritems:[9,1,1,""],to_file:[9,1,1,""],"__len__":[9,1,1,""]},"ycutils.urls":{sort_key:[7,4,1,""],webpages:[3,0,1,""],follow_url:[7,4,1,""]},"ycutils.bigvocab.VocabularyMap":{from_file:[6,1,1,""],get_indexes:[6,1,1,""],add_bow:[6,1,1,""],add_tokens:[6,1,1,""],keys_to_tokens:[6,1,1,""],get_index:[6,1,1,""],create_token:[6,5,1,""],unknown_token:[6,5,1,""],get_token:[6,1,1,""],keys_to_indexes:[6,1,1,""],get_tokens:[6,1,1,""],to_file:[6,1,1,""],size:[6,1,1,""]},"ycutils.tsvio":{TSVFile:[4,3,1,""]}},terms:{corpu:[5,1,9,6],represent:2,all:[0,6,8],code:3,"__email__":8,get_token:6,follow:[3,7,8],row:4,categori:8,tag_url:8,dot_product:2,corpusvocabulari:[5,9,6],show:8,readabl:3,send:3,cosine_similar:2,inverse_document_frequ:9,those:8,under:[1,6],norm:2,merchant:1,sourc:[0,2,3,4,5,6,7,8,9],everi:8,string:[2,4,6,3,8,9],fals:[0,4,6,3,8,9],unicodedata:8,mime_typ:3,fall:8,veri:6,retriev:[2,6,9],to_ascii:8,tri:[3,8],did:8,list:[0,2,4,6,7,8],iter:[2,6,9],l2_norm:2,vector:[5,2],cosin:2,"__time__":8,small:9,wednesdai:8,pontchartrain:8,deberri:8,impli:1,corpora:[9,6],natur:1,naiv:8,sign:8,past:8,zero:9,pass:[5,3,8],download:[3,7],"__num__":8,even:1,index:[6,7,8],what:8,sub:7,currenc:8,neg:8,sum:[2,6],idf_laplace_smooth:9,abl:6,uniform:9,access:6,delet:[6,8],version:1,consecut:8,"new":6,method:[0,2,7,5,6,3,8,9],itertoken:6,full:7,deriv:6,iteritem:9,gener:[1,2,4],never:[6,8],here:8,behaviour:6,punct:8,address:[3,7,8],path:[9,3,7],along:1,modifi:[5,1],valu:[5,4,6,9],either:[1,4],convert:[2,6,8],max_tri:3,step:8,bow:[5,2,6,9],precis:0,amount:8,bunker:8,firefox:3,appli:8,modul:[0,1,2,3,4,5,6,7,8,9],foundat:1,href:7,touchdown:8,vovabulari:6,total:2,all_smal:0,regex:8,keys_to_token:6,from:[0,2,4,6,7,8,9],describ:1,would:8,follow_url:7,two:[0,2,8],todai:8,next:4,websit:3,predict:0,call:8,type:[4,6,9,3,8],until:4,more:[0,1,9],sort:[9,7],fulltext:8,python:[1,3,7,8],peopl:8,splitta:8,relat:9,keys_to_index:6,about:[0,9,3,8],enhanc:4,warn:6,phone:8,"__iter__":[9,6],vocabularymap:6,particular:1,count1:[2,6],count2:[2,6],fly:6,none:[2,4,5,6,3,8,9],word:[5,2,6,9,8],hour:8,hous:8,uniqu:[2,9],dev:8,descriptor:[4,6,9],tag:8,del:[6,8],can:[1,9,6,8],cab:8,purpos:1,want:4,give:7,process:[1,4,8],weightag:9,agent:3,minimum:9,tab:[9,2,6,4],bagofword:[5,1,2,6,9],cours:1,end:[4,8],divid:9,"0x13fab206d2da1b9f":9,i686:3,write:[4,6,9],how:8,pure:8,instead:[4,6],simpl:8,unique_titl:9,updat:3,map:6,product:2,likewis:8,after:[2,3,8],befor:[6,8],date:3,data:[2,4,7],leve:8,attempt:3,counter:[5,2,6,9],correspond:[6,8],caus:[3,8],inform:[0,9,3,8],writelin:4,order:9,wind:8,over:[9,6,8],move:4,through:[2,5,6,3,8,9],not_punctu:8,still:8,paramet:[0,2,3,4,5,6,7,8,9],fit:1,timespicayun:8,tag_tim:8,whether:[4,9],tsv:4,bye:[2,6,9],"__money__":8,descend:9,them:[4,6,8],"return":[2,3,4,6,7,8,9],thei:[2,6,8],handl:[9,2,6,4],sentenc:[0,8],count_ngram:0,handi:1,initi:[5,2,6],nation:8,bigvocab:[1,6],save_unknown:9,now:8,term:[1,9,6],document:[5,1,2,6,9],name:4,revers:[5,7],separ:[4,6,9,8],token:[0,1,2,6,8,9],each:[4,9,3,8],found:[9,6],unicod:8,set_token_count:6,mean:8,u2019:8,domain:7,tag_punct:8,individu:8,hard:8,realli:2,redistribut:1,connect:8,year:8,our:[9,2,6,4,8],happen:8,bay:8,special:8,out:[4,6,9,7,8],"try":[7,8],network:8,publish:1,research:1,content:3,rel:7,print:[2,4,5,6,7,9],new_token:8,occurr:0,pred_len:0,math:2,gram:0,lakeshor:8,million:6,differ:8,free:1,standard:6,base:[2,8],mime:3,dictionari:[4,6,9],put:6,org:[1,7],freq:[9,6],spill:8,frequenc:[0,9,6],could:[3,8],keep:8,filter:[9,8],length:[0,8],place:[5,2,6],isn:8,return_tag:8,onto:8,assign:8,frequent:1,first:[4,9,8],origin:7,softwar:1,rang:9,"__unk__":[9,6],strip_unicod:8,number:[0,4,6,3,8,9],yourself:3,mai:7,unlik:8,wrapper:[5,6],wasn:8,skip_empti:4,tsvfile:[4,9],open:4,size:6,given:[0,4,7,5,6,3,8,9],"long":8,espn:8,start:7,unknown:[9,6],licens:1,system:3,messag:3,yesterdai:8,citi:8,add_token:[2,6],process_token:8,"final":7,store:2,document_frequ:9,especi:8,"public":1,copi:1,specifi:[4,8],user_ag:3,"__url__":8,part:[7,8],sum_to:2,kind:8,peril:8,googlebook:7,whenev:9,provid:2,remov:[9,3,8],get_index:6,second:8,structur:2,charact:[2,4,8],matter:6,were:8,posit:6,randomli:2,sai:8,comput:[0,5],ani:[1,2,6,8],dash:8,increment:6,need:5,seen:6,skyland:7,option:[1,9,8],callback:8,note:[5,9,6,3,8],also:[0,9,6,8],exampl:[5,2,9,7,8],take:[4,6,8],which:[9,2,6,4,7],get_token_count:6,subject:8,tool:3,channel:8,distribut:1,normal:[2,8],multipli:2,previou:6,reach:8,regular:8,contain:[0,2,3,4,5,6,7,8,9],new_tag:8,"class":[2,4,5,6,7,9],format:[9,2,6,4],vocab_s:9,url:[1,3,7,8],doc:[9,8],later:1,drive:8,l1_norm:2,declar:3,dot:2,shot:8,parselin:4,add_bow:[9,6],text:[9,8],"__str__":2,random:9,create_token:6,word1:[2,6],word2:[2,6],find:[2,6,3],"730st":8,current:7,onli:8,locat:3,categor:8,configur:8,bust:8,should:[1,2,8],pyutil:[1,7],dict:[4,3],to_fil:[9,6],ysim:3,hope:1,surviv:8,get:[2,9,7],express:8,watch:8,stopword:8,sqrt:2,report:8,youtub:7,him:8,requir:8,enabl:4,default_unknown_token:9,tfidf:[5,1],bag:[5,2,6,9],statist:[5,9],add_docu:9,where:9,seamlessli:6,summari:8,bleu:[0,1],set:[0,4,6,9,3],fair:8,column_head:4,see:[0,1,2,6,3,8,9],tokenit:6,fail:3,sport:8,concern:8,inconveni:8,hopefulli:3,wikipedia:7,iter_obj:6,figur:7,score:0,between:[2,6,9],"import":[2,4,5,6,8,9],email:8,attribut:2,accord:[9,8],kei:[4,6,3,7],drew:8,highlight:8,distinguish:8,refman:8,"2pm":8,popul:6,water:8,tag_email:8,howev:8,tag_phon:8,etc:[2,6,9,8],start_url:7,mani:8,ycutil:[0,1,2,3,4,5,6,7,8,9],com:8,comment:4,prespecifi:8,tokensto:[2,6],hyphen:8,from_fil:[9,6],except:8,header:4,linux:3,assum:6,duplic:9,bigbow:6,creat:[9,6],coupl:8,invers:9,empti:[4,8],compon:7,besid:8,treat:[4,8],basic:[2,3,7],"__len__":9,imag:8,argument:8,assert:2,togeth:[2,6],sort_kei:[9,7],repetit:8,present:8,"case":8,look:[2,6],gnu:1,webpag:[3,7],zlib:3,untransform:5,"__phone__":8,"__iadd__":2,defin:[9,6,8],calcul:[0,5,2],abov:8,error:[9,6,3,8],printabl:7,have:[1,6,8],stdout:[9,6],papineni:0,metric:0,non:8,words_in_sent:8,recal:0,from_corpu:9,ascii:8,"__getitem__":[9,6],perform:[5,6],wget_path:3,make:4,headlight:8,cross:8,same:[2,3,7,8],binari:3,html:[7,8],split:[6,8],largest:9,from_filenam:[9,6],jerri:8,tag_word:8,http:[1,3,7,8],again:8,rais:[9,6,8],temporari:3,user:[3,8],new_word:9,column_count:4,lower:8,add_wc_str:[2,6],nola:8,bow1:2,bow2:2,without:[1,2],thi:[0,1,2,3,4,5,6,7,8,9],gzip:3,clitic:8,model:8,ref_ngram:0,identifi:[3,8],execut:3,boot:8,human:3,mysql:8,ngram:0,monei:8,languag:1,web:3,field:4,tag_empti:8,lake:8,param:9,add:[2,6,9],els:[6,8],save:[9,6,3],tag_num:8,build:9,bin:3,transpar:6,read:[4,6,9],prefer:7,load:[9,8],piec:8,punctuat:8,traffic:8,know:8,world:[5,2,6,4,9],save_to:3,like:[9,2,6,4,8],success:3,whitespac:8,negat:8,integ:6,server:3,collect:[2,6,9],singl:8,output:2,encount:[9,6],www:1,right:8,deal:[3,7,8],twitter:8,ascend:9,back:[9,3,8],sampl:8,instal:3,home:3,bore:8,librari:3,txt:4,pinpoint:8,lead:9,"__contains__":9,avoid:9,though:8,per:6,retri:3,larg:9,filter_stopword:8,refer:[0,3],machin:0,object:[9,2,6,4,3],compress:3,pirogu:8,hexadecim:[2,9],x11:3,ref_len:0,wget:3,deflat:3,column:[4,6,9],to_wc_str:[2,6],doc_count:9,"__mul__":2,idf:[5,9],tag_list:8,disabl:4,block:8,own:8,vocabulari:[9,6],within:[6,8],automat:0,three:8,warranti:1,unknown_token:[9,6],weather:8,tag_token:8,announc:3,lebron:8,your:[1,3],tsvio:[1,4,9],few:8,transform:[5,6],textual:9,delete_token:6,avail:[4,7,8],jdeberri:8,interfac:8,includ:0,replac:[9,8],"function":[4,6,7,8],head:8,form:[2,6,9],tupl:[9,8],keyerror:6,use_count:9,translat:0,corpus_vocabulari:[5,6],line:[4,6],"true":[0,4,6,3,8,9],sent:8,count:[0,2,6,9],jarvi:8,possibl:9,"default":[9,6,3,8],maximum:[0,9,3],record:8,below:8,limit:4,otherwis:[6,8],readlin:4,similar:2,sum_unknown:6,constant:9,evalu:0,find_token:9,to_bow:9,repres:6,twist:8,inc_token_count:6,exist:[9,6],rule:8,file:[4,6,9,3],doe:[9,8],gecko:3,check:[9,8],inc:6,denot:9,quot:8,titl:[2,9],when:[2,6,9,7,8],detail:[1,6],rubber:8,flood:8,other:[2,8],test:4,you:[1,8],nice:4,node:8,mozilla:3,knew:8,comment_char:4,consid:[0,8],ago:8,tiger:8,bitbucket:7,receiv:1,eof:4,directori:3,descript:[9,3],random_titl:2,wc_string:[2,6,9],ignor:[4,6,9,8],pred_ngram:0,time:[3,8],far:8,escap:8,hello:[5,2,6,4,9]},objtypes:{"0":"py:module","1":"py:method","2":"py:data","3":"py:class","4":"py:function","5":"py:attribute"},titles:["<em>bleu</em> module","Documentation for ycutils","<em>bagofwords</em> module","<em>webpages</em> module","<em>tsvio</em> module","<em>tfidf</em> module","<em>bigvocab</em> module","<em>urls</em> module","<em>tokenize</em> module","<em>corpus</em> module"],objnames:{"0":["py","module","Python module"],"1":["py","method","Python method"],"2":["py","data","Python data"],"3":["py","class","Python class"],"4":["py","function","Python function"],"5":["py","attribute","Python attribute"]},filenames:["bleu","index","bagofwords","urls/webpages","tsvio","tfidf","bigvocab","urls","tokenize","corpus"]})
+Search.setIndex({objects:{ycutils:{bleu:[0,0,1,""],bagofwords:[2,0,1,""],tsvio:[4,0,1,""],tfidf:[5,0,1,""],bigvocab:[6,0,1,""],urls:[7,0,1,""],tokenize:[8,0,1,""],corpus:[9,0,1,""]},"ycutils.bagofwords":{Document:[2,4,1,""],cosine_similarity:[2,2,1,""],random_title:[2,2,1,""],BOW:[2,4,1,""]},"ycutils.bigvocab.BigBOW":{itertokens:[6,1,1,""],TokenIter:[6,4,1,""],add_bow:[6,1,1,""],add_tokens:[6,1,1,""],delete_token:[6,1,1,""],tokens:[6,1,1,""],inc_token_count:[6,1,1,""],get_token_count:[6,1,1,""],to_wc_string:[6,1,1,""],set_token_count:[6,1,1,""],add_wc_string:[6,1,1,""]},"ycutils.tfidf.TFIDF":{untransform:[5,1,1,""],transform:[5,1,1,""]},"ycutils.urls.webpages":{download:[3,2,1,""],MAX_TRIES:[3,3,1,""],WGET_PATH:[3,3,1,""],USER_AGENT:[3,3,1,""]},"ycutils.bleu":{count_ngrams:[0,2,1,""],score:[0,2,1,""]},"ycutils.bagofwords.BOW":{normalize:[2,1,1,""],dot_product:[2,1,1,""],to_wc_string:[2,1,1,""],"__str__":[2,1,1,""],"__iadd__":[2,1,1,""],l2_norm:[2,1,1,""],add_tokens:[2,1,1,""],to_sentence:[2,1,1,""],"__mul__":[2,1,1,""],l1_norm:[2,1,1,""],add_wc_string:[2,1,1,""]},"ycutils.tokenize":{words_in_sentences:[8,2,1,""],tag_tokens:[8,2,1,""],TAG_PHONE:[8,3,1,""],TAG_URL:[8,3,1,""],stem_tokens:[8,2,1,""],TAG_PUNCT:[8,3,1,""],TAG_TIME:[8,3,1,""],filter_stopwords:[8,2,1,""],to_ascii:[8,2,1,""],ngram_tokens:[8,2,1,""],words:[8,2,1,""],sentences:[8,2,1,""],TAG_EMPTY:[8,3,1,""],TAG_EMAIL:[8,3,1,""],TAG_NUM:[8,3,1,""],TAG_WORD:[8,3,1,""]},"ycutils.corpus":{CorpusVocabulary:[9,4,1,""],Corpus:[9,4,1,""],DEFAULT_UNKNOWN_TOKEN:[9,3,1,""]},"ycutils.corpus.Corpus":{from_file:[9,1,1,""],unique_title:[9,1,1,""],vocabulary:[9,1,1,""],add_bow:[9,1,1,""],inverse_document_frequency:[9,1,1,""],add_document:[9,1,1,""],document_frequency:[9,1,1,""],IDF_LAPLACE_SMOOTHING:[9,5,1,""],to_file:[9,1,1,""]},"ycutils.bigvocab":{BigBOW:[6,4,1,""],VocabularyMap:[6,4,1,""]},"ycutils.bagofwords.Document":{to_wc_string:[2,1,1,""],title:[2,5,1,""],"__str__":[2,1,1,""],add_wc_string:[2,1,1,""],bow:[2,1,1,""]},"ycutils.tfidf":{TFIDF:[5,4,1,""]},"ycutils.tsvio.TSVFile":{readline:[4,1,1,""],column_headers:[4,5,1,""],readlines:[4,1,1,""],writeline:[4,1,1,""],parseline:[4,1,1,""]},"ycutils.corpus.CorpusVocabulary":{from_file:[9,1,1,""],"__getitem__":[9,1,1,""],"__contains__":[9,1,1,""],from_corpus:[9,1,1,""],find_token:[9,1,1,""],to_bow:[9,1,1,""],filter:[9,1,1,""],"__iter__":[9,1,1,""],iteritems:[9,1,1,""],to_file:[9,1,1,""],"__len__":[9,1,1,""]},"ycutils.urls":{sort_key:[7,2,1,""],webpages:[3,0,1,""],follow_url:[7,2,1,""]},"ycutils.bigvocab.VocabularyMap":{from_file:[6,1,1,""],get_indexes:[6,1,1,""],add_bow:[6,1,1,""],add_tokens:[6,1,1,""],keys_to_tokens:[6,1,1,""],get_index:[6,1,1,""],create_token:[6,5,1,""],unknown_token:[6,5,1,""],get_token:[6,1,1,""],keys_to_indexes:[6,1,1,""],get_tokens:[6,1,1,""],to_file:[6,1,1,""],size:[6,1,1,""]},"ycutils.tsvio":{TSVFile:[4,4,1,""]}},terms:{corpu:[5,1,9,6],represent:2,all:[0,6,8],code:3,gupta:8,month:8,"__email__":8,get_token:6,follow:[3,7,8],row:4,categori:8,tag_url:8,dot_product:2,corpusvocabulari:[5,9,6],show:8,readabl:3,send:3,cosine_similar:2,inverse_document_frequ:9,those:8,under:[1,6],norm:2,merchant:1,sourc:[0,2,3,4,5,6,7,8,9],everi:8,string:[2,4,6,3,8,9],fals:[0,2,4,6,3,8,9],unicodedata:8,mime_typ:3,fall:8,veri:6,retriev:[2,6,9],to_ascii:8,tri:[3,8],did:8,list:[0,2,4,6,7,8],iter:[2,6,9],l2_norm:2,vector:[5,2],cosin:2,"__time__":8,small:9,wednesdai:8,pontchartrain:8,deberri:8,impli:1,corpora:[9,6],natur:1,naiv:8,sign:8,past:8,zero:9,pass:[5,3,8],download:[3,7],"__num__":8,even:1,index:[6,7,8],what:8,sub:7,currenc:8,neg:[2,8],sum:[2,6],idf_laplace_smooth:9,abl:6,uniform:9,access:6,delet:[6,8],version:1,tartaru:8,consecut:8,"new":6,method:[0,2,7,5,6,3,8,9],itertoken:6,full:7,deriv:6,iteritem:9,gener:[1,2,4,8],never:[6,8],here:8,behaviour:6,punct:8,address:[3,7,8],path:[9,3,7],along:1,modifi:[5,1],valu:[5,4,6,9,8],either:[1,4],convert:[2,6,8],max_tri:3,step:8,bow:[5,2,6,9],precis:0,amount:8,bunker:8,implement:8,"__month__":8,firefox:3,appli:8,modul:[0,1,2,3,4,5,6,7,8,9],foundat:1,href:7,touchdown:8,vovabulari:6,total:2,all_smal:0,regex:8,keys_to_token:6,from:[0,2,4,6,7,8,9],describ:1,would:8,follow_url:7,two:[0,2,8],todai:8,next:4,websit:3,predict:0,call:8,type:[4,6,9,3,8],until:4,more:[0,1,9],sort:[2,9,7],fulltext:8,python:[1,3,7,8],peopl:8,splitta:8,relat:9,keys_to_index:6,about:[0,9,3,8],enhanc:4,warn:6,phone:8,"__iter__":[9,6],vocabularymap:6,particular:1,count1:[2,6],count2:[2,6],fly:6,none:[2,4,5,6,3,8,9],word:[5,2,6,9,8],hour:8,hous:8,uniqu:[2,9],dev:8,descriptor:[4,6,9],tag:8,del:[6,8],can:[1,9,6,8],cab:8,purpos:1,want:4,give:7,process:[1,4,8],weightag:9,agent:3,minimum:9,tab:[9,2,6,4],bagofword:[5,1,2,6,9],cours:1,end:[4,8],divid:9,get:[2,9,7],"0x13fab206d2da1b9f":9,i686:3,write:[4,6,9],how:8,ngram_token:8,pure:8,instead:[4,6],simpl:8,unique_titl:9,updat:3,map:6,product:2,likewis:8,after:[2,3,8],befor:[6,8],date:3,data:[2,4,7],leve:8,attempt:3,counter:[5,2,6,9],correspond:[6,8],caus:[3,8],inform:[0,9,3,8],writelin:4,order:9,wind:8,over:[9,6,8],move:4,through:[2,5,6,3,8,9],not_punctu:8,still:8,paramet:[0,2,3,4,5,6,7,8,9],fit:1,timespicayun:8,"__unk__":[9,6],tag_tim:8,whether:[4,9],tsv:4,bye:[2,6,9],"__money__":8,descend:9,them:[4,6,8],"return":[2,3,4,6,7,8,9],stemmer:8,thei:[2,6,8],handl:[9,2,6,4],sentenc:[0,2,8],count_ngram:0,handi:1,initi:[5,2,6],nation:8,bigvocab:[1,6],save_unknown:9,now:8,term:[1,9,6],document:[5,1,2,6,9],name:[4,8],revers:[5,7],separ:[4,6,9,8],token:[0,1,2,6,8,9],each:[4,9,3,8],found:[9,6,8],unicod:8,set_token_count:6,mean:8,u2019:8,domain:7,tag_punct:8,individu:8,hard:8,realli:2,redistribut:1,connect:8,year:8,our:[9,2,6,4,8],happen:8,bay:8,special:8,out:[4,6,9,7,8],"try":[7,8],network:8,publish:1,research:1,content:[2,3],rel:7,print:[2,4,5,6,7,9],new_token:8,occurr:0,pred_len:0,math:2,gram:[0,8],lakeshor:8,million:6,differ:8,free:1,standard:6,base:[2,8],mime:3,dictionari:[4,6,9],put:6,org:[1,7,8],freq:[9,6],spill:8,frequenc:[0,9,6],could:[3,8],keep:8,filter:[9,8],turn:8,length:[0,8],place:[5,2,6],isn:8,return_tag:8,onto:8,assign:8,frequent:1,first:[4,9,8],origin:7,softwar:1,rang:9,to_sent:2,strip_unicod:8,number:[0,4,6,3,8,9],yourself:3,mai:7,unlik:8,wrapper:[5,6],wasn:8,skip_empti:4,tsvfile:[4,9],open:4,size:6,given:[0,4,7,5,6,3,8,9],"long":8,espn:8,start:7,unknown:[9,6],licens:1,system:3,messag:3,yesterdai:8,citi:8,add_token:[2,6],process_token:8,"final":7,store:2,document_frequ:9,especi:8,"public":1,copi:1,specifi:[4,8],user_ag:3,"__url__":8,part:[7,8],steem:8,kind:8,peril:8,googlebook:7,whenev:9,provid:2,remov:[9,3,8],get_index:6,second:8,structur:2,charact:[2,4,8],matter:6,were:8,posit:6,randomli:2,sai:8,comput:[0,5,8],ani:[1,2,6,8],dash:8,increment:6,need:5,seen:6,skyland:7,option:[1,9,8],equival:2,callback:8,note:[5,9,6,3,8],also:[0,9,6,8],exampl:[5,2,9,7,8],take:[4,6,8],which:[9,2,6,4,7],get_token_count:6,subject:8,tool:3,channel:8,sep_char:8,distribut:1,normal:[2,8],multipli:2,previou:6,reach:8,regular:8,contain:[0,2,3,4,5,6,7,8,9],new_tag:8,"class":[2,4,5,6,7,9],format:[9,2,6,4],vocab_s:9,url:[1,3,7,8],doc:[9,8],later:1,drive:8,l1_norm:2,declar:3,knew:8,dot:2,shot:8,parselin:4,add_bow:[9,6],text:[9,8],"__str__":2,random:9,create_token:6,word1:[2,6],word2:[2,6],find:[2,6,3],"730st":8,current:7,onli:8,locat:3,categor:8,configur:8,bust:8,should:[1,2,8],pyutil:[1,7],dict:[4,3],to_fil:[9,6],ysim:3,hope:1,surviv:8,sum_to:2,express:8,watch:8,stopword:8,sqrt:2,report:8,youtub:7,him:8,requir:8,enabl:4,default_unknown_token:9,tfidf:[5,1],bag:[5,2,6,9],my_stopword:8,statist:[5,9],add_docu:9,where:9,seamlessli:6,summari:8,bleu:[0,1],set:[0,4,6,9,3],fair:8,column_head:4,see:[0,1,2,6,3,8,9],tokenit:6,fail:3,sport:8,concern:8,inconveni:8,hopefulli:3,wikipedia:7,iter_obj:6,figur:7,score:0,between:[2,6,9,8],"import":[2,4,5,6,8,9],email:8,attribut:2,accord:[9,8],kei:[4,6,3,7],conjoin:8,drew:8,highlight:8,distinguish:8,refman:8,"2pm":8,popul:6,water:8,tag_email:8,howev:8,tag_phon:8,etc:[2,6,9,8],start_url:7,mani:8,ycutil:[0,1,2,3,4,5,6,7,8,9],com:8,comment:4,prespecifi:8,tokensto:[2,6],hyphen:8,from_fil:[9,6],except:8,header:4,linux:3,assum:[2,6],duplic:9,bigbow:6,creat:[9,6],coupl:8,invers:9,empti:[4,8],compon:7,besid:8,treat:[4,8],basic:[2,3,7],"__len__":9,imag:8,argument:8,assert:2,togeth:[2,6,8],sort_kei:[9,7],repetit:8,present:8,"case":8,look:[2,6],gnu:1,webpag:[3,7],zlib:3,untransform:5,"__phone__":8,"__iadd__":2,defin:[9,6,8],calcul:[0,5,2],abov:8,error:[9,6,3,8],stem_token:8,printabl:7,have:[1,6,8],stdout:[9,6],papineni:0,metric:0,non:[2,8],words_in_sent:8,recal:0,from_corpu:9,ascii:8,"__getitem__":[9,6],perform:[5,6],wget_path:3,make:4,headlight:8,cross:8,same:[2,3,7,8],binari:3,html:[7,8],split:[6,8],largest:9,from_filenam:[9,6],jerri:8,tag_word:8,http:[1,3,7,8],again:8,rais:[9,6,8],temporari:3,user:[3,8],build:9,new_word:9,column_count:4,lower:8,add_wc_str:[2,6],nola:8,bow1:2,bow2:2,without:[1,2],thi:[0,1,2,3,4,5,6,7,8,9],gzip:3,clitic:8,model:8,filter_stopword:8,ref_ngram:0,identifi:[3,8],execut:3,boot:8,human:3,mysql:8,ngram:0,monei:8,languag:1,web:3,field:4,tag_empti:8,lake:8,param:9,add:[2,6,9],els:[6,8],save:[9,6,3],tag_num:8,porterstemm:8,bin:3,transpar:6,read:[4,6,9],prefer:7,load:[9,8],piec:8,punctuat:8,traffic:8,know:8,world:[5,2,6,4,9],save_to:3,like:[9,2,6,4,8],success:3,whitespac:8,negat:8,integ:[2,6],server:3,collect:[2,6,9],singl:8,martin:8,output:2,encount:[9,6],www:1,right:8,deal:[3,7,8],twitter:8,ascend:9,back:[9,3,8],sampl:8,instal:3,home:3,bore:8,librari:3,txt:[4,8],pinpoint:8,lead:9,"__contains__":9,avoid:9,though:8,per:6,retri:3,larg:9,porter:8,refer:[0,3],machin:0,object:[9,2,6,4,3],compress:3,pirogu:8,hexadecim:[2,9],x11:3,ref_len:0,wget:3,deflat:3,column:[4,6,9],to_wc_str:[2,6],doc_count:9,"__mul__":2,idf:[5,9],tag_list:8,disabl:4,block:8,own:8,vocabulari:[9,6],within:[6,8],automat:0,three:8,warranti:1,unknown_token:[9,6],weather:8,tag_token:8,announc:3,lebron:8,your:[1,3,8],tsvio:[1,4,9],few:8,transform:[5,6],textual:9,delete_token:6,avail:[4,7,8],jdeberri:8,interfac:8,includ:0,replac:[9,8],"function":[4,6,7,8],head:8,form:[2,6,9],tupl:[9,8],keyerror:6,use_count:9,translat:0,corpus_vocabulari:[5,6],line:[4,6],"true":[0,4,6,3,8,9],sent:8,count:[0,2,6,9],jarvi:8,possibl:9,"default":[9,6,3,8],maximum:[0,9,3],record:8,below:8,limit:4,otherwis:[6,8],readlin:4,similar:2,sum_unknown:6,constant:9,evalu:0,find_token:9,to_bow:9,repres:6,twist:8,inc_token_count:6,vivak:8,exist:[9,6],rule:8,file:[4,6,9,3],doe:[9,8],gecko:3,check:[9,8],inc:6,denot:9,quot:8,titl:[2,9],when:[2,6,9,7,8],detail:[1,6],rubber:8,flood:8,other:[2,8],test:4,you:[1,8],nice:4,node:8,mozilla:3,sequenc:8,comment_char:4,consid:[0,8],ago:8,tiger:8,bitbucket:7,receiv:1,eof:4,directori:3,descript:[9,3],random_titl:2,wc_string:[2,6,9],ignor:[4,6,9,8],pred_ngram:0,time:[3,8],far:8,escap:8,hello:[5,2,6,4,9]},objtypes:{"0":"py:module","1":"py:method","2":"py:function","3":"py:data","4":"py:class","5":"py:attribute"},titles:["<em>bleu</em> module","Documentation for ycutils","<em>bagofwords</em> module","<em>webpages</em> module","<em>tsvio</em> module","<em>tfidf</em> module","<em>bigvocab</em> module","<em>urls</em> module","<em>tokenize</em> module","<em>corpus</em> module"],objnames:{"0":["py","module","Python module"],"1":["py","method","Python method"],"2":["py","function","Python function"],"3":["py","data","Python data"],"4":["py","class","Python class"],"5":["py","attribute","Python attribute"]},filenames:["bleu","index","bagofwords","urls/webpages","tsvio","tfidf","bigvocab","urls","tokenize","corpus"]})

docs/html/tokenize.html

 
 <dl class="function">
 <dt id="ycutils.tokenize.words">
-<tt class="descclassname">ycutils.tokenize.</tt><tt class="descname">words</tt><big>(</big><em>text, strip_unicode=False, normalize=['case', 'consecutive', 'phone', 'time', 'url', 'email', 'number', 'money', 'punct-del', 'hyphen-split', 'clitics-del', 'neg-clitics-keep'], tag_list=['phone', 'time', 'url', 'email', 'number', 'money'], filter_stopwords=False, not_punctuations='', return_tags=False, process_token=None</em><big>)</big><a class="reference internal" href="_modules/ycutils/tokenize.html#words"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#ycutils.tokenize.words" title="Permalink to this definition">¶</a></dt>
+<tt class="descclassname">ycutils.tokenize.</tt><tt class="descname">words</tt><big>(</big><em>text, strip_unicode=False, normalize=['case', 'consecutive', 'phone', 'time', 'month', 'url', 'email', 'number', 'money', 'punct-del', 'hyphen-split', 'clitics-del', 'neg-clitics-keep'], tag_list=['phone', 'time', 'month', 'url', 'email', 'number', 'money'], filter_stopwords=False, not_punctuations='', return_tags=False, process_token=None</em><big>)</big><a class="reference internal" href="_modules/ycutils/tokenize.html#words"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#ycutils.tokenize.words" title="Permalink to this definition">¶</a></dt>
 <dd><p>Tokenize a given string into individual words. This tokenizer is based on a couple of regexes. It first splits a sentence by whitespace into tokens, and tries to identify (tag) each token with a category (through <a class="reference internal" href="#ycutils.tokenize.tag_tokens" title="ycutils.tokenize.tag_tokens"><tt class="xref py py-meth docutils literal"><span class="pre">tag_tokens()</span></tt></a>), for example url, email, number, etc. One can configure the types of tag to check for by specifying it in the <cite>tag_list</cite> argument, and the categories to normalize in the <cite>normalize</cite> argument.</p>
 <p>Besides normalizing for categories, <tt class="xref py py-attr docutils literal"><span class="pre">normalize</span></tt> contains options that specify how we deal with punctuation, clitics, negation clitics and case.</p>
 <p id="normalization-options">Below is a summary of available normalization options:</p>
 <li><tt class="docutils literal"><span class="pre">email</span></tt>: Replace email addresses with <tt class="docutils literal"><span class="pre">__EMAIL__</span></tt>.</li>
 <li><tt class="docutils literal"><span class="pre">number</span></tt>: Replace numbers with <tt class="docutils literal"><span class="pre">__NUM__</span></tt>.</li>
 <li><tt class="docutils literal"><span class="pre">money</span></tt>: Replace currency with <tt class="docutils literal"><span class="pre">__MONEY__</span></tt>.</li>
+<li><tt class="docutils literal"><span class="pre">month</span></tt>: Replace month names with <tt class="docutils literal"><span class="pre">__MONTH__</span></tt>.</li>
 <li><tt class="docutils literal"><span class="pre">punct-split</span></tt>: Separate punctuations into their own tokens. However, consecutive punctuations will be considered as a single token.</li>
 <li><tt class="docutils literal"><span class="pre">punct-del</span></tt>: Remove all punctuations except hyphens, dashes, single quotes and those specified in <tt class="xref py py-attr docutils literal"><span class="pre">not_punctuations</span></tt>.</li>
 <li><tt class="docutils literal"><span class="pre">hyphen-split</span></tt>: Separate hyphenated tokens into individual tokens. Hyphens are removed in the process.</li>
 
 <dl class="function">
 <dt id="ycutils.tokenize.words_in_sentences">
-<tt class="descclassname">ycutils.tokenize.</tt><tt class="descname">words_in_sentences</tt><big>(</big><em>sents, strip_unicode=False, normalize=['case', 'consecutive', 'phone', 'time', 'url', 'email', 'number', 'money', 'punct-del', 'hyphen-split', 'clitics-del', 'neg-clitics-keep'], tag_list=['phone', 'time', 'url', 'email', 'number', 'money'], filter_stopwords=False, not_punctuations='', return_tags=False, process_token=None</em><big>)</big><a class="reference internal" href="_modules/ycutils/tokenize.html#words_in_sentences"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#ycutils.tokenize.words_in_sentences" title="Permalink to this definition">¶</a></dt>
+<tt class="descclassname">ycutils.tokenize.</tt><tt class="descname">words_in_sentences</tt><big>(</big><em>sents, strip_unicode=False, normalize=['case', 'consecutive', 'phone', 'time', 'month', 'url', 'email', 'number', 'money', 'punct-del', 'hyphen-split', 'clitics-del', 'neg-clitics-keep'], tag_list=['phone', 'time', 'month', 'url', 'email', 'number', 'money'], filter_stopwords=False, not_punctuations='', return_tags=False, process_token=None</em><big>)</big><a class="reference internal" href="_modules/ycutils/tokenize.html#words_in_sentences"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#ycutils.tokenize.words_in_sentences" title="Permalink to this definition">¶</a></dt>
 <dd><p>Tokenize by words a list of sentences. Empty sentences will be removed.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name" />
 
 <dl class="function">
 <dt id="ycutils.tokenize.tag_tokens">
-<tt class="descclassname">ycutils.tokenize.</tt><tt class="descname">tag_tokens</tt><big>(</big><em>tokens, tag_list=['phone', 'time', 'url', 'email', 'number', 'money']</em><big>)</big><a class="reference internal" href="_modules/ycutils/tokenize.html#tag_tokens"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#ycutils.tokenize.tag_tokens" title="Permalink to this definition">¶</a></dt>
+<tt class="descclassname">ycutils.tokenize.</tt><tt class="descname">tag_tokens</tt><big>(</big><em>tokens, tag_list=['phone', 'time', 'month', 'url', 'email', 'number', 'money']</em><big>)</big><a class="reference internal" href="_modules/ycutils/tokenize.html#tag_tokens"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#ycutils.tokenize.tag_tokens" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given a list of tokens, try to assign a category tag to each of them using prespecified regular expressions.</p>
 <table class="docutils field-list" frame="void" rules="none">
 <col class="field-name" />
 </table>
 </dd></dl>
 
+<dl class="function">
+<dt id="ycutils.tokenize.filter_stopwords">
+<tt class="descclassname">ycutils.tokenize.</tt><tt class="descname">filter_stopwords</tt><big>(</big><em>tokens</em>, <em>my_stopwords=None</em><big>)</big><a class="reference internal" href="_modules/ycutils/tokenize.html#filter_stopwords"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#ycutils.tokenize.filter_stopwords" title="Permalink to this definition">¶</a></dt>
+<dd><p>Filter stopwords from a list of tokens.</p>
+<table class="docutils field-list" frame="void" rules="none">
+<col class="field-name" />
+<col class="field-body" />
+<tbody valign="top">
+<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
+<li><strong>tokens</strong> &#8211; a list of tokens.</li>
+<li><strong>my_stopwords</strong> &#8211; <p>option to use your own stopwords list. Defaults to MySQL stopwords found at <a class="reference external" href="http://dev.mysql.com/doc/refman/5.5/en/fulltext-stopwords.html">http://dev.mysql.com/doc/refman/5.5/en/fulltext-stopwords.html</a>.</p>
+</li>
+</ul>
+</td>
+</tr>
+<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">a list of tokens with stopwords removed.</p>
+</td>
+</tr>
+<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last">list</p>
+</td>
+</tr>
+</tbody>
+</table>
+</dd></dl>
+
+<dl class="function">
+<dt id="ycutils.tokenize.ngram_tokens">
+<tt class="descclassname">ycutils.tokenize.</tt><tt class="descname">ngram_tokens</tt><big>(</big><em>tokens</em>, <em>n</em>, <em>sep_char='_'</em><big>)</big><a class="reference internal" href="_modules/ycutils/tokenize.html#ngram_tokens"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#ycutils.tokenize.ngram_tokens" title="Permalink to this definition">¶</a></dt>
+<dd><p>Generate list of n-grams from a sequence of tokens.</p>
+<table class="docutils field-list" frame="void" rules="none">
+<col class="field-name" />
+<col class="field-body" />
+<tbody valign="top">
+<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
+<li><strong>tokens</strong> &#8211; a sequence of tokens.</li>
+<li><strong>n</strong> &#8211; the number of tokens to conjoin together. If given a list, n-grams are computed for each <cite>n</cite> value in turn.</li>
+<li><strong>sep_char</strong> &#8211; separation character to use between consecutive tokens.</li>
+</ul>
+</td>
+</tr>
+<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">a list of n-gram tokens.</p>
+</td>
+</tr>
+<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last">list</p>
+</td>
+</tr>
+</tbody>
+</table>
+</dd></dl>
+
+<dl class="function">
+<dt id="ycutils.tokenize.stem_tokens">
+<tt class="descclassname">ycutils.tokenize.</tt><tt class="descname">stem_tokens</tt><big>(</big><em>tokens</em><big>)</big><a class="headerlink" href="#ycutils.tokenize.stem_tokens" title="Permalink to this definition">¶</a></dt>
+<dd><p>Apply the Porter stemmer to a list of tokens.</p>
+<p>Python implementation of Porter stemmer by Vivake Gupta <a class="reference external" href="http://tartarus.org/martin/PorterStemmer/python.txt">http://tartarus.org/martin/PorterStemmer/python.txt</a>.</p>
+<table class="docutils field-list" frame="void" rules="none">
+<col class="field-name" />
+<col class="field-body" />
+<tbody valign="top">
+<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>tokens</strong> &#8211; a sequence of tokens.</td>
+</tr>
+<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">a corresponding list of Porter steemed tokens.</td>
+</tr>
+<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">list</td>
+</tr>
+</tbody>
+</table>
+</dd></dl>
+
 <dl class="data">
 <dt id="ycutils.tokenize.TAG_EMPTY">
 <tt class="descclassname">ycutils.tokenize.</tt><tt class="descname">TAG_EMPTY</tt><em class="property"> = 0</em><a class="headerlink" href="#ycutils.tokenize.TAG_EMPTY" title="Permalink to this definition">¶</a></dt>

docs/yc-pyutils.pdf

Binary file modified.

sphinx/tokenize.rst

 Module methods
 ==============
 .. automodule:: ycutils.tokenize
-  :members: to_ascii, words, sentences, words_in_sentences, tag_tokens, TAG_EMPTY, TAG_WORD, TAG_NUM, TAG_PUNCT, TAG_TIME, TAG_PHONE, TAG_EMAIL, TAG_URL
+  :members: to_ascii, words, sentences, words_in_sentences, tag_tokens, filter_stopwords, ngram_tokens, stem_tokens, TAG_EMPTY, TAG_WORD, TAG_NUM, TAG_PUNCT, TAG_TIME, TAG_PHONE, TAG_EMAIL, TAG_URL
   :special-members: __DEFAULT_NORMALIZE__, __DEFAULT_TAG_LIST__

ycutils/tokenize.py

 
 stemmer = PorterStemmer()
 def stem_tokens(tokens):
+  """Apply the Porter stemmer to a list of tokens.
+
+  Python implementation of Porter stemmer by Vivake Gupta `http://tartarus.org/martin/PorterStemmer/python.txt <http://tartarus.org/martin/PorterStemmer/python.txt>`_.
+
+  :param tokens: a sequence of tokens.
+  :returns: a corresponding list of Porter steemed tokens.
+  :rtype: list"""
+
   return map(lambda w: stemmer.stem(w, 0, len(w)-1), tokens)
+#end def
+