Commits

Ruben Martinez-Cantin committed dd96879

Initial commit

  • Participants

Comments (0)

Files changed (8)

File beautifyj.css

+body {
+	margin: 0;
+	padding: 0;
+	font-family: arial;
+	background-color: #F6F3E5;
+}
+.as {
+	font-size: 12px;
+	color: #900;
+}
+.ts {
+	font-weight: bold;
+	font-size: 14px;
+}
+.tt {
+	color: #009;
+	font-size: 13px;
+}
+h1 {
+	font-size: 20px;
+	padding: 0;
+	margin: 0;
+}
+#titdiv {
+	width: 100%;
+	height: 90px;
+	background-color: #840000;
+	color: white;
+
+	padding-top: 20px;
+	padding-left: 20px;
+
+	border-bottom: 1px solid #540000;
+}
+
+#maindiv {
+	width: 970px;
+	padding: 15px;
+	margin-left: auto;
+	margin-right: auto;
+
+	border-left: solid 1px #D6D3C5;
+	border-right: solid 1px #D6D3C5;
+
+	background-color: white;
+}
+
+.apaper {
+	margin-top: 25px;
+	min-height: 300px;
+}
+
+.paperdesc {
+	float: left;
+}
+
+.dllinks {
+	float: right;
+	text-align: right;
+}
+
+#titdiv a:link{ color: white; }
+#titdiv a:visited{ color: white; }
+
+#maindiv a:link{ color: #666; }
+#maindiv a:visited{ color: #600; }
+
+.t0 { color: #000;}
+.t1 { color: #C00;}
+.t2 { color: #0C0;}
+.t3 { color: #00C;}
+.t4 { color: #AA0;}
+.t5 { color: #C0C;}
+.t6 { color: #0CC;}
+
+.topicchoice {
+	border: 2px solid black;
+	border-radius: 10px;
+	padding: 4px;
+	cursor: pointer;
+	text-decoration: underline;
+}
+
+#explanation {
+	background-color: #CFC;
+	border-radius: 5px;
+	color: black;
+	padding: 5px;
+	text-align: center;
+}
+
+#sortoptions {
+	text-align: center;
+	padding: 10px;
+}
+
+.sim {
+	cursor: pointer;
+	text-decoration: underline;
+}

File generatenicelda.py

+# creates the nice .html page
+# assumes that pdftowordcloud.py, pdftothumbs.py and scrape.py were already run
+
+import cPickle as pickle
+from numpy import argmax, zeros, ones
+from math import log
+
+def generate_page(pdflinkbase=None, bibtexlinkbase=None,
+		  titlepage=None, urlold=None):
+    use_images = False
+
+    # load the pickle of papers scraped from the HTML page (result of
+    # scrape.py)
+    paperdict = pickle.load(open( "papers.p", "rb" ))
+    print "Loaded %d papers from papers.p" % (len(paperdict), )
+
+    # load the top word frequencies (result of pdftowordcloud.py)
+    topdict = pickle.load(open("topwords.p", "rb"))
+    print "Loaded %d entries from topwords.p" % (len(topdict), )
+
+    # load LDA words and invert their dictionary list
+    (ldak, phi, voca) = pickle.load(open("ldaphi.p", "rb"))
+    wtoid = {}
+    for i,w in enumerate(voca):
+	wtoid[w] = i
+
+    # compute pairwise distances between papers based on top words
+    # using something similar to tfidf, but simpler. No vectors
+    # will be normalized or otherwise harmed during this computation.
+    # first compute inverse document frequency (idf)
+    N = len(paperdict) # number of documents
+    idf = {}
+    for pid,p in enumerate(paperdict):
+	tw = topdict.get(p, []) # top 100 words
+	ts = [x[0] for x in tw]
+	for t in ts:
+	    idf[t] = idf.get(t, 0.0) + 1.0
+    for t in idf:
+	idf[t] = log(N/idf[t], 2)
+
+    # now compute weighted intersection
+    ds = zeros((N, N))
+    for pid,p in enumerate(paperdict):
+	tw = topdict.get(p, [])
+	w = set([x[0] for x in tw]) # just the words
+	accum = 0.0
+
+	for pid2, p2 in enumerate(paperdict):
+	    if pid2<pid: continue
+	    tw2= topdict.get(p2, [])
+	    w2 = set([x[0] for x in tw2]) # just the words
+
+	    # tw and tw2 are top 100 words as (word, count) in both
+	    # papers. Compute the intersection!
+	    winter = w.intersection(w2)
+	    score = sum([idf[x] for x in winter])
+	    ds[pid, pid2] = score
+	    ds[pid2, pid] = score
+
+    # build up the string for html
+    html = open("nipsnice_template.html", "r").read()
+    s = ""
+    js = "ldadist=["
+    js2 = "pairdists=["
+    for pid, p in enumerate(paperdict):
+
+	# get title, author
+	title, author = paperdict[p]
+
+	# create the tags string
+	topwords = topdict.get(p, [])
+	# some top100 words may not have been computed during LDA so
+	# exclude them if they aren't found in wtoid
+	t = [x[0] for x in topwords if x[0] in wtoid]
+	# assign each word to class
+	tid = [int(argmax(phi[:, wtoid[x]])) for x in t] 
+	tcat = ""
+	for k in range(ldak):
+	    ws = [x for i,x in enumerate(t) if tid[i]==k]
+	    tcat += '[<span class="t'+ `k` + '">' + ", ".join(ws) + '</span>] '
+	
+	# count up the complete distribution for the entire document
+	# and build up a javascript vector storing all this
+	svec = zeros(ldak)
+	for w in t: 
+	    svec += phi[:, wtoid[w]]
+	if svec.sum() == 0: 
+	    svec = ones(ldak)/ldak;
+	else: 
+	    svec = svec / svec.sum() # normalize
+	nums = [0 for k in range(ldak)]
+	for k in range(ldak): 
+	    nums[k] = "%.2f" % (float(svec[k]), )
+	
+	js += "[" + ",".join(nums) + "]"
+	if not pid == len(paperdict)-1: js += ","
+
+	# dump similarities of this document to others
+	scores = ["%.2f" % (float(ds[pid, i]),) for i in range(N)]
+	js2 += "[" + ",".join(scores) + "]"
+	if not pid == len(paperdict)-1: js2 += ","
+
+	# get path to thumbnails for this paper
+	thumbpath = "thumbs/%s.pdf.jpg" % (p, )
+
+	# get links to PDF, supplementary and bibtex on NIPS servers
+#	if pdflinkbase: pdflink = pdflinkbase+ "%s/%s.pdf" % (p,p)
+#	if bibtexlinkbase:bibtexlink = bibtexlinkbase + "%s.html" % (p, )
+
+	bibtexlink = bibtexlinkbase % (p, )
+	pdflink = pdflinkbase % (p,p)
+
+	s += """
+
+	<div class="apaper" id="pid%d">
+	<div class="paperdesc">
+		<span class="ts">%s</span><br />
+		<span class="as">%s</span><br /><br />
+	</div>
+	<div class="dllinks">
+		<a href="%s">[pdf] </a>
+		<a href="%s">[bibtex] </a>
+		<span class="sim" id="sim%d">[rank by tf-idf similarity to this]</span>
+	</div>
+	<img src = "%s"><br />
+	<span class="tt">%s</span>
+	</div>
+
+	""" % (pid, title, author, pdflink, bibtexlink, pid, thumbpath, tcat)
+
+    newhtml = html.replace("TITLEPAGE", titlepage)
+    newhtml = newhtml.replace("URL_MAIN", urlold)
+    newhtml = newhtml.replace("RESULTTABLE", s)
+
+    js += "]"
+    newhtml = newhtml.replace("LOADDISTS", js)
+
+    js2 += "]"
+    newhtml = newhtml.replace("PAIRDISTS", js2)
+
+    with open("nipsnice.html", "w") as f:
+        f.write(newhtml)
+	f.close()
+
+if __name__ == "__main__":
+    generate_page(bibtexlinkbase="http://www.jmlr.org/papers/v13/%s.html",
+		  pdflinkbase="http://www.jmlr.org/papers/volume13/%s/%s.pdf",
+		  titlepage="JMLR Vol 13",
+		  urlold="http://jmlr.csail.mit.edu/papers/v13/")
+from BeautifulSoup import BeautifulSoup
+import urllib
+import cPickle as pickle
+
+class JournalParser:
+    def __init__(self, url):
+        self.url = url
+        html_page = urllib.urlopen(url)
+        self.soup = BeautifulSoup(html_page)
+
+    def parse_titles_jmlr(self):
+        titles = [title.find(text=True).strip()
+                  for title in self.soup.findAll('dt')]
+        authors = [auth.find(text=True).strip()
+                   for auth in self.soup.findAll('dd')]
+        self.titles2dict(titles,authors)
+        
+    def parse_titles_rss(self):
+        titles = [link.find(text=True) for link in soup.findAll('a')
+                  if link.get('href').strip('phtml.').isdigit()]
+
+        authors = [name.find(text=True) for name in soup.findAll('i')]
+        self.titles2dict(titles,authors)
+
+    def titles2dict(self,titles,authors):
+        papers = zip(titles, authors)
+        linkpdfs = [link.get('href') for link in self.soup.findAll('a')
+                         if link.get('href').endswith(".pdf")]
+
+        idpdfs = [link.split('/')[-1][:-4] for link in linkpdfs]
+        self.linkids = zip(linkpdfs, idpdfs)
+        self.dictpapers = dict(zip(idpdfs,papers))
+
+
+    def get_pdf_files(self):
+        for link,id_paper in self.linkids:
+            savename = 'pdfs/' + id_paper + '.pdf'
+            print "Downloading:", link, "in", savename
+            urllib.urlretrieve(link,savename)
+
+    def dump_author_data(self):
+        for key in self.dictpapers:
+            print key, "=", self.dictpapers[key]
+        
+        # dump a dictionary indexed by paper id that points to (title,
+        # authors) tuple
+        pickle.dump(self.dictpapers, open("papers.p", "wb"))
+
+
+if __name__ == "__main__":
+    parser = JournalParser("http://jmlr.csail.mit.edu/papers/v13/")
+    parser.parse_titles_jmlr()
+    parser.dump_author_data()
+    
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Latent Dirichlet Allocation + collapsed Gibbs sampling
+# This code is available under the MIT License.
+# (c)2010-2011 Nakatani Shuyo / Cybozu Labs Inc.
+
+import numpy
+import cPickle as pickle
+
+class LDA:
+    def __init__(self, K, alpha, beta, docs, V, smartinit=True):
+        self.K = K
+        self.alpha = alpha # parameter of topics prior
+        self.beta = beta   # parameter of words prior
+        self.docs = docs
+        self.V = V
+
+        self.z_m_n = [] # topics of words of documents
+        self.n_m_z = numpy.zeros((len(self.docs), K)) + alpha     # word count of each document and topic
+        self.n_z_t = numpy.zeros((K, V)) + beta # word count of each topic and vocabulary
+        self.n_z = numpy.zeros(K) + V * beta    # word count of each topic
+
+        self.N = 0
+        for m, doc in enumerate(docs):
+            self.N += len(doc)
+            z_n = []
+            for t in doc:
+                if smartinit:
+                    p_z = self.n_z_t[:, t] * self.n_m_z[m] / self.n_z
+                    z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()
+                else:
+                    z = numpy.random.randint(0, K)
+                z_n.append(z)
+                self.n_m_z[m, z] += 1
+                self.n_z_t[z, t] += 1
+                self.n_z[z] += 1
+            self.z_m_n.append(numpy.array(z_n))
+
+    def inference(self):
+        """learning once iteration"""
+        for m, doc in enumerate(self.docs):
+            z_n = self.z_m_n[m]
+            n_m_z = self.n_m_z[m]
+            for n, t in enumerate(doc):
+                # discount for n-th word t with topic z
+                z = z_n[n]
+                n_m_z[z] -= 1
+                self.n_z_t[z, t] -= 1
+                self.n_z[z] -= 1
+
+                # sampling topic new_z for t
+                p_z = self.n_z_t[:, t] * n_m_z / self.n_z
+                new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()
+
+                # set z the new topic and increment counters
+                z_n[n] = new_z
+                n_m_z[new_z] += 1
+                self.n_z_t[new_z, t] += 1
+                self.n_z[new_z] += 1
+
+    def worddist(self):
+        """get topic-word distribution"""
+        return self.n_z_t / self.n_z[:, numpy.newaxis]
+
+    def perplexity(self, docs=None):
+        if docs == None: docs = self.docs
+        phi = self.worddist()
+        log_per = 0
+        N = 0
+        Kalpha = self.K * self.alpha
+        for m, doc in enumerate(docs):
+            theta = self.n_m_z[m] / (len(self.docs[m]) + Kalpha)
+            for w in doc:
+                log_per -= numpy.log(numpy.inner(phi[:,w], theta))
+            N += len(doc)
+        return numpy.exp(log_per / N)
+
+def lda_learning(lda, iteration, voca):
+    pre_perp = lda.perplexity()
+    print "initial perplexity=%f" % pre_perp
+    for i in range(iteration):
+        lda.inference()
+        perp = lda.perplexity()
+        print "-%d p=%f" % (i + 1, perp)
+        if pre_perp:
+            if pre_perp < perp:
+                output_word_topic_dist(lda, voca)
+                pre_perp = None
+            else:
+                pre_perp = perp
+    output_word_topic_dist(lda, voca)
+
+def output_word_topic_dist(lda, voca):
+    zcount = numpy.zeros(lda.K, dtype=int)
+    wordcount = [dict() for k in xrange(lda.K)]
+    for xlist, zlist in zip(lda.docs, lda.z_m_n):
+        for x, z in zip(xlist, zlist):
+            zcount[z] += 1
+            if x in wordcount[z]:
+                wordcount[z][x] += 1
+            else:
+                wordcount[z][x] = 1
+
+    phi = lda.worddist()
+    for k in xrange(lda.K):
+        print "\n-- topic: %d (%d words)" % (k, zcount[k])
+        for w in numpy.argsort(-phi[k])[:20]:
+            print "%s: %f (%d)" % (voca[w], phi[k,w], wordcount[k].get(w,0))
+
+    pickle.dump((lda.K, phi, voca.vocas), open("ldaphi.p", "wb"))
+
+def main():
+    import optparse
+    import vocabulary
+    parser = optparse.OptionParser()
+    parser.add_option("-f", dest="filename", help="corpus filename")
+    parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)")
+    parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
+    parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5)
+    parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
+    parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
+    parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False)
+    parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
+    parser.add_option("--seed", dest="seed", type="int", help="random seed")
+    parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0)
+    (options, args) = parser.parse_args()
+    if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)")
+
+    if options.filename:
+        corpus = vocabulary.load_file(options.filename)
+    else:
+        corpus = vocabulary.load_corpus(options.corpus)
+        if not corpus: parser.error("corpus range(-c) forms 'start:end'")
+    if options.seed != None:
+        numpy.random.seed(options.seed)
+
+    voca = vocabulary.Vocabulary(options.stopwords)
+    docs = [voca.doc_to_ids(doc) for doc in corpus]
+    if options.df > 0: docs = voca.cut_low_freq(docs, options.df)
+
+    lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit)
+    print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)
+
+    #import cProfile
+    #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile')
+    lda_learning(lda, options.iteration, voca)
+
+if __name__ == "__main__":
+    main()

File nipsnice_template.html

+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title>TITLEPAGE</title>
+
+<link rel="stylesheet" type="text/css" href="beautifyj.css">
+<script src="jquery-1.8.3.min.js"></script>
+<script>
+
+// this line below will get filled in with database of LDA topic distributions for top words 
+// for every paper
+LOADDISTS
+
+// this will be filled with pairwise scores between papers
+PAIRDISTS
+
+var choices = [0, 0, 0, 1, 1, 0, 0]; // default choices, random...
+var similarityMode = 0; // is the user currently looking at papers similar to some one paper?
+var similarTo = 0; // the index of query paper
+
+// given choices of topics to sort by, handle user interface stuff (i.e. show selection)
+function colorChoices() {
+	for(var i=0;i<choices.length;i++) {
+		if(choices[i] == 1) {
+			$("#tc"+i).css("background-color", "#EFE");
+			$("#tc"+i).css("border-color", "#575");
+		} else {
+			$("#tc"+i).css("background-color", "#FFF");
+			$("#tc"+i).css("border-color", "#FFF");
+		}
+	}
+}
+
+// this permutes the divs (that contian 1 paper each) based on a custom sorting function
+// in our case, this sort is done as dot product based on the choices[] array
+// here we are guaranteed ldadist[] already sums to 1 for every paper
+function arrangeDivs() {
+	var rtable = $("#rtable");
+	var paperdivs = rtable.children(".apaper");
+
+	// normalize choices to sum to 1
+	var nn = choices.slice(0); // copy the array
+	var ss = 0.0;
+	for(var j=0;j<choices.length;j++) { ss += choices[j]; }
+	for(var j=0;j<choices.length;j++) { nn[j] = nn[j]/ss; }
+
+	paperdivs.detach().sort(function(a,b) {
+		var ixa = parseInt($(a).attr('id').substring(3));
+		var ixb = parseInt($(b).attr('id').substring(3));
+
+		if(similarityMode === 1) {
+			return pairdists[ixa][similarTo] < pairdists[ixb][similarTo] ? 1 : -1;
+		}
+
+		if(similarityMode === 0) {
+
+			// chi-squared kernel for the two histograms
+			var accuma = 0;
+			var accumb = 0;
+			for(var i=0;i<7;i++) {
+				var ai= ldadist[ixa][i];
+				var bi= ldadist[ixb][i];
+				var ci= choices[i];
+				accuma += (ai-ci)*(ai-ci)/(ai+ci);
+				accumb += (bi-ci)*(bi-ci)/(bi+ci);
+			}		
+			return accuma > accumb ? 1 : -1;
+
+			/*
+			// vector distance. These are histograms... but lets pretend they arent
+			var accuma = 0;
+			var accumb = 0;
+			for(var i=0;i<7;i++) {
+				var ai= ldadist[ixa][i];
+				var bi= ldadist[ixb][i];
+				var ci= nn[i];
+				accuma += (ai-ci)*(ai-ci);
+				accumb += (bi-ci)*(bi-ci);
+			}
+			return accuma > accumb ? 1 : -1;
+			*/
+
+			/*
+			// inner product distance
+			var accuma = 0;
+			var accumb = 0;
+			for(var i=0;i<7;i++) {
+				accuma += ldadist[ixa][i] * choices[i];
+				accumb += ldadist[ixb][i] * choices[i];
+			}
+			return accuma < accumb ? 1 : -1;
+			*/
+		}
+
+	});
+	rtable.append(paperdivs);
+}
+
+// when page loads...
+$(document).ready(function(){
+	
+	arrangeDivs();
+	colorChoices();
+
+	// user clicks on one of the Topic buttons
+	$(".topicchoice").click(function() {
+		similarityMode = 0; // make sure this is off
+		var tcid = parseInt($(this).attr('id').substring(2));
+		choices[tcid] = 1 - choices[tcid]; // toggle!
+		
+		colorChoices();
+		arrangeDivs();
+	});
+
+	// user clicks on "rank by tf-idf similarity to this" button for some paper
+	$(".sim").click(function() {
+		similarityMode = 1; // turn on similarity mode
+		for(var i=0;i<choices.length;i++) { choices[i] = 0; } // zero out choices
+		similarTo = parseInt($(this).attr('id').substring(3)); // store id of the paper clicked
+
+		colorChoices();
+		arrangeDivs();
+
+		// also scroll to top
+		$('html, body').animate({ scrollTop: 0 }, 'fast');
+	});
+});
+
+</script>
+
+</head>
+
+<body>
+
+<div id ="titdiv">
+<h1>TITLEPAGE</h1>
+created by <a href="http://webdiis.unizar.es/~rmcantin">rmcantin</a>
+as a remake of <a href="URL_MAIN">this</a>.<br />
+Based on an idea by <a href="https://twitter.com/karpathy">@karpathy</a><br/>
+source code on <a href="">bitbucket</a>
+</div>
+
+<div id="maindiv">
+<div id="explanation">Below every paper are TOP 100 most-occuring words in that paper and their color is based on LDA topic model with k = 7.<br />
+	<div style="font-size: 12px;">(It looks like 0 = theory, 1 = reinforcement learning, 2 = graphical models, 3 = deep learning/vision, 4 = optimization, 5 = neuroscience, 6 = embeddings etc.) </div>
+</div>
+<div id="sortoptions">
+Toggle LDA topics to sort by:
+<span class="topicchoice t0" id="tc0">TOPIC0</span>
+<span class="topicchoice t1" id="tc1">TOPIC1</span>
+<span class="topicchoice t2" id="tc2">TOPIC2</span>
+<span class="topicchoice t3" id="tc3">TOPIC3</span>
+<span class="topicchoice t4" id="tc4">TOPIC4</span>
+<span class="topicchoice t5" id="tc5">TOPIC5</span>
+<span class="topicchoice t6" id="tc6">TOPIC6</span>
+</div>
+
+<!-- the keyword below will be replaced by content from the python script generatenice.py -->
+<div id="rtable">
+RESULTTABLE
+</div>
+
+</div>
+
+<br /><br /><br /><br /><br /><br />
+</body>
+
+</html>

File pdftowordcloud.py

+# go over all pdfs in NIPS, get all the words from each, discard stop
+# words, count frequencies of all words, retain top 100 for each PDF
+# and dump a pickle of results into topwords.p
+
+import os
+import re
+import cPickle as pickle
+from string import punctuation
+from operator import itemgetter
+from pyPdf import PdfFileReader
+
+class PDFextractor:
+    def __init__(self):
+	self.N= 100 # how many top words to retain
+		
+	# load in stopwords (i.e. boring words, these we will ignore)
+	stopwords = open("stopwords.txt", "r").read().split()
+	self.stopwords = [x.strip(punctuation) for x in stopwords if len(x)>2]
+        
+    def get_files_local(self):
+	# get list of all PDFs supplied by NIPS
+	self.relpath = "pdfs/"
+	allFiles = os.listdir(self.relpath)
+	self.pdfs = [x for x in allFiles if x.endswith(".pdf")]
+
+    def get_files_network(self):
+        pass
+
+    def get_words(self):
+	# go over every PDF, use pdftotext to get all words, discard
+	# boring ones, and count frequencies
+	topdict = {} # dict of paperid -> [(word, frequency),...]
+	with open("allpapers.txt", "w") as outf:
+            for i,f in enumerate(self.pdfs):
+		paperid = f[:-4]
+		fullpath = self.relpath + f
+
+		print "processing %s, %d/%d" % (paperid, i, len(self.pdfs))
+
+		# create text file
+		cmd = "pdftotext %s %s" % (fullpath, "out.txt")
+		print "EXEC: " + cmd
+		os.system(cmd)
+	    
+		# get all words in a giant list
+		txtlst = open("out.txt").read().split() 
+		# take only alphanumerics
+		words = [x.lower() for x in txtlst
+			 if re.match('^[\w-]+$', x) is not None] 
+		# remove stop words
+		words = [x for x in words
+			 if len(x)>2 and (not x in self.stopwords)]
+	    
+
+		# count up frequencies of all words
+		wcount = {} 
+		for w in words: wcount[w] = wcount.get(w, 0) + 1
+		top = sorted(wcount.iteritems(), key=itemgetter(1),
+			     reverse=True)[:self.N] # sort and take top N
+
+		topdict[paperid] = top # save to our dict
+
+		# For LDA: only take words that occurr at least a bit (for
+		# efficiency)
+		words = [x for x in words if wcount[x] >= 3]
+
+		outf.write(" ".join(words))
+		outf.write("\n")
+
+	# dump to pickle
+	pickle.dump(topdict, open("topwords.p", "wb"))
+
+    def get_thumbs(self):
+        for i,f in enumerate(self.pdfs):
+            paperid = f[:-4]
+            fullpath = self.relpath + f
+
+            inputpdf = PdfFileReader(file(fullpath, "rb"))
+            numpages = inputpdf.getNumPages()
+            if numpages > 8:
+		l1 = numpages-6
+		l2 = numpages-2
+            else:
+		l1 = 2
+		l2 = 6
+            print "processing %s, %d/%d" % (paperid, i, len(self.pdfs))
+
+            # this is a mouthful...  take first 8 pages of the pdf
+            # ([0-7]), since 9th page are references tile them
+            # horizontally, use JPEG compression 80, trim the borders for
+            # each image
+	
+            cmd = "montage %s[0-1] %s[%d-%d] -mode Concatenate -tile x1 -quality 80 -resize x230 -trim %s" % (fullpath, fullpath, l1,l2, "thumbs/" + f + ".jpg")
+            print "EXEC: " + cmd
+            os.system(cmd)
+
+
+if __name__ == "__main__":
+    converter = PDFextractor()
+    converter.get_files_local()
+    converter.get_words()
+    converter.get_thumbs()

File stopwords.txt

+a
+able
+about
+above
+abst
+accordance
+according
+accordingly
+across
+act
+actually
+added
+adj
+affected
+affecting
+affects
+after
+afterwards
+again
+against
+ah
+all
+almost
+alone
+along
+already
+also
+although
+always
+am
+among
+amongst
+an
+and
+announce
+another
+any
+anybody
+anyhow
+anymore
+anyone
+anything
+anyway
+anyways
+anywhere
+apparently
+approximately
+are
+aren
+arent
+arise
+around
+as
+aside
+ask
+asking
+at
+auth
+available
+away
+awfully
+b
+back
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+begin
+beginning
+beginnings
+begins
+behind
+being
+believe
+below
+beside
+besides
+between
+beyond
+biol
+both
+brief
+briefly
+but
+by
+c
+ca
+came
+can
+cannot
+can't
+cause
+causes
+certain
+certainly
+co
+com
+come
+comes
+contain
+containing
+contains
+could
+couldnt
+d
+date
+did
+didn't
+different
+do
+does
+doesn't
+doing
+done
+don't
+down
+downwards
+due
+during
+e
+each
+ed
+edu
+effect
+eg
+eight
+eighty
+either
+else
+elsewhere
+end
+ending
+enough
+especially
+et
+et-al
+etc
+even
+ever
+every
+everybody
+everyone
+everything
+everywhere
+ex
+except
+f
+far
+few
+ff
+fifth
+first
+five
+fix
+followed
+following
+follows
+for
+former
+formerly
+forth
+found
+four
+from
+further
+furthermore
+g
+gave
+get
+gets
+getting
+give
+given
+gives
+giving
+go
+goes
+gone
+got
+gotten
+h
+had
+happens
+hardly
+has
+hasn't
+have
+haven't
+having
+he
+hed
+hence
+her
+here
+hereafter
+hereby
+herein
+heres
+hereupon
+hers
+herself
+hes
+hi
+hid
+him
+himself
+his
+hither
+home
+how
+howbeit
+however
+hundred
+i
+id
+ie
+if
+i'll
+im
+immediate
+immediately
+importance
+important
+in
+inc
+indeed
+index
+information
+instead
+into
+invention
+inward
+is
+isn't
+it
+itd
+it'll
+its
+itself
+i've
+j
+just
+k
+keep
+keeps
+kept
+kg
+km
+know
+known
+knows
+l
+largely
+last
+lately
+later
+latter
+latterly
+least
+less
+lest
+let
+lets
+like
+liked
+likely
+line
+little
+'ll
+look
+looking
+looks
+ltd
+m
+made
+mainly
+make
+makes
+many
+may
+maybe
+me
+mean
+means
+meantime
+meanwhile
+merely
+mg
+might
+million
+miss
+ml
+more
+moreover
+most
+mostly
+mr
+mrs
+much
+mug
+must
+my
+myself
+n
+na
+name
+namely
+nay
+nd
+near
+nearly
+necessarily
+necessary
+need
+needs
+neither
+never
+nevertheless
+new
+next
+nine
+ninety
+no
+nobody
+non
+none
+nonetheless
+noone
+nor
+normally
+nos
+not
+noted
+nothing
+now
+nowhere
+o
+obtain
+obtained
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+omitted
+on
+once
+one
+ones
+only
+onto
+or
+ord
+other
+others
+otherwise
+ought
+our
+ours
+ourselves
+out
+outside
+over
+overall
+owing
+own
+p
+page
+pages
+part
+particular
+particularly
+past
+per
+perhaps
+placed
+please
+plus
+poorly
+possible
+possibly
+potentially
+pp
+predominantly
+present
+previously
+primarily
+probably
+promptly
+proud
+provides
+put
+q
+que
+quickly
+quite
+qv
+r
+ran
+rather
+rd
+re
+readily
+really
+recent
+recently
+ref
+refs
+regarding
+regardless
+regards
+related
+relatively
+research
+respectively
+resulted
+resulting
+results
+right
+run
+s
+said
+same
+saw
+say
+saying
+says
+sec
+section
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+self
+selves
+sent
+seven
+several
+shall
+she
+shed
+she'll
+shes
+should
+shouldn't
+show
+showed
+shown
+showns
+shows
+significant
+significantly
+similar
+similarly
+since
+six
+slightly
+so
+some
+somebody
+somehow
+someone
+somethan
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+sorry
+specifically
+specified
+specify
+specifying
+still
+stop
+strongly
+sub
+substantially
+successfully
+such
+sufficiently
+suggest
+sup
+sure
+t
+take
+taken
+taking
+tell
+tends
+th
+than
+thank
+thanks
+thanx
+that
+that'll
+thats
+that've
+the
+their
+theirs
+them
+themselves
+then
+thence
+there
+thereafter
+thereby
+thered
+therefore
+therein
+there'll
+thereof
+therere
+theres
+thereto
+thereupon
+there've
+these
+they
+theyd
+they'll
+theyre
+they've
+think
+this
+those
+thou
+though
+thoughh
+thousand
+throug
+through
+throughout
+thru
+thus
+til
+tip
+to
+together
+too
+took
+toward
+towards
+tried
+tries
+truly
+try
+trying
+ts
+twice
+two
+u
+un
+under
+unfortunately
+unless
+unlike
+unlikely
+until
+unto
+up
+upon
+ups
+us
+use
+used
+useful
+usefully
+usefulness
+uses
+using
+usually
+v
+value
+various
+'ve
+very
+via
+viz
+vol
+vols
+vs
+w
+want
+wants
+was
+wasn't
+way
+we
+wed
+welcome
+we'll
+went
+were
+weren't
+we've
+what
+whatever
+what'll
+whats
+when
+whence
+whenever
+where
+whereafter
+whereas
+whereby
+wherein
+wheres
+whereupon
+wherever
+whether
+which
+while
+whim
+whither
+who
+whod
+whoever
+whole
+who'll
+whom
+whomever
+whos
+whose
+why
+widely
+willing
+wish
+with
+within
+without
+won't
+words
+world
+would
+wouldn't
+www
+x
+y
+yes
+yet
+you
+youd
+you'll
+your
+youre
+yours
+yourself
+yourselves
+you've
+z
+zero
+will

File vocabulary.py

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# This code is available under the MIT License.
+# (c)2010-2011 Nakatani Shuyo / Cybozu Labs Inc.
+
+import nltk, re
+
+def load_corpus(range):
+    m = re.match(r'(\d+):(\d+)$', range)
+    if m:
+        start = int(m.group(1))
+        end = int(m.group(2))
+        from nltk.corpus import brown as corpus
+        return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]]
+
+def load_file(filename):
+    corpus = []
+    f = open(filename, 'r')
+    for line in f:
+        doc = re.findall(r'\w+(?:\'\w+)?',line)
+        if len(doc)>0:
+            corpus.append(doc)
+    f.close()
+    return corpus
+
+#stopwords_list = nltk.corpus.stopwords.words('english')
+stopwords_list = "a,s,able,about,above,according,accordingly,across,actually,after,afterwards,again,against,ain,t,all,allow,allows,almost,alone,along,already,also,although,always,am,among,amongst,an,and,another,any,anybody,anyhow,anyone,anything,anyway,anyways,anywhere,apart,appear,appreciate,appropriate,are,aren,t,around,as,aside,ask,asking,associated,at,available,away,awfully,be,became,because,become,becomes,becoming,been,before,beforehand,behind,being,believe,below,beside,besides,best,better,between,beyond,both,brief,but,by,c,mon,c,s,came,can,can,t,cannot,cant,cause,causes,certain,certainly,changes,clearly,co,com,come,comes,concerning,consequently,consider,considering,contain,containing,contains,corresponding,could,couldn,t,course,currently,definitely,described,despite,did,didn,t,different,do,does,doesn,t,doing,don,t,done,down,downwards,during,each,edu,eg,eight,either,else,elsewhere,enough,entirely,especially,et,etc,even,ever,every,everybody,everyone,everything,everywhere,ex,exactly,example,except,far,few,fifth,first,five,followed,following,follows,for,former,formerly,forth,four,from,further,furthermore,get,gets,getting,given,gives,go,goes,going,gone,got,gotten,greetings,had,hadn,t,happens,hardly,has,hasn,t,have,haven,t,having,he,he,s,hello,help,hence,her,here,here,s,hereafter,hereby,herein,hereupon,hers,herself,hi,him,himself,his,hither,hopefully,how,howbeit,however,i,d,i,ll,i,m,i,ve,ie,if,ignored,immediate,in,inasmuch,inc,indeed,indicate,indicated,indicates,inner,insofar,instead,into,inward,is,isn,t,it,it,d,it,ll,it,s,its,itself,just,keep,keeps,kept,know,knows,known,last,lately,later,latter,latterly,least,less,lest,let,let,s,like,liked,likely,little,look,looking,looks,ltd,mainly,many,may,maybe,me,mean,meanwhile,merely,might,more,moreover,most,mostly,much,must,my,myself,name,namely,nd,near,nearly,necessary,need,needs,neither,never,nevertheless,new,next,nine,no,nobody,non,none,noone,nor,normally,not,nothing,novel,now,nowhere,obviously,of,off,often,oh,ok,okay,old,on,once,one,ones,only,onto,or,other,others,otherwise,ought,our,ours,ourselves,out,outside,over,overall,own,particular,particularly,per,perhaps,placed,please,plus,possible,presumably,probably,provides,que,quite,qv,rather,rd,re,really,reasonably,regarding,regardless,regards,relatively,respectively,right,said,same,saw,say,saying,says,second,secondly,see,seeing,seem,seemed,seeming,seems,seen,self,selves,sensible,sent,serious,seriously,seven,several,shall,she,should,shouldn,t,since,six,so,some,somebody,somehow,someone,something,sometime,sometimes,somewhat,somewhere,soon,sorry,specified,specify,specifying,still,sub,such,sup,sure,t,s,take,taken,tell,tends,th,than,thank,thanks,thanx,that,that,s,thats,the,their,theirs,them,themselves,then,thence,there,there,s,thereafter,thereby,therefore,therein,theres,thereupon,these,they,they,d,they,ll,they,re,they,ve,think,third,this,thorough,thoroughly,those,though,three,through,throughout,thru,thus,to,together,too,took,toward,towards,tried,tries,truly,try,trying,twice,two,un,under,unfortunately,unless,unlikely,until,unto,up,upon,us,use,used,useful,uses,using,usually,value,various,very,via,viz,vs,want,wants,was,wasn,t,way,we,we,d,we,ll,we,re,we,ve,welcome,well,went,were,weren,t,what,what,s,whatever,when,whence,whenever,where,where,s,whereafter,whereas,whereby,wherein,whereupon,wherever,whether,which,while,whither,who,who,s,whoever,whole,whom,whose,why,will,willing,wish,with,within,without,won,t,wonder,would,would,wouldn,t,yes,yet,you,you,d,you,ll,you,re,you,ve,your,yours,yourself,yourselves,zero".split(',')
+recover_list = {"wa":"was", "ha":"has"}
+wl = nltk.WordNetLemmatizer()
+
+def is_stopword(w):
+    return w in stopwords_list
+def lemmatize(w0):
+    w = wl.lemmatize(w0.lower())
+    #if w=='de': print w0, w
+    if w in recover_list: return recover_list[w]
+    return w
+
+class Vocabulary:
+    def __init__(self, excluds_stopwords=False):
+        self.vocas = []        # id to word
+        self.vocas_id = dict() # word to id
+        self.docfreq = []      # id to document frequency
+        self.excluds_stopwords = excluds_stopwords
+
+    def term_to_id(self, term0):
+        term = lemmatize(term0)
+        if not re.match(r'[a-z]+$', term): return None
+        if self.excluds_stopwords and is_stopword(term): return None
+        if term not in self.vocas_id:
+            voca_id = len(self.vocas)
+            self.vocas_id[term] = voca_id
+            self.vocas.append(term)
+            self.docfreq.append(0)
+        else:
+            voca_id = self.vocas_id[term]
+        return voca_id
+
+    def doc_to_ids(self, doc):
+        #print ' '.join(doc)
+        list = []
+        words = dict()
+        for term in doc:
+            id = self.term_to_id(term)
+            if id != None:
+                list.append(id)
+                if not words.has_key(id):
+                    words[id] = 1
+                    self.docfreq[id] += 1
+        if "close" in dir(doc): doc.close()
+        return list
+
+    def cut_low_freq(self, corpus, threshold=1):
+        new_vocas = []
+        new_docfreq = []
+        self.vocas_id = dict()
+        conv_map = dict()
+        for id, term in enumerate(self.vocas):
+            freq = self.docfreq[id]
+            if freq > threshold:
+                new_id = len(new_vocas)
+                self.vocas_id[term] = new_id
+                new_vocas.append(term)
+                new_docfreq.append(freq)
+                conv_map[id] = new_id
+        self.vocas = new_vocas
+        self.docfreq = new_docfreq
+
+        def conv(doc):
+            new_doc = []
+            for id in doc:
+                if id in conv_map: new_doc.append(conv_map[id])
+            return new_doc
+        return [conv(doc) for doc in corpus]
+
+    def __getitem__(self, v):
+        return self.vocas[v]
+
+    def size(self):
+        return len(self.vocas)
+
+    def is_stopword_id(self, id):
+        return self.vocas[id] in stopwords_list