1. hizel
  2. trololo

Source

trololo / lorparser.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from urllib2 import urlopen, HTTPError, build_opener, HTTPHandler, Request, \
        HTTPRedirectHandler, HTTPCookieProcessor, install_opener
import mechanize        
from BeautifulSoup import BeautifulSoup
import codecs
import re
from pytagcloud import create_tag_image, LAYOUTS, make_tags
from pytagcloud.lang.counter import get_tag_counts
from StringIO import StringIO
import shelve

BASE_URL = 'http://www.linux.org.ru/'
re_quote = re.compile(r'^>')
re_link  = re.compile('jump-message.jsp\?msgid=(\d+)&cid=(\d+)')
re_tag   = re.compile(r'\&\w+\;')

def commentfetch(lnk):
    m = re_link.match(lnk)
    if m:
        msgid = m.group(1)
        cid = m.group(2)
    else:
        print 'invalid lnk'
        return
    url = BASE_URL+lnk    
    try:
        comments = mechanize.urlopen(url).read()
    except HTTPError:
        return None

    bs_comments = BeautifulSoup(comments)
    comment = bs_comments.find('div', {'class' : 'msg', 'id' : 'comment-%d' % int(cid) })
    comment_div = comment.find('div', {'class': 'msg_body message-w-userpic'})
    return (unicode(comment_div), int(cid))

def commentparse(comment):
    bs_comment = BeautifulSoup(comment)    
    comment_div = bs_comment.find('div', {'class': 'msg_body message-w-userpic'})
    content_div = comment_div.findAll('div')
    comment_sign = bs_comment.find('div', {'class': 'sign'})
    comment_sign_a = comment_sign.find('a')
    # remove datatime string
    comment_sign_a.extract() 
    comment_title = bs_comment.find('h2')
    # remove title string
    if comment_title:
        comment_title.extract()
    # magic remove all <div>content</div> with content ^_^
    [x.extract() for x in content_div] 
    f = StringIO()
    for item in comment_div.recursiveChildGenerator():
        if isinstance(item, unicode) and not re_quote.match(item):
            print >>f, item.rstrip()
    text_content = f.getvalue()
    f.close()
    return (text_content.rstrip(), comment_sign.text.lstrip().rstrip())

def parse_comment_links(nick, cache):
    url = BASE_URL+'show-comments.jsp?nick='+nick
    deleted = 0
    count = 0
    offset = 0
    br = mechanize.Browser()
    while True:
        if offset:
            url_offset = url + '&offset=%d' % offset
        else:
            url_offset = url
        try:
            comments = br.open(url_offset).read()
        except HTTPError:
            print 'nick %s invalid' % nick
            return
        soup_comments = BeautifulSoup(comments)
        table = soup_comments.table
        table_body = table.tbody
        comment_links = table_body.findAll('a')

        for cl in comment_links:
            count += 1
            key = cl['href'].encode('utf8')
            if not cache.has_key(key):
                try:
                    cache[key] = commentfetch(cl['href'])
                except Exception:
                    cache[key] = ''
                    print 'error parse %s' % cl['href']
                print '%d:%s' % (count, cl['href'])
            else:
                # if found first exists url - exit from cycle
                print '%d:%s exists' % (count, cl['href'])

        table_footer = table.tfoot
        navigate_links = table_footer.findAll('a')
        if offset == 0 and len(navigate_links) == 1: # first page
            offset += 50
            continue
        elif len(navigate_links) == 0: # one pag(?)
            break
        elif offset != 0 and len(navigate_links) == 1: # last page
            break
        elif len(navigate_links) == 2:
            offset += 50 
        else:
            assert('strange page')

    print 'parse %d comments' % count

def create_image(cache, imgname):
    f = StringIO()
    for k in cache.keys():
        if cache[k]:
            content, cid = cache[k]
            content_text, time = commentparse(content)
            print >>f, content_text
    content = f.getvalue()
    content = re_tag.sub('', content)
    counts = get_tag_counts(content)
    f.close()
    tags = make_tags(counts)
    create_tag_image(
            tags[:200], 
            '%s.png' % imgname,
            fontname='Ubuntu',
            background=(0, 0, 0, 255),
            size=(1024,900),
            crop=False
            )

if __name__ == '__main__':
    import sys
    from os.path import exists as fexists
    if len(sys.argv) != 3:
        print "usage:%s <nick> <imgname>.png" % (sys.argv[0]) 
        sys.exit(1)
    nick = sys.argv[1]    
    cachename = '%s.cache2' % nick
    imgname = sys.argv[2]
    if fexists(cachename):
        print 'found %s cache' % nick
    cache = shelve.open(cachename)
    parse_comment_links(nick,cache)
    create_image(cache, imgname)
    cache.close()