1. hizel
  2. trololo

Source

trololo / lorparser.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from urllib2 import HTTPError
from mechanize import urlopen, Browser
from BeautifulSoup import BeautifulSoup
from pytagcloud import create_tag_image, LAYOUTS, make_tags
from pytagcloud.lang.counter import get_tag_counts
from StringIO import StringIO

import shelve
import codecs
import re

BASE_URL = 'http://www.linux.org.ru/'
re_quote = re.compile(r'^>')
re_link  = re.compile('jump-message.jsp\?msgid=(\d+)&cid=(\d+)')
re_tag   = re.compile(r'\&\w+\;')

def topicsfetch(nick, cache):
    url = BASE_URL + 'people/%s/' % nick
    offset = 0
    count = 0 
    br = Browser()
    while True:
        if offset:
            url_offset = url + '?offset=%d' % offset
        else:
            url_offset = url

        try:
            topics_page = br.open(url_offset).read()
        except HTTPError:
            if url_offset == url:
                print 'nick %s invalid' % nick
            return

        soup_topics_page = BeautifulSoup(topics_page)

        soup_topics = soup_topics_page.findAll('div', {'class': 'news'})

        for topic in soup_topics:
            id = None
            for attr in topic.attrs:
                if attr[0] == u'id':
                    id = attr[1].encode('utf8')
            if id:
                count += 1
                if not cache.has_key(id):
                    print '%d:%s' % (count, id)
                    cache[id] = (unicode(topic), 0)
                else:
                    print '%d:%s exists' % (count, id)

        soup_nav = soup_topics_page.findAll('table', {'class': 'nav'})
        soup_nav_hrefs = soup_nav[1].findAll('a')
        if len(soup_nav_hrefs) == 1 and offset != 0:
            break

        offset += 20

def topicparse(topic):
    bs_topic = BeautifulSoup(topic)
    topic_title = bs_topic.find('h2')
    topic_msg = bs_topic.find('div', {'class': 'msg'})
    topic_sign = bs_topic.find('div', {'class': 'sign'})
    topic_sign_a = topic_sign.find('a')
    topic_sign_a.extract() 
    topic_content_divs = topic_msg.findAll('div')
    [x.extract() for x in topic_content_divs]
    f = StringIO()

    for item in topic_title.recursiveChildGenerator():
        if isinstance(item, unicode):
            print >>f, item.rstrip()

    for item in topic_msg.recursiveChildGenerator():
        if isinstance(item, unicode):
            print >>f, item.rstrip()
    text_content = f.getvalue()
    f.close()
    return (text_content.rstrip(), topic_sign)

def commentfetch(lnk):
    m = re_link.match(lnk)
    if m:
        msgid = m.group(1)
        cid = m.group(2)
    else:
        print 'invalid lnk'
        return
    url = BASE_URL+lnk    
    try:
        comments = urlopen(url).read()
    except HTTPError:
        return None

    bs_comments = BeautifulSoup(comments)
    comment = bs_comments.find('div', {'class' : 'msg', 'id' : 'comment-%d' % int(cid) })
    comment_div = comment.find('div', {'class': 'msg_body message-w-userpic'})
    return (unicode(comment_div), int(cid))

def commentparse(comment):
    bs_comment = BeautifulSoup(comment)    
    comment_div = bs_comment.find('div', {'class': 'msg_body message-w-userpic'})
    content_div = comment_div.findAll('div')
    comment_sign = bs_comment.find('div', {'class': 'sign'})
    comment_sign_a = comment_sign.find('a')
    # remove datatime string
    comment_sign_a.extract() 
    comment_title = bs_comment.find('h2')
    # remove title string
    if comment_title:
        comment_title.extract()
    # magic remove all <div>content</div> with content ^_^
    [x.extract() for x in content_div] 
    f = StringIO()
    for item in comment_div.recursiveChildGenerator():
        if isinstance(item, unicode) and not re_quote.match(item):
            print >>f, item.rstrip()
    text_content = f.getvalue()
    text_sign = ''
    if comment_sign:
        text_sign = comment_sign.text
    f.close()
    return (text_content.rstrip(), text_sign)

def parse_comment_links(nick, cache):
    url = BASE_URL+'show-comments.jsp?nick='+nick
    deleted = 0
    count = 0
    offset = 0
    br = Browser()
    while True:
        if offset:
            url_offset = url + '&offset=%d' % offset
        else:
            url_offset = url
        try:
            comments = br.open(url_offset).read()
        except HTTPError:
            print 'nick %s invalid' % nick
            return
        soup_comments = BeautifulSoup(comments)
        table = soup_comments.table
        table_body = table.tbody
        comment_links = table_body.findAll('a')

        for cl in comment_links:
            count += 1
            key = cl['href'].encode('utf8')
            if not cache.has_key(key):
                try:
                    cache[key] = commentfetch(cl['href'])
                except Exception:
                    cache[key] = ''
                    print 'error parse %s' % cl['href']
                print '%d:%s' % (count, cl['href'])
            else:
                # if found first exists url - exit from cycle
                print '%d:%s exists' % (count, cl['href'])

        table_footer = table.tfoot
        navigate_links = table_footer.findAll('a')
        if offset == 0 and len(navigate_links) == 1: # first page
            offset += 50
            continue
        elif len(navigate_links) == 0: # one pag(?)
            break
        elif offset != 0 and len(navigate_links) == 1: # last page
            break
        elif len(navigate_links) == 2:
            offset += 50 
        else:
            assert('strange page')

    print 'parse %d comments' % count

def create_image(cache, imgname):
    f_comments = StringIO()
    f_topics = StringIO()

    for k in cache.keys():
        if cache[k]:
            content, cid = cache[k]
            if cid:
                content_text, time = commentparse(content)
                print >>f_comments, content_text
            else:
                content_text, time = topicparse(content)
                print >>f_topics, content_text

    content_comments = re_tag.sub('', f_comments.getvalue())
    content_topics = re_tag.sub('', f_topics.getvalue())

    counts_comments = get_tag_counts(content_comments)
    counts_topics = get_tag_counts(content_topics)
    counts_all = get_tag_counts(content_comments + ' ' +content_topics)

    f_comments.close()
    f_topics.close()

    tags_comments = make_tags(counts_comments)
    tags_topics = make_tags(counts_topics)
    tags_all = make_tags(counts_all)

    print '#top 30 comments#'
    print ''

    for tag in tags_comments[:30]:
        print '%s:%s' % ( tag['tag'].encode('utf8'), tag['size'])
    print '#top 30 topics#'
    print ''

    for tag in tags_topics[:30]:
        print '%s:%s' % ( tag['tag'].encode('utf8'), tag['size'])
    print '#top 30 topics+comments#'
    print ''

    for tag in tags_all[:30]:
        print '%s:%s' % ( tag['tag'].encode('utf8'), tag['size'])

    create_tag_image(
            tags_comments[:200], 
            '%s.comments.png' % imgname,
            fontname='Ubuntu',
            background=(0, 0, 0, 255),
            size=(1024,900),
            crop=False
            )
    create_tag_image(
            tags_topics[:200], 
            '%s.topics.png' % imgname,
            fontname='Ubuntu',
            background=(0, 0, 0, 255),
            size=(1024,900),
            crop=False
            )
    create_tag_image(
            tags_all[:200], 
            '%s.all.png' % imgname,
            fontname='Ubuntu',
            background=(0, 0, 0, 255),
            size=(1024,900),
            crop=False
            )

if __name__ == '__main__':
    import sys
    from os.path import exists as fexists
    if len(sys.argv) != 3:
        print "usage:%s <nick> <imgname>.png" % (sys.argv[0]) 
        sys.exit(1)
    nick = sys.argv[1]    
    cachename = '%s.cache2' % nick
    imgname = sys.argv[2]
    if fexists(cachename):
        print 'found %s cache' % nick
    cache = shelve.open(cachename)
    parse_comment_links(nick, cache)
    topicsfetch(nick, cache)
    create_image(cache, imgname)
    cache.close()