Source

trololo / lorparser.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from urllib2 import HTTPError
from mechanize import urlopen, Browser
from BeautifulSoup import BeautifulSoup
from pytagcloud import create_tag_image, LAYOUTS, make_tags
from pytagcloud.lang.counter import get_tag_counts
from StringIO import StringIO

from logging import debug, info, error, warning
from os.path import exists

import shelve
import codecs
import re

BASE_URL = 'http://www.linux.org.ru/'
re_quote = re.compile(r'^\s*>')
re_link  = re.compile('jump-message.jsp\?msgid=(\d+)&cid=(\d+)')
re_tag   = re.compile(r'\&\w+\;')
re_topicurl = re.compile('http://www.linux.org.ru/\w+/\w+/(\d+)')
re_topicid = re.compile('topic-\d+')
re_commentid = re.compile('comment-\d+')

def topicsfetch(nick, cache):
    url = BASE_URL + 'people/%s/' % nick
    offset = 0
    count = 0 
    br = Browser()
    while True:
        if offset:
            url_offset = url + '?offset=%d' % offset
        else:
            url_offset = url

        try:
            topics_page = br.open(url_offset).read()
        except HTTPError:
            if url_offset == url:
                warning('nick %s invalid' % nick)
            return

        soup_topics_page = BeautifulSoup(topics_page)

        soup_topics = soup_topics_page.findAll('div', {'class': 'news'})

        for topic in soup_topics:
            id = None
            for attr in topic.attrs:
                if attr[0] == u'id':
                    id = attr[1].encode('utf8')
            if id:
                count += 1
                if not cache.has_key(id):
                    info('%d:%s' % (count, id))
                    cache[id] = (unicode(topic), 0)
                else:
                    info('%d:%s exists' % (count, id))

        soup_nav = soup_topics_page.findAll('table', {'class': 'nav'})
        soup_nav_hrefs = soup_nav[1].findAll('a')
        if len(soup_nav_hrefs) == 1 and offset != 0:
            break

        offset += 20

def topicparse(topic):
    bs_topic = BeautifulSoup(topic)
    topic_title = bs_topic.find('h2')
    topic_msg = bs_topic.find('div', {'class': 'msg'})
    topic_sign = bs_topic.find('div', {'class': 'sign'})
    topic_sign_a = topic_sign.find('a')
    topic_sign_a.extract() 
    topic_content_divs = topic_msg.findAll('div')
    [x.extract() for x in topic_content_divs]
    f = StringIO()

    for item in topic_title.recursiveChildGenerator():
        if isinstance(item, unicode):
            print >>f, item.rstrip()

    for item in topic_msg.recursiveChildGenerator():
        if isinstance(item, unicode):
            print >>f, item.rstrip()
    text_content = f.getvalue()
    f.close()
    return (text_content.rstrip(), topic_sign)

def commentfetch(lnk):
    m = re_link.match(lnk)
    if m:
        msgid = m.group(1)
        cid = m.group(2)
    else:
        warning('invalid lnk')
        return
    url = BASE_URL+lnk    
    try:
        comments = urlopen(url).read()
    except HTTPError:
        return None

    bs_comments = BeautifulSoup(comments)
    comment = bs_comments.find('div', {'class' : 'msg', 'id' : 'comment-%d' % int(cid) })
    comment_div = comment.find('div', {'class': 'msg_body message-w-userpic'})
    return (unicode(comment_div), int(cid))

def commentparse(comment):
    bs_comment = BeautifulSoup(comment)    
    comment_div = bs_comment.find('div', {'class': 'msg_body message-w-userpic'})
    content_div = comment_div.findAll('div')
    comment_sign = bs_comment.find('div', {'class': 'sign'})
    comment_sign_a = comment_sign.find('a')
    # remove datatime string
    if comment_sign_a:
        comment_sign_a.extract() 
    comment_title = bs_comment.find('h2')
    # remove title string
    if comment_title:
        comment_title.extract()
    # magic remove all <div>content</div> with content ^_^
    [x.extract() for x in content_div] 
    f = StringIO()
    for item in comment_div.recursiveChildGenerator():
        if isinstance(item, unicode) and not re_quote.match(item):  
            print >>f, item.rstrip()
    text_content = f.getvalue()
    text_sign = ''
    if comment_sign:
        text_sign = comment_sign.text
    f.close()
    return (text_content.rstrip(), text_sign)

def parse_comment_links(nick, cache):
    url = BASE_URL+'show-comments.jsp?nick='+nick
    deleted = 0
    count = 0
    offset = 0
    br = Browser()
    while True:
        if offset:
            url_offset = url + '&offset=%d' % offset
        else:
            url_offset = url
        try:
            comments = br.open(url_offset).read()
        except HTTPError:
            error('nick %s invalid' % nick)
            return
        soup_comments = BeautifulSoup(comments)
        table = soup_comments.table
        table_body = table.tbody
        comment_links = table_body.findAll('a')

        for cl in comment_links:
            count += 1
            m = re_link.match(cl['href'])
            if m:
                topic_id = int(m.group(1))
                comment_id = int(m.group(2))
                key = 'comment-%d-%d' % (topic_id, comment_id)
            else:
                warning('invalid link')
                continue

            if not cache.has_key(key):
                try:
                    cache[key] = commentfetch(cl['href'])
                except Exception:
                    cache[key] = ''
                    warning('error parse %s' % cl['href'])
                info('%d: topic:%d comment:%d' % (count, topic_id, comment_id))
            else:
                info('%d: topic:%d comment:%d exists' % (count, topic_id,
                        comment_id))

        table_footer = table.tfoot
        navigate_links = table_footer.findAll('a')
        if offset == 0 and len(navigate_links) == 1: # first page
            offset += 50
            continue
        elif len(navigate_links) == 0: # one pag(?)
            break
        elif offset != 0 and len(navigate_links) == 1: # last page
            break
        elif len(navigate_links) == 2:
            offset += 50 
        else:
            assert('strange page')

    info('parse %d comments' % count)

def create_image(cache, imgname, size=(1024,900), count=200, crop=False):
    f_comments = StringIO()
    f_topics = StringIO()

    for k in cache.keys():
        if cache[k]:
            content, cid = cache[k]
            if cid:
                content_text, time = commentparse(content)
                print >>f_comments, content_text
            else:
                content_text, time = topicparse(content)
                print >>f_topics, content_text

    content_comments = re_tag.sub('', f_comments.getvalue())
    content_topics = re_tag.sub('', f_topics.getvalue())

    try_comments = True
    try_topics = True
    try_all = True

    counts_comments = get_tag_counts(content_comments)
    counts_topics = get_tag_counts(content_topics)
    counts_all = get_tag_counts(content_comments + ' ' +content_topics)

    f_comments.close()
    f_topics.close()

    if len(counts_comments) < 5:
        warning('zero comments')
        try_comments = False
    if len(counts_topics) < 5:
        warning('zero topics')
        try_topics = False
    if len(counts_all) < 5:
        warning('zero')
        try_all = False


    if try_comments:
        info('#top 30 comments#')
        tags_comments = make_tags(counts_comments)
        for tag in tags_comments[:30]:
            info('%s:%s' % ( tag['tag'].encode('utf8'), tag['size']))
        create_tag_image(
                tags_comments[:count], 
                '%s.comments.png' % imgname,
                fontname='Ubuntu',
                background=(0, 0, 0, 255),
                size=size,
                crop=crop
                )

    if try_topics:
        info('#top 30 topics#')
        tags_topics = make_tags(counts_topics)
        for tag in tags_topics[:30]:
            info('%s:%s' % ( tag['tag'].encode('utf8'), tag['size']))
        create_tag_image(
                tags_topics[:count], 
                '%s.topics.png' % imgname,
                fontname='Ubuntu',
                background=(0, 0, 0, 255),
                size=size,
                crop=crop
                )

    if try_all:
        info('#top 30 topics+comments#')
        tags_all = make_tags(counts_all)
        for tag in tags_all[:30]:
            info('%s:%s' % ( tag['tag'].encode('utf8'), tag['size']))

        create_tag_image(
                tags_all[:count], 
                '%s.all.png' % imgname,
                fontname='Ubuntu',
                background=(0, 0, 0, 255),
                size=size,
                crop=crop
                )

def fixoldcache(cache):
    for k in cache.keys():
        key = k.decode('utf8')
        m = re_link.match(key)
        if m:
            newkey = 'comment-%d-%d' % (int(m.group(1)), int(m.group(2)))
            cache[newkey] = cache[k]
            del cache[k]

def drawonetopic(url, imgname, size=(1024,900), count=200, crop=False):
    m = re_topicurl.match(url)
    if not m:
        error('invalid topic url')
        return

    topic_id = int(m.group(1))
    db_name = 'topic-%d.db' % topic_id
    if not exists(db_name):
        error('for create image need fetch topic')

    db = shelve.open(db_name)

    f_comments = StringIO()

    for k in db.keys():
        if db[k]:
            content = db[k]
            content_text, time = commentparse(content)
            print >>f_comments, content_text

    content_comments = re_tag.sub('', f_comments.getvalue())

    try_comments = True

    counts_comments = get_tag_counts(content_comments)

    f_comments.close()

    if len(counts_comments) < 5:
        warning('zero comments')
        try_comments = False

    if try_comments:
        info('#top 30 comments#')
        tags_comments = make_tags(counts_comments)
        for tag in tags_comments[:30]:
            info('%s:%s' % ( tag['tag'].encode('utf8'), tag['size']))
        create_tag_image(
                tags_comments[:count], 
                '%s-topic-%d.png' % (imgname,topic_id),
                fontname='Ubuntu',
                background=(0, 0, 0, 255),
                size=size,
                crop=crop
                )

def fetchonetopic(url):
    m = re_topicurl.match(url)
    if not m:
        error('invalid topic url')
        return

    topic_id = int(m.group(1))
    db_name = 'topic-%d.db' % topic_id
    db = shelve.open(db_name)

    resp = urlopen(url)

    bs_resp = BeautifulSoup(resp.read())

    tag_nav = bs_resp.find('div', {'class':'nav'}).findNext('div', {'class':'nav'})

    if tag_nav:
        tag_nav_a = tag_nav.findAll('a')
        pages = [a['href'] for a in tag_nav_a]

    count = 0

    for page in pages:
        tags_msg = bs_resp.findAll('div', {'class':'msg'})
        for tag in tags_msg:
            count += 1
            id = tag['id'].encode('utf8')
            if re_commentid.match(id):
                if not db.has_key(id):
                    db[id] = unicode(tag)
                    info('%d:%s add' % (count, id))
                else:
                    info('%d:%s exists' % (count, id))
        next_url = BASE_URL + page
        resp = urlopen(next_url)
        bs_resp = BeautifulSoup(resp.read())
    db.close()
    return db_name

if __name__ == '__main__':
    import sys
    from os.path import exists as fexists
    if len(sys.argv) != 3:
        print "usage:%s <nick> <imgname>.png" % (sys.argv[0]) 
        sys.exit(1)
    nick = sys.argv[1]    
    cachename = '%s.cache2' % nick
    imgname = sys.argv[2]
    if fexists(cachename):
        print 'found %s cache' % nick
    cache = shelve.open(cachename)
    fixoldcache(cache)
    parse_comment_links(nick, cache)
    topicsfetch(nick, cache)
    create_image(cache, imgname)
    cache.close()