Commits

hizel committed 07503c5

logging и fetchonetopic - будем строить облачка для отдельного топика

Comments (0)

Files changed (3)

 *.txt
 *.cache
 *.cache2
+*.db
 build/*
 dist/*
 LorCloud.egg-info/*
 from pytagcloud.lang.counter import get_tag_counts
 from StringIO import StringIO
 
+from logging import debug, info, error, warning
+
 import shelve
 import codecs
 import re
 re_quote = re.compile(r'^>')
 re_link  = re.compile('jump-message.jsp\?msgid=(\d+)&cid=(\d+)')
 re_tag   = re.compile(r'\&\w+\;')
+re_topicurl = re.compile('http://www.linux.org.ru/\w+/\w+/(\d+)')
+re_topicid = re.compile('topic-\d+')
+re_commentid = re.compile('comment-\d+')
 
 def topicsfetch(nick, cache):
     url = BASE_URL + 'people/%s/' % nick
             topics_page = br.open(url_offset).read()
         except HTTPError:
             if url_offset == url:
-                print 'nick %s invalid' % nick
+                warning('nick %s invalid' % nick)
             return
 
         soup_topics_page = BeautifulSoup(topics_page)
             if id:
                 count += 1
                 if not cache.has_key(id):
-                    print '%d:%s' % (count, id)
+                    info('%d:%s' % (count, id))
                     cache[id] = (unicode(topic), 0)
                 else:
-                    print '%d:%s exists' % (count, id)
+                    info('%d:%s exists' % (count, id))
 
         soup_nav = soup_topics_page.findAll('table', {'class': 'nav'})
         soup_nav_hrefs = soup_nav[1].findAll('a')
         msgid = m.group(1)
         cid = m.group(2)
     else:
-        print 'invalid lnk'
+        warning('invalid lnk')
         return
     url = BASE_URL+lnk    
     try:
         try:
             comments = br.open(url_offset).read()
         except HTTPError:
-            print 'nick %s invalid' % nick
+            error('nick %s invalid' % nick)
             return
         soup_comments = BeautifulSoup(comments)
         table = soup_comments.table
                 comment_id = int(m.group(2))
                 key = 'comment-%d-%d' % (topic_id, comment_id)
             else:
-                print 'invalid link'
+                warning('invalid link')
                 continue
 
             if not cache.has_key(key):
                     cache[key] = commentfetch(cl['href'])
                 except Exception:
                     cache[key] = ''
-                    print 'error parse %s' % cl['href']
-                print '%d: topic:%d comment:%d' % (count, topic_id, comment_id)
+                    warning('error parse %s' % cl['href'])
+                info('%d: topic:%d comment:%d' % (count, topic_id, comment_id))
             else:
-                print '%d: topic:%d comment:%d exists' % (count, topic_id, comment_id)
+                info('%d: topic:%d comment:%d exists' % (count, topic_id,
+                        comment_id))
 
         table_footer = table.tfoot
         navigate_links = table_footer.findAll('a')
         else:
             assert('strange page')
 
-    print 'parse %d comments' % count
+    info('parse %d comments' % count)
 
 def create_image(cache, imgname, size=(1024,900), count=200, crop=False):
     f_comments = StringIO()
     f_topics.close()
 
     if len(counts_comments) < 5:
-        print 'zero comments'
+        warning('zero comments')
         try_comments = False
     if len(counts_topics) < 5:
-        print 'zero topics'
+        warning('zero topics')
         try_topics = False
     if len(counts_all) < 5:
-        print 'zero'
+        warning('zero')
         try_all = False
 
 
     if try_comments:
-        print '#top 30 comments#'
-        print ''
+        info('#top 30 comments#')
         tags_comments = make_tags(counts_comments)
         for tag in tags_comments[:30]:
-            print '%s:%s' % ( tag['tag'].encode('utf8'), tag['size'])
+            info('%s:%s' % ( tag['tag'].encode('utf8'), tag['size']))
         create_tag_image(
                 tags_comments[:count], 
                 '%s.comments.png' % imgname,
                 )
 
     if try_topics:
-        print '#top 30 topics#'
-        print ''
+        info('#top 30 topics#')
         tags_topics = make_tags(counts_topics)
         for tag in tags_topics[:30]:
-            print '%s:%s' % ( tag['tag'].encode('utf8'), tag['size'])
+            info('%s:%s' % ( tag['tag'].encode('utf8'), tag['size']))
         create_tag_image(
                 tags_topics[:count], 
                 '%s.topics.png' % imgname,
                 )
 
     if try_all:
-        print '#top 30 topics+comments#'
-        print ''
+        info('#top 30 topics+comments#')
         tags_all = make_tags(counts_all)
         for tag in tags_all[:30]:
-            print '%s:%s' % ( tag['tag'].encode('utf8'), tag['size'])
+            info('%s:%s' % ( tag['tag'].encode('utf8'), tag['size']))
 
         create_tag_image(
                 tags_all[:count], 
             cache[newkey] = cache[k]
             del cache[k]
 
+def fetchonetopic(url):
+    m = re_topicurl.match(url)
+    if not m:
+        error('invalid topic url')
+        return
+
+    topic_id = int(m.group(1))
+    db_name = 'topic-%d.db' % topic_id
+    db = shelve.open(db_name)
+
+    resp = urlopen(url)
+
+    bs_resp = BeautifulSoup(resp.read())
+
+    tag_nav = bs_resp.find('div', {'class':'nav'}).findNext('div', {'class':'nav'})
+
+    if tag_nav:
+        tag_nav_a = tag_nav.findAll('a')
+        pages = [a['href'] for a in tag_nav_a]
+
+    count = 0
+
+    for page in pages:
+        tags_msg = bs_resp.findAll('div', {'class':'msg'})
+        for tag in tags_msg:
+            count += 1
+            id = tag['id'].encode('utf8')
+            if re_commentid.match(id):
+                if not db.has_key(id):
+                    db[id] = unicode(tag)
+                    info('%d:%s add' % (count, id))
+                else:
+                    info('%d:%s exists' % (count, id))
+        next_url = BASE_URL + page
+        resp = urlopen(next_url)
+        bs_resp = BeautifulSoup(resp.read())
+    db.close()
+    return db_name
+
 if __name__ == '__main__':
     import sys
     from os.path import exists as fexists
 #resp = urlopen(url)
 
 
+#import shelve
+#re_url = re.compile(r'jump-message.jsp\?msgid=(\d+)&cid=(\d+)')
+#cache = shelve.open('hizel.cache3')
+#for k in cache.keys():
+#    key = k.decode('utf8')
+#    m = re_url.match(key)
+#    if m:
+#        newkey = 'comment-%d-%d' % (int(m.group(1)), int(m.group(2)))
+#        cache[newkey] = cache[k]
+#        del cache[k]
+#        print '%s -> %s' % (k,newkey)
+
+from mechanize import urlopen
+from BeautifulSoup import BeautifulSoup
 import shelve
-re_url = re.compile(r'jump-message.jsp\?msgid=(\d+)&cid=(\d+)')
-cache = shelve.open('hizel.cache3')
-for k in cache.keys():
-    key = k.decode('utf8')
-    m = re_url.match(key)
-    if m:
-        newkey = 'comment-%d-%d' % (int(m.group(1)), int(m.group(2)))
-        cache[newkey] = cache[k]
-        del cache[k]
-        print '%s -> %s' % (k,newkey)
+
+url = 'http://www.linux.org.ru/news/games/5967940'
+url2 = 'http://www.linux.org.ru/forum/talks/6056345'
+url3 = 'http://www.linux.org.ru/forum/talks/6056282'
+re_topicurl = re.compile('http://www.linux.org.ru/\w+/\w+/(\d+)')
+re_topicid = re.compile('topic-\d+')
+re_commentid = re.compile('comment-\d+')
+
+
+def fetchonetopic(url):
+    m = re_topicurl.match(url)
+    if not m:
+        print 'invalid topic url'
+        return
+
+    topic_id = int(m.group(1))
+    db_name = 'topic-%d.db' % topic_id
+    db = shelve.open(db_name)
+    print topic_id
+
+    resp = urlopen(url)
+
+    bs_resp = BeautifulSoup(resp.read())
+
+    tag_nav = bs_resp.find('div', {'class':'nav'}).findNext('div', {'class':'nav'})
+
+    pages = ['']
+
+    if tag_nav:
+        tag_nav_a = tag_nav.findAll('a')
+        pages = [a['href'] for a in tag_nav_a]
+
+    print pages
+
+    base_url = 'http://www.linux.org.ru'
+
+    count = 0
+
+    for page in pages:
+        tags_msg = bs_resp.findAll('div', {'class':'msg'})
+        for tag in tags_msg:
+            count += 1
+            id = tag['id'].encode('utf8')
+            if re_commentid.match(id):
+                if not db.has_key(id):
+                    db[id] = unicode(tag)
+                    print '%d:%s add' % (count, id)
+                else:
+                    print '%d:%s exists' % (count, id)
+        next_url = base_url + page
+        resp = urlopen(next_url)
+        bs_resp = BeautifulSoup(resp.read())
+    db.close()
+    return db_name
+
+
+#fetchonetopic(url)
+#fetchonetopic(url2)
+fetchonetopic(url3)