Commits

Anonymous committed 590166c

пора домой

  • Participants
  • Parent commits 416bc98

Comments (0)

Files changed (3)

File lorparser.py

 BASE_URL = 'http://www.linux.org.ru/'
 re_quote = re.compile('>')
 
+def commentfetch(lnk):
+    pat = re.compile('jump-message.jsp\?msgid=(\d+)&cid=(\d+)')
+    m = pat.match(lnk)
+    if m:
+        msgid = m.group(1)
+        cid = m.group(2)
+    else:
+        print 'invalid lnk'
+        return
+    url = BASE_URL+lnk    
+    try:
+        comments = mechanize.urlopen(url).read()
+    except HTTPError:
+        return None
+
+    bs_comments = BeautifulSoup(comments)
+    comment = bs_comments.find('div', {'class' : 'msg', 'id' : 'comment-%d' % int(cid) })
+    comment_div = comment.find('div', {'class': 'msg_body message-w-userpic'})
+    return unicode(comment_div)
+
+def commentparse(comment):
+    bs_comment = BeautifulSoup(comment)    
+    comment_div = bs_comment.find('div', {'class': 'msg_body message-w-userpic'})
+    content_div = comment_div.findAll('div')
+    comment_sign = bs_comment.find('div', {'class': 'sign'})
+    comment_sign_a = comment_sign.find('a')
+    comment_sign_a.extract()
+    # magic remove all <div>content</div> with content ^_^
+    [x.extract() for x in content_div] 
+    f = StringIO()
+    for item in comment_div.recursiveChildGenerator():
+        if isinstance(item, unicode) and not re_quote.match(item):
+            print >>f, item.rstrip()
+    text_content = f.getvalue()
+    f.close()
+    return (text_content.rstrip(), comment_sign.text.lstrip().rstrip())
+
+
+
 def parse_comment(lnk):
     pat = re.compile('jump-message.jsp\?msgid=(\d+)&cid=(\d+)')
     m = pat.match(lnk)

File parsetest.py

 #!/usr/bin/env python
 
-from lorparser import parse_comment
+from lorparser import commentfetch, commentparse
 from StringIO import StringIO
 import re
 
 url_quote = 'jump-message.jsp?msgid=6032690&cid=6033193'
 url_code = 'jump-message.jsp?msgid=6019454&cid=6024269'
 url_quote_gt = 'jump-message.jsp?msgid=6019329&cid=6021135'
+url_title = 'jump-message.jsp?msgid=6044815&cid=6050597'
 
-url = url_code
+url = url_title
 
-text_content = parse_comment(url)
-
-print text_content.encode('utf8')
+comment_html = commentfetch(url)
+comment_parsed = commentparse(comment_html)
+print comment_parsed[0].encode('utf8'), '<->', comment_parsed[1].encode('utf8')

File pytagcloud/lang/stopwords.py

 import os
 import codecs
 
-ACTIVE_LISTS = ('russian, english')
+ACTIVE_LISTS = ('russian', 'english')
 
 class StopWords(object):
     
     def is_stop_word(self, word):
         if not self.language:
             raise LookupError("No language loaded")
-        return word in self.stop_words_lists[self.language]
+        for lang in ACTIVE_LISTS:
+            if word in self.stop_words_lists[lang]:
+                return True
+        return False
     
     def guess(self, words):
         currentWinner = None;