hizel avatar hizel committed a744c00

regexp в порядке и исключение quote только если ">" в начале строки

Comments (0)

Files changed (3)

 *.dot
 *.txt
 *.cache
+*.cache2
 build/*
 dist/*
 LorCloud.egg-info/*
 import shelve
 
 BASE_URL = 'http://www.linux.org.ru/'
-re_quote = re.compile('>')
+re_quote = re.compile(r'^>')
+re_link  = re.compile('jump-message.jsp\?msgid=(\d+)&cid=(\d+)')
+re_tag   = re.compile(r'\&\w+\;')
 
 def commentfetch(lnk):
-    pat = re.compile('jump-message.jsp\?msgid=(\d+)&cid=(\d+)')
-    m = pat.match(lnk)
+    m = re_link.match(lnk)
     if m:
         msgid = m.group(1)
         cid = m.group(2)
     f.close()
     return (text_content.rstrip(), comment_sign.text.lstrip().rstrip())
 
-
-
-def parse_comment(lnk):
-    pat = re.compile('jump-message.jsp\?msgid=(\d+)&cid=(\d+)')
-    m = pat.match(lnk)
-    if m:
-        msgid = m.group(1)
-        cid = m.group(2)
-    else:
-        print 'invalid lnk'
-        return
-    url = BASE_URL+lnk    
-    try:
-        comments = mechanize.urlopen(url).read()
-    except HTTPError:
-        return None
-
-    bs_comments = BeautifulSoup(comments)
-    comment = bs_comments.find('div', {'class' : 'msg', 'id' : 'comment-%d' % int(cid) })
-    comment_div = comment.find('div', {'class': 'msg_body message-w-userpic'})
-    content = comment_div
-    content_div = content.findAll('div')
-    # magic remove all <div>content</div> with content ^_^
-    [x.extract() for x in content_div] 
-    f = StringIO()
-    for item in content.recursiveChildGenerator():
-        if isinstance(item, unicode) and not re_quote.match(item):
-            print >>f, item.rstrip()
-    text_content = f.getvalue()
-    f.close()
-    return text_content.rstrip()
-
 def parse_comment_links(nick, cache):
     url = BASE_URL+'show-comments.jsp?nick='+nick
     deleted = 0
     print 'parse %d comments' % count
 
 def create_image(cache, imgname):
-    rep =re.compile(r'\&\w+\;')
     f = StringIO()
     for k in cache.keys():
         if cache[k]:
             content_text, time = commentparse(content)
             print >>f, content_text
     content = f.getvalue()
-    content = rep.sub('', content)
+    content = re_tag.sub('', content)
     counts = get_tag_counts(content)
     f.close()
     tags = make_tags(counts)
 url_quote_gt = 'jump-message.jsp?msgid=6019329&cid=6021135'
 url_title = 'jump-message.jsp?msgid=6044815&cid=6050597'
 
-url = url_code
+url = url_quote_gt
 
-comment_html = commentfetch(url)
+comment_html, cid = commentfetch(url)
 print comment_html.encode('utf8')
 comment_parsed = commentparse(comment_html)
 print comment_parsed[0].encode('utf8'), '<->', comment_parsed[1].encode('utf8')
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.