hizel avatar hizel committed 9ef4d65

stop по всем языкам, улучшен парсер комментов, теперь с датой, изменен кэш, сохраняется вся разметка

Comments (0)

Files changed (4)

     bs_comments = BeautifulSoup(comments)
     comment = bs_comments.find('div', {'class' : 'msg', 'id' : 'comment-%d' % int(cid) })
     comment_div = comment.find('div', {'class': 'msg_body message-w-userpic'})
-    return unicode(comment_div)
+    return (unicode(comment_div), int(cid))
 
 def commentparse(comment):
     bs_comment = BeautifulSoup(comment)    
     content_div = comment_div.findAll('div')
     comment_sign = bs_comment.find('div', {'class': 'sign'})
     comment_sign_a = comment_sign.find('a')
-    comment_sign_a.extract()
+    # remove datatime string
+    comment_sign_a.extract() 
+    comment_title = bs_comment.find('h2')
+    # remove title string
+    if comment_title:
+        comment_title.extract()
     # magic remove all <div>content</div> with content ^_^
     [x.extract() for x in content_div] 
     f = StringIO()
             key = cl['href'].encode('utf8')
             if not cache.has_key(key):
                 try:
-                    cache[key] = parse_comment(cl['href'])
+                    cache[key] = commentfetch(cl['href'])
                 except Exception:
                     cache[key] = ''
                     print 'error parse %s' % cl['href']
     rep =re.compile(r'\&\w+\;')
     f = StringIO()
     for k in cache.keys():
-        print >>f, cache[k]
+        if cache[k]:
+            content, cid = cache[k]
+            content_text, time = commentparse(content)
+            print >>f, content_text
     content = f.getvalue()
     content = rep.sub('', content)
     counts = get_tag_counts(content)
         print "usage:%s <nick> <imgname>.png" % (sys.argv[0]) 
         sys.exit(1)
     nick = sys.argv[1]    
-    cachename = '%s.cache' % nick
+    cachename = '%s.cache2' % nick
     imgname = sys.argv[2]
     if fexists(cachename):
         print 'found %s cache' % nick
 url_quote_gt = 'jump-message.jsp?msgid=6019329&cid=6021135'
 url_title = 'jump-message.jsp?msgid=6044815&cid=6050597'
 
-url = url_title
+url = url_code
 
 comment_html = commentfetch(url)
+print comment_html.encode('utf8')
 comment_parsed = commentparse(comment_html)
 print comment_parsed[0].encode('utf8'), '<->', comment_parsed[1].encode('utf8')

pytagcloud/lang/counter.py

     words = map(lambda x:x.group(0).lower(), 
             re.finditer(WORD, text, re.UNICODE))
     
-    s = StopWords()     
-    s.load_language(s.guess(words))
+    s = StopWords( ('russian', 'english') )     
     
     counted = {}
     

pytagcloud/lang/stopwords.py

 import os
 import codecs
 
-ACTIVE_LISTS = ('russian', 'english')
-
 class StopWords(object):
     
-    def __init__(self):
+    def __init__(self, active_list):
         
-        self.stop_words_lists = {}
+        self.stop_words = {}
         self.language = None
+        self.active_list = active_list
         
         stop_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stop')
         
         for root, dirs, files in os.walk(stop_dir):
             for file in files:
-                if not file in ACTIVE_LISTS:
+                if not file in active_list:
                     continue
                 stop_file = codecs.open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'stop/', file), encoding='utf-8')
-                self.stop_words_lists[file] = []                
                 for stop_word in stop_file:
-                    self.stop_words_lists[file].append(stop_word.strip().lower())
+                    self.stop_words[stop_word.rstrip()] = 1
                 stop_file.close()
 
-    def load_language(self, language):
-        self.language = language
-                    
     def is_stop_word(self, word):
-        if not self.language:
-            raise LookupError("No language loaded")
-        for lang in ACTIVE_LISTS:
-            if word in self.stop_words_lists[lang]:
-                return True
-        return False
+        return word in self.stop_words;
     
-    def guess(self, words):
-        currentWinner = None;
-        currentMax = 0;
-        
-        for language, stop_word_list in self.stop_words_lists.items():
-            count = 0
-            for word in words:
-                if word in stop_word_list:
-                    count += 1
-                    
-            if count > currentMax:
-                currentWinner = language
-                currentMax = count
-        
-        return currentWinner
-    
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.