Commits

David Gero committed f009e37 Draft

[a] WordsCounter

Comments (0)

Files changed (1)

jozsefattilafrommek.py

 from BeautifulSoup import BeautifulSoup
 from lxml import etree
 import re
+import nltk
 
 class Tisztitas(object):
 
         def getTitle(self, text):
                 return text.find('a', attrs={'name' : True}).string
         
-        def getNumberOfChars(self,line):
-                chars_number = len(line)
-                return str(chars_number)
-        
-        def getNumberOfCharsWithoutBlank(self, line):
-                without_blank = re.sub(r'\s', '', line)
-                chars_number = len(without_blank)
-                return str(chars_number)
         
         def getWords(self, words):
                 return words.split(" ")
                 
-                                
+        
+class CharsCounter():
+        def __init__(self, chars):
+                self.chars = chars
+        
+        def getNumberOfChars(self):
+                chars_number = len(self.chars)
+                return str(chars_number)
+                
+        def getNumberOfCharsWithoutBlank(self):
+                without_blank = re.sub(r'\s', '', self.chars)
+                chars_number = len(without_blank)
+                return str(chars_number)
+
+class SentenceSplitter():
+        def __init__(self, sentence):
+                # todo: isinstance(string, str)
+                self.sentence = sentence
+                self.word_list = self.tokenizeSentence()
+                
+                
+        def getWordTokenizeSentence(self):
+                return nltk.word_tokenize(self.sentence)
+        
+        def tokenizeSentence(self):
+                return nltk.word_tokenize(self.sentence)
+                #return self.sentence.split(" ")
+        
+        def getWordsList(self):
+                return self.word_list
+        
+        def getWordsWithCount(self):
+                words_with_count = WordsWithCount()
+                for word in self.getWordsList():
+                        count = CharsCounter(word).getNumberOfCharsWithoutBlank()
+                        words_with_count.addNewItem(word, count)
+                return words_with_count
+        
+        def getWordsCount(self):
+                word_count = len(self.word_list)
+                return str(word_count)
+                        
+class WordsWithCount():
+        def __init__(self):
+                self._word = list()
+                self._count = list()
+                
+        def addNewItem(self, word, count):
+                self._word.append(word)
+                self._count.append(count)
+                
+        def _checkLenEqual(self):
+                if len(self_word) == len(self._count):
+                        return True
+                else:
+                        return False
+        
+        def getNext(self):
+                return self._popWord(), self._popCount()
+        
+        def _popWord(self):
+                return self._word.pop()
+        
+        def _popCount(self):
+                return self._count.pop()
+        
+        def __iter__(self):
+                return _WordsWithCountIterator(self._word, self._count)
+        
+class _WordsWithCountIterator():
+        def __init__(self, word, count):
+                self._word = word
+                self._count = count
+                self._cur_item = 0
+                
+        def __iter__(self):
+                return self
+                
+        def next(self):
+                """
+                From PEP 3114 iterator.next() to iterator.__next__()
+                """
+                if self._cur_item < len(self._word):
+                        word = self._word[self._cur_item]
+                        count = self._count[self._cur_item]
+                        self._cur_item += 1
+                        return word, count
+                else:
+                        raise StopIteration        
         
 class Stemmer():
         """
                                                 pass
                                         else:
                                                 text = re.sub("&amp;nbsp;", "", parag)
-                                                #line = self.tb.addElement(lg,"l")
-                                                #self.tb.addWords(line, self.par.getWords())
-                                                line = self.tb.addElementWithText(lg,"l", text)
-                                                line.attrib['count_with_blank'] = self.par.getNumberOfChars(text)
-                                                line.attrib['count_without_blank'] = self.par.getNumberOfCharsWithoutBlank(text)
+                                                line = self.tb.addElement(lg,"l")
+                                                for word, count in SentenceSplitter(text).getWordsWithCount():
+                                                        w = self.tb.addElementWithText(line, "w", word)
+                                                        w.attrib['count_without_blank'] = count
+                                                line.attrib['count_with_blank'] = CharsCounter(text).getNumberOfChars()
+                                                line.attrib['count_without_blank'] = CharsCounter(text).getNumberOfCharsWithoutBlank()
+                                                line.attrib['words_count'] = SentenceSplitter(text).getWordsCount()
                                               
                 self.tb.showResult()