David Gero avatar David Gero committed 57ca0a2 Draft

add w tag

Comments (0)

Files changed (1)

jozsefattilafrommek.py

         def setAttrib(self, attr, value):
                 self.element = self.getBeforeElement()
                 self.element[attr] = value
+                
+        def addWords(self, line, words_list):
+                for word in word_list:
+                        if word != "":
+                                n += 1
+                                w = self.addElementWithText(line, "w", word)
+                                # kell a parser obj
+                                #w.attrib['count_without_blank'] = self.par.getNumberOfCharsWithoutBlank(text)
+                line.attrib['words_count'] = str(n) 
+                
         
         def showResult(self):
                 print(self.et.tostring(self.getRoot(), encoding='utf-8',  pretty_print=True))
                 chars_number = len(without_blank)
                 return str(chars_number)
         
-
+        def getWords(self, words):
+                return words.split(" ")
+                
+                                
+        
+class Stemmer():
+        """
+        Snowball stemmer
+        todo: a w-taghez hozzárendelni a stem-et
+        from nltk import SnowballStemmer
+        """
+        def __ini__(self, word):
+                #from nltk import SnowballStemmer
+                self.word = word
+                self.language = 'hungarian'
+                self.stemmer = SnowballStemmer(self.language)
+                
+        def getSnowBallStemmer(self):
+                return self.stemmer
+        
+        def getStem(self, word):
+                return self.getSnowBallStemmer().stem(word)
+        
+class HunMorph():
+        """
+        A HunMorph morfológiai elemező. Stemmer és tagger, 
+        tehát ki tudja váltani a Snowballt, de rettenetesen lassú! 
+        """
+        def __init__(self):
+                pass
+        
 class FromMEK():
         def __init__(self, url, root_element="versek"):
                 self.url = url
                                                 pass
                                         else:
                                                 text = re.sub(" ", "", parag)
-                                                line = self.tb.addElementWithText(lg, "l", text)
+                                                #line = self.tb.addElement(lg,"l")
+                                                #self.tb.addWords(line, self.par.getWords())
+                                                line = self.tb.addElementWithText(lg,"l", text)
                                                 line.attrib['count_with_blank'] = self.par.getNumberOfChars(text)
                                                 line.attrib['count_without_blank'] = self.par.getNumberOfCharsWithoutBlank(text)
-                        
+                                              
                 self.tb.showResult()       
                         
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.