1. Mikhail Korobov
  2. pymorphy2

Commits

Mikhail Korobov  committed 5ec3f91

experiment: remove normal_form

  • Participants
  • Parent commits 6cfef84
  • Branches no-normal-forms

Comments (0)

Files changed (2)

File pymorphy2/analyzer.py

View file
  • Ignore whitespace
 
 logger = logging.getLogger(__name__)
 
-_Parse = collections.namedtuple('Parse', 'word, tag, normal_form, para_id, idx, estimate')
+_Parse = collections.namedtuple('Parse', 'word, tag, para_id, idx, estimate')
 
 class Parse(_Parse):
     """
     _morph = None
     _dict = None
 
+    _normal_form = None
+
     def inflect(self, required_grammemes):
         res = self._morph._inflect(self, required_grammemes)
         return None if not res else res[0]
             return self
 
         tag = self._dict.build_tag_info(self.para_id, 0)
-        return self.__class__(self.normal_form, tag, self.normal_form,
-                              self.para_id, 0, self.estimate)
+        return self.__class__(self.normal_form, tag, self.para_id, 0, self.estimate)
+
+    @property
+    def normal_form(self):
+        """ Word normal form (as text) """
+        if self.idx == 0:
+            return self.word
+
+        if self._normal_form is not None:
+            return self._normal_form
+
+        normal_form = self._dict.build_normal_form(self.para_id, self.idx, self.word)
+        self._normal_form = normal_form
+        return normal_form
 
     @property
     def paradigm(self):
         return self._dict.build_paradigm_info(self.para_id)
 
+    # def __repr__(self):
+    #     'Return a nicely formatted representation string'
+    #     return self.__class__.__name__ + '({repr_fmt})' % self
+
 
 class Dictionary(object):
     """
         Parse a word using this dictionary.
         """
         res = []
-        para_normal_forms = {}
+        #para_normal_forms = {}
         para_data = self.words.similar_items(word, self.ee)
 
         for fixed_word, parses in para_data:
             # `fixed_word` is a word with proper ё letters
             for para_id, idx in parses:
 
-                if para_id not in para_normal_forms:
-                    normal_form = self.build_normal_form(para_id, idx, fixed_word)
-                    para_normal_forms[para_id] = normal_form
-                else:
-                    normal_form = para_normal_forms[para_id]
+                # if para_id not in para_normal_forms:
+                #     normal_form = self.build_normal_form(para_id, idx, fixed_word)
+                #     para_normal_forms[para_id] = normal_form
+                # else:
+                #     normal_form = para_normal_forms[para_id]
 
                 tag = self.build_tag_info(para_id, idx)
 
                 res.append(
-                    (fixed_word, tag, normal_form, para_id, idx, 1.0)
+                    (fixed_word, tag, para_id, idx, 1.0)
                 )
 
         return res
         seen_paradigms = set()
         result = []
 
-        for fixed_word, tag, normal_form, para_id, idx, estimate in word_parses:
+        for fixed_word, tag, para_id, idx, estimate in word_parses:
             if para_id in seen_paradigms:
                 continue
             seen_paradigms.add(para_id)
                 # XXX: what to do with estimate?
                 # XXX: do we need all info?
                 result.append(
-                    (word, _tag, normal_form, para_id, index, estimate)
+                    (word, _tag, para_id, index, estimate)
                 )
 
         return result
         """
         for word, (para_id, idx) in self.words.iteritems(prefix):
             tag = self.build_tag_info(para_id, idx)
-            normal_form = self.build_normal_form(para_id, idx, word)
-            yield (word, tag, normal_form, para_id, idx, 1.0)
+            #normal_form = self.build_normal_form(para_id, idx, word)
+            yield (word, tag, para_id, idx, 1.0)
 
 
 
         """
         seen = set()
         result = []
-        for fixed_word, tag, normal_form, para_id, idx, estimate in self.parse(word):
+        for fixed_word, tag, para_id, idx, estimate in self.parse(word):
+            normal_form = self.dictionary.build_normal_form(para_id, idx, fixed_word)
             if normal_form not in seen:
                 result.append(normal_form)
                 seen.add(normal_form)
 
         def weigth(parse):
             # order by (probability, index in lexeme)
-            return -parse[5], parse[4]
+            return -parse[4], parse[3]
 
         result = []
         seen = set()

File pymorphy2/predictors.py

View file
  • Ignore whitespace
             if len(unprefixed_word) < self.MIN_REMINDER_LENGTH:
                 continue
 
-            for fixed_word, tag, normal_form, para_id, idx, estimate in self.morph.parse(unprefixed_word):
+            for fixed_word, tag, para_id, idx, estimate in self.morph.parse(unprefixed_word):
 
                 if not tag.is_productive():
                     continue
 
-                parse = (prefix+fixed_word, tag, prefix+normal_form, para_id, idx, estimate*self.ESTIMATE_DECAY)
+                parse = (prefix+fixed_word, tag, para_id, idx, estimate*self.ESTIMATE_DECAY)
                 _add_parse_if_not_seen(parse, result, seen_parses)
 
         return result
     def parse(self, word, seen_parses):
         result = []
         for prefix, unprefixed_word in word_splits(word):
-            for fixed_word, tag, normal_form, para_id, idx, estimate in self.dict.parse(unprefixed_word):
+            for fixed_word, tag, para_id, idx, estimate in self.dict.parse(unprefixed_word):
 
                 if not tag.is_productive():
                     continue
 
-                parse = (prefix+fixed_word, tag, prefix+normal_form, para_id, idx, estimate*self.ESTIMATE_DECAY)
+                parse = (prefix+fixed_word, tag, para_id, idx, estimate*self.ESTIMATE_DECAY)
                 _add_parse_if_not_seen(parse, result, seen_parses)
 
         return result
                         total_counts[prefix_id] += cnt
 
                         fixed_word = word[:-i] + fixed_suffix
-                        normal_form = self.dict.build_normal_form(para_id, idx, fixed_word)
+                        # normal_form = self.dict.build_normal_form(para_id, idx, fixed_word)
 
-                        parse = (cnt, fixed_word, tag, normal_form, para_id, idx, prefix_id)
+                        parse = (cnt, fixed_word, tag, para_id, idx, prefix_id)
                         reduced_parse = parse[1:4]
                         if reduced_parse in seen_parses:
                             continue
                     break
 
         result = [
-            (fixed_word, tag, normal_form, para_id, idx, cnt/total_counts[prefix_id] * self.ESTIMATE_DECAY)
-            for (cnt, fixed_word, tag, normal_form, para_id, idx, prefix_id) in result
+            (fixed_word, tag, para_id, idx, cnt/total_counts[prefix_id] * self.ESTIMATE_DECAY)
+            for (cnt, fixed_word, tag, para_id, idx, prefix_id) in result
         ]
-        result.sort(key=operator.itemgetter(5), reverse=True)
+        result.sort(key=operator.itemgetter(4), reverse=True)
         return result
 
 
             if not unsuffixed_word:
                 continue
 
-            for fixed_word, tag, normal_form, para_id, idx, estimate in self.morph.parse(unsuffixed_word):
-                parse = (fixed_word, tag, normal_form, para_id, idx, estimate*self.ESTIMATE_DECAY)
+            for fixed_word, tag, para_id, idx, estimate in self.morph.parse(unsuffixed_word):
+                parse = (fixed_word, tag, para_id, idx, estimate*self.ESTIMATE_DECAY)
                 _add_parse_if_not_seen(parse, result, seen_parses)
 
             # If a word ends with with one of the particles,