Commits

Mikhail Korobov committed fbdc1b0

return P(t|w) instead of old estimate; sort results according to P(t|w); bump requirements

  • Participants
  • Parent commits 74ab4e2

Comments (0)

Files changed (13)

 deps =
     pytest
     psutil
-    pymorphy2-dicts >=2.2, <3.0
+    pymorphy2-dicts >=2.4, <3.0
 
 [testenv]
 deps=
-    dawg >= 0.5
+    dawg > 0.7
     {[base]deps}
 
 ;setenv =
 ;    PYMORPHY2_DICT_PATH = ../pymorphy2-dicts/pymorphy2_dicts/data
 
 commands=
+    python setup.py install
     pymorphy dict mem_usage
     python benchmarks/bench.py run []
 
 [testenv:py26]
 deps=
     futures
-    dawg >= 0.5
+    dawg > 0.7
     {[base]deps}
 
 [testenv:py27]
 deps=
     futures
-    dawg >= 0.5
+    dawg > 0.7
     {[base]deps}
 
 [testenv:pypy]
 deps=
     futures
-    dawg-python >= 0.5
+    dawg-python >= 0.7
     {[base]deps}
 
 commands=
 basepython=python2.7
 deps=
     futures
-    dawg-python >= 0.5
+    dawg-python >= 0.7
     {[base]deps}
 
 
 [testenv:py27_no_compiler]
 basepython = python2.7
 deps=
-    dawg-python >= 0.5
+    dawg-python >= 0.7
     {[base]deps}
 
 [testenv:py33_no_compiler]
 basepython = python3.3
 deps=
-    dawg-python >= 0.5
+    dawg-python >= 0.7
     {[base]deps}

pymorphy2/analyzer.py

 import collections
 import logging
 import threading
+import operator
 
 from pymorphy2 import opencorpora_dict
 from pymorphy2 import units
+from pymorphy2.dawg import ConditionalProbDistDAWG
 
 logger = logging.getLogger(__name__)
 
-_Parse = collections.namedtuple('Parse', 'word, tag, normal_form, estimate, methods_stack')
+_Parse = collections.namedtuple('Parse', 'word, tag, normal_form, score, methods_stack')
+
+_score_getter = operator.itemgetter(3)
 
 class Parse(_Parse):
     """
     @property
     def is_known(self):
         """ True if this form is a known dictionary form. """
-        # return self.estimate == 1?
         return self._dict.word_is_known(self.word, strict_ee=True)
 
     @property
     #     return self._dict.build_paradigm_info(self.para_id)
 
 
+class SingleTagProbabilityEstimator(object):
+    def __init__(self, dict_path):
+        cpd_path = os.path.join(dict_path, 'p_t_given_w.intdawg')
+        self.p_t_given_w = ConditionalProbDistDAWG().load(cpd_path)
+
+    def apply_to_parses(self, word, word_lower, parses):
+        if not parses:
+            return parses
+
+        probs = [self.p_t_given_w.prob(word_lower, tag)
+                for (word, tag, normal_form, score, methods_stack) in parses]
+
+        if sum(probs) == 0:
+            # no P(t|w) information is available; return normalized estimate
+            k = 1.0 / sum(map(_score_getter, parses))
+            return [
+                (word, tag, normal_form, score*k, methods_stack)
+                for (word, tag, normal_form, score, methods_stack) in parses
+            ]
+
+        # replace score with P(t|w) probability
+        return sorted([
+            (word, tag, normal_form, prob, methods_stack)
+            for (word, tag, normal_form, score, methods_stack), prob
+            in zip(parses, probs)
+        ], key=_score_getter, reverse=True)
+
+    def apply_to_tags(self, word, word_lower, tags):
+        if not tags:
+            return tags
+        return sorted(tags,
+            key=lambda tag: self.p_t_given_w.prob(word_lower, tag),
+            reverse=True
+        )
+
+
+class DummySingleTagProbabilityEstimator(object):
+    def __init__(self, dict_path):
+        pass
+
+    def apply_to_parses(self, word, word_lower, parses):
+        return parses
+
+    def apply_to_tags(self, word, word_lower, tags):
+        return tags
+
 
 class MorphAnalyzer(object):
     """
         units.KnownSuffixAnalyzer,
     ]
 
-    def __init__(self, path=None, result_type=Parse, units=None):
+    def __init__(self, path=None, result_type=Parse, units=None,
+                 probability_estimator_cls=SingleTagProbabilityEstimator):
         path = self.choose_dictionary_path(path)
         with threading.RLock():
             self.dictionary = opencorpora_dict.Dictionary(path)
+            if probability_estimator_cls is None:
+                probability_estimator_cls = DummySingleTagProbabilityEstimator
+            self.prob_estimator = probability_estimator_cls(path)
 
             if result_type is not None:
                 # create a subclass with the same name,
             self._unit_classes = units
             self._units = [cls(self) for cls in units]
 
-
     @classmethod
     def choose_dictionary_path(cls, path=None):
         if path is not None:
                    "or set %s environment variable.") % cls.ENV_VARIABLE
             raise ValueError(msg)
 
-
     def parse(self, word):
         """
         Analyze the word and return a list of :class:`pymorphy2.analyzer.Parse`
         namedtuples:
 
-            Parse(word, tag, normal_form, para_id, idx, _estimate)
+            Parse(word, tag, normal_form, para_id, idx, _score)
 
         (or plain tuples if ``result_type=None`` was used in constructor).
         """
             if res and analyzer.terminal:
                 break
 
+        res = self.prob_estimator.apply_to_parses(word, word_lower, res)
+
         if self._result_type is None:
             return res
 
         return [self._result_type(*p) for p in res]
 
-
     def tag(self, word):
         res = []
         seen = set()
             if res and analyzer.terminal:
                 break
 
-        return res
-
+        return self.prob_estimator.apply_to_tags(word, word_lower, res)
 
     def normal_forms(self, word):
         """
             return result
         return [self._result_type(*p) for p in result]
 
-
     def _inflect(self, form, required_grammemes):
         possible_results = [f for f in self.get_lexeme(form)
                             if required_grammemes <= f[1].grammemes]
 
         return heapq.nlargest(1, possible_results, key=similarity)
 
-
     # ====== misc =========
 
     def iter_known_word_parses(self, prefix=""):
             else:
                 yield self._result_type(*parse)
 
-
     def word_is_known(self, word, strict_ee=False):
         """
         Check if a ``word`` is in the dictionary.
         """
         return self.dictionary.word_is_known(word.lower(), strict_ee)
 
-
     @property
     def TagClass(self):
         """
         """
         return self.dictionary.Tag
 
-
     def __reduce__(self):
         args = (self.dictionary.path, self._result_type_orig, self._unit_classes)
         return self.__class__, args, None
+
+
 
 def estimate_tag_cpd(corpus_filename, out_path, min_word_freq, update_meta=True):
     from pymorphy2.opencorpora_dict.probability import (estimate_conditional_tag_probability, build_cpd_dawg)
-    m = pymorphy2.MorphAnalyzer(out_path)
+    m = pymorphy2.MorphAnalyzer(out_path, probability_estimator_cls=None)
 
     logger.info("Estimating P(t|w) from %s" % (corpus_filename))
     cpd, cfd = estimate_conditional_tag_probability(m, corpus_filename)

pymorphy2/dawg.py

 from __future__ import absolute_import, division
 
 try:
-    from dawg import DAWG, RecordDAWG, IntDAWG
+    from dawg import DAWG, RecordDAWG, IntCompletionDAWG
     CAN_CREATE = True
 
 except ImportError:
-    from dawg_python import DAWG, RecordDAWG, IntDAWG
+    from dawg_python import DAWG, RecordDAWG, IntCompletionDAWG
     CAN_CREATE = False
 
 def assert_can_create():
     DATA_FORMAT = str(">HHH")
 
 
-class ConditionalProbDistDAWG(IntDAWG):
+class ConditionalProbDistDAWG(IntCompletionDAWG):
 
     MULTIPLIER = 1000000
 

pymorphy2/opencorpora_dict/storage.py

 import itertools
 import array
 import struct
-import threading
 
 try:
     izip = itertools.izip
 
 logger = logging.getLogger(__name__)
 
-CURRENT_FORMAT_VERSION = '2.3'
+CURRENT_FORMAT_VERSION = '2.4'
 
 LoadedDictionary = collections.namedtuple('LoadedDictionary', [
     'meta', 'gramtab', 'suffixes', 'paradigms', 'words',
     prediction_suffixes_dawgs = []
     for prefix_id in range(len(paradigm_prefixes)):
         fn = _f('prediction-suffixes-%s.dawg' % prefix_id)
-
         assert os.path.exists(fn)
-
         prediction_suffixes_dawgs.append(dawg.PredictionSuffixesDAWG().load(fn))
 
     return LoadedDictionary(meta, gramtab, suffixes, paradigms, words,

pymorphy2/opencorpora_dict/wrapper.py

         tag_id = paradigm[tag_info_offset + idx]
         return self.gramtab[tag_id]
 
-
     def build_paradigm_info(self, para_id):
         """
         Return a list of
             )
         return res
 
-
     def build_normal_form(self, para_id, idx, fixed_word):
         """
         Build a normal form.
 
         return normal_prefix + stem + normal_suffix
 
-
     def build_stem(self, paradigm, idx, fixed_word):
         """
         Return word stem (given a word, paradigm and the word index).
         else:
             return fixed_word[len(prefix):]
 
-
     def word_is_known(self, word, strict_ee=False):
         """
         Check if a ``word`` is in the dictionary.
         else:
             return bool(self.words.similar_keys(word, self.ee))
 
-
     def iter_known_words(self, prefix=""):
         """
         Return an iterator over ``(word, tag, normal_form, para_id, idx)``

pymorphy2/units/by_analogy.py

                                    without_fixed_prefix, with_prefix)
 from pymorphy2.utils import word_splits
 
-
+_cnt_getter = operator.itemgetter(3)
 
 class _PrefixAnalyzer(AnalogyAnalizerUnit):
 
             (fixed_word, tag, normal_form, cnt/total_counts[prefix_id] * self.ESTIMATE_DECAY, methods_stack)
             for (cnt, fixed_word, tag, normal_form, prefix_id, methods_stack) in result
         ]
-        result.sort(key=operator.itemgetter(3), reverse=True)
+        result.sort(key=_cnt_getter, reverse=True)
         return result
 
 

pymorphy2/units/by_lookup.py

 logger = logging.getLogger(__name__)
 
 
-
 class DictionaryAnalyzer(BaseAnalyzerUnit):
     """
     Analyzer unit that analyzes word using dictionary.
 
                 tag = self.dict.build_tag_info(para_id, idx)
                 method = ((self, fixed_word, para_id, idx),)
-
                 res.append((fixed_word, tag, normal_form, 1.0, method))
 
         # res.sort(key=lambda p: len(p[1]))  #  prefer simple parses
         """
         Return a lexeme (given a parsed word).
         """
-        fixed_word, tag, normal_form, estimate, methods_stack = form
+        fixed_word, tag, normal_form, score, methods_stack = form
         _, para_id, idx = self._extract_para_info(methods_stack)
 
         _para = self.dict.paradigms[para_id]
         for index, (_prefix, _tag, _suffix) in enumerate(paradigm):
             word = _prefix + stem + _suffix
             new_methods_stack = self._fix_stack(methods_stack, word, para_id, index)
-            parse = (word, _tag, normal_form, estimate, new_methods_stack)
+            parse = (word, _tag, normal_form, 1.0, new_methods_stack)
             result.append(parse)
 
         return result
 
     def normalized(self, form):
-        fixed_word, tag, normal_form, estimate, methods_stack = form
+        fixed_word, tag, normal_form, score, methods_stack = form
         original_word, para_id, idx = self._extract_para_info(methods_stack)
 
         if idx == 0:
         tag = self.dict.build_tag_info(para_id, 0)
         new_methods_stack = self._fix_stack(methods_stack, normal_form, para_id, 0)
 
-        return (normal_form, tag, normal_form, estimate, new_methods_stack)
+        return (normal_form, tag, normal_form, 1.0, new_methods_stack)
 
     def _extract_para_info(self, methods_stack):
         # This method assumes that DictionaryAnalyzer is the first
 dawg-python >= 0.7
-pymorphy2-dicts >= 2.2, < 3.0
+pymorphy2-dicts >= 2.4, < 3.0
         'pymorphy2.opencorpora_dict',
     ],
     scripts=['bin/pymorphy'],
-    requires=['dawg_python (>= 0.7)', 'pymorphy2_dicts (>2.0, <3.0)'],
+    requires=['dawg_python (>= 0.7)', 'pymorphy2_dicts (>=2.4, <3.0)'],
 
 #    cmdclass = {'build_ext': build_ext},
 #    ext_modules = [Extension("pymorphy2.analyzer", ["pymorphy2/analyzer.py"])],

tests/test_inflection.py

 
 
 @with_test_data([
-    ('валенок', ['gent'], 'валенка'),
-    ('валенок', ['gen2'], 'валенка'),  # there is no gen2
+    ('валенок', ['gent', 'sing'], 'валенка'),
+    ('валенок', ['gen2', 'sing'], 'валенка'),  # there is no gen2
     ('велосипед', ['loct'], 'велосипеде'), # о велосипеде
     ('велосипед', ['loc2'], 'велосипеде'), # а тут второго предложного нет, в велосипеде
     ('хомяк', ['voct'], 'хомяк'),        # there is not voct, nomn should be used

tests/test_opencorpora_dict.py

                              prediction_options=options)
 
         # use it
-        morph = pymorphy2.MorphAnalyzer(out_path)
+        morph = pymorphy2.MorphAnalyzer(out_path, probability_estimator_cls=None)
         assert morph.tag('ёжиться') == [morph.TagClass('INFN,impf,intr')]
 
     def test_test_suite_generator(self, tmpdir):
 deps =
     pytest
     psutil
-    pymorphy2-dicts >=2.2, <3.0
+    pymorphy2-dicts >=2.4, <3.0
 
 [testenv]
 deps=
-    dawg >= 0.5
+    dawg > 0.7
     {[base]deps}
 
 ;setenv =
 [testenv:py26]
 deps=
     futures
-    dawg >= 0.5
+    dawg > 0.7
     {[base]deps}
 
 [testenv:py27]
 deps=
     futures
-    dawg >= 0.5
+    dawg > 0.7
     {[base]deps}
 
 [testenv:pypy]
 deps=
     futures
-    dawg-python >= 0.5
+    dawg-python >= 0.7
     {[base]deps}
 
 [testenv:no_compiler]
 basepython=python2.7
 deps=
     futures
-    dawg-python >= 0.5
+    dawg-python >= 0.7
     {[base]deps}