Commits

Mikhail Korobov committed 0d2db62

склонение слов

Comments (0)

Files changed (10)

     psutil
 
 commands=
-    pymorphy dict mem_usage
     python benchmarks/bench.py run []
 
 [testenv:pypy]

pymorphy2/opencorpora_dict.py

 import struct
 
 try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
-try:
     izip = itertools.izip
 except AttributeError:
     izip = zip
             grammeme = elem.text
             parent = elem.get('parent')
             grammemes.append((grammeme, parent))
+            _clear(elem)
 
         if elem.tag == 'dictionary':
             version = elem.get('version')
         if elem.tag == 'lemma':
             lemma_id, lemma_forms = _lemma_forms_from_xml_elem(elem)
             lemmas[lemma_id] = lemma_forms
-
             _clear(elem)
 
         elif elem.tag == 'link':
 def _longest_common_substring(data):
     """
     Returns a longest common substring of a list of strings.
-    See http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
+    See http://stackoverflow.com/questions/2892931/
     """
     substr = ''
     if len(data) > 1 and len(data[0]) > 0:

pymorphy2/tagger.py

 # -*- coding: utf-8 -*-
 from __future__ import print_function, unicode_literals, division
 import os
+import heapq
 import collections
 from pymorphy2 import opencorpora_dict
 from pymorphy2.constants import LEMMA_PREFIXES, NON_PRODUCTIVE_CLASSES
 
     def parse(self, word):
         """
-        Returns a list of (fixed_word, tag, normal_form, _estimate) tuples.
+        Returns a list of (fixed_word, tag, normal_form, _para_id, _idx, _estimate)
+        tuples.
         """
         res = self._parse_as_known(word)
         if not res:
 
         return res
 
-
     def _tag_as_word_with_unknown_prefix(self, word, _seen_tags=None):
         if _seen_tags is None:
             _seen_tags = set()
 
         return res
 
-
     def _tag_as_word_with_known_suffix(self, word, _seen_tags=None):
         if _seen_tags is None:
             _seen_tags = set()
 
     # ==== inflection ========
 
-    def decline(self, word, required_tags=None):
+    def inflect(self, word, required_grammemes):
+        """
+        Returns a list of parsed words that are closest to ``word`` and
+        have all ``required_grammemes``.
+        """
+        required_grammemes = set(required_grammemes)
+        parses = self.parse(word)
+
+        def weigth(parse):
+            # order by (probability, index in lemma)
+            return -parse[5], parse[4]
+
+        result = []
+        seen = set()
+        for form in sorted(parses, key=weigth):
+            for inflected in self._inflect(form, required_grammemes):
+                if inflected in seen:
+                    continue
+                seen.add(inflected)
+                result.append(inflected)
+
+        return result
+
+    def _inflect(self, form, required_grammemes):
+        grammemes = form[1].updated_grammemes(required_grammemes)
+
+        possible_results = [form for form in self._decline([form])
+                            if required_grammemes.issubset(form[1].grammemes)]
+
+        def similarity(form):
+            tag = form[1]
+            return len(grammemes & tag.grammemes)
+
+        return heapq.nlargest(1, possible_results, key=similarity)
+
+    def decline(self, word):
         """
         Returns parses for all possible word forms.
+        """
+        return self._decline(self.parse(word))
 
-        XXX: performance is not good.
+    def _decline(self, word_parses):
         """
-
-        required_tags = set(required_tags or [])
+        Returns parses for all possible word forms (given a list of
+        possible word parses).
+        """
 
         paradigms = self._dictionary.paradigms
         seen_paradigms = set()
         result = []
 
-        for fixed_word, tag, normal_form, para_id, idx, estimate in self.parse(word):
+        for fixed_word, tag, normal_form, para_id, idx, estimate in word_parses:
             if para_id in seen_paradigms:
                 continue
             seen_paradigms.add(para_id)
             for index, (_prefix, _tag, _suffix) in enumerate(self._build_paradigm_info(para_id)):
                 word = _prefix + stem + _suffix
 
-                tag_parts = set(_tag.parts())
-                if tag_parts.issuperset(required_tags):
-                    # XXX: what to do with estimate?
-                    # XXX: do we need all info?
-                    result.append(
-                        (word, _tag, normal_form, para_id, index, estimate)
-                    )
+                # XXX: what to do with estimate?
+                # XXX: do we need all info?
+                result.append(
+                    (word, _tag, normal_form, para_id, index, estimate)
+                )
 
         return result
 
 
         return normal_prefix + stem + normal_suffix
 
-
     def _build_stem(self, paradigm, idx, fixed_word):
         """
         Returns word stem (given a word, paradigm and the word index).

pymorphy2/tagset.py

 # -*- coding: utf-8 -*-
 """
 Utils for working with grammatical tags.
-
 """
 from __future__ import absolute_import, unicode_literals
 import collections
 
 try:
     from sys import intern
-except ImportError: # python 2.x has builtin ``intern`` function
+except ImportError:
+    # python 2.x has builtin ``intern`` function
     pass
-    #intern = lambda x: x
 
-# Design note: Tag objects should be immutable.
+
+# Design notes: Tag objects should be immutable.
 class OpencorporaTag(object):
 
-    __slots__ = ['grammemes', '_lemma_grammemes', '_grammemes_set_cache', '_str']
+    __slots__ = ['_grammemes_tuple', '_lemma_grammemes', '_grammemes_cache', '_str']
 
     FORMAT = 'opencorpora-int'
     NON_PRODUCTIVE_CLASSES = set(['NUMR', 'NPRO', 'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ'])
     GRAMMEME_INDICES = collections.defaultdict(lambda: 0)
     GRAMMEME_INCOMPATIBLE = collections.defaultdict(set)
 
-    def __init__(self, tag=None, grammemes=None, lemma_grammemes=1):
-        if tag is not None:
-            lemma_grammemes = tag.split(' ')[0].count(',') + 1
-            grammemes = tag.replace(' ', ',', 1).split(',')
-            self._str = tag
-        else:
-            grammemes = sorted(grammemes, key=lambda g: self.GRAMMEME_INDICES[g])
-            self._str = None
+    def __init__(self, tag=None):
+        self._str = tag
 
-        self._lemma_grammemes = lemma_grammemes # number of lemma grammemes
-        self._grammemes_set_cache = None # cache
+        # XXX: we loose information about which grammemes
+        # belongs to lemma and which belongs to form
+        # (this information seems useless for pymorphy2).
 
-        # hacks for better memory usage (they save 1M..3M):
+        # Hacks for better memory usage:
+        # - store grammemes in a tuple and build a set only when needed;
         # - use byte strings for grammemes under Python 2.x;
         # - grammemes are interned.
-        self.grammemes = tuple([intern(str(g)) for g in grammemes])
-
+        grammemes = tag.replace(' ', ',', 1).split(',')
+        self._grammemes_tuple = tuple([intern(str(g)) for g in grammemes])
+        self._grammemes_cache = None
 
     @property
-    def _grammemes_set(self):
+    def grammemes(self):
         """
         Tag grammemes as frozenset.
         """
-        if self._grammemes_set_cache is None:
-            self._grammemes_set_cache = frozenset(self.grammemes)
-        return self._grammemes_set_cache
+        if self._grammemes_cache is None:
+            self._grammemes_cache = frozenset(self._grammemes_tuple)
+        return self._grammemes_cache
 
     @property
     def cls(self):
         """
         Word class (as string).
         """
-        return self.grammemes[0]
+        return self._grammemes_tuple[0]
 
     def is_productive(self):
         return not self.cls in self.NON_PRODUCTIVE_CLASSES
 
-    def _updated(self, add):
+    def updated_grammemes(self, required):
         """
-        Returns a new OpencorporaTag with grammemes from ``add`` added
+        Returns a new set of grammemes with ``required`` grammemes added
         and incompatible grammemes removed.
         """
-        new_grammemes = self._grammemes_set | set(add)
-        for grammeme in add:
+        new_grammemes = self.grammemes | required
+        for grammeme in required:
+            if grammeme not in self.GRAMMEME_INDICES:
+                raise ValueError("Unknown grammeme: %s" % grammeme)
             new_grammemes -= self.GRAMMEME_INCOMPATIBLE[grammeme]
-
-        # XXX: lemma_grammemes would be incorrect, but this shouldn't matter
-        # because tags constructed with "_updated" method should be for
-        # internal use only.
-        return OpencorporaTag(grammemes=new_grammemes)
+        return new_grammemes
 
     # FIXME: __repr__ and __str__ always return unicode,
     # but they should return a byte string under Python 2.x.
     def __str__(self):
-        if self._str is None:
-            lemma_tags = ",".join(self.grammemes[:self._lemma_grammemes])
-            form_tags = ",".join(self.grammemes[self._lemma_grammemes:])
-            if not form_tags:
-                self._str = lemma_tags
-            else:
-                self._str = lemma_tags + " " + form_tags
         return self._str
 
     def __repr__(self):
 
 
     def __eq__(self, other):
-        return self.grammemes == other.grammemes
+        return self._grammemes_tuple == other._grammemes_tuple
 
     def __ne__(self, other):
-        return self.grammemes != other.grammemes
+        return self._grammemes_tuple != other._grammemes_tuple
 
     def __lt__(self, other):
-        return self.grammemes < other.grammemes
+        return self._grammemes_tuple < other._grammemes_tuple
 
     def __gt__(self, other):
-        return self.grammemes > other.grammemes
+        return self._grammemes_tuple > other._grammemes_tuple
 
     def __hash__(self):
-        return hash(self.grammemes)
+        return hash(self._grammemes_tuple)
 
     @classmethod
-    def _init_restrictions(cls, grammemes):
+    def _init_restrictions(cls, dict_grammemes):
         """
         Fills ``OpencorporaTag.GRAMMEME_INDICES`` and
         ``OpencorporaTag.GRAMMEME_INCOMPATIBLE`` class attributes.
         """
 
         # figure out parents & children
-        gr = dict(grammemes)
+        gr = dict(dict_grammemes)
         children = collections.defaultdict(set)
 
-        for index, (name, parent) in enumerate(grammemes):
+        for index, (name, parent) in enumerate(dict_grammemes):
             if parent:
                 children[parent].add(name)
             if gr.get(parent, None): # parent's parent
                 g_set.update(children[g])
 
         # fill GRAMMEME_INDICES and GRAMMEME_INCOMPATIBLE
-        for index, (name, parent) in enumerate(grammemes):
+        for index, (name, parent) in enumerate(dict_grammemes):
             cls.GRAMMEME_INDICES[name] = index
             incompatible = cls.EXTRA_INCOMPATIBLE.get(name, set())
             incompatible = (incompatible | children[parent]) - set([name])

tests/test_fuzzy.py

 import os
 import pytest
 
-from pymorphy2 import tagger
+from .utils import morph
 
 SUITE_PATH = os.path.join(
     os.path.dirname(__file__),
     'suite.txt'
 )
 
-morph = tagger.Morph.load()
 Tag = morph.tag_class()
 
 def iter_suite(path):

tests/test_inflection.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
+import pytest
+import copy
+
+from .utils import morph
+
+def with_test_data(data):
+    return pytest.mark.parametrize(
+        ("word", "grammemes", "result"),
+        data
+    )
+
+def assert_first_inflected_variant(word, grammemes, result):
+    inflected_variants = morph.inflect(word, grammemes)
+    assert len(inflected_variants)
+
+    inflected = inflected_variants[0]
+    assert inflected[0] == result
+
+
+@with_test_data([
+    # суслики и бутявки
+    ("СУСЛИК", ["datv"], "СУСЛИКУ"),
+    ("СУСЛИКИ", ["datv"], "СУСЛИКАМ"),
+    ("СУСЛИКОВ", ["datv"], "СУСЛИКАМ"),
+    ("СУСЛИКА", ["datv"], "СУСЛИКУ"),
+    ("СУСЛИК", ["datv", "plur"], "СУСЛИКАМ"),
+
+    ("БУТЯВКА", ["datv"], "БУТЯВКЕ"),
+    ("БУТЯВОК", ["datv"], "БУТЯВКАМ"),
+
+    # глаголы, причастия, деепричастия
+    ("ГУЛЯЮ", ["past"], "ГУЛЯЛ"),
+    ("ГУЛЯЛ", ["pres"], "ГУЛЯЮ"),
+    ("ГУЛЯЛ", ["INFN"], "ГУЛЯТЬ"),
+    ("ГУЛЯЛ", ["GRND"], "ГУЛЯВ"),
+    ("ГУЛЯЛ", ["PRTF"], "ГУЛЯВШИЙ"),
+    ("ГУЛЯЛА", ["PRTF"], "ГУЛЯВШАЯ"),
+    ("ГУЛЯЮ", ["PRTF", "datv"], "ГУЛЯЮЩЕМУ"),
+    ("ГУЛЯВШИЙ", ["VERB"], "ГУЛЯЛ"),
+    ("ГУЛЯВШИЙ", ["VERB", "femn"], "ГУЛЯЛА"),
+    ("ИДУ", ["2per"], "ИДЁШЬ"),
+    ("ИДУ", ["2per", "plur"], "ИДЁТЕ"),
+    ("ИДУ", ["3per"], "ИДЁТ"),
+    ("ИДУ", ["3per", "plur"], "ИДУТ"),
+    ("ИДУ", ["impr", "excl"], "ИДИ"),
+
+    # баг из pymorphy
+    ('КИЕВ', ['loct'], 'КИЕВЕ'),
+
+    # одушевленность
+    ('СЛАБЫЙ', ['accs', 'inan'], 'СЛАБЫЙ'),
+    ('СЛАБЫЙ', ['accs', 'anim'], 'СЛАБОГО'),
+
+    # сравнительные степени прилагательных
+    ('БЫСТРЫЙ', ['COMP'], 'БЫСТРЕЕ'),
+    ('ХОРОШАЯ', ['COMP'], 'ЛУЧШЕ'),
+])
+def test_first_inflected_value(word, grammemes, result):
+    assert_first_inflected_variant(word, grammemes, result)
+
+
+@pytest.mark.xfail
+@with_test_data([
+    # доп. падежи, fixme
+    ('ЛЕС', ['loct'], 'ЛЕСЕ'),   # о лесе
+    ('ЛЕС', ['loc2'], 'ЛЕСУ'),   # в лесу
+    ('ВЕЛОСИПЕД', ['loct'], 'ВЕЛОСИПЕДЕ'), # о велосипеде
+    ('ВЕЛОСИПЕД', ['loc2'], 'ВЕЛОСИПЕДЕ'), # а тут второго предложного нет, в велосипеде
+])
+def test_loc2(word, grammemes, result):
+    assert_first_inflected_variant(word, grammemes, result)
+
+@pytest.mark.xfail
+def test_orel():
+    assert_first_inflected_variant('ОРЕЛ', ['gent'], 'ОРЛА')
+
+@pytest.mark.xfail
+def test_best_guess():
+    assert_first_inflected_variant('ОСТРОВА', ['datv'], 'ОСТРОВАМ')

tests/test_tagger.py

 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, unicode_literals
 import pytest
-from pymorphy2 import tagger
 
-morph = tagger.Morph.load()
+from .utils import morph
 
 TEST_DATA = [
     ('КОШКА', ['КОШКА']),

tests/test_tagset.py

 
     def test_number(self):
         tag = OpencorporaTag('NOUN,sing,masc')
-        assert OpencorporaTag('NOUN,plur') == tag._updated(add=['plur'])
+        grammemes = tag.updated_grammemes(required=set(['plur']))
+        assert grammemes == set(['NOUN', 'plur'])
 
     def test_order(self):
         tag = OpencorporaTag('VERB,impf,tran sing,3per,pres,indc')
-        assert str(tag._updated(['1per'])) == 'VERB sing,impf,tran,1per,pres,indc'
+        grammemes = tag.updated_grammemes(required=set(['1per']))
+        assert grammemes == set('VERB,sing,impf,tran,1per,pres,indc'.split(','))
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+from pymorphy2 import tagger
+morph = tagger.Morph.load()
     psutil
 
 commands=
+    pymorphy dict mem_usage
     py.test tests []
 
 
 deps=
     dawg-python >= 0.5
     pytest
+    psutil