Commits

Mikhail Korobov committed 7f2b915

Cyrillic tags, second attempt: cyrillic representations can only be returned as strings.

Comments (0)

Files changed (6)

docs/user/guide.rst

 Для того, чтоб проверить, есть ли в данном теге отдельная граммема
 (или все граммемы из указанного множества), используйте оператор in::
 
+    >>> p.tag
+    OpencorporaTag('VERB,perf,intr plur,past,indc')
     >>> 'NOUN' in p.tag  # то же самое, что и {'NOUN'} in p.tag
     False
     >>> 'VERB' in p.tag
 Кроме того, у каждого тега есть атрибуты, через которые можно получить
 часть речи, число и другие характеристики::
 
+    >>> p.tag
+    OpencorporaTag('VERB,perf,intr plur,past,indc')
     >>> p.tag.POS           # Part of Speech, часть речи
     'VERB'
     >>> p.tag.animacy       # одушевленность
     ...
     ValueError: 'plur' is not a valid grammeme for this attribute.
 
+
+Русские названия тегов и граммем
+--------------------------------
+
+Теги и граммемы в pymorphy2 записываются латиницей (например, ``NOUN``).
+Но часто удобнее использовать кириллические названия граммем (например,
+``СУЩ`` вместо ``NOUN``). Чтобы получить тег в виде строки,
+записанной кириллицей, используйте свойство :attr:`OpencorporaTag.cyr_repr`::
+
+    >>> p.tag
+    OpencorporaTag('VERB,perf,intr plur,past,indc')
+    >>> p.tag.cyr_repr
+    'ГЛ,сов,неперех мн,прош,изъяв'
+
+Для преобразования произвольных строк с тегами/граммемами между
+кириллицей и латиницей используйте методы :meth:`MorphAnalyzer.cyr2lat`
+и :meth:`MorphAnalyzer.lat2cyr`::
+
+    >>> morph.lat2cyr('NOUN,anim,masc plur,ablt')
+    'СУЩ,од,мр мн,тв'
+    >>> morph.cyr2lat('СУЩ,од,мр мн,тв')
+    'NOUN,anim,masc plur,ablt'
+
 Склонение слов
 --------------
 
 
 Посмотрим, что сделает pymorphy2 в этом примере:
 
-    >>> m.parse('думающему')[0].normal_form
+    >>> morph.parse('думающему')[0].normal_form
     'думать'
 
 pymorphy2 сейчас использует алгоритм нахождения нормальной формы,
 Если требуется нормализовывать слова иначе, можно воспользоваться
 методом :meth:`Parse.inflect`::
 
-    >>> m.parse('думающему')[0].inflect({'sing', 'nomn'}).word
+    >>> morph.parse('думающему')[0].inflect({'sing', 'nomn'}).word
     'думающий'
 
 Согласование слов с числительными
 
 У каждого разбора есть параметр score::
 
-    >>> m.parse('на')
+    >>> morph.parse('на')
     [Parse(word='на', tag=OpencorporaTag('PREP'), normal_form='на', score=0.999628, methods_stack=((<DictionaryAnalyzer>, 'на', 23, 0),)),
      Parse(word='на', tag=OpencorporaTag('INTJ'), normal_form='на', score=0.000318, methods_stack=((<DictionaryAnalyzer>, 'на', 20, 0),)),
      Parse(word='на', tag=OpencorporaTag('PRCL'), normal_form='на', score=5.3e-05, methods_stack=((<DictionaryAnalyzer>, 'на', 21, 0),))]

pymorphy2/analyzer.py

         """
         return self.dictionary.Tag
 
+    def cyr2lat(self, tag_or_grammeme):
+        """ Return Latin representation for ``tag_or_grammeme`` string """
+        return self.TagClass.cyr2lat(tag_or_grammeme)
+
+    def lat2cyr(self, tag_or_grammeme):
+        """ Return Cyrillic representation for ``tag_or_grammeme`` string """
+        return self.TagClass.lat2cyr(tag_or_grammeme)
+
     def __reduce__(self):
         args = (self.dictionary.path, self._result_type_orig, self._unit_classes)
         return self.__class__, args, None

pymorphy2/tagset.py

     Descriptor object for accessing grammemes of certain classes
     (e.g. number or voice).
     """
-    def __init__(self, grammeme_set, lat2cyr):
+    def __init__(self, grammeme_set):
         self.grammeme_set = grammeme_set
-
         # ... are descriptors not magical enough?
 
         # In order to fight typos, raise an exception
                 if other is None:
                     return False
                 if other not in grammeme_set:
-                    raise ValueError("'%s' is not a valid grammeme for this attribute." % other)
+                    known_grammemes = ", ".join(grammeme_set)
+                    raise ValueError("'%s' is not a valid grammeme for this attribute. Valid grammemes: %s" % (other, known_grammemes))
                 return _str.__eq__(self, other)
 
             def __ne__(self, other):
             def __hash__(self):
                 return _str.__hash__(self)
 
-            @property
-            def cyr(self):
-                """ Cyrillic representation of this grammeme """
-                return lat2cyr[self]
-
         self.TypedGrammeme = TypedGrammeme
 
     def __get__(self, instance, owner):
 
         >>> from pymorphy2 import MorphAnalyzer
         >>> morph = MorphAnalyzer()
-        >>> Tag = morph.TagClass # get an initialzed Tag class
+        >>> Tag = morph.TagClass  # get an initialzed Tag class
         >>> tag = Tag('VERB,perf,tran plur,impr,excl')
         >>> tag
         OpencorporaTag('VERB,perf,tran plur,impr,excl')
         >>> tag.POS == 'plur'
         Traceback (most recent call last):
         ...
-        ValueError: 'plur' is not a valid grammeme for this attribute.
+        ValueError: 'plur' is not a valid grammeme for this attribute. Valid grammemes: ...
 
     """
 
     }
     _GRAMMEME_INDICES = collections.defaultdict(int)
     _GRAMMEME_INCOMPATIBLE = collections.defaultdict(set)
-    LAT2CYR = dict()
+    _LAT2CYR = None
+    _CYR2LAT = None
     KNOWN_GRAMMEMES = set()
 
     _NUMERAL_AGREEMENT_GRAMMEMES = (
         self._cyr_grammemes_cache = None
         self._cyr = None
 
+    # attributes for grammeme categories
+    POS = _select_grammeme_from(PARTS_OF_SPEECH)
+    animacy = _select_grammeme_from(ANIMACY)
+    aspect = _select_grammeme_from(ASPECTS)
+    case = _select_grammeme_from(CASES)
+    gender = _select_grammeme_from(GENDERS)
+    involvement = _select_grammeme_from(INVOLVEMENT)
+    mood = _select_grammeme_from(MOODS)
+    number = _select_grammeme_from(NUMBERS)
+    person = _select_grammeme_from(PERSONS)
+    tense = _select_grammeme_from(TENSES)
+    transitivity = _select_grammeme_from(TRANSITIVITY)
+    voice = _select_grammeme_from(VOICES)
+
     @property
     def grammemes(self):
         """ A frozenset with grammemes for this tag. """
     def grammemes_cyr(self):
         """ A frozenset with Cyrillic grammemes for this tag. """
         if self._cyr_grammemes_cache is None:
-            cyr_grammemes = [self.LAT2CYR[g] for g in self._grammemes_tuple]
+            cyr_grammemes = [self._LAT2CYR[g] for g in self._grammemes_tuple]
             self._cyr_grammemes_cache = frozenset(cyr_grammemes)
         return self._cyr_grammemes_cache
 
     @property
-    def cyr(self):
-        """ Cyrillic version representation of this tag """
+    def cyr_repr(self):
+        """ Cyrillic representation of this tag """
         if self._cyr is None:
-            #cyr = CyrillicOpencorporaTag._from_internal_tag(self._str)
-            cyr = self._str
-            for name, alias in self.LAT2CYR.items():
-                if alias:
-                    cyr = cyr.replace(name, alias)
-            self._cyr = cyr
+            self._cyr = self.lat2cyr(self)
         return self._cyr
 
+    @classmethod
+    def cyr2lat(cls, tag_or_grammeme):
+        """ Return Latin representation for ``tag_or_grammeme`` string """
+        return _translate_tag(tag_or_grammeme, cls._CYR2LAT)
 
-    # attributes for grammeme categories
-    POS = _select_grammeme_from(PARTS_OF_SPEECH, LAT2CYR)
-    animacy = _select_grammeme_from(ANIMACY, LAT2CYR)
-    aspect = _select_grammeme_from(ASPECTS, LAT2CYR)
-    case = _select_grammeme_from(CASES, LAT2CYR)
-    gender = _select_grammeme_from(GENDERS, LAT2CYR)
-    involvement = _select_grammeme_from(INVOLVEMENT, LAT2CYR)
-    mood = _select_grammeme_from(MOODS, LAT2CYR)
-    number = _select_grammeme_from(NUMBERS, LAT2CYR)
-    person = _select_grammeme_from(PERSONS, LAT2CYR)
-    tense = _select_grammeme_from(TENSES, LAT2CYR)
-    transitivity = _select_grammeme_from(TRANSITIVITY, LAT2CYR)
-    voice = _select_grammeme_from(VOICES, LAT2CYR)
+    @classmethod
+    def lat2cyr(cls, tag_or_grammeme):
+        """ Return Cyrillic representation for ``tag_or_grammeme`` string """
+        return _translate_tag(tag_or_grammeme, cls._LAT2CYR)
 
     def __contains__(self, grammeme):
 
         """
         Replace rare cases (loc2/voct/...) with common ones (loct/nomn/...).
         """
-        return frozenset(cls.RARE_CASES.get(g,g) for g in grammemes)
+        return frozenset(cls.RARE_CASES.get(g, g) for g in grammemes)
+
+    @classmethod
+    def add_grammemes_to_known(cls, lat, cyr):
+        cls.KNOWN_GRAMMEMES.add(lat)
+        cls._LAT2CYR[lat] = cyr
+        cls._CYR2LAT[cyr] = lat
 
     @classmethod
     def _init_grammemes(cls, dict_grammemes):
             ]
 
         """
-        for name, parent, alias, description in dict_grammemes:
-            cls.LAT2CYR[name] = alias
+        with threading.RLock():
+            cls.KNOWN_GRAMMEMES = set()
+            cls._CYR2LAT = {}
+            cls._LAT2CYR = {}
+            for name, parent, alias, description in dict_grammemes:
+                cls.add_grammemes_to_known(name, alias)
 
-        gr = dict((name, parent) for (name, parent, alias, description) in dict_grammemes)
+            gr = dict((name, parent) for (name, parent, alias, description) in dict_grammemes)
 
-        # figure out parents & children
-        children = collections.defaultdict(set)
-        for index, (name, parent, alias, description) in enumerate(dict_grammemes):
-            if parent:
-                children[parent].add(name)
-            if gr.get(parent, None): # parent's parent
-                children[gr[parent]].add(name)
-
-        with threading.RLock():
-            cls.KNOWN_GRAMMEMES = set(gr.keys())
+            # figure out parents & children
+            children = collections.defaultdict(set)
+            for index, (name, parent, alias, description) in enumerate(dict_grammemes):
+                if parent:
+                    children[parent].add(name)
+                if gr.get(parent, None):  # parent's parent
+                    children[gr[parent]].add(name)
 
             # expand EXTRA_INCOMPATIBLE
             for grammeme, g_set in cls._EXTRA_INCOMPATIBLE.items():
         for name, parent, alias, description in dict_grammemes:
             cls._GRAMMEME_ALIAS_MAP[name] = alias
 
+
+def _translate_tag(tag, mapping):
+    """
+    Translate ``tag`` string according to ``mapping``, assuming grammemes
+    are separated by commas or whitespaces. Commas/whitespaces positions
+    are preserved.
+    """
+    if isinstance(tag, OpencorporaTag):
+        tag = str(tag)
+    return " ".join([
+        _translate_comma_separated(whitespace_separated_part, mapping)
+        for whitespace_separated_part in tag.split()
+    ])
+
+
+def _translate_comma_separated(tag_part, mapping):
+    grammemes = [mapping.get(tok, tok) for tok in tag_part.split(',')]
+    return ",".join(grammemes)
+
+
 registry = dict()
 
 for tag_type in [CyrillicOpencorporaTag, OpencorporaTag]:

pymorphy2/units/by_shape.py

 from pymorphy2.units.base import BaseAnalyzerUnit
 from pymorphy2.shapes import is_latin, is_punctuation, is_roman_number
 
+
 class _ShapeAnalyzer(BaseAnalyzerUnit):
 
     terminal = True
 
     def __init__(self, morph):
         super(_ShapeAnalyzer, self).__init__(morph)
-        self.morph.TagClass.KNOWN_GRAMMEMES.update(self.EXTRA_GRAMMEMES)
-        aliases = dict(zip(self.EXTRA_GRAMMEMES, self.EXTRA_GRAMMEMES_CYR))
-        self.morph.TagClass.LAT2CYR.update(aliases)
+
+        for lat, cyr in zip(self.EXTRA_GRAMMEMES, self.EXTRA_GRAMMEMES_CYR):
+            self.morph.TagClass.add_grammemes_to_known(lat, cyr)
 
     def parse(self, word, word_lower, seen_parses):
         shape = self.check_shape(word, word_lower)
     Example: "," -> PNCT
     """
     TAG_STR = 'PNCT'
-    TAG_STR_CYR = 'ЗПР'
+    TAG_STR_CYR = 'ЗПР'  # aot.ru uses this name
 
     def check_shape(self, word, word_lower):
         return is_punctuation(word)

tests/test_parsing.py

 import pymorphy2
 from .utils import morph
 
+
 def _to_test_data(text):
     """
     Lines should be of this format: <word> <normal_form> <tag>.
         parse = morph.parse(word)
         assert_parse_is_correct(parse, word, normal_form, tag)
 
+
 def _check_new_analyzer(parses):
     morph = pymorphy2.MorphAnalyzer()
     for word, normal_form, tag in parses:
         parse = morph.parse(word)
         assert_parse_is_correct(parse, word, normal_form, tag)
 
+
 def _create_morph_analyzer(i):
     morph = pymorphy2.MorphAnalyzer()
     word, normal_form, tag = random.choice(PARSES)

tests/test_tagset.py

 from .utils import morph
 Tag = morph.TagClass
 
+
 def test_hashing():
     tag1 = Tag('NOUN')
     tag2 = Tag('NOUN')
 
 
 class TestCyrillic:
-    def test_cyr(self):
+    def test_cyr_repr(self):
         tag = Tag('VERB,perf,tran plur,impr,excl')
-        assert tag.cyr == 'ГЛ,сов,перех мн,повел,выкл'
-
-    def test_cyr_grammeme(self):
-        tag = Tag('VERB,perf,tran plur,impr,excl')
-        assert tag.POS.cyr == 'ГЛ'
+        assert tag.cyr_repr == 'ГЛ,сов,перех мн,повел,выкл'
 
     def test_grammemes_cyr(self):
         tag = Tag('VERB,perf,tran plur,impr,excl')
 
     def test_cyr_extra_grammemes(self):
         tag = Tag('ROMN')
-        assert tag.cyr == 'РИМ'
+        assert tag.cyr_repr == 'РИМ'
 
+    @pytest.mark.parametrize(('lat', 'cyr'), [
+        ('VERB,perf,tran plur,impr,excl', 'ГЛ,сов,перех мн,повел,выкл'),
+        ('ROMN', 'РИМ'),
+        ('ROMN,unknown_grammeme', 'РИМ,unknown_grammeme'),
+        ('plur', 'мн'),
+    ])
+    def test_lat2cyr(self, lat, cyr):
+        assert Tag.lat2cyr(lat) == cyr
+        assert Tag.cyr2lat(cyr) == lat