Commits

Mikhail Korobov  committed b162794

поддержка иерархии граммем (нужен словарь OpenCorpora 0.82)

  • Participants
  • Parent commits 5606ae5

Comments (0)

Files changed (5)

File pymorphy2/opencorpora_dict.py

 import itertools
 import array
 import struct
-import re
 
 try:
     import cPickle as pickle
     """
     Parses OpenCorpora dict XML and returns a tuple
 
-        (lemmas_list, links, version, revision)
+        (lemmas_list, links, grammemes, version, revision)
 
     """
     from lxml import etree
 
     links = []
     lemmas = {}
+    grammemes = []
     version, revision = None, None
 
     def _clear(elem):
 
     for ev, elem in etree.iterparse(filename):
 
+        if elem.tag == 'grammeme':
+            grammeme = elem.text
+            parent = elem.get('parent')
+            grammemes.append((grammeme, parent))
+
         if elem.tag == 'dictionary':
             version = elem.get('version')
             revision = elem.get('revision')
             links.append(link_tuple)
             _clear(elem)
 
-    return lemmas, links, version, revision
+    return lemmas, links, grammemes, version, revision
 
 def _lemma_forms_from_xml_elem(elem):
     """
             return
 
     # load & compile dictionary
-    lemmas, links, version, revision = _load_json_or_xml_dict(opencorpora_dict_path)
+    lemmas, links, grammemes, version, revision = _load_json_or_xml_dict(opencorpora_dict_path)
     gramtab, suffixes, paradigms, words_dawg, prediction_suffixes_dawg = _gram_structures(
         lemmas, links, prediction_options=prediction_options
     )
     with codecs.open(_f('suffixes.json'), 'w', 'utf8') as f:
         json.dump(suffixes, f, ensure_ascii=False)
 
+    with codecs.open(_f('grammemes.json'), 'w', 'utf8') as f:
+        json.dump(grammemes, f, ensure_ascii=False)
+
     with open(_f('paradigms.array'), 'wb') as f:
         f.write(struct.pack(str("<H"), len(paradigms)))
         for para in paradigms:
 
 
 
-
 DictTuple = collections.namedtuple('DictTuple', 'meta gramtab suffixes paradigms words prediction_prefixes prediction_suffixes Tag')
 
 def load(path):
 
     Tag = tagset.registry[gramtab_format]
 
+    with open(_f('grammemes.json'), 'r') as f:
+        grammemes = json.load(f, encoding='utf8')
+        Tag._init_restrictions(grammemes)
+
     with open(_f('gramtab.json'), 'r') as f:
-        gramtab = [Tag(tag_str) for tag_str in json.load(f)]
+        gramtab = [Tag(tag_str) for tag_str in json.load(f, encoding='utf8')]
 
     with open(_f('suffixes.json'), 'r') as f:
         suffixes = json.load(f)

File pymorphy2/tagset.py

 # -*- coding: utf-8 -*-
 """
 Utils for working with grammatical tags.
+
 """
 from __future__ import absolute_import, unicode_literals
+import collections
 
+try:
+    from sys import intern
+except ImportError: # python 2.x has builtin ``intern`` function
+    pass
+    #intern = lambda x: x
+
+# Design note: Tag objects should be immutable.
 class OpencorporaTag(object):
 
-    __slots__ = ['_tag', '_tags_tuple']
+    __slots__ = ['grammemes', '_lemma_grammemes', '_grammemes_set_cache', '_str']
 
     FORMAT = 'opencorpora-int'
     NON_PRODUCTIVE_CLASSES = set(['NUMR', 'NPRO', 'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ'])
 
-    def __init__(self, tag):
-        self._tag = tag
-        self._tags_tuple = tuple(tag.replace(' ', ',', 1).split(','))
+    # XXX: is it a good idea to have these rules?
+    EXTRA_INCOMPATIBLE = {
+        'plur': set(['GNdr']),
 
-    def get_class(self):
-        return self.parts()[0]
+        # XXX: how to use rules from OpenCorpora
+        # (they have "lemma/form" separation)?
+
+#        'anim': set(['femn', 'neut']),
+#        'inan': set(['femn', 'neut']),
+#        'ADJF': set(['voct', 'gen2', 'acc2', 'loc2']),
+#        'PRTF': set(['voct', 'gen2', 'acc2', 'loc2']),
+#        'GRND': set(['PErs', 'futr', 'GNdr']),
+#        'Impe': set(['PErs', 'tran', 'Mult', 'impr', 'plur', 'masc', 'femn']),
+#        'impf': set(['futr', 'incl']),
+#        'perf': set(['pres', 'Mult']),
+#        'Sgtm': set(['plur']),
+#        'Pltm': set(['sing']),
+#        'pssv': set(['intr']),
+
+#        'past': set(['PErs']),
+#        'futr': set(['PErs', 'GNdr']),
+#        'pres': set(['GNdr']),
+    }
+
+    GRAMMEME_INDICES = collections.defaultdict(lambda: 0)
+    GRAMMEME_INCOMPATIBLE = collections.defaultdict(set)
+
+    def __init__(self, tag=None, grammemes=None, lemma_grammemes=1):
+        if tag is not None:
+            lemma_grammemes = tag.split(' ')[0].count(',') + 1
+            grammemes = tag.replace(' ', ',', 1).split(',')
+            self._str = tag
+        else:
+            grammemes = sorted(grammemes, key=lambda g: self.GRAMMEME_INDICES[g])
+            self._str = None
+
+        self._lemma_grammemes = lemma_grammemes # number of lemma grammemes
+        self._grammemes_set_cache = None # cache
+
+        # hacks for better memory usage (they save 1M..3M):
+        # - use byte strings for grammemes under Python 2.x;
+        # - grammemes are interned.
+        self.grammemes = tuple([intern(str(g)) for g in grammemes])
+
+
+    @property
+    def _grammemes_set(self):
+        """
+        Tag grammemes as frozenset.
+        """
+        if self._grammemes_set_cache is None:
+            self._grammemes_set_cache = frozenset(self.grammemes)
+        return self._grammemes_set_cache
+
+    @property
+    def cls(self):
+        """
+        Word class (as string).
+        """
+        return self.grammemes[0]
 
     def is_productive(self):
-        return not self.get_class() in self.NON_PRODUCTIVE_CLASSES
+        return not self.cls in self.NON_PRODUCTIVE_CLASSES
 
-    def parts(self):
-        return self._tags_tuple
+    def _updated(self, add):
+        """
+        Returns a new OpencorporaTag with grammemes from ``add`` added
+        and incompatible grammemes removed.
+        """
+        new_grammemes = self._grammemes_set | set(add)
+        for grammeme in add:
+            new_grammemes -= self.GRAMMEME_INCOMPATIBLE[grammeme]
+
+        # XXX: lemma_grammemes would be incorrect, but this shouldn't matter
+        # because tags constructed with "_updated" method should be for
+        # internal use only.
+        return OpencorporaTag(grammemes=new_grammemes)
+
+    # FIXME: __repr__ and __str__ always return unicode,
+    # but they should return a byte string under Python 2.x.
+    def __str__(self):
+        if self._str is None:
+            lemma_tags = ",".join(self.grammemes[:self._lemma_grammemes])
+            form_tags = ",".join(self.grammemes[self._lemma_grammemes:])
+            if not form_tags:
+                self._str = lemma_tags
+            else:
+                self._str = lemma_tags + " " + form_tags
+        return self._str
 
     def __repr__(self):
-        # FIXME: this method always returns unicode,
-        # but it should return a byte string under Python 2.x.
-        return "OpencorporaTag('%s')" % self._tag
+        return "OpencorporaTag('%s')" % self
+
 
     def __eq__(self, other):
-        return self._tags_tuple == other._tags_tuple
+        return self.grammemes == other.grammemes
 
     def __ne__(self, other):
-        return self._tags_tuple != other._tags_tuple
+        return self.grammemes != other.grammemes
+
+    def __lt__(self, other):
+        return self.grammemes < other.grammemes
 
     def __gt__(self, other):
-        return self._tags_tuple > other._tags_tuple
-
-    def __lt__(self, other):
-        return self._tags_tuple < other._tags_tuple
+        return self.grammemes > other.grammemes
 
     def __hash__(self):
-        return hash(self._tag)
+        return hash(self.grammemes)
+
+    @classmethod
+    def _init_restrictions(cls, grammemes):
+        """
+        Fills ``OpencorporaTag.GRAMMEME_INDICES`` and
+        ``OpencorporaTag.GRAMMEME_INCOMPATIBLE`` class attributes.
+        """
+
+        # figure out parents & children
+        gr = dict(grammemes)
+        children = collections.defaultdict(set)
+
+        for index, (name, parent) in enumerate(grammemes):
+            if parent:
+                children[parent].add(name)
+            if gr.get(parent, None): # parent's parent
+                children[gr[parent]].add(name)
+
+        # expand EXTRA_INCOMPATIBLE
+        for grammeme, g_set in cls.EXTRA_INCOMPATIBLE.items():
+            for g in g_set.copy():
+                g_set.update(children[g])
+
+        # fill GRAMMEME_INDICES and GRAMMEME_INCOMPATIBLE
+        for index, (name, parent) in enumerate(grammemes):
+            cls.GRAMMEME_INDICES[name] = index
+            incompatible = cls.EXTRA_INCOMPATIBLE.get(name, set())
+            incompatible = (incompatible | children[parent]) - set([name])
+
+            cls.GRAMMEME_INCOMPATIBLE[name] = frozenset(incompatible)
 
 
 registry = dict()

File pymorphy2/test_suite_generator.py

     ``word_limit`` words for each distinct gram. tag) and saves it to a file.
     """
     logger.debug('loading dictionary to memory...')
-    lemmas, links, version, revision = _load_json_or_xml_dict(opencorpora_dict_path)
+    lemmas, links, grammemes, version, revision = _load_json_or_xml_dict(opencorpora_dict_path)
 
     logger.debug('preparing...')
     parses = _get_word_parses(lemmas)

File tests/test_tagger.py

 class TestTagMethod(object):
 
     def _tagged_as(self, parse, cls):
-        return any(tag.get_class()==cls for tag in parse)
+        return any(tag.cls==cls for tag in parse)
 
     def assertNotTaggedAs(self, word, cls):
         parse = morph.tag(word)
 class TestParse(object):
 
     def _parsed_as(self, parse, cls):
-        return any(p[1].get_class()==cls for p in parse)
+        return any(p[1].cls==cls for p in parse)
 
     def _parse_cls_first_index(self, parse, cls):
         for idx, p in enumerate(parse):
-            if p[1].get_class() == cls:
+            if p[1].cls == cls:
                 return idx
 
     def assertNotParsedAs(self, word, cls):

File tests/test_tagset.py

 def test_hashing():
     tag1 = OpencorporaTag('NOUN')
     tag2 = OpencorporaTag('NOUN')
+    tag3 = OpencorporaTag('VERB')
 
+    assert tag1 == tag2
+    assert tag1 != tag3
     assert set([tag1]) == set([tag2])
+    assert set([tag3]) != set([tag1])
 
 
 @pytest.mark.parametrize(("tag", "cls"), [
         ['NOUN sing', 'NOUN'],
     ])
 def test_cls(tag, cls):
-    assert OpencorporaTag(tag).get_class() == cls
+    assert OpencorporaTag(tag).cls == cls
 
+def test_repr():
+    assert repr(OpencorporaTag('NOUN anim,plur')) == "OpencorporaTag('NOUN anim,plur')"
 
+
+class TestUpdated(object):
+
+    def test_number(self):
+        tag = OpencorporaTag('NOUN,sing,masc')
+        assert OpencorporaTag('NOUN,plur') == tag._updated(add=['plur'])
+
+    def test_order(self):
+        tag = OpencorporaTag('VERB,impf,tran sing,3per,pres,indc')
+        assert str(tag._updated(['1per'])) == 'VERB sing,impf,tran,1per,pres,indc'