Commits

Mikhail Korobov  committed d996be9

more tests & better conversion

  • Participants
  • Parent commits 284eaeb

Comments (0)

Files changed (2)

File russian_tagsets/aot.py

 # -*- coding: utf-8 -*-
 """
 Conversion from aot.ru tags to positional tags.
-AOT tags should be tuple(POS, gram_info), e.g.::
-
-    (u'П', u'мр,ед,им,од,но')
-
-POS->AOT conversion is lossy.
 """
 
 from __future__ import absolute_import, unicode_literals
 from russian_tagsets import positional
 
 def to_positional(aot_tag):
-    pos, info = aot_tag
-    info = set(info.split(','))
+    if ',' in aot_tag:
+        pos, info = aot_tag.split(',', 1)
+        info = set(info.split(','))
+    else:
+        pos = aot_tag
+        info = set()
+
     tag = positional.Tag()
 
-    # ==== POS tag ====
+    # ==== 1,2: POS tag ====
     if pos == 'С':
         tag.POS = 'NN'
 
     elif pos == 'П':
         tag.POS = 'AA'
         # lossy: possessive adjective (мамин, овечью) should be AU
+        # some comparative adjectives should be adverbs
 
     elif pos == 'Г':
         tag.POS = 'VB'
         # and degrees of comparison (vverxu, vnizu, potom)
         tag.POS = 'Db'
 
+#        if 'указат' in info or 'вопр' in info:
+#            tag.POS = 'Db'
+#        else:
+#            tag.POS = 'Dg'
+
         # fixme: adverb forming negation and degrees of
         # comparison (vysoko, daleko). Idea: pymorphy.get_graminfo
         # returns 'КР_ПРИЛ' and 'Н' variants for such adverbs.
         # e.g. конечно
         tag.POS = 'Db' # ?
 
-    # ==== gender ====
+    elif pos == 'ФРАЗ':
+        # e.g. несмотря
+        tag.POS = 'RF'
+
+    # ==== 3: gender ====
 
     if 'мр' in info:
         tag.gender = 'M'
     elif 'мр-жр' in info:
         tag.gender = 'X' # fixme?
 
-    # ==== animacy ====
+    # ==== 4: animacy ====
     # FIXME
     # X should be used: except for nouns, in all
     # forms except accusative masculine singular and accusative
     else:
         tag.animacy = '-' # fixme?
 
-    # ======== number =======
+    # ======== 5: number =======
     if 'ед' in info:
         tag.number = 'S'
     elif 'мн' in info:
     # 3rd person possessive pronouns:
     # tag.number = 'X'
 
-    # ======== case ==========
+    # ======== 6: case ==========
     CASES = {
         'им': '1',
         'рд': '2',
         tag.case = '1'
         tag.variant = '1'
 
-    # TODO: possessor's gender
-    # TODO: possessor's number
+    # TODO: 7: possessor's gender
+    # TODO: 8: possessor's number
 
-    # ======= person ========
+    # ======= 9: person ========
     # fixme: it should be X for non-declinable verbs
     PERSONS = {'1л': '1', '2л': '2', '3л': '3'}
     for person in PERSONS:
             tag.person = PERSONS[person]
             break
 
-    # ======= reflexivity ========
+    # ======= 10: reflexivity ========
     if tag.POS in ['AG', 'PP', 'P5', 'PS'] or tag.mainPOS == 'V':
         tag.reflexivity = 'I'
         # fixme: lossy! e.g. for "получиться" it should be 'R'.
 
-    # ======= tense =========
-    TENSES = {
-        'нст': 'P',
-        'прш': 'R',
-        'буд': 'F',
-    }
-    for tense in TENSES:
-        if tense in info:
-            tag.tense = TENSES[tense]
-            break
+    # ========= 15: voice ===========
+    # XXX: this is out of order because tense needs it
+    if tag.POS in ['AG', 'Ac']: # ??
+        if 'дст' in info:
+            tag.voice = 'A'
+        elif 'стр' in info:
+            tag.voice = 'P'
 
-    # TODO: verbal_aspect
+    # ======= 11: tense =========
+    if tag.POS in ['AG', 'VB', 'Vp']: # ?
+        TENSES = {
+            'нст': 'P',
+            'прш': 'R',
+            'буд': 'F',
+        }
+        for tense in TENSES:
+            if tense in info:
+                tag.tense = TENSES[tense]
+                break
+        # passive long participles
+        if tag.POS == 'AG' and tag.tense == '-' and tag.voice == 'P':
+            tag.tense = 'X'
 
-    # ========== degree of comparison =========
+
+    # TODO: 12: verbal_aspect
+
+    # ========== 13: degree of comparison =========
     if tag.POS in ['AA', 'Dg']:
         if 'сравн' in info:
             tag.degree_of_comparison = '2'
         else:
             tag.degree_of_comparison = '1'
 
-    # ========= negation =========
+    # ========= 14: negation =========
     if tag.mainPOS in ['N', 'A'] or tag.POS == 'Dg':
         tag.negation = 'A'
-        # fixme: lossy! e.g. for ""
-
+        # fixme: lossy!
 
     return tag
 

File russian_tagsets/tests/test_aot.py

 from russian_tagsets import converters
 from russian_tagsets import positional
 
+def _remove_unsupported(tag):
+    if not isinstance(tag, positional.Tag):
+        tag = positional.Tag(tag)
+
+    if tag.POS == 'AU':
+        tag.POS = 'AA'
+
+    tag.animacy = '-'
+    tag.possessors_gender = '-'
+    tag.possessors_number = '-'
+    tag.reflexivity = '-'
+    tag.verbal_aspect = '-'
+
+    if tag.degree_of_comparison == '1':
+        tag.degree_of_comparison = '-'
+
+    if tag.POS == 'Dg' and tag.degree_of_comparison != '-':
+        tag.POS = 'AA'
+
+    # prepositions
+    if tag.POS in ['RR', 'RV']:
+        tag.POS = 'R-'
+        tag.case = '-'
+
+    # conjunctions
+    if tag.POS in ['J,', 'J^']:
+        tag.POS = 'J-'
+
+
+    return str(tag)
 
 class TestConversion(object):
 
-    TAGS = [
-        [('МС', '3л,мр,ед,им'), 'PPM-S1--3I------'], # он
-        [('Г', 'дст,прш,мр,ед'), 'VBM-S----IR-----'], # купил
+    TAGS = [ # word, positional tag, pymorphy tag
+        # example sentence
+        ['он',          'PPM-S1--3I------', 'МС,3л,мр,ед,им'],
+        ['купил',       'VBM-S----IR-----', 'Г,дст,прш,мр,ед'],
+       #['нашу',        'PSFIS4-P1I------', 'МС-П,жр,ед,вн,од,но'], # not enough info
+        ['нашу',        'PSFIS4-P-I------', 'МС-П,жр,ед,вн,од,но'],
+        ['старую',      'AAFXS4------1A--', 'П,жр,ед,вн,од,но'],
+        ['фотографию',  'NNF-S4-------A--', 'С,жр,ед,вн'],
 
-        #[('МС-П', 'жр,ед,вн,од,но'), 'PSFIS4-P1I------'], # нашу:
-        [('МС-П', 'жр,ед,вн,од,но'), 'PSFXS4---I------'], # not enough info
+        # nouns
+        ['голос',       'NNMIS4-------A--', 'С,мр,ед,вн'], # online example tag was incorrect
 
-        [('П', 'жр,ед,вн,од,но'), 'AAFXS4------1A--'], # старую
-        [('С', 'жр,ед,вн'), 'NNF-S4-------A--'], # фотографию
+        # adjectives
+        ['тяжелый',     'AAMIS4------1A--', 'П,мр,ед,вн,од,но'],
+        ['красив',      'ACM-S--------A--', 'КР_ПРИЛ,мр,ед,од,но'],
+        ['читающий',    'AGMXS1---IPI-AA-', 'ПРИЧАСТИЕ,од,но,нст,дст,ед,мр,им'],
+        ['читаемый',    'AGMXS1---IXI-AP-', 'ПРИЧАСТИЕ,од,но,стр,ед,мр,им'],
+        ['мужнин',      'AUMXS4M------A--', 'П,мр,ед,вн,но'],
+        ['прочитан',    'AcM-S----I-P-AP-', 'КР_ПРИЧАСТИЕ,од,но,прш,стр,ед,мр'],
+
+        # verbs
+        ['отрываешь',   'VB--S---2IPI----', 'Г,дст,нст,2л,ед'],
+        ['читал',       'VBM-S----IRI----', 'Г,дст,прш,мр,ед'],
+        ['грозя',       'Ve-------I-I----', 'ДЕЕПРИЧАСТИЕ,дст,нст'],
+        ['написав',     'Ve-------I-P----', 'ДЕЕПРИЧАСТИЕ,дст,прш'],
+        ['спать',       'Vf-------I-I----', 'ИНФИНИТИВ,дст'],
+        ['работай',     'Vi--S---2I-I----', 'Г,дст,пвл,2л,ед'],
+
+        # adverbs
+        ['там',         'Db--------------', 'Н,указат'],
+        ['сильнее',     'Dg----------2A--', 'П,сравн,од,но'], # online example was incorrect
+
+        # prepositions
+        ['над',         'RR---7----------', 'ПРЕДЛ'],
+        ['надо',        'RV---7----------', 'ПРЕДЛ'],
+        ['несмотря',    'RF--------------', 'ФРАЗ'],
+
+        # conjunctions, particles, interjenctions
+        ['и',           'J^--------------', 'СОЮЗ'],
+        ['что',         'J,--------------', 'СОЮЗ'],
+        ['нет',         'TT--------------', 'ЧАСТ'],
+        ['ой',          'II--------------', 'МЕЖД'],
     ]
 
-    @pytest.mark.parametrize(("aot_tag", "pos_tag_txt"), TAGS)
-    def test_from_positional(self, aot_tag, pos_tag_txt):
-        converted = converters.convert(
-            positional.Tag(pos_tag_txt),
-            'positional', 'aot'
-        )
-        assert converted == aot_tag
+    def assertPositionalEqual(self, converted, gold):
+        assert _remove_unsupported(converted) == _remove_unsupported(gold)
 
-    @pytest.mark.parametrize(("aot_tag", "pos_tag_txt"), TAGS)
-    def test_to_positional(self, aot_tag, pos_tag_txt):
+#    @pytest.mark.parametrize(("word", "pos_tag_txt", "aot_tag"), TAGS)
+#    def test_from_positional(self, word, pos_tag_txt, aot_tag):
+#        converted = converters.convert(
+#            positional.Tag(pos_tag_txt),
+#            'positional', 'aot'
+#        )
+#        assert converted == aot_tag
+
+    @pytest.mark.parametrize(("word", "pos_tag_txt", "aot_tag"), TAGS)
+    def test_to_positional(self, word, pos_tag_txt, aot_tag):
         converted = converters.convert(aot_tag, 'aot', 'positional')
-        assert str(converted) == pos_tag_txt
+        self.assertPositionalEqual(converted, pos_tag_txt)