Commits

Mikhail Korobov  committed ac41b6e

HyphenatedWordsPredictor draft (inflection is not implemented)

  • Participants
  • Parent commits 1bb2ccd

Comments (0)

Files changed (3)

File pymorphy2/analyzer.py

         predictors.PunctuationPredictor,
         predictors.LatinPredictor,
         predictors.HyphenSeparatedParticlePredictor,
+        predictors.HyphenatedWordsPredictor,
         predictors.KnownPrefixPredictor,
         predictors.UnknownPrefixPredictor,
         predictors.KnownSuffixPredictor,

File pymorphy2/predictors.py

     "UnknownPrefixPredictor",
     "KnownSuffixPredictor",
     "HyphenSeparatedParticlePredictor",
+    "HyphenatedWordsPredictor",
     "PunctuationPredictor",
     "LatinPredictor",
 ]
         return result
 
 
+class HyphenatedWordsPredictor(BasePredictor):
+    """
+    Parse the word by parsing its hyphen-separated parts.
+    """
+
+    terminal = True
+    ESTIMATE_DECAY = 0.75
+
+    def _similarity_features(self, tag):
+        """
+        @type tag: pymorphy2.tagset.OpencorporaTag
+        """
+        return (tag.POS, tag.number, tag.case, tag.person, tag.tense)
+
+    def parse(self, word, seen_parses):
+        if '-' not in word:
+            return []
+
+        result = []
+
+        # If there are more than 2 parts, the rest would be parsed
+        # by recursion.
+        left, right = word.split('-', 1)
+
+        left_parses = self.morph.parse(left)
+        right_parses = self.morph.parse(right)
+
+        # Step 1: Assume that the left part is an uninflected prefix.
+        # Examples: интернет-магазин, воздушно-капельный
+        method1 = (self, right)
+        right_features = []
+
+        for fixed_word, tag, normal_form, para_id, idx, estimate, methods in right_parses:
+            parse = (
+                '-'.join([left, fixed_word]), tag, '-'.join([left, normal_form]),
+                para_id, idx, estimate*self.ESTIMATE_DECAY,
+                methods+(method1,)
+            )
+            _add_parse_if_not_seen(parse, result, seen_parses)
+            right_features.append(self._similarity_features(tag))
+
+        # Step 2: if left and right can be parsed the same way,
+        # then it may be the case that both parts should be inflected.
+        # Examples: человек-гора, команд-участниц, компания-производитель
+
+        method2 = (self, word)
+
+        # FIXME: quadratic algorithm
+        for left_parse in left_parses:
+
+            left_feat = self._similarity_features(left_parse[1])
+
+            for parse_index, right_parse in enumerate(right_parses):
+                right_feat = right_features[parse_index]
+
+                if left_feat != right_feat:
+                    continue
+
+                # tag
+                parse = (
+                    '-'.join([left_parse[0], right_parse[0]]), # word
+                    left_parse[1], # tag is from the left part
+                    '-'.join([left_parse[2], right_parse[2]]),  # normal form
+                    left_parse[3], left_parse[4], # para_id, idx?
+                    left_parse[5]*self.ESTIMATE_DECAY,
+                    left_parse[6]+(method2,)
+                )
+                _add_parse_if_not_seen(parse, result, seen_parses)
+
+        return result
+
+    def tag(self, word, seen_tags):
+        result = []
+        for p in self.parse(word, set()):
+            _add_tag_if_not_seen(p[1], result, seen_tags)
+        return result
+
+
+
+
 class _ShapeAnalyzer(BasePredictor):
     ESTIMATE = 0.5
     EXTRA_GRAMMEMES = []

File tests/test_analyzer.py

 
     ('кюди', ['кюдить', 'кюдь', 'кюди']), # и никаких "человек"
 
-    ("ей-то", ["она-то"]),
-    ("скажи-ка", ["сказать-ка"]),
-    ('измохратился-таки', ['измохратиться-таки']),
 ]
 
+HYPHEN_TEST_DATA = [
+    # particles
+    ("ей-то", "она-то", 'NPRO,femn,3per sing,datv'),
+    ("скажи-ка", "сказать-ка", "VERB,perf,tran sing,impr,excl"),
+    ('измохратился-таки', 'измохратиться-таки', "VERB,perf,intr masc,sing,past,indc"),
+
+    # compound words with immutable left
+    ('интернет-магазина', 'интернет-магазин', 'NOUN,inan,masc sing,gent'),
+    ('pdf-документов', 'pdf-документ', 'NOUN,inan,masc plur,gent'),
+    ('аммиачно-селитрового', 'аммиачно-селитровый', 'ADJF,Qual masc,sing,gent'),
+    ('быстро-быстро', 'быстро-быстро', 'ADVB'),
+
+    # compound words with mutable left
+    ('команд-участниц', 'команда-участница', 'NOUN,inan,femn plur,gent'),
+    ('бегает-прыгает', 'бегать-прыгать', 'VERB,impf,intr sing,3per,pres,indc'),
+    ('дул-надувался', 'дуть-надуваться', 'VERB,impf,tran masc,sing,past,indc'),
+
+    # ПО-
+    ('почтово-банковский', 'почтово-банковский', 'ADJF masc,sing,nomn'),
+    ('по-прежнему', 'по-прежнему', 'ADVB'),
+
+    # old bugs
+    ('поездов-экспрессов', 'поезд-экспресс', 'NOUN,inan,masc plur,gent'),
+    ('подростками-практикантами', 'подросток-практикант', 'NOUN,anim,masc plur,ablt'),
+    ('подводников-североморцев', 'подводник-североморец', 'NOUN,anim,masc plur,gent'),
+
+    # cities
+    ('санкт-петербурга', 'санкт-петербург', 'NOUN,inan,masc,Geox sing,gent'),
+    ('ростове-на-дону', 'ростов-на-дону', 'NOUN,inan,masc,Sgtm,Geox sing,loct'),
+]
+
+HYPHEN_TEST_DATA_XFAIL = [
+    ('по-воробьиному', 'по-воробьиному', 'ADVB'),
+]
+
+
 NON_PRODUCTIVE_BUGS_DATA = [
     ('бякобы', 'PRCL'),
     ('бякобы', 'CONJ'),
     ('псевдоякобы', 'CONJ'),
 ]
 
+
 def with_test_data(data, second_param_name='parse_result'):
     return pytest.mark.parametrize(
         ("word", second_param_name),
         data
     )
 
+
 class TestNormalForms:
 
     @with_test_data(TEST_DATA)
         assert self._parse_cls_first_index(parse, 'NOUN') < self._parse_cls_first_index(parse, 'ADVB')
 
 
+class TestHyphen:
+
+    def assertHasParse(self, word, normal_form, tag):
+        for p in morph.parse(word):
+            if p.normal_form == normal_form and str(p.tag) == tag:
+                return
+
+        assert False, morph.parse(word)
+
+
+    @pytest.mark.parametrize(("word", "normal_form", "tag"), HYPHEN_TEST_DATA)
+    def test_hyphenated_words(self, word, normal_form, tag):
+        self.assertHasParse(word, normal_form, tag)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize(("word", "normal_form", "tag"), HYPHEN_TEST_DATA_XFAIL)
+    def test_hyphenated_words_xfail(self, word, normal_form, tag):
+        self.assertHasParse(word, normal_form, tag)
+
+
 class TestTagWithPrefix:
 
     def test_tag_with_unknown_prefix(self):