Mikhail Korobov avatar Mikhail Korobov committed 93ce947

LatinPredictor

Comments (0)

Files changed (4)

pymorphy2/analyzer.py

     @property
     def normalized(self):
         """ A :class:`Parse` instance for :attr:`self.normal_form`. """
-        if self.idx == 0:
-            return self
-
-        tag = self._dict.build_tag_info(self.para_id, 0)
-        return self.__class__(self.normal_form, tag, self.normal_form,
-                              self.para_id, 0, self.estimate, self.methods)
+        return self.__class__(*self.methods[-1][0].normalized(self))
 
     @property
     def paradigm(self):
 
         return result
 
+    def normalized(self, form):
+        fixed_word, tag, normal_form, para_id, idx, estimate, methods = form
+
+        if idx == 0:
+            return form
+
+        tag = self.build_tag_info(para_id, 0)
+        return (normal_form, tag, normal_form,
+                para_id, 0, estimate, methods)
+
+
 
     # ===== misc =======
 
 
     ENV_VARIABLE = 'PYMORPHY2_DICT_PATH'
     DEFAULT_PREDICTORS = [
+        predictors.LatinPredictor,
         predictors.HyphenSeparatedParticlePredictor,
         predictors.KnownPrefixPredictor,
         predictors.UnknownPrefixPredictor,
                 seen.add(normal_form)
         return result
 
-
     # ==== inflection ========
 
     def get_lexeme(self, form):
             return result
         return [self._result_type(*p) for p in result]
 
-
     def _inflect(self, form, required_grammemes):
         grammemes = form[1].updated_grammemes(required_grammemes)
 

pymorphy2/predictors.py

 import logging
 
 from .utils import word_splits
+from .shapes import is_latin
 
 logger = logging.getLogger(__name__)
 
     "UnknownPrefixPredictor",
     "KnownSuffixPredictor",
     "HyphenSeparatedParticlePredictor",
+    "LatinPredictor",
 ]
 
 class BasePredictor(object):
         previous_predictor = methods[-2][0]
         return previous_predictor.get_lexeme(form, methods[:-1])
 
+    def normalized(self, form):
+        return self.dict.normalized(form)
+
     def __repr__(self):
         return str("<%s>") % self.__class__.__name__
 
         return result
 
 
+class LatinPredictor(BasePredictor):
+    """
+    This predictor marks latin words with "LATN" tag.
+    """
+    terminal = True
+    ESTIMATE = 0.5
+    EXTRA_GRAMMEMES = ['LATN']
+
+    def __init__(self, morph):
+        super(LatinPredictor, self).__init__(morph)
+        self.morph.TagClass.KNOWN_GRAMMEMES.update(self.EXTRA_GRAMMEMES)
+        self._tag = self.morph.TagClass('LATN')
+
+    def parse(self, word, seen_parses):
+        if not is_latin(word):
+            return []
+
+        return [(
+            word, self._tag, word,
+            None, None, self.ESTIMATE,
+            [(self, )],
+        )]
+
+    def tag(self, word, seen_tags):
+        if not is_latin(word):
+            return []
+        return [self._tag]
+
+    def get_lexeme(self, form, methods):
+        return [form]
+
+    def normalized(self, form):
+        return form
+
+
 def _add_parse_if_not_seen(parse, result_list, seen_parses):
     reduced_parse = parse[:3]
     if reduced_parse in seen_parses:

pymorphy2/shapes.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+import unicodedata
+
+_latin_letters_cache={}
+def is_latin_letter(uchr):
+    try:
+        return _latin_letters_cache[uchr]
+    except KeyError:
+        if isinstance(uchr, bytes):
+            uchr = uchr.decode('ascii')
+        is_latin = 'LATIN' in unicodedata.name(uchr)
+        return _latin_letters_cache.setdefault(uchr, is_latin)
+
+def is_latin(word):
+    """
+    Return True if all word letters are latin and there is at
+    least one latin letter in a word::
+
+        >>> is_latin('foo')
+        True
+        >>> is_latin('123-FOO')
+        True
+        >>> is_latin('123')
+        False
+        >>> is_latin(':)')
+        False
+
+    """
+    return (
+        any(ch.isalpha() for ch in word) and
+        all(is_latin_letter(ch) for ch in word if ch.isalpha())
+    )

tests/test_analyzer.py

         self.assertAllTuples(morph_plain.parse('кот'))
         # self.assertAllTuples(morph_plain.inflect('кот', set(['plur'])))
         # self.assertAllTuples(morph_plain.decline('кот'))
+
+class TestLatinPredictor:
+
+    def test_tag(self):
+        tags = morph.tag('Maßstab')
+        assert len(tags) == 1
+        assert 'LATN' in tags[0]
+
+    def test_parse(self):
+        parses = morph.parse('Maßstab')
+        assert len(parses) == 1
+        assert 'LATN' in parses[0].tag
+
+    def test_lexeme(self):
+        p = morph.parse('Maßstab')[0]
+        assert p.lexeme == [p]
+
+    def test_normalized(self):
+        p = morph.parse('Maßstab')[0]
+        assert p.normalized == p
+
+    def test_normal_forms(self):
+        assert morph.normal_forms('Maßstab') == ['Maßstab']
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.