Mikhail Korobov avatar Mikhail Korobov committed 6310b2c

PunctuationPredictor

Comments (0)

Files changed (4)

pymorphy2/analyzer.py

 
     ENV_VARIABLE = 'PYMORPHY2_DICT_PATH'
     DEFAULT_PREDICTORS = [
+        predictors.PunctuationPredictor,
         predictors.LatinPredictor,
         predictors.HyphenSeparatedParticlePredictor,
         predictors.KnownPrefixPredictor,

pymorphy2/predictors.py

 import logging
 
 from .utils import word_splits
-from .shapes import is_latin
+from .shapes import is_latin, is_punctuation
 
 logger = logging.getLogger(__name__)
 
     "UnknownPrefixPredictor",
     "KnownSuffixPredictor",
     "HyphenSeparatedParticlePredictor",
+    "PunctuationPredictor",
     "LatinPredictor",
 ]
 
 
         return result
 
+class _ShapeAnalyzer(BasePredictor):
+    ESTIMATE = 0.5
+    EXTRA_GRAMMEMES = []
 
-class LatinPredictor(BasePredictor):
+    def __init__(self, morph):
+        super(_ShapeAnalyzer, self).__init__(morph)
+        self.morph.TagClass.KNOWN_GRAMMEMES.update(self.EXTRA_GRAMMEMES)
+
+    def _check_shape(self, word):
+        raise NotImplementedError()
+
+    def _get_tag(self, word, shape):
+        raise NotImplementedError()
+
+    def parse(self, word, seen_parses):
+        shape = self._check_shape(word)
+        if not shape:
+            return []
+
+        return [(
+            word, self._get_tag(word, shape), word,
+            None, None, self.ESTIMATE,
+            [(self, )],
+        )]
+
+    def tag(self, word, seen_tags):
+        shape = self._check_shape(word)
+        if not shape:
+            return []
+        return [self._get_tag(word, shape)]
+
+    def get_lexeme(self, form, methods):
+        return [form]
+
+    def normalized(self, form):
+        return form
+
+
+class PunctuationPredictor(_ShapeAnalyzer):
+    """
+    This predictor tags punctuation marks as "PNCT".
+    """
+    terminal = True
+    ESTIMATE = 0.5
+    EXTRA_GRAMMEMES = ['PNCT']
+
+    def __init__(self, morph):
+        super(PunctuationPredictor, self).__init__(morph)
+        self._tag = self.morph.TagClass('PNCT')
+
+    def _get_tag(self, word, shape):
+        return self._tag
+
+    def _check_shape(self, word):
+        return is_punctuation(word)
+
+
+class LatinPredictor(_ShapeAnalyzer):
     """
     This predictor marks latin words with "LATN" tag.
     """
 
     def __init__(self, morph):
         super(LatinPredictor, self).__init__(morph)
-        self.morph.TagClass.KNOWN_GRAMMEMES.update(self.EXTRA_GRAMMEMES)
         self._tag = self.morph.TagClass('LATN')
 
-    def parse(self, word, seen_parses):
-        if not is_latin(word):
-            return []
+    def _get_tag(self, word, shape):
+        return self._tag
 
-        return [(
-            word, self._tag, word,
-            None, None, self.ESTIMATE,
-            [(self, )],
-        )]
-
-    def tag(self, word, seen_tags):
-        if not is_latin(word):
-            return []
-        return [self._tag]
-
-    def get_lexeme(self, form, methods):
-        return [form]
-
-    def normalized(self, form):
-        return form
+    def _check_shape(self, word):
+        return is_latin(word)
 
 
 def _add_parse_if_not_seen(parse, result_list, seen_parses):

pymorphy2/shapes.py

 from __future__ import absolute_import
 import unicodedata
 
-_latin_letters_cache={}
-def is_latin_letter(uchr):
-    try:
-        return _latin_letters_cache[uchr]
-    except KeyError:
-        if isinstance(uchr, bytes):
-            uchr = uchr.decode('ascii')
-        is_latin = 'LATIN' in unicodedata.name(uchr)
-        return _latin_letters_cache.setdefault(uchr, is_latin)
-
-def is_latin(word):
+def is_latin(token):
     """
-    Return True if all word letters are latin and there is at
-    least one latin letter in a word::
+    Return True if all token letters are latin and there is at
+    least one latin letter in the token::
 
         >>> is_latin('foo')
         True
 
     """
     return (
-        any(ch.isalpha() for ch in word) and
-        all(is_latin_letter(ch) for ch in word if ch.isalpha())
+        any(ch.isalpha() for ch in token) and
+        all(is_latin_char(ch) for ch in token if ch.isalpha())
     )
+
+def is_punctuation(token):
+    """
+    Return True if a word contains only spaces and punctuation marks
+    and there is at least one punctuation mark::
+
+        >>> is_punctuation(', ')
+        True
+        >>> is_punctuation('..!')
+        True
+        >>> is_punctuation('x')
+        False
+        >>> is_punctuation(' ')
+        False
+        >>> is_punctuation('')
+        False
+
+    """
+    if isinstance(token, bytes): # python 2.x ascii str
+        token = token.decode('ascii')
+
+    return (
+        bool(token) and
+        not token.isspace() and
+        all(unicodedata.category(ch)[0] == 'P' for ch in token if not ch.isspace())
+    )
+
+_latin_letters_cache={}
+def is_latin_char(uchr):
+    try:
+        return _latin_letters_cache[uchr]
+    except KeyError:
+        if isinstance(uchr, bytes):
+            uchr = uchr.decode('ascii')
+        is_latin = 'LATIN' in unicodedata.name(uchr)
+        return _latin_letters_cache.setdefault(uchr, is_latin)

tests/test_analyzer.py

 
     def test_normal_forms(self):
         assert morph.normal_forms('Maßstab') == ['Maßstab']
+
+
+class TetsPunctuationPredictor:
+    def test_tag(self):
+        tags = morph.tag('…')
+        assert len(tags) == 1
+        assert 'PNCT' in tags[0]
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.