Mikhail Korobov avatar Mikhail Korobov committed bde01ee

Specialized analyzers for names, surnames, patronymic names, geographical names and organization names

Comments (0)

Files changed (8)

pymorphy2/analyzer.py

         units.HyphenatedWordsAnalyzer,
         units.KnownPrefixAnalyzer,
         units.UnknownPrefixAnalyzer,
+
+        units.NameAnalyzer,
+        units.SurnameAnalyzer,
+        units.PatronymicAnalyzer,
+        units.GeoAnalyzer,
+        units.OrganizationAnalyzer,
+
         units.KnownSuffixAnalyzer,
     ]
 

pymorphy2/opencorpora_dict/compile.py

 
 from pymorphy2 import dawg
 from pymorphy2.constants import PARADIGM_PREFIXES, PREDICTION_PREFIXES
-from pymorphy2.utils import longest_common_substring, largest_group
+from pymorphy2.utils import longest_common_substring, largest_elements
 
 logger = logging.getLogger(__name__)
 
 
 CompiledDictionary = collections.namedtuple(
     'CompiledDictionary',
-    'gramtab suffixes paradigms words_dawg prediction_suffixes_dawgs parsed_dict prediction_options'
+    'gramtab suffixes paradigms words_dawg prediction_suffixes_dawgs extra_prediction_dawgs parsed_dict prediction_options'
 )
 
+EXTRA_GRAMMEMES_FOR_PREDICTION = ['Name', 'Surn', 'Patr', 'Geox', 'Orgn']
 
 def convert_to_pymorphy2(opencorpora_dict_path, out_path, overwrite=False,
                          prediction_options=None):
     paradigms = (fix_strings(para) for para in paradigms)
     paradigms = [_linearized_paradigm(paradigm) for paradigm in paradigms]
 
-    logger.debug('calculating prediction data..')
+    logger.debug('calculating main prediction data..')
     suffixes_dawgs_data = _suffixes_prediction_data(
-        words, paradigm_popularity, gramtab, paradigms, suffixes, **_prediction_options
+        words, paradigm_popularity, gramtab, paradigms, suffixes,
+        _POS_tags(gramtab),
+        **_prediction_options
     )
 
+    aux_dawgs_data = {}
+    aux_prediction_options = dict(
+        min_ending_freq = 2,
+        min_paradigm_popularity = 1,
+        max_suffix_length = _prediction_options['max_suffix_length'],
+        max_parses_per_grammeme = 4,
+    )
+    for grammeme in EXTRA_GRAMMEMES_FOR_PREDICTION:
+        logger.debug('calculating auxilary prediction data for %s..', grammeme)
+
+        aux_dawgs_data[grammeme] = _suffixes_prediction_data(
+            words, paradigm_popularity, gramtab, paradigms, suffixes,
+            set([grammeme]),
+            **aux_prediction_options
+        )
+
+    # print(aux_dawgs_data)
+
     logger.debug('building word DAWG..')
     words_dawg = dawg.WordsDawg(words)
 
     del words
 
+    logger.debug('building prediction_suffixes DAWGs..')
+    prediction_suffixes_dawgs = [dawg.PredictionSuffixesDAWG(d) for d in suffixes_dawgs_data]
 
-    prediction_suffixes_dawgs = []
-    for prefix_id, dawg_data in enumerate(suffixes_dawgs_data):
-        logger.debug('building prediction_suffixes DAWGs #%d..' % prefix_id)
-        prediction_suffixes_dawgs.append(dawg.PredictionSuffixesDAWG(dawg_data))
+    logger.debug('building prediction_suffixes DAWGs..')
+
+    extra_prediction_dawgs = {}
+    for grammeme, data in aux_dawgs_data.items():
+        extra_prediction_dawgs[grammeme] = [dawg.PredictionSuffixesDAWG(d) for d in data]
 
     return CompiledDictionary(tuple(gramtab), suffixes, paradigms,
-                              words_dawg, prediction_suffixes_dawgs, parsed_dict,
-                              _prediction_options)
+                              words_dawg, prediction_suffixes_dawgs,
+                              extra_prediction_dawgs,
+                              parsed_dict, _prediction_options)
 
 
 def _join_lexemes(lexemes, links):
     return stem, tuple(zip(suffixes, tags, prefixes))
 
 
-def _suffixes_prediction_data(words, paradigm_popularity, gramtab, paradigms, suffixes,
-                              min_ending_freq, min_paradigm_popularity, max_suffix_length):
+def _suffixes_prediction_data(words, paradigm_popularity, gramtab, paradigms, suffixes, grammemes,
+                              min_ending_freq, min_paradigm_popularity, max_suffix_length,
+                              max_parses_per_grammeme=1):
 
-    logger.debug('calculating prediction data: removing non-productive paradigms..')
-    productive_paradigms = set(
-        para_id
-        for (para_id, count) in paradigm_popularity.items()
-        if count >= min_paradigm_popularity
-    )
+    productive_paradigms = _popular_paradigms(paradigm_popularity, min_paradigm_popularity)
 
-    # ["suffix"] => number of occurrences
-    # this is for removing non-productive suffixes
-    ending_counts = collections.defaultdict(int)
+    def iter_words():
+        for word, (para_id, idx) in _show_progress(words, 1e6):
+            if para_id not in productive_paradigms:
+                continue
+            yield word, (para_id, idx)
 
-    # [form_prefix_id]["suffix"]["POS"][(para_id, idx)] => number or occurrences
-    # this is for selecting most popular parses
+    logger.debug('collecting statistics for word suffixes..')
+    words_info = _iter_words_info(iter_words(), paradigms, gramtab, suffixes)
+    ending_counts, endings = _ending_stats(words_info, max_suffix_length, grammemes)
+
+    # logger.debug('preparing data for DAWGs building..')  # it is fast
+    dawgs_data = []
+    for form_prefix_id in sorted(endings.keys()):
+        _endings = endings[form_prefix_id]
+
+        counted_suffixes_dawg_data = []
+
+        for word_end in _endings:
+            if ending_counts[word_end] < min_ending_freq:
+                continue
+
+            for grammeme in _endings[word_end]:
+                common_endings = largest_elements(
+                    _endings[word_end][grammeme].items(),
+                    operator.itemgetter(1),
+                    max_parses_per_grammeme
+                )
+
+                for form, cnt in common_endings:
+                    record = word_end, (cnt,) + form
+                    counted_suffixes_dawg_data.append(record)
+
+
+        dawgs_data.append(counted_suffixes_dawg_data)
+
+    return dawgs_data
+
+
+def _ending_stats(words_info, max_suffix_length, interesting_grammemes):
+    """
+    Return (ending_counts, endings) tuple.
+
+    ending_counts: ["suffix"] => number of occurrences
+        it is for removing non-productive suffixes
+
+    endings: [form_prefix_id]["suffix"]["grammeme"][(para_id, idx)] => number or occurrences
+        it is for selecting most popular parses
+
+    """
     endings = {}
     for form_prefix_id in range(len(PARADIGM_PREFIXES)):
         endings[form_prefix_id] = collections.defaultdict(
                                     lambda: collections.defaultdict(
                                         lambda: collections.defaultdict(int)))
+    ending_counts = collections.defaultdict(int)
+    interesting_grammemes = set(interesting_grammemes)
 
-    logger.debug('calculating prediction data: checking word endings..')
+    for word, tag, form_prefix, form_suffix, form_prefix_id, para_id, idx in words_info:
+        grammemes = set(_to_grammemes(tag)) & interesting_grammemes
+        if not grammemes:
+            continue
+
+        _endings = endings[form_prefix_id]
+
+        for word_end in _iter_prediction_suffixes(word, form_suffix, max_suffix_length):
+            ending_counts[word_end] += 1
+            for grammeme in grammemes:
+                _endings[word_end][grammeme][(para_id, idx)] += 1
+
+    return ending_counts, endings
+
+
+def _popular_paradigms(paradigm_popularity, min_count):
+    return set(
+        para_id
+        for (para_id, count) in paradigm_popularity.items()
+        if count >= min_count
+    )
+
+
+def _iter_words_info(words, paradigms, gramtab, suffixes):
     for word, (para_id, idx) in words:
 
-        if para_id not in productive_paradigms:
-            continue
-
         paradigm = paradigms[para_id]
-
         form_count = len(paradigm) // 3
-
         tag = gramtab[paradigm[form_count + idx]]
         form_prefix_id = paradigm[2*form_count + idx]
         form_prefix = PARADIGM_PREFIXES[form_prefix_id]
         assert word.endswith(form_suffix), word
 
         if len(word) == len(form_prefix)+len(form_suffix):
-            # pseudo-paradigm
+            # pseudo-paradigms are useless for prediction
             continue
 
-        POS = tuple(tag.replace(' ', ',', 1).split(','))[0]
+        yield word, tag, form_prefix, form_suffix, form_prefix_id, para_id, idx
 
-        for i in range(max(len(form_suffix), 1), max_suffix_length+1): #was: 1,2,3,4,5
-            word_end = word[-i:]
 
-            ending_counts[word_end] += 1
-            endings[form_prefix_id][word_end][POS][(para_id, idx)] += 1
+def _to_grammemes(tag):
+    return tag.replace(' ', ',', 1).split(',')
 
-    dawgs_data = []
 
-    for form_prefix_id in sorted(endings.keys()):
-
-        logger.debug('calculating prediction data: preparing DAWGs data #%d..' % form_prefix_id)
-
-        counted_suffixes_dawg_data = []
-
-        endings_with_prefix = endings[form_prefix_id]
-        for suff in endings_with_prefix:
-
-            if ending_counts[suff] < min_ending_freq:
-                continue
-
-            for POS in endings_with_prefix[suff]:
-
-                common_endings = largest_group(
-                    endings_with_prefix[suff][POS].items(),
-                    operator.itemgetter(1)
-                )
-
-                for form, cnt in common_endings:
-                    counted_suffixes_dawg_data.append(
-                        (suff, (cnt,)+ form)
-                    )
-
-        dawgs_data.append(counted_suffixes_dawg_data)
-
-    return dawgs_data
+def _POS_tags(gramtab):
+    return set(_to_grammemes(tag)[0] for tag in gramtab)
 
 
 def _linearized_paradigm(paradigm):
             return False
     return True
 
+
+def _iter_prediction_suffixes(word, form_suffix, max_suffix_length):
+    min_length = max(len(form_suffix), 1)
+    for i in range(min_length, max_suffix_length+1):
+        yield word[-i:]
+
+
+def _show_progress(iterator, print_every):
+    """ Print "NUM done" message every ``print_every`` iteration. """
+    for index, el in enumerate(iterator):
+        if not (index % int(print_every)):
+            logger.debug("%d done", index)
+        yield el
+

pymorphy2/opencorpora_dict/storage.py

 
 logger = logging.getLogger(__name__)
 
-CURRENT_FORMAT_VERSION = '2.1'
+CURRENT_FORMAT_VERSION = '2.2'
 
 LoadedDictionary = collections.namedtuple(
     'LoadedDictionary',
-    'meta, gramtab, suffixes, paradigms, words, prediction_prefixes, prediction_suffixes_dawgs, Tag, paradigm_prefixes'
+    'meta, gramtab, suffixes, paradigms, words, prediction_prefixes, prediction_suffixes_dawgs, extra_prediction_dawgs, Tag, paradigm_prefixes'
 )
 
 
     prediction_suffixes_dawgs = []
     for prefix_id in range(len(paradigm_prefixes)):
         fn = _f('prediction-suffixes-%s.dawg' % prefix_id)
+        assert os.path.exists(fn)
+        prediction_suffixes_dawgs.append(dawg.PredictionSuffixesDAWG().load(fn))
 
-        assert os.path.exists(fn)
+    extra_prediction_dawgs = {}
+    for grammeme in meta.get('extra_prediction_dawg_lengths', []):
+        extra_prediction_dawgs[grammeme] = []
+        for prefix_id in range(len(paradigm_prefixes)):
+            fn = _f('%s-prediction-suffixes-%s.dawg' % (grammeme, prefix_id))
+            assert os.path.exists(fn)
+            d = dawg.PredictionSuffixesDAWG().load(fn)
+            extra_prediction_dawgs[grammeme].append(d)
 
-        prediction_suffixes_dawgs.append(dawg.PredictionSuffixesDAWG().load(fn))
 
     return LoadedDictionary(meta, gramtab, suffixes, paradigms, words,
                             prediction_prefixes, prediction_suffixes_dawgs,
-                            Tag, paradigm_prefixes)
+                            extra_prediction_dawgs, Tag, paradigm_prefixes)
 
 
 def save_compiled_dict(compiled_dict, out_path):
     json_write(_f('grammemes.json'), compiled_dict.parsed_dict.grammemes)
 
     gramtab_formats = {}
-    for format, Tag in tagset.registry.items():
+    for format_, Tag in tagset.registry.items():
         Tag._init_grammemes(compiled_dict.parsed_dict.grammemes)
         new_gramtab = [Tag._from_internal_tag(tag) for tag in compiled_dict.gramtab]
 
-        gramtab_name = "gramtab-%s.json" % format
-        gramtab_formats[format] = gramtab_name
+        gramtab_name = "gramtab-%s.json" % format_
+        gramtab_formats[format_] = gramtab_name
 
         json_write(_f(gramtab_name), new_gramtab)
 
     for prefix_id, prediction_suffixes_dawg in enumerate(compiled_dict.prediction_suffixes_dawgs):
         prediction_suffixes_dawg.save(_f('prediction-suffixes-%s.dawg' % prefix_id))
 
+    for grammeme, dawgs in compiled_dict.extra_prediction_dawgs.items():
+        for prefix_id, prediction_suffixes_dawg in enumerate(dawgs):
+            fname = '%s-prediction-suffixes-%s.dawg' % (grammeme.lower(), prefix_id)
+            prediction_suffixes_dawg.save(_f(fname))
+
 
     dawg.DAWG(PREDICTION_PREFIXES).save(_f('prediction-prefixes.dawg'))
     json_write(_f('paradigm-prefixes.json'), PARADIGM_PREFIXES)
     for prediction_suffixes_dawg in compiled_dict.prediction_suffixes_dawgs:
         prediction_suffixes_dawg_lenghts.append(_dawg_len(prediction_suffixes_dawg))
 
+    logger.debug('  extra_prediction_dawg_lengths')
+    extra_dawg_lengths = dict(
+        (grammeme, _dawg_len(d[0]))
+        for grammeme, d in compiled_dict.extra_prediction_dawgs.items()
+    )
+
     meta = [
         ['format_version', CURRENT_FORMAT_VERSION],
         ['pymorphy2_version', pymorphy2.__version__],
         ['prediction_suffixes_dawg_lengths', prediction_suffixes_dawg_lenghts],
         ['prediction_prefixes_dawg_length', len(PREDICTION_PREFIXES)],
         ['paradigm_prefixes_length', len(PARADIGM_PREFIXES)],
+        ['extra_prediction_dawg_lengths', extra_dawg_lengths],
     ]
 
     json_write(_f('meta.json'), meta, indent=4)
 
     Tag = tagset.registry[gramtab_format]
 
-    # FIXME: clone the class
+    # FIXME: clone the class?
     # Tag = type(Tag.__name__, (Tag,), {
     #     'KNOWN_GRAMMEMES': Tag.KNOWN_GRAMMEMES.copy(),
     # })

pymorphy2/opencorpora_dict/wrapper.py

         self.words = self._data.words
         self.prediction_prefixes = self._data.prediction_prefixes
         self.prediction_suffixes_dawgs = self._data.prediction_suffixes_dawgs
+        self.extra_prediction_dawgs = self._data.extra_prediction_dawgs
         self.meta = self._data.meta
         self.Tag = self._data.Tag
 

pymorphy2/units/__init__.py

 
 from .by_lookup import DictionaryAnalyzer
 
-from .by_analogy import (KnownPrefixAnalyzer, KnownSuffixAnalyzer,
-                         UnknownPrefixAnalyzer)
+from .by_analogy import (
+    KnownPrefixAnalyzer, KnownSuffixAnalyzer, UnknownPrefixAnalyzer,
+    NameAnalyzer, SurnameAnalyzer, PatronymicAnalyzer,
+    OrganizationAnalyzer, GeoAnalyzer)
 
 from .by_hyphen import (HyphenatedWordsAnalyzer, HyphenAdverbAnalyzer,
                         HyphenSeparatedParticleAnalyzer)
 
 from .by_shape import (LatinAnalyzer, PunctuationAnalyzer, NumberAnalyzer,
-                       RomanNumberAnalyzer)
+                       RomanNumberAnalyzer)

pymorphy2/units/by_analogy.py

         # or maybe use a proper discounting?
         total_counts = [1] * len(self._paradigm_prefixes)
 
-        for prefix_id, prefix, suffixes_dawg in self._possible_prefixes(word_lower):
+        for prefix_id, prefix, suffixes_dawg in self._possible_prefixes(word, word_lower):
 
             for i in self._prediction_splits:
 
                         tag = self.dict.build_tag_info(para_id, idx)
 
                         # skip non-productive tags
+                        # XXX: move this check to dictionary compilation step?
                         if not tag.is_productive():
                             continue
                         total_counts[prefix_id] += cnt
         # ``self.parse(...)``.
 
         result = []
-        for prefix_id, prefix, suffixes_dawg in self._possible_prefixes(word_lower):
+        for prefix_id, prefix, suffixes_dawg in self._possible_prefixes(word, word_lower):
 
             for i in self._prediction_splits:
 
 
                 for fixed_suffix, parses in para_data:
                     for cnt, para_id, idx in parses:
-
                         tag = self.dict.build_tag_info(para_id, idx)
 
+                        # XXX: move this check to dictionary compilation step?
                         if not tag.is_productive():
                             continue
 
         result.sort(reverse=True)
         return [tag for cnt, tag in result]
 
-    def _possible_prefixes(self, word):
+    def _possible_prefixes(self, word, word_lower):
         for prefix_id, prefix in self._paradigm_prefixes:
-            if not word.startswith(prefix):
+            if not word_lower.startswith(prefix):
                 continue
 
             suffixes_dawg = self.dict.prediction_suffixes_dawgs[prefix_id]
             yield prefix_id, prefix, suffixes_dawg
+
+
+
+class _SpecialKnownSuffixAnalyzer(KnownSuffixAnalyzer):
+    """
+    Parse the word by checking how the words with similar suffixes
+    are parsed, assuming the word has some predefined GRAMMEME
+    (which should be set in subclasses).
+
+    This class allows to create specialized predictors for given grammemes
+    (e.g. for surnames or geographical location names).
+
+    Prediction data for the grammeme must be available in dictionary.
+    """
+    GRAMMEME = None
+    ESTIMATE_DECAY = 0.6
+
+    def _possible_prefixes(self, word, word_lower):
+        if not word or not word[0].isupper():
+            # only run for title-cased words
+            return []
+
+        suffixes_dawg = self.dict.extra_prediction_dawgs[self.GRAMMEME][0]
+        return [(0, '', suffixes_dawg)]
+
+
+class NameAnalyzer(_SpecialKnownSuffixAnalyzer):
+    GRAMMEME = 'Name'
+
+class SurnameAnalyzer(_SpecialKnownSuffixAnalyzer):
+    GRAMMEME = 'Surn'
+
+class PatronymicAnalyzer(_SpecialKnownSuffixAnalyzer):
+    GRAMMEME = 'Patr'
+
+class GeoAnalyzer(_SpecialKnownSuffixAnalyzer):
+    GRAMMEME = 'Geox'
+
+class OrganizationAnalyzer(_SpecialKnownSuffixAnalyzer):
+    GRAMMEME = 'Orgn'

pymorphy2/utils.py

 import itertools
 import codecs
 import json
+import heapq
 
 try:
     from urllib.request import urlopen
         return json.load(f, **json_options)
 
 
-def largest_group(iterable, key):
+def largest_elements(iterable, key, n=1):
     """
-    Find a group of largest elements (according to ``key``).
+    Return a list of largest elements (according to ``key``).
 
     >>> s = [-4, 3, 5, 7, 4, -7]
-    >>> largest_group(s, abs)
+    >>> largest_elements(s, abs, 1)
     [7, -7]
+    >>> largest_elements(s, abs, 2)
+    [5, 7, -7]
+    >>> largest_elements(s, abs, 3)
+    [-4, 5, 7, 4, -7]
 
     """
     it1, it2 = itertools.tee(iterable)
-    max_key = max(map(key, it1))
-    return [el for el in it2 if key(el) == max_key]
+    top_keys = set(heapq.nlargest(n, set(map(key, it1))))
+    return [el for el in it2 if key(el) in top_keys]
 
 
 def word_splits(word, min_reminder=3, max_prefix_length=5):

tests/test_opencorpora_dict.py

 from __future__ import absolute_import, unicode_literals
 
 import os
+import pytest
+
 import pymorphy2
-from pymorphy2.opencorpora_dict.compile import (_to_paradigm,
-                                                convert_to_pymorphy2)
+from pymorphy2.opencorpora_dict.compile import (_to_paradigm, convert_to_pymorphy2)
 from pymorphy2.opencorpora_dict.parse import parse_opencorpora_xml
 from pymorphy2.dawg import assert_can_create
 from pymorphy2.test_suite_generator import make_test_suite
 
-import pytest
-
 
 class TestToyDictionary:
 
         # use it
         morph = pymorphy2.MorphAnalyzer(out_path)
         assert morph.tag('ёжиться') == [morph.TagClass('INFN,impf,intr')]
+        assert morph.tag('корёжиться') == [morph.TagClass('INFN,impf,intr')]
+        assert morph.tag('коржиться') == [morph.TagClass('INFN,impf,intr')]
 
     def test_test_suite_generator(self, tmpdir):
         # just make sure it doesn't raise an exception
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.