Commits

Mikhail Korobov committed daabf00

небольшие улучшения в коде (большей частью - pep8 и комментарии)

Comments (0)

Files changed (8)

 logger = logging.getLogger('pymorphy2')
 logger.addHandler(logging.StreamHandler())
 
+XML_BZ2_URL = "http://opencorpora.org/files/export/dict/dict.opcorpora.xml.bz2"
+
 # ============================ Commands ===========================
 
 def compile_dict(in_filename, out_folder=None, overwrite=False, prediction_options=None):
     """
-    Makes a Pymorphy2 dictionary from OpenCorpora .xml dictionary.
+    Make a Pymorphy2 dictionary from OpenCorpora .xml dictionary.
     """
     if out_folder is None:
         out_folder = 'dict'
 
 def xml_to_json(in_filename, out_filename):
     """
-    Parses XML and caches result to json.
+    Parse XML and caches result to json.
     """
     opencorpora_dict.xml_dict_to_json(in_filename, out_filename)
 
 
 def show_dict_mem_usage(dict_path, verbose=False):
     """
-    Shows dictionary memory usage.
+    Show dictionary memory usage.
     """
     initial_mem = get_mem_usage()
     initial_time = time.time()
             from guppy import hpy; hp=hpy()
             logger.debug(hp.heap())
         except ImportError:
-            logger.warning('guppy is not installed, detailed info is not available')
+            logger.warn('guppy is not installed, detailed info is not available')
+
 
 def show_dict_meta(dict_path):
     dct = opencorpora_dict.load(dict_path)
 
 def make_test_suite(dict_filename, out_filename, word_limit=100):
     """
-    Makes a test suite from (unparsed) OpenCorpora dictionary.
+    Make a test suite from (unparsed) OpenCorpora dictionary.
     """
     return test_suite_generator.make_test_suite(
         dict_filename, out_filename, word_limit=int(word_limit))
 
 
-XML_BZ2_URL = "http://opencorpora.org/files/export/dict/dict.opcorpora.xml.bz2"
-
 def download_xml(out_filename, verbose):
     """
-    Downloads an updated XML from OpenCorpora
+    Download an updated XML from OpenCorpora
     """
     def on_chunk():
         if verbose:
 
     logger.info('\nDone.')
 
+
 def _parse(dict_path, in_filename, out_filename):
     from pymorphy2 import tagger
     morph = pymorphy2.tagger.Morph.load(dict_path)

pymorphy2/constants.py

     "УЛЬТРА",
     "ЭКСТРА"
 ]
-
-NON_PRODUCTIVE_CLASSES = {
-    'opencorpora-int': set(['NUMR', 'NPRO', 'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ'])
-}

pymorphy2/dawg.py

 
 def assert_can_create():
     if not CAN_CREATE:
-        raise NotImplementedError("Creating of DAWGs with DAWG-Python is not supported; install 'dawg' package. ")
+        msg = ("Creating of DAWGs with DAWG-Python is "
+               "not supported; install 'dawg' package.")
+        raise NotImplementedError(msg)
 
 
 class WordsDawg(RecordDAWG):
     """
     DAWG for storing prediction data.
     """
+
     # We are storing 3 unsigned short ints as values:
     # count, the paradigm ID and the form index (inside paradigm).
     # Byte order is big-endian (this makes word forms properly sorted).

pymorphy2/opencorpora_dict.py

 
 def _parse_opencorpora_xml(filename):
     """
-    Parses OpenCorpora dict XML and returns a tuple
+    Parse OpenCorpora dict XML and return a tuple
 
         (lemmas_list, links, grammemes, version, revision)
 
         while elem.getprevious() is not None:
             del elem.getparent()[0]
 
-
     for ev, elem in etree.iterparse(filename):
 
         if elem.tag == 'grammeme':
 
     return lemmas, links, grammemes, version, revision
 
+
 def _lemma_forms_from_xml_elem(elem):
     """
-    Returns a list of (word, tags) pairs given an XML element with lemma.
+    Return a list of (word, tags) pairs given an XML element with lemma.
     """
     def _tags(elem):
         return ",".join(g.get('v') for g in elem.findall('g'))
     lemma = []
     lemma_id = elem.get('id')
 
-    if len(elem) == 0: # deleted lemma
+    if len(elem) == 0:  # deleted lemma
         return lemma_id, lemma
 
     base_info = elem.findall('l')
 
     return lemma_id, lemma
 
+
 def _longest_common_substring(data):
     """
-    Returns a longest common substring of a list of strings.
+    Return a longest common substring of a list of strings.
     See http://stackoverflow.com/questions/2892931/
     """
     substr = ''
                     substr = data[0][i:i+j]
     return substr
 
+
 def _to_paradigm(lemma):
     """
-    Extracts (stem, paradigm) pair from lemma list.
-    Paradigm is a list of suffixes with associated gram. tags and prefixes.
+    Extract (stem, paradigm) pair from lemma list.
+    Paradigm is a list of suffixes with associated tags and prefixes.
     """
     forms, tags = list(zip(*lemma))
     prefixes = [''] * len(tags)
 
 
 def xml_dict_to_json(xml_filename, json_filename):
+    """
+    Convert XML dictionary to JSON for faster loading.
+    It may be useful while developing dictionary preparation routines.
+    """
     logger.info('parsing xml...')
     parsed_dct = _parse_opencorpora_xml(xml_filename)
 
     with codecs.open(json_filename, 'w', 'utf8') as f:
         json.dump(parsed_dct, f, ensure_ascii=False)
 
+
 def _load_json_dict(filename):
     with codecs.open(filename, 'r', 'utf8') as f:
         return json.load(f)
 
 
 def _linearized_paradigm(paradigm):
+    """
+    Convert ``paradigm`` (a list of tuples with numbers)
+    to 1-dimensional array.array (for reduced memory usage).
+    """
     return array.array(str("H"), list(itertools.chain(*zip(*paradigm))))
 
+
 def _load_json_or_xml_dict(filename):
+    """
+    Load (parse) raw OpenCorpora dictionary either from XML or from JSON
+    (depending on file extension).
+    """
     if filename.endswith(".json"):
         logger.info('loading json...')
         return _load_json_dict(filename)
         logger.info('parsing xml...')
         return _parse_opencorpora_xml(filename)
 
-def _suffixes_prediction_data(words, popularity, gramtab, paradigms,
-                              min_ending_freq=2, min_paradigm_popularity=3,
-                              max_forms_per_class=1):
-
-    # XXX: this uses approach different from pymorphy 0.5.6;
-    # what are the implications on prediction quality?
-
-    productive_paradigms = set(
-        para_id
-        for (para_id, count) in popularity.items()
-        if count >= min_paradigm_popularity
-    )
-
-    ending_counts = collections.Counter()
-
-    endings = collections.defaultdict(lambda: collections.defaultdict(collections.Counter))
-
-    for word, (para_id, idx) in words:
-
-        if para_id not in productive_paradigms:
-            continue
-
-        paradigm = paradigms[para_id]
-        tag = gramtab[paradigm[len(paradigm) // 3 + idx]]
-        cls = tuple(tag.replace(' ', ',', 1).split(','))[0]
-
-        for i in 1,2,3,4,5:
-            word_end = word[-i:]
-            ending_counts[word_end] += 1
-            endings[word_end][cls][(para_id, idx)] += 1
-
-    counted_suffixes_dawg_data = []
-    for suff in endings:
-
-        if ending_counts[suff] < min_ending_freq:
-            continue
-
-        for cls in endings[suff]:
-            for form, cnt in endings[suff][cls].most_common(max_forms_per_class):
-                counted_suffixes_dawg_data.append(
-                    (suff, (cnt,)+ form)
-                )
-
-    return counted_suffixes_dawg_data
-
-
 
 def _gram_structures(lemmas, links, prediction_options=None):
     """
-    Returns compacted dictionary data.
+    Return compacted dictionary data.
     """
     prediction_options = prediction_options or {}
     gramtab = []
     logger.debug('building prediction_suffixes DAWG..')
     prediction_suffixes_dawg = dawg.PredictionSuffixesDAWG(suffixes_dawg_data)
 
-    return tuple(gramtab), suffixes, paradigms, words_dawg, prediction_suffixes_dawg
+    return (tuple(gramtab), suffixes, paradigms,
+            words_dawg, prediction_suffixes_dawg)
+
+
+def _suffixes_prediction_data(words, popularity, gramtab, paradigms,
+                              min_ending_freq=2, min_paradigm_popularity=3,
+                              max_forms_per_class=1):
+
+    # XXX: this uses approach different from pymorphy 0.5.6;
+    # what are the implications on prediction quality?
+
+    productive_paradigms = set(
+        para_id
+        for (para_id, count) in popularity.items()
+        if count >= min_paradigm_popularity
+    )
+
+    ending_counts = collections.Counter()
+
+    endings = collections.defaultdict(lambda: collections.defaultdict(collections.Counter))
+
+    for word, (para_id, idx) in words:
+
+        if para_id not in productive_paradigms:
+            continue
+
+        paradigm = paradigms[para_id]
+        tag = gramtab[paradigm[len(paradigm) // 3 + idx]]
+        cls = tuple(tag.replace(' ', ',', 1).split(','))[0]
+
+        for i in 1,2,3,4,5:
+            word_end = word[-i:]
+            ending_counts[word_end] += 1
+            endings[word_end][cls][(para_id, idx)] += 1
+
+    counted_suffixes_dawg_data = []
+    for suff in endings:
+
+        if ending_counts[suff] < min_ending_freq:
+            continue
+
+        for cls in endings[suff]:
+            for form, cnt in endings[suff][cls].most_common(max_forms_per_class):
+                counted_suffixes_dawg_data.append(
+                    (suff, (cnt,)+ form)
+                )
+
+    return counted_suffixes_dawg_data
 
 
 def to_pymorphy2_format(opencorpora_dict_path, out_path, overwrite=False, prediction_options=None):
         json.dump(meta, f, ensure_ascii=False, indent=4)
 
 
+# ------ Routines for loading compiled dictionaries --------
 
-DictTuple = collections.namedtuple('DictTuple', 'meta gramtab suffixes paradigms words prediction_prefixes prediction_suffixes Tag')
+DictTuple = collections.namedtuple(
+    'DictTuple',
+    'meta gramtab suffixes paradigms words prediction_prefixes prediction_suffixes Tag'
+)
+
 
 def load(path, gramtab_format='opencorpora-int'):
     """
-    Loads Pymorphy2 dictionary.
+    Load Pymorphy2 dictionary.
     ``path`` is a folder name where dictionary data reside.
     """
     #meta, gramtab, suffixes, paradigms, words = [None]*5
 
     _f = lambda p: os.path.join(path, p)
 
-    with open(_f('meta.json'), 'r') as f:
+    meta = _load_meta(_f('meta.json'))
+    _assert_format_is_compatible(meta)
+
+    Tag = _load_tag_class(gramtab_format, _f('grammemes.json'))
+    gramtab = [Tag(tag_str) for tag_str in _load_gramtab(meta, gramtab_format, path)]
+
+    suffixes = _load_suffixes(_f('suffixes.json'))
+    paradigms = _load_paradigms(_f('paradigms.array'))
+    words = dawg.WordsDawg().load(_f('words.dawg'))
+
+    prediction_suffixes = dawg.PredictionSuffixesDAWG().load(_f('prediction-suffixes.dawg'))
+    prediction_prefixes = dawg.DAWG().load(_f('prediction-prefixes.dawg'))
+
+    return DictTuple(meta, gramtab, suffixes, paradigms, words,
+                     prediction_prefixes, prediction_suffixes, Tag)
+
+
+def _load_meta(filename):
+    """ Load metadata. """
+    with open(filename, 'r') as f:
         meta = json.load(f)
         if hasattr(collections, 'OrderedDict'):
-            meta = collections.OrderedDict(meta)
-        else:
-            meta = dict(meta)
+            return collections.OrderedDict(meta)
+        return dict(meta)
 
-    format_version = meta.get('format_version', None)
-    if format_version != 1:
-        raise ValueError("This dictionary format ('%s') is not supported." % format_version)
 
+def _load_tag_class(gramtab_format, grammemes_filename):
+    """ Load and initialize Tag class (according to ``gramtab_format``). """
     if gramtab_format not in tagset.registry:
         raise ValueError("This gramtab format ('%s') is unsupported." % gramtab_format)
+
     Tag = tagset.registry[gramtab_format]
 
-    with open(_f('grammemes.json'), 'r') as f:
+    with open(grammemes_filename, 'r') as f:
         grammemes = json.load(f, encoding='utf8')
         Tag._init_restrictions(grammemes)
 
+    return Tag
+
+
+def _load_gramtab(meta, gramtab_format, path):
+    """ Load gramtab (a list of tags) """
 
     gramtab_formats = meta.get('gramtab_formats', {})
     if gramtab_format not in gramtab_formats:
         raise ValueError("This gramtab format (%s) is unavailable; available formats: %s" % (gramtab_format, gramtab_formats.keys()))
 
-    with open(_f(gramtab_formats[gramtab_format]), 'r') as f:
-        gramtab = [Tag(tag_str) for tag_str in json.load(f, encoding='utf8')]
+    gramtab_filename = os.path.join(path, gramtab_formats[gramtab_format])
+    with open(gramtab_filename, 'r') as f:
+        return json.load(f, encoding='utf8')
 
-    with open(_f('suffixes.json'), 'r') as f:
-        suffixes = json.load(f)
 
+def _load_suffixes(filename):
+    """ Load a list of possible word suffixes """
+    with open(filename, 'r') as f:
+        return json.load(f)
+
+
+def _load_paradigms(filename):
+    """ Load paradigms data """
     paradigms = []
-    with open(_f('paradigms.array'), 'rb') as f:
+    with open(filename, 'rb') as f:
         paradigms_count = struct.unpack(str("<H"), f.read(2))[0]
 
         for x in range(paradigms_count):
             paradigm_len = struct.unpack(str("<H"), f.read(2))[0]
+
             para = array.array(str("H"))
             para.fromfile(f, paradigm_len)
+
             paradigms.append(para)
+    return paradigms
 
-    words = dawg.WordsDawg().load(_f('words.dawg'))
-    prediction_suffixes = dawg.PredictionSuffixesDAWG().load(_f('prediction-suffixes.dawg'))
-    prediction_prefixes = dawg.DAWG().load(_f('prediction-prefixes.dawg'))
-    return DictTuple(meta, gramtab, suffixes, paradigms, words, prediction_prefixes, prediction_suffixes, Tag)
+
+def _assert_format_is_compatible(meta):
+    """ Raise an exception if dictionary format is not compatible """
+    format_version = meta.get('format_version', None)
+    if format_version != 1:
+        raise ValueError("This dictionary format ('%s') is not supported." % format_version)
+

pymorphy2/os_utils.py

 
 CHUNK_SIZE = 256*1024
 
-def download_bz2(url, out_fp, chunk_size=CHUNK_SIZE, on_chunk=lambda:None):
+
+def download_bz2(url, out_fp, chunk_size=CHUNK_SIZE, on_chunk=lambda: None):
+    """
+    Download a bz2-encoded file from ``url`` and write it to ``out_fp`` file.
+    """
     decompressor = bz2.BZ2Decompressor()
     fp = urlopen(url, timeout=30)
 

pymorphy2/tagger.py

 from __future__ import print_function, unicode_literals, division
 import os
 import heapq
-import collections
 from pymorphy2 import opencorpora_dict
-from pymorphy2.constants import LEMMA_PREFIXES, NON_PRODUCTIVE_CLASSES
+from pymorphy2.constants import LEMMA_PREFIXES
 
-#ParseResult = collections.namedtuple('ParseResult', 'fixed_word tag normal_form para_id form_idx estimate')
+#ParseResult = collections.namedtuple('ParseResult',
+#           'fixed_word tag normal_form para_id form_idx estimate')
 
 class Morph(object):
 
     @classmethod
     def load(cls, path=None):
         """
-        Creates a Morph object using dictionaries at ``path``.
+        Create a Morph object using dictionaries at ``path``.
 
         If ``path`` is None then the path is obtained from
         ``PYMORPHY2_DICT_PATH`` environment variable.
         """
         if path is None:
             if cls.env_variable not in os.environ:
-                raise ValueError("Please pass a path to dictionaries or set %s environment variable" % cls.env_variable)
+                raise ValueError("Please pass a path to dictionaries or set "
+                                 "%s environment variable" % cls.env_variable)
             path = os.environ[cls.env_variable]
 
         dct = opencorpora_dict.load(path)
 
     def parse(self, word):
         """
-        Returns a list of (fixed_word, tag, normal_form, _para_id, _idx, _estimate)
+        Return a list of
+
+            (fixed_word, tag, normal_form, _para_id, _idx, _estimate)
+
         tuples.
         """
         res = self._parse_as_known(word)
 
     def _parse_as_known(self, word):
         """
-        Parses the word using a dictionary.
+        Parse the word using a dictionary.
         """
         res = []
         para_normal_forms = {}
         para_data = self._dictionary.words.similar_items(word, self._ee)
 
-        for fixed_word, parses in para_data: # fixed_word is a word with proper Ё letters
+        for fixed_word, parses in para_data:
+            # `fixed_word` is a word with proper Ё letters
             for para_id, idx in parses:
 
                 if para_id not in para_normal_forms:
 
     def _parse_as_word_with_known_prefix(self, word):
         """
-        Parses the word by checking if it starts with a known prefix
+        Parse the word by checking if it starts with a known prefix
         and parsing the reminder.
         """
         res = []
 
     def _parse_as_word_with_unknown_prefix(self, word, _seen_parses=None):
         """
-        Parses the word by parsing only the word suffix
+        Parse the word by parsing only the word suffix
         (with restrictions on prefix & suffix lengths).
         """
         if _seen_parses is None:
 
     def _parse_as_word_with_known_suffix(self, word, _seen_parses=None):
         """
-        Parses the word by checking how the words with similar suffixes
+        Parse the word by checking how the words with similar suffixes
         are parsed.
         """
         if _seen_parses is None:
 
     def normal_forms(self, word):
         """
-        Returns a list of word normal forms.
+        Return a list of word normal forms.
         """
         seen = set()
         result = []
 
     def inflect(self, word, required_grammemes):
         """
-        Returns a list of parsed words that are closest to ``word`` and
+        Return a list of parsed words that are closest to ``word`` and
         have all ``required_grammemes``.
         """
         required_grammemes = set(required_grammemes)
 
     def decline(self, word):
         """
-        Returns parses for all possible word forms.
+        Return parses for all possible word forms.
         """
         return self._decline(self.parse(word))
 
     def _decline(self, word_parses):
         """
-        Returns parses for all possible word forms (given a list of
+        Return parses for all possible word forms (given a list of
         possible word parses).
         """
-
         paradigms = self._dictionary.paradigms
         seen_paradigms = set()
         result = []
 
     def _build_tag_info(self, para_id, idx):
         """
-        Returns gram. tag as a string.
+        Return gram. tag as a string.
         """
         paradigm = self._dictionary.paradigms[para_id]
         tag_info_offset = len(paradigm) // 3
 
     def _build_paradigm_info(self, para_id):
         """
-        Returns a list of
+        Return a list of
 
             (prefix, tag, suffix)
 
 
     def _build_normal_form(self, para_id, idx, fixed_word):
         """
-        Builds a normal form.
+        Build a normal form.
         """
 
         if idx == 0: # a shortcut: normal form is a word itself
 
     def _build_stem(self, paradigm, idx, fixed_word):
         """
-        Returns word stem (given a word, paradigm and the word index).
+        Return word stem (given a word, paradigm and the word index).
         """
         paradigm_len = len(paradigm) // 3
 
             return fixed_word[len(prefix):]
 
 
-
     # ====== misc =========
 
     def meta(self):
 
 def _split_word(word, min_reminder=3, max_prefix_length=5):
     """
-    Returns all splits of a word (taking in account min_reminder and
+    Return all splits of a word (taking in account min_reminder and
     max_prefix_length).
     """
     max_split = min(max_prefix_length, len(word)-min_reminder)

pymorphy2/tagset.py

 # Design notes: Tag objects should be immutable.
 class InternalOpencorporaTag(object):
 
-    __slots__ = ['_grammemes_tuple', '_lemma_grammemes', '_grammemes_cache', '_str']
+    __slots__ = ['_grammemes_tuple', '_lemma_grammemes',
+                 '_grammemes_cache', '_str']
 
     FORMAT = 'opencorpora-int'
-    NON_PRODUCTIVE_CLASSES = set(['NUMR', 'NPRO', 'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ'])
+    NON_PRODUCTIVE_CLASSES = set(['NUMR', 'NPRO', 'PRED', 'PREP',
+                                  'CONJ', 'PRCL', 'INTJ'])
 
     # XXX: is it a good idea to have these rules?
     EXTRA_INCOMPATIBLE = {
 
     @classmethod
     def _from_internal_tag(cls, tag):
-        """ Returns tag string given internal tag string """
+        """ Return tag string given internal tag string """
         return tag
 
     @classmethod
     @classmethod
     def _init_restrictions(cls, dict_grammemes):
         """
-        Fills ``OpencorporaTag.GRAMMEME_INDICES`` and
+        Fill ``OpencorporaTag.GRAMMEME_INDICES`` and
         ``OpencorporaTag.GRAMMEME_INCOMPATIBLE`` class attributes.
         """
 
     @classmethod
     def _init_restrictions(cls, dict_grammemes):
         """
-        Fills ``OpencorporaTag.GRAMMEME_INDICES`` and
+        Fill ``OpencorporaTag.GRAMMEME_INDICES`` and
         ``OpencorporaTag.GRAMMEME_INCOMPATIBLE`` class attributes.
         """
         cls._init_alias_map(dict_grammemes)
             cls.GRAMMEME_ALIAS_MAP[name] = alias
 
 
-
 registry = dict()
 
 for tag_type in [OpencorporaTag, InternalOpencorporaTag]:

pymorphy2/test_suite_generator.py

 
 def _get_test_suite(word_parses, word_limit=100):
     """
-    Limits word_parses to ``word_limit`` words per tag.
+    Limit word_parses to ``word_limit`` words per tag.
     """
     gramtab = collections.Counter() # tagset -> number of stored items
     result = list()
 
 def make_test_suite(opencorpora_dict_path, out_path, word_limit=100):
     """
-    Extracts test data from OpenCorpora .xml dictionary (at least
-    ``word_limit`` words for each distinct gram. tag) and saves it to a file.
+    Extract test data from OpenCorpora .xml dictionary (at least
+    ``word_limit`` words for each distinct gram. tag) and save it to a file.
     """
     logger.debug('loading dictionary to memory...')
     lemmas, links, grammemes, version, revision = _load_json_or_xml_dict(opencorpora_dict_path)