Mikhail Korobov avatar Mikhail Korobov committed 8975a92

пропускаем fuzzy-тесты, если версия тестовых данных не соответствуют версии словаря

Comments (0)

Files changed (2)


 import itertools
 import copy
 import re
+import codecs
 from pymorphy2.opencorpora_dict import _load_json_or_xml_dict
 logger = logging.getLogger(__name__)
-def _get_word_parses(filename):
+def _get_word_parses(lemmas):
     word_parses = collections.defaultdict(list) # word -> possible tags
-    lemmas, links, version, revision = _load_json_or_xml_dict(filename)
-    logger.debug("%10s %20s", "lemma #", "result size")
     for index, lemma in enumerate(lemmas):
         for word, tag in lemma:
-        if not index % 10000:
-            logger.debug('%10s %20s', index, len(word_parses))
     return word_parses
     return result
-def _save_test_suite(path, suite):
-    with open(path, 'wb') as f:
+def _save_test_suite(path, suite, revision):
+    with codecs.open(path, 'w', 'utf8') as f:
+        f.write("%s\n" % revision)
         for word, parses in suite:
             txt = "|".join([word]+parses) +'\n'
-            f.write(txt.encode('utf8'))
+            f.write(txt)
 def make_test_suite(opencorpora_dict_path, out_path, word_limit=100):
     ``word_limit`` words for each distinct gram. tag) and saves it to a file.
     logger.debug('loading dictionary to memory...')
-    parses = _get_word_parses(opencorpora_dict_path)
+    lemmas, links, version, revision = _load_json_or_xml_dict(opencorpora_dict_path)
+    logger.debug('preparing...')
+    parses = _get_word_parses(lemmas)
     logger.debug('dictionary size: %d', len(parses))
     logger.debug('handling umlauts...')
     parses = _add_ee_parses(parses)
     logger.debug('dictionary size: %d', len(parses))
     logger.debug('test suite size: %d', len(suite))
-    _save_test_suite(out_path, suite)
+    _save_test_suite(out_path, suite, revision)


     loads test suite
     with codecs.open(path, 'r', 'utf8') as f:
-        for line in f:
-            parts = line.strip('\n').split('|')
+        for index, line in enumerate(f):
+            line = line.strip("\n")
+            if index == 0: # revision
+                yield line
+                continue
+            # test data
+            parts = line.split('|')
             word, tags = parts[0], [Tag(tag) for tag in parts[1:]]
             yield word, tags
 def load_suite(path):
-    return list(iter_suite(path))
+    suite = list(iter_suite(path))
+    return suite[0], suite[1:]
-suite70k = load_suite(SUITE_PATH)
+suite_revision, suite70k = load_suite(SUITE_PATH)
 def test_tagger_fuzzy():
+    dict_revision = morph.meta()['source_revision']
+    if suite_revision != dict_revision:
+        msg = """
+        Test suite revision (%s) doesn't match dictionary revision (%s).
+        Regenerate test suite with the following command:
+            pymorphy dict make_test_suite dict.xml dev_data/suite.txt -v
+        """  % (suite_revision, dict_revision)
+        pytest.skip(msg)
     for word, tags in suite70k:
         parse_result = set(morph.tag(word))
         assert parse_result == set(tags)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.