Commits

Mikhail Korobov committed 284eaeb

conversion engine; aot -> positional conversion barely works; some tests; py3k

  • Participants
  • Parent commits 40b0bbb

Comments (0)

Files changed (16)

 ^build
-\.pyc$
+\.pyc$
+^\.tox
+^\.coverage$
+^MANIFEST$
+Copyright (c) Mikhail Korobov <kmike84@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished
+to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR
+A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+include README.rst
+include tox.ini

File russian_tagsets/__init__.py

+from __future__ import absolute_import
+__version__ = '0.1'
+
+from . import aot, positional

File russian_tagsets/aot.py

+# -*- coding: utf-8 -*-
+"""
+Conversion from aot.ru tags to positional tags.
+AOT tags should be tuple(POS, gram_info), e.g.::
+
+    (u'П', u'мр,ед,им,од,но')
+
+POS->AOT conversion is lossy.
+"""
+
+from __future__ import absolute_import, unicode_literals
+from russian_tagsets import converters
+from russian_tagsets import positional
+
+def to_positional(aot_tag):
+    pos, info = aot_tag
+    info = set(info.split(','))
+    tag = positional.Tag()
+
+    # ==== POS tag ====
+    if pos == 'С':
+        tag.POS = 'NN'
+
+    elif pos == 'П':
+        tag.POS = 'AA'
+        # lossy: possessive adjective (мамин, овечью) should be AU
+
+    elif pos == 'Г':
+        tag.POS = 'VB'
+        if 'пвл' in info:
+            tag.POS = 'Vi'
+
+    elif pos == 'ПРИЧАСТИЕ':
+        # Participle, active or long passive
+        # (читающий, читавший, прочитавший, читаемый;
+        # but not прочитанный (AA), прочитан (Ac)
+        tag.POS = 'AG'
+
+    elif pos == 'КР_ПРИЧАСТИЕ':
+        tag.POS = 'Ac'
+
+    elif pos == 'КР_ПРИЛ':
+        tag.POS = 'AC'
+
+    elif pos == 'ДЕЕПРИЧАСТИЕ':
+        tag.POS = 'Ve'
+
+    elif pos == 'ИНФИНИТИВ':
+        tag.POS = 'Vf'
+
+    elif pos == 'МС':
+        # Personal pronoun (ja, my, ty, vy, on, ona, ono, oni, sebja)
+        tag.POS = 'PP'
+
+    elif pos == 'МС-ПРЕДК':
+        # e.g. нечего
+        tag.POS = 'Db' # lossy
+
+    elif pos == 'МС-П':
+        # e.g. всякий
+
+        # Relative/interrogative pronoun with adjectival
+        # declension (kakoj, kotoryj, cej, ...)
+        tag.POS = 'Pq'
+
+        # fixme/hack: Possessive pronoun (moj, ego, svoj, ..)
+        if 'од' in info:
+            tag.POS = 'PS'
+
+    elif pos == 'Н':
+        # Adverb without a possibility to form negation
+        # and degrees of comparison (vverxu, vnizu, potom)
+        tag.POS = 'Db'
+
+        # fixme: adverb forming negation and degrees of
+        # comparison (vysoko, daleko). Idea: pymorphy.get_graminfo
+        # returns 'КР_ПРИЛ' and 'Н' variants for such adverbs.
+        # tag.POS == 'Dg'
+
+        # fixme: Multiplicative numeral (dvaždy, triždy) are adverbs
+
+    elif pos == 'ПРЕДК':
+        # e.g. интересно
+        tag.POS = 'Dg' # ?
+
+    elif pos == 'ЧИСЛ':
+        # Cardinal numeral (odin, tri, sorok)
+        tag.POS = 'Cn'
+
+        # fixme: Generic/collective numeral (dvoje, četvero)
+        # tag.POS = 'Cj'
+
+    elif pos == 'ЧИСЛ-П':
+        tag.POS = 'Cr'
+
+    elif pos == 'ПРЕДЛ':
+        # Nonvocalized preposition (ob, pered, s, v, ...)
+        tag.POS = 'RR'
+
+        # fixme: Part of a preposition; never appears isolated (nesmotrja)
+        # tag.POS = 'PF'
+
+        # fixme: Vocalized preposition (obo, peredo, so, vo, ...)
+        # tag.POS = 'RV'
+
+    elif pos == 'СОЮЗ':
+        # Non-subordinate conjunction (i, a, xotja, pricem)
+        tag.POS = 'J^'
+        # fixme: Subordinate conjunction (esli, čto, kotoryj)
+        # tag.POS = 'J,'
+
+    elif pos == 'МЕЖД':
+        tag.POS = 'II'
+
+    elif pos == 'ЧАСТ':
+        tag.POS = 'TT'
+
+    elif pos == 'ВВОДН':
+        # e.g. конечно
+        tag.POS = 'Db' # ?
+
+    # ==== gender ====
+
+    if 'мр' in info:
+        tag.gender = 'M'
+    elif 'жр' in info:
+        tag.gender = 'F'
+    elif 'ср' in info:
+        tag.gender = 'N'
+    elif 'мр-жр' in info:
+        tag.gender = 'X' # fixme?
+
+    # ==== animacy ====
+    # FIXME
+    # X should be used: except for nouns, in all
+    # forms except accusative masculine singular and accusative
+    # plural of all genders. Non-declinable words in all forms.
+    if info.issuperset(['од', 'но']):
+        tag.animacy = 'X' # fixme
+    elif 'од' in info:
+        tag.animacy = 'A'
+    elif 'но' in info:
+        tag.animacy = 'I'
+    else:
+        tag.animacy = '-' # fixme?
+
+    # ======== number =======
+    if 'ед' in info:
+        tag.number = 'S'
+    elif 'мн' in info:
+        tag.number = 'P'
+    # FIXME
+    # non-declinable nouns, adjectives and verbs,
+    # 3rd person possessive pronouns:
+    # tag.number = 'X'
+
+    # ======== case ==========
+    CASES = {
+        'им': '1',
+        'рд': '2',
+        'дт': '3',
+        'вн': '4',
+        'тв': '7',
+        'пр': '6',
+    }
+    for case in CASES:
+        if case in info:
+            tag.case = CASES[case]
+        if '2' in info:
+            tag.variant = '1'
+    if 'зв' in CASES:
+        tag.case = '1'
+        tag.variant = '1'
+
+    # TODO: possessor's gender
+    # TODO: possessor's number
+
+    # ======= person ========
+    # fixme: it should be X for non-declinable verbs
+    PERSONS = {'1л': '1', '2л': '2', '3л': '3'}
+    for person in PERSONS:
+        if person in info:
+            tag.person = PERSONS[person]
+            break
+
+    # ======= reflexivity ========
+    if tag.POS in ['AG', 'PP', 'P5', 'PS'] or tag.mainPOS == 'V':
+        tag.reflexivity = 'I'
+        # fixme: lossy! e.g. for "получиться" it should be 'R'.
+
+    # ======= tense =========
+    TENSES = {
+        'нст': 'P',
+        'прш': 'R',
+        'буд': 'F',
+    }
+    for tense in TENSES:
+        if tense in info:
+            tag.tense = TENSES[tense]
+            break
+
+    # TODO: verbal_aspect
+
+    # ========== degree of comparison =========
+    if tag.POS in ['AA', 'Dg']:
+        if 'сравн' in info:
+            tag.degree_of_comparison = '2'
+        elif 'прев' in info:
+            tag.degree_of_comparison = '3'
+        else:
+            tag.degree_of_comparison = '1'
+
+    # ========= negation =========
+    if tag.mainPOS in ['N', 'A'] or tag.POS == 'Dg':
+        tag.negation = 'A'
+        # fixme: lossy! e.g. for ""
+
+
+    return tag
+
+def from_positional(positional_tag):
+    pass
+
+converters.add('positional', 'aot', from_positional)
+converters.add('aot', 'positional', to_positional)

File russian_tagsets/convert_engine.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+class NoConvertPath(Exception):
+    pass
+
+def bigrams(iterable):
+    _first = object()
+    prev = _first
+    for current in iterable:
+        if prev is not _first:
+            yield prev, current
+        prev = current
+
+def find_shortest_path(graph, start, end, cost=lambda path: len(path), path=[]):
+    # adopted from http://www.python.org/doc/essays/graphs/
+    path = path + [start]
+    if start == end:
+        return path
+    if start not in graph:
+        return None
+    shortest = None
+    for node in graph[start]:
+        if node not in path:
+            newpath = find_shortest_path(graph, node, end, cost, path)
+            if newpath:
+                if not shortest or cost(newpath) < cost(shortest):
+                    shortest = newpath
+    return shortest
+
+class Registry(object):
+
+    def __init__(self):
+        # Directed graph of all possible transformations.
+        # _registry['from']['to'] -> transformation function
+        self._registry = {}
+
+    def add(self, type_from, type_to, method):
+        """
+        Registers :param:``method`` as conversion method from
+        :param:``type_from`` to :param:``type_to``.
+
+        :param:``method`` signature should receive object of type ``type_from``
+        and return an object of type ``type_to``.
+        """
+        if type_from not in self._registry:
+            self._registry[type_from] = {}
+
+        self._registry[type_from][type_to] = method
+
+    def path(self, type_from, type_to):
+        """
+        Returns a list of conversion steps.
+        """
+        _path = find_shortest_path(self._registry, type_from, type_to)
+        if _path is None:
+            raise NoConvertPath()
+        return _path
+
+    def steps(self, type_from, type_to):
+        """
+        Returns a list of conversion functions that should be applied to
+        translate from ``type_from`` to ``type_to``.
+        """
+        for from_, to_ in bigrams(self.path(type_from, type_to)):
+            yield self._registry[from_][to_]
+
+    def convert(self, obj, type_from, type_to):
+        """
+        Converts object from ``type_from`` to ``type_to``.
+        """
+        for func in self.steps(type_from, type_to):
+            obj = func(obj)
+        return obj

File russian_tagsets/converters.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from russian_tagsets.convert_engine import Registry
+
+_registry = Registry()
+
+class ConversionError(Exception):
+    pass
+
+def add(type_from, type_to, method):
+    """
+    Registers :param:``method`` as conversion method from
+    :param:``type_from`` to :param:``type_to``.
+
+    :param:``method`` signature should receive object of type ``type_from``
+    and return an object of type ``type_to``.
+    """
+    _registry.add(type_from, type_to, method)
+
+def steps(type_from, type_to):
+    """
+    Returns a list of conversion functions that should be applied to
+    convert from ``type_from`` to ``type_to``; raises ConversionError
+    if conversion is not possible.
+    """
+    return _registry.steps(type_from, type_to)
+
+def convert(obj, type_from, type_to):
+    """
+    Converts object from ``type_from`` to ``type_to``.
+    """
+    for func in steps(type_from, type_to):
+        obj = func(obj)
+    return obj
+

File russian_tagsets/positional/__init__.py

 Association. pp. 1277-1284.)
 
 """
+from __future__ import absolute_import, unicode_literals, print_function
+import collections
+import array
+
+TagInfo = collections.namedtuple('TagInfo', 'position letter name full_name values')
+
+TAGS_POSITIONS = [
+    TagInfo(1, 'p', 'POS', 'Part of Speech', {
+        'N': 'Noun',
+        'A': 'Adjective',
+        'P': 'Pronoun',
+        'C': 'Numeral',
+        'V': 'Verb',
+        'D': 'Adverb',
+        'R': 'Preposition',
+        'J': 'Conjunction',
+        'I': 'Interjection',
+        'T': 'Particle',
+        'Z': 'Punctuation',
+        'X': 'Unknown, special use',
+    }),
+
+    TagInfo(2, 's', 'subPOS', 'SubPOS (Detailed Part of Speech)', {
+        '#': 'Z: Sentence boundary',
+        ',': 'J: Subordinate conjunction (esli, čto, kotoryj)',
+        '0': 'X: Part of a multiword foreign phrase',
+        '5': 'P: 3rd person pronoun in prepositional forms (nego, nej, ...)',
+        ':': 'Z: Punctuation',
+        '=': 'C: Number written using digits',
+        'A': 'A: Adjective (long, non-participle) (xorosij, ploxoj)',
+        'B': 'V: Verb in present, past or rarely future form (čitaju, splju, pišum, spal, ždal)',
+        'C': 'A: Short adjective (non-participle) (surov, krasiv)',
+        'D': 'P: Pronoun demonstrative (ètot, tot, sej, takoj, èkij, ... )',
+        'F': 'R: Part of a preposition; never appears isolated (nesmotrja)',
+        'G': 'A: Participle, active or long passive (čitajuscij, čitavsij, pročitavšij, čitaemyj; but not pročitannyj (AA), pročitan (Ac)',
+        'I': 'I: Interjection (oj, aga, m-da)',
+        'N': 'N: Noun',
+        'P': 'P: Personal pronoun (ja, my, ty, vy, on, ona, ono, oni, sebja)',
+        'Q': 'P: Relative/interrogative pronoun with nominal declension (kto, čto)',
+        'R': 'R: Nonvocalized preposition (ob, pered, s, v, ...)',
+        'S': 'P: Possessive pronoun (moj, ego, svoj, ..)',
+        'T': 'T: Particle (li)',
+        'U': "A: Possessive adjective (mamin, oveč'ju)",
+        'V': 'R: Vocalized preposition (obo, peredo, so, vo, ...)',
+        'W': 'P: Negative pronoun with nominal declension (nicto, nikto)',
+        'X': 'X: Unknown, special use',
+        'Z': "P: Indefinite pronoun with nominal declension (kto-to, kto-nibud', cto-to, ...)",
+        '^': 'J: Non-subordinate conjunction (i, a, xotja, pricem)',
+        'a': "C: Indefinite numeral (mnogo, neskol'ko)",
+        'b': 'D: Adverb without a possibility to form negation and degrees of comparison (vverxu, vnizu, potom)',
+        'c': 'A: Short passive participle (procitan)',
+        'e': 'V: Gerund (delaja; pridja, otpisav)',
+        'f': "V: Infinitive (delat', spat')",
+        'g': 'D: Adverb forming negation and degrees of comparison (vysoko, daleko)',
+        'i': 'V: Imperative (spi, sdelaj, pročti)',
+        'j': 'C: Generic/collective numeral (dvoje, četvero)',
+        'n': 'C: Cardinal numeral (odin, tri, sorok)',
+        'q': 'P: Relative/interrogative pronoun with adjectival declension (kakoj, kotoryj, cej, ...)',
+        'r': 'C: Ordinal numeral (pervyj, tretij)',
+        'u': "C: Interrogative numeral (skol'ko)",
+        'v': 'C: Multiplicative numeral (dvaždy, triždy)',
+        'w': 'P: Negative pronoun with adjectival declension (nikakoj, nicej)',
+        'z': "P: Indefinite pronoun with adjectival declension (samyj, ves', ...)",
+        '}': 'C: Number written using Roman numerals (XIV)',
+    }),
+
+    TagInfo(3, 'g', 'gender', 'Gender', {
+        'F': 'Feminine',
+        'M': 'Masculine',
+        'N': 'Neuter',
+        'X': 'Any gender',
+    }),
+    TagInfo(4, 'y', 'animacy', 'Animacy', {
+        'A': 'Animate',
+        'I': 'Inanimate',
+        'X': 'Either',
+    }),
+    TagInfo(5, 'n', 'number', 'Number', {
+        'P': 'Plural',
+        'S': 'Singular',
+        'X': 'Any number'
+    }),
+    TagInfo(6, 'c', 'case', 'Case', {
+        '1': 'Nominative',
+        '2': 'Genitive',
+        '3': 'Dative',
+        '4': 'Accusative',
+        '6': 'Locative',
+        '7': 'Instrumental',
+        'X': 'Any case'
+    }),
+    TagInfo(7, 'f', 'possessors_gender', "Possessor's Gender", {
+        'F': 'Feminine possessor',
+        'M': 'Masculine possessor',
+        'N': 'Neuter possessor',
+        'X': 'Possessor of any gender',
+    }),
+    TagInfo(8, 'm', 'possessors_number', "Possessor's Number", {
+        'S': 'Singular possessor',
+        'P': 'Plural possessor',
+    }),
+    TagInfo(9, 'e', 'person', 'Person', {
+        '1': '1st person',
+        '2': '2nd person',
+        '3': '3rd person',
+        'X': 'Any person',
+    }),
+    TagInfo(10, 'r', 'reflexivity', 'Reflexivity', {
+        'I': 'Irreflexive',
+        'R': 'Reflexive',
+    }),
+    TagInfo(11, 't', 'tense', 'Tense', {
+        'F': 'Future',
+        'P': 'Present',
+        'R': 'Past',
+        'X': 'Any (Past, Present, or Future)',
+    }),
+    TagInfo(12, 'b', 'verbal_aspect', 'Verbal aspect', {
+        'P': 'perfective',
+        'I': 'imperfective',
+        'X': 'either aspect',
+    }),
+    TagInfo(13, 'd', 'degree_of_comparison', 'Degree of comparison', {
+        '1': 'Positive',
+        '2': 'Comparative',
+        '3': 'Superlative',
+    }),
+    TagInfo(14, 'a', 'negation', 'Negation', {
+        'A': 'Affirmative (not negated)',
+        'N': 'Negated',
+    }),
+    TagInfo(15, 'v', 'voice', 'Voice', {
+        'A': 'Active',
+        'P': 'Passive',
+    }),
+    TagInfo(16, 'i', 'variant', 'Variant, Abbreviation', {
+        '1': 'Variant (generally less frequent)',
+        '2': 'Variant (generally rarely used, bookish, or archaic)',
+        '3': 'Variant (very archaic)',
+        '5': 'Variant (colloquial)',
+        '6': 'Variant (colloquial, generally less frequent)',
+        '7': 'Variant (colloquial, generally less frequent)',
+        '8': 'Abbreviations'
+    }),
+]
+
+def _fget(ind):
+    def fget(self):
+        return self._data[ind]
+    return fget
+
+def _fset(ind):
+    def fset(self, value):
+        if value != '-' and value not in TAGS_POSITIONS[ind].values:
+            raise ValueError('Invalid value %s' % value)
+        self._data[ind] = value
+    return fset
+
+def _prop(ind):
+    return _fget(ind), _fset(ind)
+
+
+class Tag(object):
+    def __init__(self, txt='-'*16):
+        self._data = array.array(str('u'), txt)
+
+    mainPOS = property(*_prop(0))
+    subPOS = property(*_prop(1))
+    gender = property(*_prop(2))
+    animacy = property(*_prop(3))
+    number = property(*_prop(4))
+    case = property(*_prop(5))
+    possessors_gender = property(*_prop(6))
+    possessors_number = property(*_prop(7))
+    person = property(*_prop(8))
+    reflexivity = property(*_prop(9))
+    tense = property(*_prop(10))
+    verbal_aspect = property(*_prop(11))
+    degree_of_comparison = property(*_prop(12))
+    negation = property(*_prop(13))
+    voice = property(*_prop(14))
+    variant = property(*_prop(15))
+
+    # 2-letter POS
+    def _get_pos(self):
+        return self._data[0:2].tounicode()
+    def _set_pos(self, txt):
+        self.mainPOS, self.subPOS = txt[0], txt[1]
+    POS = property(_get_pos, _set_pos)
+
+
+    def is_valid(self):
+        if len(self._data) != 16:
+            return False
+
+        for index, c in enumerate(self):
+            if c == '-':
+                continue
+            if c not in TAGS_POSITIONS[index].values:
+                return False
+
+        return True
+
+    def verbose_info(self):
+        return dict((
+            (TAGS_POSITIONS[index].name, TAGS_POSITIONS[index].values[tag])
+            for index, tag in enumerate(self) if tag != '-'
+        ))
+
+    def __iter__(self):
+        return iter(self._data)
+
+    def __str__(self):
+        return self._data.tounicode() # this is not correct under python 2.x
+
+
+if __name__ == '__main__':
+    print(Tag("NNFIS7-------A--").verbose_info())

File russian_tagsets/positional/spec.py

 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
+
+# THIS IS UNUSED AND DOESN'T WORK
+
 from fnmatch import fnmatchcase
 
 _RESTRICTIONS = [

File russian_tagsets/positional/tagset.py

-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-import collections
-
-
-TagInfo = collections.namedtuple('TagInfo', 'position letter name full_name values')
-
-TAGS_POSITIONS = [
-    TagInfo(1, 'p', 'POS', 'Part of Speech', {
-        'N': 'Noun',
-        'A': 'Adjective',
-        'P': 'Pronoun',
-        'C': 'Numeral',
-        'V': 'Verb',
-        'D': 'Adverb',
-        'R': 'Preposition',
-        'J': 'Conjunction',
-        'I': 'Interjection',
-        'T': 'Particle',
-        'Z': 'Punctuation',
-        'X': 'Unknown, special use',
-    }),
-
-    TagInfo(2, 's', 'subPOS', 'SubPOS (Detailed Part of Speech)', {
-        '#': u'Z: Sentence boundary',
-        ',': u'J: Subordinate conjunction (esli, čto, kotoryj)',
-        '0': u'X: Part of a multiword foreign phrase',
-        '5': u'P: 3rd person pronoun in prepositional forms (nego, nej, ...)',
-        ':': u'Z: Punctuation',
-        '=': u'C: Number written using digits',
-        'A': u'A: Adjective (long, non-participle) (xorosij, ploxoj)',
-        'B': u'V: Verb in present, past or rarely future form (čitaju, splju, pišum, spal, ždal)',
-        'C': u'A: Short adjective (non-participle) (surov, krasiv)',
-        'D': u'P: Pronoun demonstrative (ètot, tot, sej, takoj, èkij, ... )',
-        'F': u'R: Part of a preposition; never appears isolated (nesmotrja)',
-        'G': u'A: Participle, active or long passive (čitajuscij, čitavsij, pročitavšij, čitaemyj; but not pročitannyj (AA), pročitan (Ac)',
-        'I': u'I: Interjection (oj, aga, m-da)',
-        'N': u'N: Noun',
-        'P': u'P: Personal pronoun (ja, my, ty, vy, on, ona, ono, oni, sebja)',
-        'Q': u'P: Relative/interrogative pronoun with nominal declension (kto, čto)',
-        'R': u'R: Nonvocalized preposition (ob, pered, s, v, ...)',
-        'S': u'P: Possessive pronoun (moj, ego, svoj, ..)',
-        'T': u'T: Particle (li)',
-        'U': u"A: Possessive adjective (mamin, oveč'ju)",
-        'V': u'R: Vocalized preposition (obo, peredo, so, vo, ...)',
-        'W': u'P: Negative pronoun with nominal declension (nicto, nikto)',
-        'X': u'X: Unknown, special use',
-        'Z': u"P: Indefinite pronoun with nominal declension (kto-to, kto-nibud', cto-to, ...)",
-        '^': u'J: Non-subordinate conjunction (i, a, xotja, pricem)',
-        'a': u"C: Indefinite numeral (mnogo, neskol'ko)",
-        'b': u'D: Adverb without a possibility to form negation and degrees of comparison (vverxu, vnizu, potom)',
-        'c': u'A: Short passive participle (procitan)',
-        'e': u'V: Gerund (delaja; pridja, otpisav)',
-        'f': u"V: Infinitive (delat', spat')",
-        'g': u'D: Adverb forming negation and degrees of comparison (vysoko, daleko)',
-        'i': u'V: Imperative (spi, sdelaj, pročti)',
-        'j': u'C: Generic/collective numeral (dvoje, četvero)',
-        'n': u'C: Cardinal numeral (odin, tri, sorok)',
-        'q': u'P: Relative/interrogative pronoun with adjectival declension (kakoj, kotoryj, cej, ...)',
-        'r': u'C: Ordinal numeral (pervyj, tretij)',
-        'u': u"C: Interrogative numeral (skol'ko)",
-        'v': u'C: Multiplicative numeral (dvaždy, triždy)',
-        'w': u'P: Negative pronoun with adjectival declension (nikakoj, nicej)',
-        'z': u"P: Indefinite pronoun with adjectival declension (samyj, ves', ...)",
-        '}': u'C: Number written using Roman numerals (XIV)',
-    }),
-
-    TagInfo(3, 'g', 'gender', 'Gender', {
-        'F': 'Feminine',
-        'M': 'Masculine',
-        'N': 'Neuter',
-        'X': 'Any gender',
-    }),
-    TagInfo(4, 'y', 'animacy', 'Animacy', {
-        'A': 'Animate',
-        'I': 'Inanimate',
-        'X': 'Either',
-    }),
-    TagInfo(5, 'n', 'number', 'Number', {
-        'P': 'Plural',
-        'S': 'Singular',
-        'X': 'Any number'
-    }),
-    TagInfo(6, 'c', 'case', 'Case', {
-        '1': 'Nominative',
-        '2': 'Genitive',
-        '3': 'Dative',
-        '4': 'Accusative',
-        '6': 'Locative',
-        '7': 'Instrumental',
-        'X': 'Any case'
-    }),
-    TagInfo(7, 'f', 'possessors_gender', 'Possessor\'s Gender', {
-        'F': 'Feminine possessor',
-        'M': 'Masculine possessor',
-        'N': 'Neuter possessor',
-        'X': 'Possessor of any gender',
-    }),
-    TagInfo(8, 'm', 'possessors_number', 'Possessor\'s Number', {
-        'S': 'Singular possessor',
-        'P': 'Plural possessor',
-    }),
-    TagInfo(9, 'e', 'person', 'Person', {
-        '1': '1st person',
-        '2': '2nd person',
-        '3': '3rd person',
-        'X': 'Any person',
-    }),
-    TagInfo(10, 'r', 'reflexivity', 'Reflexivity', {
-        'I': 'Irreflexive',
-        'R': 'Reflexive',
-    }),
-    TagInfo(11, 't', 'tense', 'Tense', {
-        'F': 'Future',
-        'P': 'Present',
-        'R': 'Past',
-        'X': 'Any (Past, Present, or Future)',
-    }),
-    TagInfo(12, 'b', 'verbal_aspect', 'Verbal aspect', {
-        'P': 'perfective',
-        'I': 'imperfective',
-        'X': 'either aspect',
-    }),
-    TagInfo(13, 'd', 'degree_of_comparison', 'Degree of comparison', {
-        '1': 'Positive',
-        '2': 'Comparative',
-        '3': 'Superlative',
-    }),
-    TagInfo(14, 'a', 'negation', 'Negation', {
-        'A': 'Affirmative (not negated)',
-        'N': 'Negated',
-    }),
-    TagInfo(15, 'v', 'voice', 'Voice', {
-        'A': 'Active',
-        'P': 'Passive',
-    }),
-    TagInfo(16, 'i', 'variant', 'Variant, Abbreviation', {
-        '1': 'Variant (generally less frequent)',
-        '2': 'Variant (generally rarely used, bookish, or archaic)',
-        '3': 'Variant (very archaic)',
-        '5': 'Variant (colloquial)',
-        '6': 'Variant (colloquial, generally less frequent)',
-        '7': 'Variant (colloquial, generally less frequent)',
-        '8': 'Abbreviations'
-    }),
-]
-
-
-_PositionalTag = collections.namedtuple(
-    'PositionalTag', [pos.name for pos in TAGS_POSITIONS]
-)
-
-
-class PositionalTag(_PositionalTag):
-
-    @classmethod
-    def fromstr(cls, string):
-        assert len(string) == 16
-        return cls(*string)
-
-
-    def is_valid(self):
-        if len(self) != 16:
-            return False
-
-        for index, c in enumerate(self):
-            if c == '-':
-                continue
-            if c not in TAGS_POSITIONS[index].values:
-                return False
-
-        return True
-
-
-    def verbose_info(self):
-        return [
-            [TAGS_POSITIONS[index].name, TAGS_POSITIONS[index].values[tag]]
-            for index, tag in enumerate(self) if tag != '-'
-        ]
-
-
-
-if __name__ == '__main__':
-    print PositionalTag.fromstr("NNFIS7-------A--").verbose_info()

File russian_tagsets/tests/__init__.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import

File russian_tagsets/tests/test_aot.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals, print_function
+import pytest
+
+from russian_tagsets import converters
+from russian_tagsets import positional
+
+
+class TestConversion(object):
+
+    TAGS = [
+        [('МС', '3л,мр,ед,им'), 'PPM-S1--3I------'], # он
+        [('Г', 'дст,прш,мр,ед'), 'VBM-S----IR-----'], # купил
+
+        #[('МС-П', 'жр,ед,вн,од,но'), 'PSFIS4-P1I------'], # нашу:
+        [('МС-П', 'жр,ед,вн,од,но'), 'PSFXS4---I------'], # not enough info
+
+        [('П', 'жр,ед,вн,од,но'), 'AAFXS4------1A--'], # старую
+        [('С', 'жр,ед,вн'), 'NNF-S4-------A--'], # фотографию
+    ]
+
+    @pytest.mark.parametrize(("aot_tag", "pos_tag_txt"), TAGS)
+    def test_from_positional(self, aot_tag, pos_tag_txt):
+        converted = converters.convert(
+            positional.Tag(pos_tag_txt),
+            'positional', 'aot'
+        )
+        assert converted == aot_tag
+
+    @pytest.mark.parametrize(("aot_tag", "pos_tag_txt"), TAGS)
+    def test_to_positional(self, aot_tag, pos_tag_txt):
+        converted = converters.convert(aot_tag, 'aot', 'positional')
+        assert str(converted) == pos_tag_txt

File russian_tagsets/tests/test_engine.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+import unittest
+from russian_tagsets import convert_engine
+
+class TransformTest(unittest.TestCase):
+
+    def test_transform_path(self):
+        reg = convert_engine.Registry()
+
+        ident = lambda obj: obj
+        reg.add('A', 'B', ident)
+        reg.add('C', 'B', ident)
+        reg.add('A', 'C', ident)
+        reg.add('B', 'A', ident)
+        reg.add('C', 'D', ident)
+
+        self.assertEqual(reg.path('A', 'B'), ['A', 'B'])
+        self.assertEqual(reg.path('A', 'C'), ['A', 'C'])
+        self.assertEqual(reg.path('B', 'C'), ['B', 'A', 'C'])
+        self.assertEqual(reg.path('A', 'A'), ['A'])
+        self.assertEqual(reg.path('C', 'A'), ['C', 'B', 'A'])
+        self.assertEqual(reg.path('B', 'D'), ['B', 'A', 'C', 'D'])
+        self.assertRaises(convert_engine.NoConvertPath, reg.path, 'D', 'A')
+
+    def test_transform(self):
+        reg = convert_engine.Registry()
+        reg.add('lower', 'upper', lambda s: s.upper())
+        reg.add('upper', 'none', lambda s: None)
+        reg.add('upper', 'lower', lambda s: s.lower() if s.isupper() else 'Error')
+
+        self.assertEqual(reg.convert('foo', 'lower', 'upper'), 'FOO')
+        self.assertEqual(reg.convert('foo', 'lower', 'lower'), 'foo')
+        self.assertEqual(reg.convert('FOO', 'upper', 'lower'), 'foo')
+        self.assertEqual(reg.convert('foo', 'lower', 'none'), None)

File russian_tagsets/tests/test_positional.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, print_function, unicode_literals
+import unittest
+from russian_tagsets import positional
+
+class TagTest(unittest.TestCase):
+    def test_validate(self):
+        self.assertFalse(positional.Tag('sdgdhsgdhdgavbhk').is_valid())
+
+        tag = positional.Tag('NNFIS7-------A--')
+        self.assertTrue(tag.is_valid())
+
+    def test_str(self):
+        tag_txt = 'NNFIS7-------A--'
+        tag = positional.Tag(tag_txt)
+        self.assertEqual(str(tag), tag_txt)
+
+    def test_attributes(self):
+        tag = positional.Tag('NNFIS7-------A--')
+        self.assertEqual(tag.POS, 'NN')
+        tag.POS = 'VV'
+        self.assertEqual(str(tag), 'VVFIS7-------A--')
+
     author_email='kmike84@gmail.com',
 
     url = 'https://bitbucket.org/kmike/russian-tagsets/',
-    download_url = 'https://bitbucket.org/kmike/russian-tagsets/get/v%s.zip' % __version__,
+    test_requires = ['pytest'],
 
     packages = ['russian_tagsets', 'russian_tagsets.positional'],
 
           'Programming Language :: Python :: 2',
           'Programming Language :: Python :: 2.6',
           'Programming Language :: Python :: 2.7',
+          'Programming Language :: Python :: 3',
+          'Programming Language :: Python :: 3.2',
           'Topic :: Software Development :: Libraries :: Python Modules',
           'Topic :: Scientific/Engineering :: Information Analysis',
           'Topic :: Text Processing :: Linguistic',
+[tox]
+envlist = py26,py27,py32,pypy
+
+[testenv]
+deps =
+    pytest
+    pytest-cov
+    coverage
+commands=
+    py.test --cov-report term-missing --cov russian_tagsets []