Commits

Mikhail Korobov committed 3050511

initial commit: base structure, tags info, restrictions parser-matcher

Comments (0)

Files changed (7)

+^build
+\.pyc$

README.rst

Empty file added.

russian_tagsets/__init__.py

Empty file added.

russian_tagsets/positional/__init__.py

+# -*- coding: utf-8 -*-
+"""
+Python implementation of http://ufal.mff.cuni.cz/~hana/morph/rutags.html
+
+(Jirka Hana and Anna Feldman (2010). A Positional Tagset for Russian.
+In: Proceedings of the 7th International Conference on Language Resources
+and Evaluation ({LREC} 2010)}. European Language Resources
+Association. pp. 1277-1284.)
+
+"""

russian_tagsets/positional/spec.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from fnmatch import fnmatchcase
+
+_RESTRICTIONS = [
+    # only nouns distinguish gender in plural
+    '[FNM].P.* => N.*',
+
+    # for non-nouns, animacy is specified only for masc.sg.acc and pl.acc.
+    '[^N]..[AI].* => ..M.S4.* ....P4.*',
+
+    # AG (long participle) => tv \in {PA,RA,XP}
+    'AG.* => AG....---.P.-.A- AG....---.R.-.A- AG....---.X.-.P-',
+
+    # PP (personal pronoun)
+    'PP--..--.I------ => PP--..--[12]I------',
+    'PP......3.* => .....[^6].*',   # locative only after preposition
+    'P5.* => .....[^1].*',   # nominative only without preposition
+
+    # 3rd person possessive pronouns
+    'PS......3.* => PSXXXX[MFN]S3.* PSXXXXXP3.*',
+
+    # imperative only in 2nd person
+    'Vi.* => .{8}2.*',
+
+    # past tense
+    '10 R => AG.* VB.*',
+
+    # X gender
+    #   no agr gender in plural
+    #   plural tantum noun
+    #   foreign/abbr
+    #   3rd person pers. pronoun
+    #   "CnXP.*",    # !! TODO check
+    '2 X => A.X.P.* P[8DSqwz]X.P.* PSXXXX..3.* CnX.P.* CrX.P.* VBX.P.*    NNX.P.*    NNXXXX.* AAXXXX.*    P[P5]X-P.--3.*',
+
+    # X animacy. TODO what else. Plural agreement?
+    #3 X => AAXXXX.* NNXXXX.* PSXXXX..3.*
+
+    # X number - foreign/abbr  PP
+    '4 X => NN..XX.* AAXXXX.* VB--X---XP..*   PSXXXX..3.*',
+
+    # X case - foreign/abbr PS
+    '5 X => NN..[PSX]X.* AAXXXX.*     PSXXXX..3.*',
+
+    # .......X.......
+    # X possessor's gender
+    '6 X => PSXXXXXP3.*',
+
+    # person .......X....... - # foreign
+    '8 X => VB--X---X.P.*',
+
+    # tense # TODO check voice
+    '10 X => AG.*',
+]
+
+def split_parts(restriction):
+    condition, conclusions = restriction.split(' => ')
+    return condition.strip(), conclusions.strip().split()
+
+def normalize_condition(condition):
+    if condition[0].isdigit():
+        num, value = condition.split()
+        condition = '.'*(int(num)) + value + '*'
+    return condition
+
+def prepare_for_fnmatch(pattern):
+    return pattern.replace('X', '[!-]').replace('^', '!').replace('.', '?')
+
+RESTRICTIONS = [
+    [
+        prepare_for_fnmatch(normalize_condition(condition)),
+        map(prepare_for_fnmatch, conclusions)
+    ]
+    for condition, conclusions in map(split_parts, _RESTRICTIONS)
+]
+
+class TagValidationError(Exception):
+    pass
+
+
+def validate(tag_string):
+    if len(tag_string) == 15:
+        tag_string += '-'
+
+    if len(tag_string) != 16:
+        raise TagValidationError('length is incorrect')
+
+    for condition, conclusions in RESTRICTIONS:
+        print "Matching ", condition
+        if fnmatchcase(tag_string, condition):
+            print "Matched, checking condition ..."
+
+            if not any([fnmatchcase(tag_string, conclusion) for conclusion in conclusions]):
+                raise TagValidationError("%s is not matched by %s" % (tag_string, conclusions))
+
+            print 'passed'
+
+
+SPEC = """
+// =============================================================================
+// Specification of the (new) Russian tagset (16 slots)
+// =============================================================================
+
+// =============================================================================
+// Note that the variant slot is ignored except abbrs.
+
+// =============================================================================
+// values for individual slots;
+// =============================================================================
+// slots with <null> are fully specified by templates below
+// psgyncfmertbdavi
+
+null
+null
+FMNX
+AIX
+PSX
+123467X
+FMNX
+PS
+123X
+RI
+FPRX
+PIX
+123
+AN
+AP
+null
+}
+
+
+
+// =============================================================================
+// Templates
+// =============================================================================
+// ? are replace by all values satisfying restrictions
+
+
+NN????-------?--
+AA????------??--
+AC?-?--------?--
+AG????---???-??-
+AU?????------?--
+Ac?-?--------?P-
+PP--??--?I------
+PP?-??--3I------
+PP---?---R------
+P5?-??--3I------
+PD????----------
+PW---?----------
+Pw????----------
+PS????-??I------
+PSXXXX??3I------
+PS????---R------
+PQ---?----------
+Pq????----------
+PZ---?----------
+Pz????----------
+C=--------------
+C}--------------
+Cn????----------
+Cn??-?----------
+Cn-?-?----------
+Cn--??----------
+Cr????----------
+Cj-?-?----------
+Cu---?----------
+Ca---?----------
+Ca????----------
+Cv--------------
+VB--?---????----
+VB?-?----?R?----
+Ve-------?-?----
+Vf-------?-?----
+Vi--?---??-?----
+Db--------------
+Dg----------??--
+RR---?----------
+RV---?----------
+RF--------------
+J^--------------
+J,--------------
+TT--------------
+II--------------
+Z#--------------
+Z:--------------
+X0--------------
+XX--------------
+
+
+// abbreviations
+AAXXXX------1A-8
+Db-------------8
+J^-------------8
+NNFAPX-------A-8
+NNFIPX-------A-8
+NNFASX-------A-8
+NNFISX-------A-8
+NNFAXX-------A-8
+NNFIXX-------A-8
+NNMIPX-------A-8
+NNMAPX-------A-8
+NNMISX-------A-8
+NNMASX-------A-8
+NNMIXX-------A-8
+NNMAXX-------A-8
+NNNIPX-------A-8
+NNNAPX-------A-8
+NNNISX-------A-8
+NNNASX-------A-8
+NNNIXX-------A-8
+NNNAXX-------A-8
+NNXXXX-------A-8
+}
+"""

russian_tagsets/positional/tagset.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+import collections
+
+
+TagInfo = collections.namedtuple('TagInfo', 'position letter name full_name values')
+
+TAGS_POSITIONS = [
+    TagInfo(1, 'p', 'POS', 'Part of Speech', {
+        'N': 'Noun',
+        'A': 'Adjective',
+        'P': 'Pronoun',
+        'C': 'Numeral',
+        'V': 'Verb',
+        'D': 'Adverb',
+        'R': 'Preposition',
+        'J': 'Conjunction',
+        'I': 'Interjection',
+        'T': 'Particle',
+        'Z': 'Punctuation',
+        'X': 'Unknown, special use',
+    }),
+
+    TagInfo(2, 's', 'subPOS', 'SubPOS (Detailed Part of Speech)', {
+        '#': u'Z: Sentence boundary',
+        ',': u'J: Subordinate conjunction (esli, čto, kotoryj)',
+        '0': u'X: Part of a multiword foreign phrase',
+        '5': u'P: 3rd person pronoun in prepositional forms (nego, nej, ...)',
+        ':': u'Z: Punctuation',
+        '=': u'C: Number written using digits',
+        'A': u'A: Adjective (long, non-participle) (xorosij, ploxoj)',
+        'B': u'V: Verb in present, past or rarely future form (čitaju, splju, pišum, spal, ždal)',
+        'C': u'A: Short adjective (non-participle) (surov, krasiv)',
+        'D': u'P: Pronoun demonstrative (ètot, tot, sej, takoj, èkij, ... )',
+        'F': u'R: Part of a preposition; never appears isolated (nesmotrja)',
+        'G': u'A: Participle, active or long passive (čitajuscij, čitavsij, pročitavšij, čitaemyj; but not pročitannyj (AA), pročitan (Ac)',
+        'I': u'I: Interjection (oj, aga, m-da)',
+        'N': u'N: Noun',
+        'P': u'P: Personal pronoun (ja, my, ty, vy, on, ona, ono, oni, sebja)',
+        'Q': u'P: Relative/interrogative pronoun with nominal declension (kto, čto)',
+        'R': u'R: Nonvocalized preposition (ob, pered, s, v, ...)',
+        'S': u'P: Possessive pronoun (moj, ego, svoj, ..)',
+        'T': u'T: Particle (li)',
+        'U': u"A: Possessive adjective (mamin, oveč'ju)",
+        'V': u'R: Vocalized preposition (obo, peredo, so, vo, ...)',
+        'W': u'P: Negative pronoun with nominal declension (nicto, nikto)',
+        'X': u'X: Unknown, special use',
+        'Z': u"P: Indefinite pronoun with nominal declension (kto-to, kto-nibud', cto-to, ...)",
+        '^': u'J: Non-subordinate conjunction (i, a, xotja, pricem)',
+        'a': u"C: Indefinite numeral (mnogo, neskol'ko)",
+        'b': u'D: Adverb without a possibility to form negation and degrees of comparison (vverxu, vnizu, potom)',
+        'c': u'A: Short passive participle (procitan)',
+        'e': u'V: Gerund (delaja; pridja, otpisav)',
+        'f': u"V: Infinitive (delat', spat')",
+        'g': u'D: Adverb forming negation and degrees of comparison (vysoko, daleko)',
+        'i': u'V: Imperative (spi, sdelaj, pročti)',
+        'j': u'C: Generic/collective numeral (dvoje, četvero)',
+        'n': u'C: Cardinal numeral (odin, tri, sorok)',
+        'q': u'P: Relative/interrogative pronoun with adjectival declension (kakoj, kotoryj, cej, ...)',
+        'r': u'C: Ordinal numeral (pervyj, tretij)',
+        'u': u"C: Interrogative numeral (skol'ko)",
+        'v': u'C: Multiplicative numeral (dvaždy, triždy)',
+        'w': u'P: Negative pronoun with adjectival declension (nikakoj, nicej)',
+        'z': u"P: Indefinite pronoun with adjectival declension (samyj, ves', ...)",
+        '}': u'C: Number written using Roman numerals (XIV)',
+    }),
+
+    TagInfo(3, 'g', 'gender', 'Gender', {
+        'F': 'Feminine',
+        'M': 'Masculine',
+        'N': 'Neuter',
+        'X': 'Any gender',
+    }),
+    TagInfo(4, 'y', 'animacy', 'Animacy', {
+        'A': 'Animate',
+        'I': 'Inanimate',
+        'X': 'Either',
+    }),
+    TagInfo(5, 'n', 'number', 'Number', {
+        'P': 'Plural',
+        'S': 'Singular',
+        'X': 'Any number'
+    }),
+    TagInfo(6, 'c', 'case', 'Case', {
+        '1': 'Nominative',
+        '2': 'Genitive',
+        '3': 'Dative',
+        '4': 'Accusative',
+        '6': 'Locative',
+        '7': 'Instrumental',
+        'X': 'Any case'
+    }),
+    TagInfo(7, 'f', 'possessors_gender', 'Possessor\'s Gender', {
+        'F': 'Feminine possessor',
+        'M': 'Masculine possessor',
+        'N': 'Neuter possessor',
+        'X': 'Possessor of any gender',
+    }),
+    TagInfo(8, 'm', 'possessors_number', 'Possessor\'s Number', {
+        'S': 'Singular possessor',
+        'P': 'Plural possessor',
+    }),
+    TagInfo(9, 'e', 'person', 'Person', {
+        '1': '1st person',
+        '2': '2nd person',
+        '3': '3rd person',
+        'X': 'Any person',
+    }),
+    TagInfo(10, 'r', 'reflexivity', 'Reflexivity', {
+        'I': 'Irreflexive',
+        'R': 'Reflexive',
+    }),
+    TagInfo(11, 't', 'tense', 'Tense', {
+        'F': 'Future',
+        'P': 'Present',
+        'R': 'Past',
+        'X': 'Any (Past, Present, or Future)',
+    }),
+    TagInfo(12, 'b', 'verbal_aspect', 'Verbal aspect', {
+        'P': 'perfective',
+        'I': 'imperfective',
+        'X': 'either aspect',
+    }),
+    TagInfo(13, 'd', 'degree_of_comparison', 'Degree of comparison', {
+        '1': 'Positive',
+        '2': 'Comparative',
+        '3': 'Superlative',
+    }),
+    TagInfo(14, 'a', 'negation', 'Negation', {
+        'A': 'Affirmative (not negated)',
+        'N': 'Negated',
+    }),
+    TagInfo(15, 'v', 'voice', 'Voice', {
+        'A': 'Active',
+        'P': 'Passive',
+    }),
+    TagInfo(16, 'i', 'variant', 'Variant, Abbreviation', {
+        '1': 'Variant (generally less frequent)',
+        '2': 'Variant (generally rarely used, bookish, or archaic)',
+        '3': 'Variant (very archaic)',
+        '5': 'Variant (colloquial)',
+        '6': 'Variant (colloquial, generally less frequent)',
+        '7': 'Variant (colloquial, generally less frequent)',
+        '8': 'Abbreviations'
+    }),
+]
+
+
+_PositionalTag = collections.namedtuple(
+    'PositionalTag', [pos.name for pos in TAGS_POSITIONS]
+)
+
+
+class PositionalTag(_PositionalTag):
+
+    @classmethod
+    def fromstr(cls, string):
+        assert len(string) == 16
+        return cls(*string)
+
+
+    def is_valid(self):
+        if len(self) != 16:
+            return False
+
+        for index, c in enumerate(self):
+            if c == '-':
+                continue
+            if c not in TAGS_POSITIONS[index].values:
+                return False
+
+        return True
+
+
+    def verbose_info(self):
+        return [
+            [TAGS_POSITIONS[index].name, TAGS_POSITIONS[index].values[tag]]
+            for index, tag in enumerate(self) if tag != '-'
+        ]
+
+
+
+if __name__ == '__main__':
+    print PositionalTag.fromstr("NNFIS7-------A--").verbose_info()
+#! /usr/bin/env python
+from distutils.core import setup
+
+__version__ = '0.1'
+
+setup(
+    name="russian-tagsets",
+    version=__version__,
+    description="Russian tagset converters library",
+    long_description = open('README.rst').read(),
+    license = 'MIT license',
+    author='Mikhail Korobov',
+    author_email='kmike84@gmail.com',
+
+    url = 'https://bitbucket.org/kmike/russian-tagsets/',
+    download_url = 'https://bitbucket.org/kmike/russian-tagsets/get/v%s.zip' % __version__,
+
+    packages = ['russian_tagsets'],
+
+    classifiers=[
+          'Development Status :: 1 - Planning',
+          'Intended Audience :: Developers',
+          'Intended Audience :: Science/Research',
+          'License :: OSI Approved :: MIT License',
+          'Natural Language :: Russian',
+          'Programming Language :: Python',
+          'Programming Language :: Python :: 2',
+          'Programming Language :: Python :: 2.6',
+          'Programming Language :: Python :: 2.7',
+          'Topic :: Software Development :: Libraries :: Python Modules',
+          'Topic :: Scientific/Engineering :: Information Analysis',
+          'Topic :: Text Processing :: Linguistic',
+    ],
+
+)
+
+
+
+