Commits

Mikhail Korobov committed 355fa2a

Initial import

  • Participants
  • Tags 0.1

Comments (0)

Files changed (8)

+\.settings
+\.project
+\.pydevproject
+\.cache/*
+\.idea/*
+
+#temp files
+\.pyc$
+\.orig$
+~$
+
+#os files
+\.DS_Store
+Thumbs.db$
+
+#setup
+^build
+^dist
+^MANIFEST$
+\.egg-info$
+\.coverage$
+
+#project-specific files
+\.tox
+^stuff
+^shuffled_rnc
+\.ipynb
+Copyright (c) 2013 Mikhail Korobov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+include *.rst
+include tox.ini
+================
+ruscorpora-tools
+================
+
+This package provides Python interface to a free corpus subset
+available at http://ruscorpora.ru.
+
+Installation
+============
+
+::
+
+    pip install ruscorpora-tools
+
+Usage
+=====
+
+Obtaining corpora
+-----------------
+
+Download and unpack the archive with XML files from
+http://www.ruscorpora.ru/corpora-usage.html
+
+Using corpora
+-------------
+
+``ruscorpora.parse_xml`` function parses single XML file and returns
+an iterator over sentences; each sentence is a list of ``ruscorpora.Token``
+instances, annotated with a list of ``ruscorpora.Annotation`` instances.
+
+``ruscorpora.simplify`` simplifies a result of ``ruscorpora.parse_xml`` by
+removing ambiguous annotations, joining split tokens and removing accent
+information.
+
+::
+
+    >>> import ruscorpora as rc
+    >>> for sent in rc.simplify(rc.parse('fiction.xml')):
+    ...     print(sent)
+
+Development
+===========
+
+Development happens at github and bitbucket:
+
+* https://github.com/kmike/ruscorpora-tools
+* https://bitbucket.org/kmike/ruscorpora-tools
+
+The issue tracker is at github: https://github.com/kmike/ruscorpora-tools/issues
+
+Feel free to submit ideas, bugs, pull requests (git or hg) or regular patches.
+
+Running tests
+-------------
+
+Make sure `tox <http://tox.testrun.org>`_ is installed and run
+
+::
+
+    $ tox
+
+from the source checkout. Tests should pass under python 2.6..3.3
+and pypy > 1.8.

File ruscorpora/__init__.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals, print_function
+try:
+    from xml.etree import cElementTree as ElementTree
+except ImportError:
+    from xml.etree import ElementTree
+
+import warnings
+import functools
+from collections import namedtuple
+
+Token = namedtuple('Token', 'text annotations')
+Annotation = namedtuple('Annotation', 'lex gr joined')
+
+def parse_xml(source):
+    """
+    Parse XML file ``source`` (which can be obtained from ruscorpora.ru);
+    return an iterator of sentences. Each sentence is a list of Token
+    instances.
+    """
+    xml = ElementTree.parse(source)
+
+    def punct_tokens(txt):
+        if not txt:
+            return []
+
+        tokens = [tok for tok in txt.split('\n')]
+        return [Token(tok, None) for tok in tokens if tok]
+
+    for se in xml.findall('se'):
+        sent = []
+        sent.extend(punct_tokens(se.text))
+
+        for w in se.findall('w'):
+            ana_elems = w.findall('ana')
+
+            # text after the last annotation is a word
+            word = ana_elems[-1].tail or ''
+
+            annotations = [
+                Annotation(a.get('lex'), a.get('gr'), a.get('joined'))
+                for a in ana_elems
+            ]
+            sent.append(Token(word, annotations))
+            sent.extend(punct_tokens(w.tail))
+
+        sent.extend(punct_tokens(se.tail))
+        yield [t for t in sent if t.text.strip()]
+
+
+def simplify(sents, remove_accents=True, join_split=True,
+             join_hyphenated=True, punct_tag='PNCT'):
+    """
+    Simplify the result of ``sents`` parsing:
+
+    * keep only a single annotation per word part;
+    * annotate punctuation with ``punct_tag``;
+    * join split words into a single token (if ``join_split==True``);
+    * join hyphenated words to a single token (if ``join_hyphenated==True``);
+    * remove accents (if ``remove_accents==True``).
+    """
+
+    def remove_extra_annotations(token):
+        """ force token annotations to be a single-element list """
+        if token.annotations is None:
+            return (token.text, [None])
+        return (token.text, [token.annotations[-1]])
+
+    def _combine_tokens(tokens):
+        text = "".join(t[0] for t in tokens)
+        annotations = [ann for t in tokens for ann in t[1] if ann]
+        return (text, annotations)
+
+    def _join_tokens(sent, accum_size, should_accumulate):
+        accum = []
+        for text, annotations in sent:
+            ann = annotations[0]
+            if should_accumulate(text, ann, accum):
+                accum.append((text, annotations))
+
+                if len(accum) == accum_size:
+                    yield _combine_tokens(accum)
+                    accum = []
+            else:
+                if accum:
+                    warnings.warn("unconsumed tokens: %s" % accum)
+                    for tok in accum:
+                        yield tok
+                    accum = []
+                yield text, annotations
+
+    join_split_tokens = functools.partial(
+        _join_tokens,
+        accum_size=2,
+        should_accumulate = lambda text, ann, accum: ann and ann.joined == 'together'
+    )
+
+    join_hyphenated_tokens = functools.partial(
+        _join_tokens,
+        accum_size=3,
+        should_accumulate = lambda text, ann, accum: (ann and ann.joined == 'hyphen') or (accum and text.strip() == '-')
+    )
+
+    def fix_punct_tags(sent):
+        for text, annotations in sent:
+            new_annotations = []
+            for ann in annotations:
+                if ann is None:
+                    ann = Annotation(text, punct_tag, None)
+                new_annotations.append(ann)
+
+            yield text, new_annotations
+
+
+    for sent in sents:
+        sent = map(remove_extra_annotations, sent)
+
+        if remove_accents:
+            sent = [(t[0].replace('`', ''), t[1]) for t in sent]
+
+        if join_split:
+            sent = join_split_tokens(sent)
+
+        if join_hyphenated:
+            sent = join_hyphenated_tokens(sent)
+
+        sent = fix_punct_tags(sent)
+
+        yield [Token(*t) for t in sent]
+
+
+if __name__ == '__main__':
+    import sys
+    for sent in simplify(parse_xml(sys.argv[1])):
+        for tok in sent:
+            print(tok)
+        print("\n")
+#!/usr/bin/env python
+from distutils.core import setup
+
+__version__ = '0.1'
+
+setup(
+    name = 'ruscorpora-tools',
+    version = __version__,
+    author = 'Mikhail Korobov',
+    author_email = 'kmike84@gmail.com',
+    url = 'https://github.com/kmike/ruscorpora-tools/',
+
+    description = 'Python interface to a free corpus subset from ruscorpora.ru',
+    long_description = open('README.rst').read(),
+
+    license = 'MIT license',
+    packages = ['ruscorpora'],
+
+    classifiers=[
+        'Development Status :: 3 - Alpha',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: MIT License',
+        'Natural Language :: Russian',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.6',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: 3.3',
+        'Programming Language :: Python :: Implementation :: CPython',
+        'Programming Language :: Python :: Implementation :: PyPy',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        'Topic :: Scientific/Engineering :: Information Analysis',
+        'Topic :: Text Processing :: Linguistic',
+    ],
+)

File tests/test_reader.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
+import io
+import ruscorpora as rc
+
+def _parse(corpus_xml):
+    corpus = '<?xml version="1.0" encoding="utf-8" ?>\n<corpus>\n%s\n</corpus>' % corpus_xml
+    fp = io.BytesIO(corpus.encode('utf8'))
+    return list(rc.simplify(rc.parse_xml(fp)))
+
+
+def test_simple():
+    corpus = """
+    <se>«
+    <w><ana lex="школа" gr="S,f,inan=sg,nom"></ana>Шк`ола</w>
+     <w><ana lex="злословие" gr="S,n,inan=sg,gen"></ana>злосл`овия</w> » ,-
+    <w><ana lex="сми" gr="S,0=sg,nom"></ana>СМИ</w> !</se>"""
+
+    assert _parse(corpus) == [
+        [
+            ('«', [rc.Annotation(lex='«', gr='PNCT', joined=None)]),
+            ('Школа', [rc.Annotation(lex='школа', gr='S,f,inan=sg,nom', joined=None)]),
+            ('злословия', [rc.Annotation(lex='злословие', gr='S,n,inan=sg,gen', joined=None)]),
+            (' » ,-', [rc.Annotation(lex=' » ,-', gr='PNCT', joined=None)]),
+            ('СМИ', [rc.Annotation(lex='сми', gr='S,0=sg,nom', joined=None)]),
+            (' !', [rc.Annotation(lex=' !', gr='PNCT', joined=None)])
+        ]
+    ]
+
+
+def test_joined_hyphen():
+    corpus = """
+    <se>
+    <w><ana lex="Сегодня" gr="ADV" joined="hyphen"></ana>Сег`одня</w>-<w><ana lex="завтра" gr="ADV" joined="hyphen"></ana>з`автра</w>
+    <w><ana lex="школа" gr="S,f,inan=sg,nom"></ana>шк`ола</w></se>
+    """
+    assert _parse(corpus) == [
+        [
+            ('Сегодня-завтра', [
+                rc.Annotation(lex='Сегодня', gr='ADV', joined='hyphen'),
+                rc.Annotation(lex='завтра', gr='ADV', joined='hyphen')]),
+            ('школа', [rc.Annotation(lex='школа', gr='S,f,inan=sg,nom', joined=None)])
+        ]
+    ]
+
+
+def test_joined_together():
+    corpus = """
+    <se>
+    <w><ana lex="злословие" gr="S,n,inan=sg,gen"></ana>Злосл`овия</w> -
+    <w><ana lex="пол" gr="NUM" joined="together"></ana>пол</w><w><ana lex="дюжина" gr="S,f,inan=sg,gen" joined="together"></ana>дюжины</w>
+    </se>
+    """
+    assert _parse(corpus) == [
+        [
+            ('Злословия', [rc.Annotation(lex='злословие', gr='S,n,inan=sg,gen', joined=None)]),
+            (' -', [rc.Annotation(lex=' -', gr='PNCT', joined=None)]),
+            ('полдюжины', [
+                rc.Annotation(lex='пол', gr='NUM', joined='together'),
+                rc.Annotation(lex='дюжина', gr='S,f,inan=sg,gen', joined='together')])
+        ]
+    ]
+[tox]
+envlist = py26,py27,py32,py33
+
+[testenv]
+deps=
+    pytest
+
+commands=
+    py.test []