Commits

Mikhail Korobov committed c03e7a2

initial import

Comments (0)

Files changed (9)

+#projects
+\.idea
+
+#temp files
+\.pyc
+\.orig
+
+#os files
+\.DS_Store
+Thumbs.db
+
+#project-specific files
+\.tox
+MANIFEST$
+^build
+^dist
+include README.rst
+include _dump.pl
+include src/text_unicode/data.bin
+include test_unidecode.py
+Text-Unidecode
+==============
+
+text-unidecode is the most basic port of the
+`Text::Unidecode <http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm>`_
+Perl library.
+
+There are other Python ports of Text::Unidecode (unidecode_
+and isounidecode_). unidecode_ is GPL; isounidecode_ doesn't support
+Python 3 and uses too much memory.
+
+This port is licensed under `Artistic License`_ and supports both
+Python 2.x and 3.x. If you're OK with GPL, use unidecode_ (it has
+better memory usage and better transliteration quality).
+
+.. _unidecode: http://pypi.python.org/pypi/Unidecode/
+.. _isounidecode: http://pypi.python.org/pypi/isounidecode/
+.. _Artistic License: http://opensource.org/licenses/Artistic-Perl-1.0
+
+Installation
+------------
+
+::
+
+    pip install text-unidecode
+
+Usage
+-----
+
+::
+
+    >>> from text_unidecode import unidecode
+    >>> unidecode(u'какой-то текст')
+    u'kakoi-to tiekst'
+
+#!/usr/bin/env perl
+use Text::Unidecode;
+use Encode;
+
+for ($c=1; $c<65535; $c++){ # limit ourselves to narrow python builds
+    $trans = unidecode(chr($c));
+    print encode("utf8", "$trans\x00");
+}
+
+# usage: perl _dump.pl > src/text_unidecode/data.bin
+#! /usr/bin/env python
+from distutils.core import setup
+
+__version__ = '0.1'
+
+setup(
+    name="text-unidecode",
+    version=__version__,
+    description="The most basic Text::Unidecode port",
+    long_description = open('README.rst').read(),
+    license = 'Artistic License',
+    author='Mikhail Korobov',
+    author_email='kmike84@gmail.com',
+
+    url = 'https://github.com/kmike/text-unidecode/',
+
+    package_dir = {'': 'src'},
+    packages = ['text_unidecode'],
+    package_data = {'text_unidecode': ['data.bin']},
+
+    classifiers=[
+        'Development Status :: 3 - Alpha',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: Artistic License',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.6',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: 3.3',
+        'Programming Language :: Python :: Implementation :: CPython',
+        'Programming Language :: Python :: Implementation :: PyPy',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        'Topic :: Text Processing :: Linguistic',
+    ],
+)

src/text_unidecode/__init__.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
+import os
+
+_data_path = os.path.join(os.path.dirname(__file__), 'data.bin')
+with open(_data_path, 'rb') as f:
+    _replaces = f.read().decode('utf8').split('\x00')
+
+def unidecode(txt):
+    chars = []
+    for ch in txt:
+        codepoint = ord(ch)
+
+        if not codepoint:
+            chars.append('\x00')
+            continue
+
+        try:
+            chars.append(_replaces[codepoint-1])
+        except IndexError:
+            pass
+    return "".join(chars)

src/text_unidecode/data.bin

Binary file added.

test_unidecode.py

+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, unicode_literals
+from text_unidecode import unidecode
+import pytest
+
+@pytest.mark.parametrize(("text", "result"), [
+    ("Programmes de publicité - Solutions d'entreprise", "Programmes de publicite - Solutions d'entreprise"),
+    ("Транслитерирует и русский", "Translitieriruiet i russkii"),
+    ("kožušček", "kozuscek"),
+    ("北亰", "Bei Jing "),
+])
+def test_transliterate(text, result):
+    assert unidecode(text) == result
+
+
+@pytest.mark.parametrize("code", range(128))
+def test_7bit_purity(code):
+    ch = chr(code)
+    assert unidecode(ch) == ch
+
+def test_7bit_text_purity():
+    txt = "".join([chr(x) for x in range(128)])
+    assert unidecode(txt) == txt
+[tox]
+envlist = py26,py27,py32,py33,pypy
+
+[testenv]
+deps =
+    pytest
+commands=
+    py.test []