Commits

Jonathan Eunice  committed 1a5e1fe

reorganized

  • Participants
  • Parent commits 7ad42cd

Comments (0)

Files changed (15)

+Named HTML entities are much neater and much
+easier to comprehend than numeric entities. And because they
+fall within the ASCII range, they're 
+much safer
+to use in multiple contexts
+than Unicode and its various encodings (UTF-8 and such).
 
-Installation
-============
-
-::
-
-    pip install namedentities
-        
-(You may need to prefix this with "sudo " to authorize installation.)
-
-**NOTA BENE** Code runs successfully under Python 3, but packaging
-seemingly doesn't work as yet.
+This
+module helps convert from numerical HTML entites and Unicode characters that
+fall outside the normal ASCII range into named entities.
 
 Usage
 =====
     u = u'both em\u2014and–dashes…'
     print named_entities(u)
     
+yields::
+
+    both em—and–dashes…
+    
 Python 3::
 
     from namedentities import named_entities
     
     u = 'both em\u2014and–dashes…'
     print(named_entities(u))
+    # same result
 
+Or using the `six <http://pypi.python.org/pypi/six>`_ cross-version compatibility
+library, either one::
+
+    from namedentities import named_entities
+    import six
     
-History
-=======
+    u = six.u('both em\u2014and&#x2013;dashes&hellip;')
+    six.print_(named_entities(u))
+    # same result
 
- * 1.0.8 No longer attempts to encode ``&lt;``, ``&gt;``, or 
-   ``&amp;`` (or thier numerical equivalents) to avoid
+Recent Changes
+==============
+ 
+ * Now
+   successfully packaged for, and tests against, against Python 2.5, 2.6, 2.7, 3.2, and 3.3.
+   
+ * Commenced automated multi-version testing with
+   `pytest <http://pypi.python.org/pypi/pytest>`_
+   and `tox <http://pypi.python.org/pypi/tox>`_.
+   
+Notes
+=====
+   
+ * Doesn't attempt to encode ``&lt;``, ``&gt;``, or 
+   ``&amp;`` (or their numerical equivalents) to avoid
    interfering with HTML escaping.
 
- * This is basically a packaging of Ian Beck's work
-   (described in http://beckism.com/2009/03/named_entities_python/)
+ * This is basically a packaging of `Ian Beck's work
+   <http://beckism.com/2009/03/named_entities_python/>`_. Thank you, Ian!
 
-   Thank you, Ian!
+Installation
+============
 
+::
 
+    pip install namedentities
+
+To ``easy_install`` under a specific Python version (3.3 in this example)::
+
+    python3.3 -m easy_install namedentities
+    
+(You may need to prefix these with "sudo " to authorize installation.)

File namedentities.py

+"""
+Named HTML entities are much easier to comprehend than numeric entities. This
+module helps convert between the more typical numerical entiies and the more
+attractive named entities.
+"""
+
+# Primarily a packaging of Ian Beck's work from
+# http://beckism.com/2009/03/named_entities_python/
+
+# There are too many little differences in Python 2 and Python 3 string handling
+# syntax and symantics to easily have just one implementation. So there are two
+# (very similar) parallel implementations, multiplexed here.
+
+import sys
+if sys.version_info[0] >= 3:
+    from namedentities3 import named_entities, encode_ampersands
+else:
+    from namedentities2 import named_entities, encode_ampersands

File namedentities/__init__.py

-"""Named HTML entities are much easier to comprehend than numeric entities. This
-module helps convert between the more typical numerical entiies and the more
-attractive named entities. """
-
-# Primarily a packaging of Ian Beck's work from
-# http://beckism.com/2009/03/named_entities_python/
-
-# There are too many little differences in Python 2 and Python 3 string
-# handling syntax and symantics to have just one implementation. So there are
-# two parallel implementations, multiplexed here.
-
-import sys
-if sys.version >= '3':
-    from ne3 import named_entities, encode_ampersands
-else:
-    from ne2 import named_entities, encode_ampersands
-
-
-def test_named_entities():
-    """Give it a run."""
-    
-    num_html   = " this &#x2014;is&#8212;an&mdash; ok?"
-    named_html = " this &mdash;is&mdash;an&mdash; ok?"
-   
-    assert named_html == named_entities(num_html)
-  
-  
-if __name__ == '__main__':
-    test_named_entities()

File namedentities/e2.py

-from ne2 import named_entities
- 
-u = u'both em\u2014and&#x2013;dashes&hellip;'
-print named_entities(u)

File namedentities/e3.py

-from ne2 import named_entities
-
-u = 'both em\u2014and&#x2013;dashes&hellip;'
-print(named_entities(u))

File namedentities/e3a.py

-from ne3 import named_entities
-
-u = 'both em\u2014and&#x2013;dashes&hellip;'
-print(named_entities(u))

File namedentities/example2.py

-from namedentities import named_entities
- 
-u = u'both em\u2014and&#x2013;dashes&hellip;'
-print named_entities(u)

File namedentities/example3.py

-from namedentities import named_entities
-
-u = 'both em\u2014and&#x2013;dashes&hellip;'
-print(named_entities(u))
-
-both em\u2014and&ndash;dashes&hellip;

File namedentities/ne2.py

-"""Namedentities workhorse for Python 2."""
-
-from htmlentitydefs import codepoint2name, name2codepoint
-import re
-import codecs
-
-def unescape(text):
-    """Convert from HTML entities (named or numeric) to Unicode characters."""
-    
-    def fixup(m):
-        """Given an HTML entity (named or numeric), return its Unicode
-        equivalent. Does not, however, unescape &lt; &gt; and &amp; (decimal 60,
-        62, and 38). Those are 'special' in that they are often escaped for very
-        important, specific reasons (e.g. to describe HTML within HTML). Any
-        messing with them is likely to break things badly."""
-        
-        text = m.group(0)
-        if text[:2] == "&#":            # numeric entity
-            try:
-                codepoint = int(text[3:-1], 16) if text[:3] == "&#x" else int(text[2:-1])
-                if codepoint != 38 and codepoint != 60 and codepoint != 62:
-                    return unichr(codepoint)
-            except ValueError:
-                pass
-        else:                           # named entity
-            try:
-                codepoint = name2codepoint[text[1:-1]]
-                if codepoint != 38 and codepoint != 60 and codepoint != 62:
-                    return unichr(codepoint)
-            except KeyError:
-                pass
-        return text # leave as is
-    return re.sub("&#?\w+;", fixup, text)
-    
-    
-def named_entities_codec(text):
-    """Encode codec that converts Unicode characters into named entities (where
-    the names are known), or failing that, numerical entities."""
-    
-    if isinstance(text, (UnicodeEncodeError, UnicodeTranslateError)):
-        s = []
-        for c in text.object[text.start:text.end]:
-            if ord(c) in codepoint2name:
-                s.append(u'&%s;' % codepoint2name[ord(c)])
-            else:
-                s.append(u'&#%s;' % ord(c))
-        return ''.join(s), text.end
-    else:
-        raise TypeError("Can't handle %s" % text.__name__)
-
-
-codecs.register_error('named_entities', named_entities_codec)
-    
-
-def named_entities(text):
-    """Given a string, convert its numerical HTML entities to named HTML
-    entities. Works by converting the entire string to Unicode characters, then
-    re-encoding Unicode characters into named entities (where the names are
-    known), or failing that, numerical entities."""
-    
-    unescaped_text = unescape(text)
-    return unescaped_text.encode('ascii', 'named_entities')
-    
-    
-def encode_ampersands(text):
-    """Encode ampersands into &amp;"""
-    
-    text = re.sub('&(?!([a-zA-Z0-9]+|#[0-9]+|#x[0-9a-fA-F]+);)', '&amp;', text)
-    return text
-

File namedentities/ne3.py

-"""Namedentities workhorse for Python 3."""
-
-from html.entities import codepoint2name, name2codepoint
-import re
-import codecs
-
-def unescape(text):
-    """Convert from HTML entities (named or numeric) to Unicode characters."""
-    
-    def fixup(m):
-        """Given an HTML entity (named or numeric), return its Unicode
-        equivalent. Does not, however, unescape &lt; &gt; and &amp; (decimal 60,
-        62, and 38). Those are 'special' in that they are often escaped for very
-        important, specific reasons (e.g. to describe HTML within HTML). Any
-        messing with them is likely to break things badly."""
-        
-        text = m.group(0)
-        if text[:2] == "&#":            # numeric entity
-            try:
-                codepoint = int(text[3:-1], 16) if text[:3] == "&#x" else int(text[2:-1])
-                if codepoint != 38 and codepoint != 60 and codepoint != 62:
-                    return chr(codepoint)
-            except ValueError:
-                pass
-        else:                           # named entity
-            try:
-                codepoint = name2codepoint[text[1:-1]]
-                if codepoint != 38 and codepoint != 60 and codepoint != 62:
-                    return chr(codepoint)
-            except KeyError:
-                pass
-        return text # leave as is
-    return re.sub("&#?\w+;", fixup, text)
-    
-    
-def named_entities_codec(text):
-    """Encode codec that converts Unicode characters into named entities (where
-    the names are known), or failing that, numerical entities."""
-    
-    if isinstance(text, (UnicodeEncodeError, UnicodeTranslateError)):
-        s = []
-        for c in text.object[text.start:text.end]:
-            if ord(c) in codepoint2name:
-                s.append('&{};'.format(codepoint2name[ord(c)]))
-            else:
-                s.append('&#{};'.format(ord(c)))
-        return ''.join(s), text.end
-    else:
-        raise TypeError("Can't handle {}".format(text.__name__))
-
-
-codecs.register_error('named_entities', named_entities_codec)
-    
-
-def named_entities(text):
-    """Given a string, convert its numerical HTML entities to named HTML
-    entities. Works by converting the entire string to Unicode characters, then
-    re-encoding Unicode characters into named entities (where the names are
-    known), or failing that, numerical entities."""
-    
-    unescaped_text = unescape(text)
-    entities_text = unescaped_text.encode('ascii', 'named_entities')
-    return entities_text.decode("ascii", "strict")
-    
-    
-def encode_ampersands(text):
-    """Encode ampersands into &amp;"""
-    
-    text = re.sub('&(?!([a-zA-Z0-9]+|#[0-9]+|#x[0-9a-fA-F]+);)', '&amp;', text)
-    return text
-

File namedentities2.py

+"""Namedentities workhorse for Python 2."""
+
+from htmlentitydefs import codepoint2name, name2codepoint
+import re
+import codecs
+
+def unescape(text):
+    """
+    Convert from HTML entities (named or numeric) to Unicode characters.
+    """
+    
+    def fixup(m):
+        """
+        Given an HTML entity (named or numeric), return its Unicode
+        equivalent. Does not, however, unescape &lt; &gt; and &amp; (decimal 60,
+        62, and 38). Those are 'special' in that they are often escaped for very
+        important, specific reasons (e.g. to describe HTML within HTML). Any
+        messing with them is likely to break things badly.
+        """
+        
+        text = m.group(0)
+        if text[:2] == "&#":            # numeric entity
+            try:
+                codepoint = int(text[3:-1], 16) if text[:3] == "&#x" else int(text[2:-1])
+                if codepoint != 38 and codepoint != 60 and codepoint != 62:
+                    return unichr(codepoint)
+            except ValueError:
+                pass
+        else:                           # named entity
+            try:
+                codepoint = name2codepoint[text[1:-1]]
+                if codepoint != 38 and codepoint != 60 and codepoint != 62:
+                    return unichr(codepoint)
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
+    
+    
+def named_entities_codec(text):
+    """
+    Encode codec that converts Unicode characters into named entities (where
+    the names are known), or failing that, numerical entities.
+    """
+    
+    if isinstance(text, (UnicodeEncodeError, UnicodeTranslateError)):
+        s = []
+        for c in text.object[text.start:text.end]:
+            if ord(c) in codepoint2name:
+                s.append(u'&%s;' % codepoint2name[ord(c)])
+            else:
+                s.append(u'&#%s;' % ord(c))
+        return ''.join(s), text.end
+    else:
+        raise TypeError("Can't handle %s" % text.__name__)
+
+
+codecs.register_error('named_entities', named_entities_codec)
+    
+
+def named_entities(text):
+    """
+    Given a string, convert its numerical HTML entities to named HTML
+    entities. Works by converting the entire string to Unicode characters, then
+    re-encoding Unicode characters into named entities (where the names are
+    known), or failing that, numerical entities.
+    """
+    
+    unescaped_text = unescape(text)
+    return unescaped_text.encode('ascii', 'named_entities')
+    
+    
+def encode_ampersands(text):
+    """Encode ampersands into &amp;"""
+    
+    text = re.sub('&(?!([a-zA-Z0-9]+|#[0-9]+|#x[0-9a-fA-F]+);)', '&amp;', text)
+    return text
+

File namedentities3.py

+"""Namedentities workhorse for Python 3."""
+
+from html.entities import codepoint2name, name2codepoint
+import re
+import codecs
+
+def unescape(text):
+    """
+    Convert from HTML entities (named or numeric) to Unicode characters.
+    """
+    
+    def fixup(m):
+        """
+        Given an HTML entity (named or numeric), return its Unicode
+        equivalent. Does not, however, unescape &lt; &gt; and &amp; (decimal 60,
+        62, and 38). Those are 'special' in that they are often escaped for very
+        important, specific reasons (e.g. to describe HTML within HTML). Any
+        messing with them is likely to break things badly.
+        """
+        
+        text = m.group(0)
+        if text[:2] == "&#":            # numeric entity
+            try:
+                codepoint = int(text[3:-1], 16) if text[:3] == "&#x" else int(text[2:-1])
+                if codepoint != 38 and codepoint != 60 and codepoint != 62:
+                    return chr(codepoint)
+            except ValueError:
+                pass
+        else:                           # named entity
+            try:
+                codepoint = name2codepoint[text[1:-1]]
+                if codepoint != 38 and codepoint != 60 and codepoint != 62:
+                    return chr(codepoint)
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
+    
+    
+def named_entities_codec(text):
+    """
+    Encode codec that converts Unicode characters into named entities (where
+    the names are known), or failing that, numerical entities.
+    """
+    
+    if isinstance(text, (UnicodeEncodeError, UnicodeTranslateError)):
+        s = []
+        for c in text.object[text.start:text.end]:
+            if ord(c) in codepoint2name:
+                s.append('&{};'.format(codepoint2name[ord(c)]))
+            else:
+                s.append('&#{};'.format(ord(c)))
+        return ''.join(s), text.end
+    else:
+        raise TypeError("Can't handle {}".format(text.__name__))
+
+
+codecs.register_error('named_entities', named_entities_codec)
+    
+
+def named_entities(text):
+    """
+    Given a string, convert its numerical HTML entities to named HTML
+    entities. Works by converting the entire string to Unicode characters, then
+    re-encoding Unicode characters into named entities (where the names are
+    known), or failing that, numerical entities.
+    """
+    
+    unescaped_text = unescape(text)
+    entities_text = unescaped_text.encode('ascii', 'named_entities')
+    return entities_text.decode("ascii", "strict")
+    
+    
+def encode_ampersands(text):
+    """
+    Encode ampersands into &amp;
+    """
+    
+    text = re.sub('&(?!([a-zA-Z0-9]+|#[0-9]+|#x[0-9a-fA-F]+);)', '&amp;', text)
+    return text
+
 
 setup(
     name='namedentities',
-    version='1.0.9',
+    version='1.2',
     author='Jonathan Eunice',
     author_email='jonathan.eunice@gmail.com',
     description='Simple way to convert numeric HTML entities to far more readable named entities.',
     long_description=open('README.rst').read(),
     url='http://bitbucket.org/jeunice/namedentities',
-    packages = find_packages(),
+    py_modules=['namedentities', 'namedentities2', 'namedentities3'],
     install_requires=[],
     tests_require = ['tox', 'pytest','six'],
     zip_safe = True,
         'Intended Audience :: Developers',
         'Environment :: Web Environment',
         'Programming Language :: Python',
+        'Programming Language :: Python :: 2.6',
+        'Programming Language :: Python :: 2.5',
         'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: 3.3',
         'Topic :: Text Processing :: Filters',
         'Topic :: Text Processing :: Markup :: HTML'
     ]
-)
+)

File test/test.py

-
 
 import six
 from namedentities import named_entities
+import sys
 
-def test_ne():
+
+def _print(*args, **kwargs):
+    """
+    Python 2 and 3 compatible print function, similar to Python 3 arg handling.
+    """
+    sep = kwargs.get('sep', ' ')
+    end = kwargs.get('end', '\n')
+    f   = kwargs.get('file', sys.stdout)
+    parts = [str(item) for item in args ]
+    parts.append(end)
+    f.write(sep.join(parts))
+    
+def test_unicode():
+    u = six.u('both em\u2014and')
+    assert named_entities(u) == six.u("both em&mdash;and")
+
+def test_numeric_entity():
+    u = six.u('and&#x2013;dashes')
+    assert named_entities(u) == six.u("and&ndash;dashes")
+    
+def test_unicode_and_numeric():
     u = six.u('both em\u2014and&#x2013;dashes&hellip;')
-    assert named_entities(u) == six.u("both em\u2014and&ndash;dashes&hellip;")
+    assert named_entities(u) == six.u("both em&mdash;and&ndash;dashes&hellip;")
+
+def test_six_print_example(capsys):
+    u = six.u('both em\u2014and&#x2013;dashes&hellip;')
+    six.print_(named_entities(u))
+    out, err = capsys.readouterr()
+    assert out.startswith("both em&mdash;and&ndash;dashes&hellip;")
 [tox]
-envlist = py26, py27, py32, py33
+envlist = py25, py26, py27, py32, py33
 
 [testenv]
 changedir=test