Jonathan Eunice avatar Jonathan Eunice committed 6a41f20

initial commit

Comments (0)

Files changed (5)

+
+Installation
+============
+
+::
+
+    pip install namedentities
+    
+Failing that, fall back to the older approach::
+
+    easy_install namedentities
+    
+(You may need to begin these with "sudo " to authorize installation.)
+
+Usage
+=====
+
+Python 2::
+  
+    import namedentities
+    
+    u = u'both em\u2014and–dashes…'
+    print named_entities(u)
+    
+Python 3::
+
+    import namedentities
+    
+    u = 'both em\u2014and–dashes…'
+    print(named_entities(u))
+    
+Credits
+=======
+
+This is basically a packaging of Ian Beck's work
+(described in http://beckism.com/2009/03/named_entities_python/)
+
+Thank you, Ian!
+
+"""Named HTML entities are much easier to comprehend than numeric entities. This
+module helps convert between the more typical numerical entiies and the more
+attractive named entities. """
+
+# Primarily a packaging of Ian Beck's work from
+# http://beckism.com/2009/03/named_entities_python/
+
+# There are too many little differences in Python 2 and Python 3 string
+# handling syntax and symantics to have just one implementation. So there are
+# two parallel implementations, multiplexed here.
+
+import sys
+if sys.version >= '3':
+    from namedentities3 import named_entities, encode_ampersands
+else:
+    from namedentities2 import named_entities, encode_ampersands
+
+
+def test_named_entities():
+    """Give it a run."""
+    
+    num_html   = " this —is—an— ok?"
+    named_html = " this —is—an— ok?"
+   
+    assert named_html == named_entities(num_html)
+  
+  
+if __name__ == '__main__':
+    test_named_entities()

namedentities2.py

+"""Namedentities workhorse for Python 2."""
+
+
+from htmlentitydefs import codepoint2name, name2codepoint
+import re
+import codecs
+
+def unescape(text):
+    """Convert from HTML entities (named or numeric) to Unicode characters."""
+    
+    def fixup(m):
+        """Given a matched entity, return its Unicode equivalent.  NB this maps
+        existing named entities as well."""
+        
+        text = m.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
+    
+    
+def named_entities_codec(text):
+    """Encode codec that converts Unicode characters into named entities (where
+    the names are known), or failing that, numerical entities."""
+    
+    if isinstance(text, (UnicodeEncodeError, UnicodeTranslateError)):
+        s = []
+        for c in text.object[text.start:text.end]:
+            if ord(c) in codepoint2name:
+                s.append(u'&%s;' % codepoint2name[ord(c)])
+            else:
+                s.append(u'&#%s;' % ord(c))
+        return ''.join(s), text.end
+    else:
+        raise TypeError("Can't handle %s" % text.__name__)
+
+
+codecs.register_error('named_entities', named_entities_codec)
+    
+
+def named_entities(text):
+    """Given a string, convert its numerical HTML entities to named HTML
+    entities. Works by converting the entire string to Unicode characters, then
+    re-encoding Unicode characters into named entities (where the names are
+    known), or failing that, numerical entities."""
+    
+    unescaped_text = unescape(text)
+    return unescaped_text.encode('ascii', 'named_entities')
+    
+    
+def encode_ampersands(text):
+    """Encode ampersands into &"""
+    
+    text = re.sub('&(?!([a-zA-Z0-9]+|#[0-9]+|#x[0-9a-fA-F]+);)', '&', text)
+    return text
+

namedentities3.py

+"""Namedentities workhorse for Python 3."""
+
+
+from html.entities import codepoint2name, name2codepoint
+import re
+import codecs
+
+def unescape(text):
+    """Convert from HTML entities (named or numeric) to Unicode characters."""
+    
+    def fixup(m):
+        """Given a matched entity, return its Unicode equivalent.  NB this maps
+        existing named entities as well."""
+        
+        text = m.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return chr(int(text[3:-1], 16))
+                else:
+                    return chr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = chr(name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
+    
+    
+def named_entities_codec(text):
+    """Encode codec that converts Unicode characters into named entities (where
+    the names are known), or failing that, numerical entities."""
+    
+    if isinstance(text, (UnicodeEncodeError, UnicodeTranslateError)):
+        s = []
+        for c in text.object[text.start:text.end]:
+            if ord(c) in codepoint2name:
+                s.append('&{};'.format(codepoint2name[ord(c)]))
+            else:
+                s.append('&#{};'.format(ord(c)))
+        return ''.join(s), text.end
+    else:
+        raise TypeError("Can't handle {}".format(text.__name__))
+
+codecs.register_error('named_entities', named_entities_codec)
+    
+
+def named_entities(text):
+    """Given a string, convert its numerical HTML entities to named HTML
+    entities. Works by converting the entire string to Unicode characters, then
+    re-encoding Unicode characters into named entities (where the names are
+    known), or failing that, numerical entities."""
+    
+    unescaped_text = unescape(text)
+    return unescaped_text.encode('ascii', 'named_entities')
+    
+    
+def encode_ampersands(text):
+    """Encode ampersands into &"""
+    
+    text = re.sub('&(?!([a-zA-Z0-9]+|#[0-9]+|#x[0-9a-fA-F]+);)', '&', text)
+    return text
+
+#! /usr/bin/env python
+
+from setuptools import setup
+
+readme = open('README.txt', 'r')
+README_TEXT = readme.read()
+readme.close()
+
+setup(
+    name='namedentities',
+    version='1.0',
+    author='Jonathan Eunice',
+    author_email='jonathan.eunice@gmail.com',
+    description='Simple way to convert numeric HTML entites to far more readable named entities.',
+    long_description=README_TEXT,
+    url='http://bitbucket.org/jeunice/namedentities',
+    py_modules=['namedentities', 'named'],
+    install_requires=[],
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'Operating System :: OS Independent',
+        'License :: OSI Approved :: BSD License',
+        'Intended Audience :: Developers',
+        'Environment :: Web Environment',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 3',
+        'Topic :: Text Processing :: Filters',
+        'Topic :: Text Processing :: Markup :: HTML'
+    ]
+)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.