Source

VPlayer / htmldecoder.py

# -*- coding: utf-8 -*-

import re
import htmlentitydefs

def decode_entities(data, encoding=None):
    """Decode things like   to normal text"""
    def unicode_char_callback(match):
        code = match.group(1)
        try:
            value = unichr(int(code))
        except ValueError:
            value = code
        return value

    def entity_callback(match):
        entity = match.group(1)
        try:
            value = htmlentitydefs.name2codepoint[entity]
            try:
                data = unichr(value)
                if encoding:
                    data = data.encode(encoding)
                return data
            except UnicodeDecodeError:
                pass
        except KeyError:
            pass
        return u'&%s;' % entity
        
    if encoding is None and isinstance(data, str):
        try:
            data = data.decode('utf-8')
        except UnicodeDecodeError:
            print 'data encoding is not unicode neither utf-8'
            return ''
    data = re.sub(r'&([a-z]+);', entity_callback, data)
    data = re.sub(r'&#(\d+);', unicode_char_callback, data)
    return data