contentbrowser / src /

# -*- coding: utf-8 -*-
import re
import urlparse

strip_tags_re = re.compile(
    r'</?\S([^=]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)
meta_encoding_re = re.compile(r'<meta.*?charset=([^"\']+)', re.IGNORECASE)

def strip_tags(content):
    """Returns the given HTML `content` with all tags stripped."""
    return strip_tags_re.sub(' ', content)

def guess_encoding(response):
    """Returns an HTML guessed encoding from a requests' response."""
    encoding = response.encoding
    if encoding == 'ISO-8859-1':
        # By default, the fallback of the content-type text/html
        # is ISO-8859-1, so in that case we double check that the
        # encoding is not set in HTML's dedicated meta, see
        # Warning: response.text MUST be reevaluated
        encoding = re.findall(meta_encoding_re, response.text)
        if encoding:
            encoding = encoding[0]
        else:  # guess from Charade as a final fallback
            encoding = response.apparent_encoding
    return encoding

def is_valid_url(url):
    """Verifies the validity of the scheme for a given `url`."""
    parts = urlparse.urlparse(url)
    return parts.scheme in ('http', 'https')

def get_hostname(url):
    """Returns the hostname for a given `url`."""
    return urlparse.urlparse(url).netloc

def highlights(hit, column_name):
    """Returns the highlighted extract for a given `hit`."""
    return hit.highlights(column_name)