Source

superwiki / superwiki / html_diff.py

Full commit
import re
import difflib
from copy import copy

try:
    from BeautifulSoup import BeautifulSoup
    use_beautiful_soup = True
except ImportError:
    use_beautiful_soup = False

split_html_regex = re.compile(r"""
    # regular expression to split on tags, words, and punctuation. We wrap the
    # whole thing in matching group parentheses, so that re.split doesn't
    # remove anything 
    ( 
        <.*?>         # html open or close tag
        |&.*?;        # html entity
        |[a-zA-Z]+    # a word
        |\d+          # a number
        |[^\w\d\s<&]+ # punctuation
    )
""", re.VERBOSE)
def split_html(html):
    # remove &nbsp; and aliases
    html = html.replace('&nbsp;', ' ')
    html = html.replace('&#160;', ' ')
    html = html.replace('&#xA0;', ' ')

    html_list = split_html_regex.split(html)

    html_list = [s for s in html_list if s] # remove empty strings caused by split
    # normalize whitespace
    for i, item in enumerate(copy(html_list)):
        if item.isspace():
            html_list[i] = ' '
    return html_list


def html_diff(old_text, new_text):
    if use_beautiful_soup:
        old_text = BeautifulSoup(old_text).prettify()
        new_text = BeautifulSoup(new_text).prettify()

    inline_diff = []
    old_text = split_html(old_text)
    new_text = split_html(new_text)
    sm = difflib.SequenceMatcher(None, old_text, new_text)
    has_text_changes = [False]
    
    def delete(old_section):
        if has_text(old_section):
            has_text_changes[0] = True
            inline_diff.append(wrap_html(old_section, '<del>', '</del>'))
        else:
            inline_diff.append(old_section)
    def insert(new_section):
        if has_text(new_section):
            has_text_changes[0] = True
            inline_diff.append(wrap_html(new_section, '<ins>', '</ins>'))
        else:
            inline_diff.append(new_section)

    for (tag, old_start, old_end, new_start, new_end) in sm.get_opcodes():
        old_section = ''.join(old_text[old_start:old_end])
        new_section = ''.join(new_text[new_start:new_end])
        if tag == 'replace':
            insert(new_section)
            delete(old_section)
        elif tag == 'delete':
            delete(old_section)
        elif tag == 'insert':
            insert(new_section)
        elif tag == 'equal':
            inline_diff.append(old_section)

    result = '\n'.join(inline_diff)
    if use_beautiful_soup:
        result = BeautifulSoup(result).prettify()
    return result, has_text_changes[0]


tag_regex = re.compile(r'(<.*?>)')
open_tag_regex = re.compile(r'<[^/]*?>')
close_tag_regex = re.compile(r'</.*?>')
self_closing_tag_regex = re.compile(r'<.*?/>')
def wrap_html(html_section, start_tag, end_tag):
    """
    Wrap a section of html in a tag, respecting structure.

    >>> wrap_html('<h1>simple</h1>', '<div>', '</div>')
    '<h1><div>simple</div></h1>'
    >>> wrap_html('outside<p>inside', '<b>', '</b>')
    '<b>outside</b><p><b>inside</b>'
    >>> wrap_html('inside</p>outside', '<b>', '</b>')
    '<b>inside</b></p><b>outside</b>'
    >>> wrap_html('one<em>two</em>three<em>four', '<b>', '</b>')
    '<b>one</b><em><b>two</b></em><b>three</b><em><b>four</b>'
    >>> wrap_html('line 1<br/>line 2', '<b>', '</b>')
    '<b>line 1<br/>line 2</b>'
    >>> wrap_html('<p>test', '<b>', '</b>')
    '<p><b>test</b>'
    >>> wrap_html('one<div>two<p>three', '<b>', '</b>')
    '<b>one</b><div><b>two</b><p><b>three</b>'
    >>> wrap_html('one</li><li>two</li><li>three', '<del>', '</del>')
    '<del>one</del></li><li><del>two</del></li><li><del>three</del>'

    Tests for regexes
    >>> open = '<li>'
    >>> close = '</li>'
    >>> self_close = '<br />'
    >>> tags = [open, close, self_close]
    >>> [bool(tag_regex.match(t)) for t in tags]
    [True, True, True]
    >>> [bool(open_tag_regex.match(t)) for t in tags]
    [True, False, False]
    >>> [bool(close_tag_regex.match(t)) for t in tags]
    [False, True, False]
    >>> [bool(self_closing_tag_regex.match(t)) for t in tags]
    [False, False, True]
    """
    split_html = tag_regex.split(html_section)
    result_list = copy(split_html)
    i = 0 # current position in the result list
    result_list.insert(0, start_tag); i += 1 # beginning of section
    for item in split_html:
        if open_tag_regex.match(item) or close_tag_regex.match(item):
            result_list.insert(i, end_tag); i += 1
            result_list.insert(i + 1, start_tag); i += 1
        i += 1
    result_list.insert(len(result_list), end_tag); # end of section

    result = ''.join(result_list)
    result = result.replace(start_tag + end_tag, '') # remove empty wrapped tags
    return result


def has_text(html):
    """
    >>> has_text('<br />')
    False
    >>> has_text('<li>test</li>')
    True
    """
    html = tag_regex.sub('', html)
    return bool(html.strip())


if __name__ == '__main__':
    import doctest
    doctest.testmod()