Commits

Alexandre Macabies committed 41ed8e1

We now have a real HTML compressor!

Comments (0)

Files changed (3)

common/htmlcompressor.py

+#!/usr/bin/env python
+# -*- coding: utf8 -*-
+
+###
+# Copyright (c) 2011, Alexandre `Zopieux` Macabies
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#   * Redistributions of source code must retain the above copyright notice,
+#     this list of conditions, and the following disclaimer.
+#   * Redistributions in binary form must reproduce the above copyright notice,
+#     this list of conditions, and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   * Neither the name of the author of this software nor the name of
+#     contributors to this software may be used to endorse or promote products
+#     derived from this software without specific prior written consent.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+###
+
+# This module has been mostly inspired by http://code.google.com/p/htmlcompressor/
+
+import re
+import copy
+from itertools import count
+
+tempCondCommentBlock = "<%%%COMPRESS~COND~{0}%%%>"
+tempPreBlock = "%%%COMPRESS~PRE~{0}%%%"
+tempTextAreaBlock = "%%%COMPRESS~TEXTAREA~{0}%%%"
+tempScriptBlock = "%%%COMPRESS~SCRIPT~{0}%%%"
+tempStyleBlock = "%%%COMPRESS~STYLE~{0}%%%"
+tempEventBlock = "%%%COMPRESS~EVENT~{0}%%%"
+tempLineBreakBlock = "%%%COMPRESS~LT~{0}%%%"
+tempSkipBlock = "<%%%COMPRESS~SKIP~{0}%%%>"
+tempUserBlock = "%%%COMPRESS~USER{0}~{1}%%%"
+
+emptyPattern = re.compile("\\s")
+skipPattern = re.compile("<!--\\s*\\{\\{\\{\\s*-->(.*?)<!--\\s*\\}\\}\\}\\s*-->", re.S | re.I)
+condCommentPattern = re.compile("(<!(?:--)?\\[[^\\]]+?]>)(.*?)(<!\\[[^\\]]+]-->)", re.S | re.I)
+commentPattern = re.compile("<!--[^\\[].*?-->", re.S | re.I)
+intertagPattern = re.compile(">\\s+<", re.S | re.I)
+multispacePattern = re.compile("\\s+", re.S | re.I)
+tagEndSpacePattern = re.compile("(<(?:[^>]+?))(?:\\s+?)(/?>)", re.S | re.I)
+tagQuotePattern = re.compile("\\s*=\\s*([\"'])([a-z0-9-_]+?)\\1(/?)(?=[^<]*?>)", re.I)
+prePattern = re.compile("(<pre[^>]*?>)(.*?)(</pre>)", re.S | re.I)
+taPattern = re.compile("(<textarea[^>]*?>)(.*?)(</textarea>)", re.S | re.I)
+scriptPattern = re.compile("(<script[^>]*?>)(.*?)(</script>)", re.S | re.I)
+stylePattern = re.compile("(<style[^>]*?>)(.*?)(</style>)", re.S | re.I)
+tagPropertyPattern = re.compile("(\\s\\w+)\\s*=\\s*(?=[^<]*?>)", re.I)
+cdataPattern = re.compile("\\s*<!\\[CDATA\\[(.*?)\\]\\]>\\s*", re.S | re.I)
+doctypePattern = re.compile("<!DOCTYPE[^>]*>", re.S | re.I)
+jsTypeAttrPattern = re.compile("(<script[^>]*)type\\s*=\\s*([\"']*)(?:text|application)/javascript\\2([^>]*>)", re.S | re.I)
+jsLangAttrPattern = re.compile("(<script[^>]*)language\\s*=\\s*([\"']*)javascript\\2([^>]*>)", re.S | re.I)
+jsJqueryTmplTypePattern = re.compile("<script[^>]*type\\s*=\\s*([\"']*)text/x-jquery-tmpl\\1[^>]*>", re.S | re.I)
+styleTypeAttrPattern = re.compile("(<style[^>]*)type\\s*=\\s*([\"']*)text/style\\2([^>]*>)", re.S | re.I)
+linkTypeAttrPattern = re.compile("(<link[^>]*)type\\s*=\\s*([\"']*)text/(?:css|plain)\\2([^>]*>)", re.S | re.I)
+linkRelAttrPattern = re.compile("<link(?:[^>]*)rel\\s*=\\s*([\"']*)(?:alternate\\s+)?stylesheet\\1(?:[^>]*)>", re.S | re.I)
+formMethodAttrPattern = re.compile("(<form[^>]*)method\\s*=\\s*([\"']*)get\\2([^>]*>)", re.S | re.I)
+inputTypeAttrPattern = re.compile("(<input[^>]*)type\\s*=\\s*([\"']*)text\\2([^>]*>)", re.S | re.I)
+booleanAttrPattern = re.compile("(<\\w+[^>]*)(checked|selected|disabled|readonly)\\s*=\\s*([\"']*)\\w*\\3([^>]*>)", re.S | re.I)
+eventJsProtocolPattern = re.compile("^javascript:\\s*(.+)", re.S | re.I)
+httpProtocolPattern = re.compile("(<[^>]+?(?:href|src|cite|action)\\s*=\\s*['\"])http:(//[^>]+?>)", re.S | re.I)
+httpsProtocolPattern = re.compile("(<[^>]+?(?:href|src|cite|action)\\s*=\\s*['\"])https:(//[^>]+?>)", re.S | re.I)
+eventPattern1 = re.compile("(\\son[a-z]+\\s*=\\s*\")([^\"\\\\\\r\\n]*(?:\\\\.[^\"\\\\\\r\\n]*)*)(\")", re.I) # nmasked: \son[a-z]+\s*=\s*"[^"\\\r\n]*(?:\\.[^"\\\r\n]*)*"
+eventPattern2 = re.compile("(\\son[a-z]+\\s*=\\s*')([^'\\\\\\r\\n]*(?:\\\\.[^'\\\\\\r\\n]*)*)(')", re.I)
+lineBreakPattern = re.compile("(?:\\p{Blank}*(\\r?\\n)\\p{Blank}*)+")
+
+tempCondCommentPattern = re.compile("<%%%COMPRESS~COND~(\\d+?)%%%>")
+tempPrePattern = re.compile("%%%COMPRESS~PRE~(\\d+?)%%%")
+tempTextAreaPattern = re.compile("%%%COMPRESS~TEXTAREA~(\\d+?)%%%")
+tempScriptPattern = re.compile("%%%COMPRESS~SCRIPT~(\\d+?)%%%")
+tempStylePattern = re.compile("%%%COMPRESS~STYLE~(\\d+?)%%%")
+tempEventPattern = re.compile("%%%COMPRESS~EVENT~(\\d+?)%%%")
+tempSkipPattern = re.compile("<%%%COMPRESS~SKIP~(\\d+?)%%%>")
+tempLineBreakPattern = re.compile("%%%COMPRESS~LT~(\\d+?)%%%")
+
+class HTMLCompressor:
+    def __init__(self):
+        self.removeComments = True
+        self.removeMultiSpaces = True
+
+        # optional settings
+        self.removeIntertagSpaces = True
+        self.removeQuotes = False
+        self.compressJavaScript = False
+        self.compressCss = False
+        self.simpleDoctype = False
+        self.removeScriptAttributes = False
+        self.removeStyleAttributes = False
+        self.removeLinkAttributes = False
+        self.removeFormAttributes = False
+        self.removeInputAttributes = False
+        self.simpleBooleanAttributes = False
+        self.removeJavaScriptProtocol = False
+        self.removeHttpProtocol = False
+        self.removeHttpsProtocol = False
+        self.preserveLineBreaks = False
+
+        self.preservePatterns = []
+
+    def compress(self, html):
+        if not html:
+            return html
+
+        condCommentBlocks = []
+        preBlocks = []
+        taBlocks = []
+        scriptBlocks = []
+        styleBlocks = []
+        eventBlocks = []
+        skipBlocks = []
+        lineBreakBlocks = []
+        userBlocks = []
+
+        html = self.preserveBlocks(html, preBlocks, taBlocks, scriptBlocks, styleBlocks, eventBlocks, condCommentBlocks, skipBlocks, lineBreakBlocks, userBlocks)
+        html = self.processHtml(html)
+        html = self.returnBlocks(html, preBlocks, taBlocks, scriptBlocks, styleBlocks, eventBlocks, condCommentBlocks, skipBlocks, lineBreakBlocks, userBlocks)
+        return html
+
+    def preserveBlocks(self, html, preBlocks, taBlocks, scriptBlocks, styleBlocks, eventBlocks, condCommentBlocks, skipBlocks, lineBreakBlocks, userBlocks):
+        # preserve user blocks
+        for p, pattern in enumerate(self.preservePatterns):
+            userBlock = []
+            index = count()
+
+            def repl(match):
+                g = match.group(0)
+                if g.strip() != '':
+                    userBlock.append(g)
+                    return tempUserBlock.format(p, next(index))
+                return g
+
+            html = pattern.sub(repl, html)
+            userBlocks.append(userBlock)
+
+        # preserve <!-- {{{ ---><!-- }}} ---> skip blocks
+        index = count()
+        def repl(match):
+            g = match.group(1)
+            if g.strip() != '':
+                skipBlocks.append(g)
+                return tempSkipBlock.format(next(index))
+            return match.group(0)
+
+        html = skipPattern.sub(repl, html)
+
+        # preserve conditional comments
+        selfClone = copy.copy(self)
+        index = count()
+        def repl(m):
+            if m.group(2).strip() != '':
+                condCommentBlocks.append(m.group(1) + selfClone.compress(m.group(2)) + m.group(3))
+                return tempCondCommentBlock.format(next(index))
+            return m.group(0)
+
+        html = condCommentPattern.sub(repl, html)
+
+        # preserve inline events
+        index = count()
+        def repl(m):
+            g = m.group(2)
+            if g.strip() != '':
+                eventBlocks.append(g)
+                return m.group(1) + tempEventBlock.format(next(index)) + m.group(3)
+            return m.group(0)
+
+        html = eventPattern1.sub(repl, html)
+
+        index = count()
+        def repl(m):
+            g = m.group(2)
+            if g.strip() != '':
+                eventBlocks.append(g)
+                return m.group(1) + tempEventBlock.format(next(index)) + m.group(3)
+            return m.group(0)
+
+        html = eventPattern2.sub(repl, html)
+
+        # preserve PRE tags
+        index = count()
+        def repl(m):
+            g = m.group(2)
+            if g.strip() != '':
+                preBlocks.append(g)
+                return m.group(1) + tempPreBlock.format(next(index)) + m.group(3)
+            return m.group(0)
+
+        html = prePattern.sub(repl, html)
+
+        # preserve SCRIPT tags
+        index = count()
+        def repl(m):
+            g = m.group(2)
+            if g.strip() != '':
+                if not jsJqueryTmplTypePattern.match(m.group(1)):
+                    scriptBlocks.append(g)
+                    return m.group(1) + tempScriptBlock.format(next(index)) + m.group(3)
+            return m.group(0)
+
+        html = scriptPattern.sub(repl, html)
+
+        # preserve STYLE tags
+        index = count()
+        def repl(m):
+            g = m.group(2)
+            if g.strip() != '':
+                styleBlocks.append(g)
+                return m.group(1) + tempStyleBlock.format(next(index)) + m.group(3)
+            return m.group(0)
+
+        html = stylePattern.sub(repl, html)
+        
+        # preserve TEXTAREA tags
+        index = count()
+        def repl(m):
+            g = m.group(2)
+            if g.strip() != '':
+                taBlocks.append(g)
+                return m.group(1) + tempTextAreaBlock.format(next(index)) + m.group(3)
+            return m.group(0)
+
+        html = taPattern.sub(repl, html)
+
+        # preserve line breaks
+        if self.preserveLineBreaks:
+            index = cont()
+            def repl(m):
+                lineBreakBlocks.append(m.group(1))
+                return tempLineBreakBlock.format(next(index))
+
+            html = lineBreakPattern.sub(repl, html)
+
+        return html
+
+    def processHtml(self, html):
+        if self.removeComments:
+            html = commentPattern.sub("", html)
+
+        if self.simpleDoctype:
+            html = doctypePattern.sub(u"<!DOCTYPE html>", html)
+
+        if self.removeScriptAttributes:
+            html = jsTypeAttrPattern.sub(r"\1\3", html)
+            html = jsLangAttrPattern.sub(r"\1\3", html)
+
+        if self.removeStyleAttributes:
+            html = styleTypeAttrPattern.sub(r"\1\3", html)
+
+        if self.removeLinkAttributes:
+            def repl(m):
+                if linkRelAttrPattern.match(m.group(0)):
+                    return m.group(1) + m.group(3)
+                else:
+                    return m.group(0)
+            html = linkTypeAttrPattern.sub(repl, html)
+
+        if self.removeFormAttributes:
+            html = formMethodAttrPattern.sub(r"\1\3", html)
+
+        if self.removeInputAttributes:
+            html = inputTypeAttrPattern.sub(r"\1\3", html)
+
+        if self.simpleBooleanAttributes:
+            html = booleanAttrPattern.sub(r"\1\2\4", html)
+
+        if self.removeHttpProtocol:
+            html = httpProtocolPattern.sub(r"\1\2", html)
+
+        if self.removeHttpsProtocol:
+            html = httpsProtocolPattern.sub(r"\1\2", html)
+
+        if self.removeIntertagSpaces:
+            html = intertagPattern.sub(u"><", html)
+
+        if self.removeMultiSpaces:
+            html = multispacePattern.sub(u" ", html)
+
+        html = tagPropertyPattern.sub(r"\1=", html)
+        html = tagEndSpacePattern.sub(r"\1\2", html)
+
+        if self.removeQuotes:
+            def repl(m):
+                if m.group(3).strip() == '':
+                    return u"=%s" % m.group(2)
+                else:
+                    return u"=%s %s" % (m.group(2), m.group(3))
+            html = tagQuotePattern.sub(repl, html)
+
+        return html.strip()
+
+    def returnBlocks(self, html, preBlocks, taBlocks, scriptBlocks, styleBlocks, eventBlocks, condCommentBlocks, skipBlocks, lineBreakBlocks, userBlocks):
+        escape = lambda _: _
+
+        # put line breaks back
+        if self.preserveLineBreaks:
+            def repl(m):
+                i = int(m.group(1))
+                if len(lineBreakBlocks) > i:
+                    return lineBreakBlocks[i]
+                return m.group(0)
+            html = tempLineBreakPattern.sub(repl, html)
+
+        # put TEXTAREA blocks back
+        def repl(m):
+            i = int(m.group(1))
+            if len(taBlocks) > i:
+                return escape(taBlocks[i])
+            return m.group(0)
+        html = tempTextAreaPattern.sub(repl, html)
+
+        # put STYLE blocks back
+        def repl(m):
+            i = int(m.group(1))
+            if len(styleBlocks) > i:
+                return escape(styleBlocks[i])
+            return m.group(0)
+        html = tempStylePattern.sub(repl, html)
+
+        # put SCRIPT blocks back
+        def repl(m):
+            i = int(m.group(1))
+            if len(scriptBlocks) > i:
+                return escape(scriptBlocks[i])
+            return m.group(0)
+        html = tempScriptPattern.sub(repl, html)
+
+        # put PRE blocks back
+        def repl(m):
+            i = int(m.group(1))
+            if len(preBlocks) > i:
+                return escape(preBlocks[i])
+            return m.group(0)
+        html = tempPrePattern.sub(repl, html)
+
+        # put event blocks back
+        def repl(m):
+            i = int(m.group(1))
+            if len(eventBlocks) > i:
+                return escape(eventBlocks[i])
+            return m.group(0)
+        html = tempEventPattern.sub(repl, html)
+
+        # put conditional comments back
+        def repl(m):
+            i = int(m.group(1))
+            if len(condCommentBlocks) > i:
+                return escape(condCommentBlocks[i])
+            return m.group(0)
+        html = tempCondCommentPattern.sub(repl, html)
+
+        # put skip blocks back
+        def repl(m):
+            i = int(m.group(1))
+            if len(skipBlocks) > i:
+                return escape(skipBlocks[i])
+            return m.group(0)
+        html = tempSkipPattern.sub(repl, html)
+
+        # put user blocks back
+        for p in xrange(len(self.preservePatterns) - 1, -1, -1):
+            tempUserPattern = re.compile("%%%COMPRESS~USER" + p + "~(\\d+?)%%%")
+
+            def repl(m):
+                i = int(m.group(1))
+                if len(userBlocks) > p and len(userBlocks[p]) > i:
+                    return escape(userBlocks[p][i])
+            html = tempUserPattern.sub(repl, html)
+
+        return html

common/middleware.py

 from django.utils.encoding import force_unicode
 from django.conf import settings
 
+from htmlcompressor import HTMLCompressor
+
 RE_MULTISPACE = re.compile(r'\s{2,}')
 RE_NEWLINE = re.compile(r'\n')
 RE_DJANGOSTRIP = re.compile(r'>\s+<')
 
-class MinifyHTMLMiddleware(object):
+class SharpMinifyHTMLMiddleware(object):
     """
-        Provides a simple middleware to compress HTML output on-the-fly.
+        Provides a simple (but severe!) middleware to compress HTML output on-the-fly.
         It does NOT keep important new lines, such as in <pre>, <code> or <textarea>.
     """
     def process_response(self, request, response):
             response.content = RE_MULTISPACE.sub(' ', response.content)
             response.content = RE_NEWLINE.sub('', response.content)
         return response
+
+class MinifyHTMLMiddleware(object):
+    """Provides a improved middleware to compress HTML output on-the-fly."""
+    def process_response(self, request, response):
+        if 'text/html' in response['Content-Type'] and settings.COMPRESS_HTML and not settings.DEBUG:
+            compressor = HTMLCompressor()
+            response.content = compressor.compress(force_unicode(response.content))
+        return response

settings.py.sample

 # COMPRESS = True
 
 ### Sdzmoar config
-# DON'T USE THIS SETTING. THE MIDDLEWARE IS ACTUALLY DISABLED.
-# IT STRIPS TOO MUCH \n THUS CAUSING CODE BLOCKS RENDERED BADLY
-# COMPRESS_HTML = False
+COMPRESS_HTML = True
 
 # Pages to show before and after the current page in the paginator
 PAGER_SIBLING_COUNT = 3