Source

Mango / utils.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

import datetime
import os
import re

import markdown
import pytz

from django.conf import settings
from django.core.cache import cache
from django.template import Context, loader
import mango.settings
from mango.settings import *

block = r'(?m)^(%s(?=[ \n])[^\n]*(\n|$))+'
match = r'(?m)^%s(?=[ \n]) ?'

RE = {
    '\r\n?':         re.compile(r'\r\n?'),

    'replacements': (
        (re.compile(r'(?<!\\)\.\.\.(?!\.)'), u'\u2026'),     # ... -> ellipsis
        (re.compile(r' -- '), u'\u2009\u2014\u2009'),        # [space][hyphen][hyphen][space] -> [thin space][em dash][thin space]
        (re.compile(r'(?<!\\)&lt;&lt;(?!&lt;)'), u'\u00AB'), # << -> «
        (re.compile(r'(?<!\\)&gt;&gt;(?!&gt;)'), u'\u00BB'), # >> -> »
    ),
    'heading':       re.compile(r'(?m)\s*<(h[1-6])[^>]*>(?P<title>.+?)</\1>$(?P<html>[\s\S]*)'),

    # excerpts
    'hand-crafted':  re.compile(r' {,3}\S+:.*(\n[ \t]*\S+:.*)*\n{2,}(?P<excerpt>(\|(?=[ \n])[^\n]*\n)+)'),
    'excerpt':       re.compile(block % r'\|'),
    'excerpt_pipes': re.compile(match % r'\|'),

    # updates
    'update':        re.compile(block % r'\|\|'),
    'update_pipes':  re.compile(match % r'\|\|'),

    # {{ filesize }} following internal links
    'ref-style':     re.compile(r'(\[(?P<id>[^\]]+)\]\s*){{\s*filesize\s*}}'),
    'inline':        re.compile(r'(\[[^\]]+\]\(/(?P<path>\S+?)\)\s*){{\s*filesize\s*}}'),

    'alias=>canon':  re.compile(r'^(0*(?P<alias>.*?)=>)?(?P<canon>.+)$'),
}

def id_to_path(identifier, text):
    """
    Finds the path after the [X] identifier in the text (used for filesize in our Markdown files)
    
    >>> path = id_to_path('1', '[1]: /static/downloads/package.zip')
    >>> path.endswith('/downloads/package.zip')
    True
    """
    m = re.search(r'(?m)^ {,3}\[' + identifier + r'\]:\s+/(\S+)', text)
    try:
        path = m.group(1)
    except AttributeError:
        return False

    return os.path.join(PATH_TO_STATIC, *path.split('/')[1:])

def print_filesize(path_to_file, plaintext=False):
    """
    Prints the filesize of the file specified by `path_to_file`
    (wrapped in an HTML span unless `plaintext` is True)
    
    >>> head, tail = os.path.split(__file__)
    >>> path_to_file = os.path.join(head, 'examples', '1=>my-first-post.text')
    >>> print_filesize(path_to_file)
    u'<span class="filesize">(258\u2009bytes)</span>'
    >>> print_filesize(path_to_file, True)
    u'(258\u2009bytes)'
    >>> print_filesize('DOES_NOT_EXIST')
    u''
    """
    kb_size = KILOBYTE_SIZE
    try:
        filesize = os.path.getsize(path_to_file)
        bytes = (
            ('bytes', 1),
            ('kB', kb_size**1),
            ('MB', kb_size**2),
            ('GB', kb_size**3),
            ('TB', kb_size**4),
        )
        for t in bytes:
            if filesize <= t[1] * kb_size:
                if t[0] == 'bytes':
                    text = u'%s\u2009bytes' % filesize
                else:
                    text = u'≈%s\u2009%s' % (round(float(filesize)/t[1], 1), t[0])

                before, after = '(', ')'
                if not plaintext:
                    before = '<span class="filesize">' + before
                    after += '</span>'

                return u''.join([before, text, after])
    except:
        return u'' # fail silently

def parse_markdown(text, plaintext=False):
    """
    Returns the parsed text of the given string as a string. If plaintext only the mango replacements are done
    
    >>> text = get_contents('mango/examples/1=>my-first-post.text')
    >>> parsed = parse_markdown(text)
    >>> parsed['html']
    u"\\n<p>Welcome to Mango. ... <strong>Congratulations!</strong></p>"
    >>> parsed['meta']['title']
    u'My First Post'
    >>> parsed['excerpt']
    u"\\n<p>Welcome to Mango. ... <strong>Congratulations!</strong></p>"
    >>> parse_markdown(text, True)
    u"date:\\t13 April ... **Congratulations!**\\n\\n"
    """
    text = re.sub(RE['\r\n?'], '\n', text) + '\n' # keep regular expressions as simple as possible
    text = re.sub(RE['ref-style'],
                    lambda m: m.group(1) + print_filesize(id_to_path(m.group('id'), text), plaintext=plaintext),
                    text) # e.g. Download the [tiny calendar icon set][1] {{ filesize }}.
    text = re.sub(RE['inline'],
                    lambda m: m.group(1) + print_filesize(m.group('path'), plaintext=plaintext),
                    text) # e.g. Download the [tiny calendar icon set](/downloads/tiny-calendar-icon-set.zip) {{ filesize }}.

    if plaintext:
        return text

    excerpt = ''
    match = re.match(RE['hand-crafted'], text)
    if match:
        capture = match.group('excerpt')
        excerpt = parse_markdown(re.sub(RE['excerpt_pipes'], '', capture))['html'] + '\n'
        text = text.replace(capture, '')
    else:
        for match in re.finditer(RE['excerpt'], text):
            capture = match.group(0)
            snippet = parse_markdown(re.sub(RE['excerpt_pipes'], '', capture))['html']
            excerpt += snippet + '\n'
            text = text.replace(capture, snippet)

    t = loader.get_template('update.dhtml')

    for match in re.finditer(RE['update'], text):
        capture = match.group(0)
        c = Context(parse_markdown(re.sub(RE['update_pipes'], '', capture)))
        text = text.replace(capture, t.render(c))

    md = markdown.Markdown(extensions = ('meta',) + MARKDOWN_EXTENSIONS)
    html = md.convert(text)

    if REPLACEMENTS: # perform replacements on HTML so that code snippets are not affected
        fragments = re.split(r'(?s)(<code>.*?</code>|<pre>.*?</pre>)', html)
        html = ''
        for fragment in fragments:
            if not re.match(r'(?s)^<(code|pre)>.*?</\1>$', fragment):
                for key, value in dict(RE['replacements']).items():
                    fragment = re.sub(key, value, fragment)
            html += fragment

    meta = {}
    if hasattr(md, 'Meta'): # this needs to be checked as this doesn't exist if the file was empty
        for key, value in md.Meta.items(): # note: every item in md.Meta.items() is a list
            meta[key] = value
            if len(value) == 1:
                if key in META_LISTS:
                    meta[key] = value[0].split(', ')
                else:
                    meta[key] = value[0]

    if meta.has_key('date') and meta.has_key('time'):
        tz = pytz.timezone(settings.TIME_ZONE)
        dt_format = ' '.join([MARKDOWN_DATE_FORMAT, MARKDOWN_TIME_FORMAT])
        try:
            meta['datetime'] = tz.localize(datetime.datetime.strptime(' '.join([meta['date'], meta['time']]), dt_format)).astimezone(pytz.utc)
        except ValueError: # date and/or time incorrectly formatted
            meta['datetime'] = None

    # Changed to process the HTML as this will also detect if the Markdown has an HTML header tag (in legal Markdown format).
    # It also allows the setting of the title in meta in which case nothing is altered.
    if not meta.has_key('title'):
        m = re.match(RE['heading'], html)
        if m:
            meta['title'] = m.group('title')
            html = m.group('html')

    return {'meta': meta, 'excerpt': excerpt if excerpt else html, 'html': html}

def get_contents(filepath):
    """
    Returns the contents of the file as a UTF-8 encoded string

    >>> get_contents('mango/examples/1=>my-first-post.text')
    u"date:\\t13 April ... **Congratulations!**\\n"
    """
    f = open(filepath)
    u = f.read().decode('utf-8')
    f.close()
    return u

def absolute_path_to_posts():
    """
    Returns the Unix style absolute path to the posts directory
    """
    path_to_posts = mango.settings.PATH_TO_POSTS
    if not path_to_posts.startswith('/'):
        path_to_this = os.path.split(__file__)[0] # strip /utils.py
        project_path = os.path.split(path_to_this)[0] # strip /mango
        path_to_posts = os.path.join(project_path, *path_to_posts.split(u'/'))
        fragments = [u'']
        head, tail = os.path.split(path_to_posts)
        while tail:
            fragments.insert(1, tail)
            head, tail = os.path.split(head)
        path_to_posts = u'/'.join(fragments)
    return path_to_posts

def post_urls(filepath):
    """
    Returns a post's short and canonical URLs
    
    >>> path_to_posts = mango.utils.absolute_path_to_posts()
    >>> setattr(mango.settings, 'SHORT_URL_BASE', 'http://✪df.ws/')
    >>> post_urls(os.path.join(path_to_posts, '01=>my-first-post.text'))
    (u'http://\u272adf.ws/1/', u'/my-first-post/')
    >>> setattr(mango.settings, 'SHORT_URL_BASE', '')
    >>> post_urls(os.path.join(path_to_posts, '01=>my-first-post.text'))
    (u'/1/', u'/my-first-post/')
    >>> post_urls(os.path.join(path_to_posts,
    ...         'js=>javascript', 'libs=>libraries', 'prototype.js', '$.text'))
    (u'/js/libs/prototype.js/$/', u'/javascript/libraries/prototype.js/$/')
    """
    canon_fragments = [u'', u'']
    alias_fragments = [u'', u'']

    head, tail = os.path.split(os.path.abspath(filepath))
    match = re.match(RE['alias=>canon'], tail)
    if match:
        canon = os.path.splitext(match.group('canon'))[0] # strip extension
        canon_fragments.insert(1, canon)
        alias_fragments.insert(1, match.group('alias') or canon)

    while tail:
        head, tail = os.path.split(head)
        match = re.match(RE['alias=>canon'], tail)
        if match:
            canon = match.group('canon')
            canon_fragments.insert(1, canon)
            alias_fragments.insert(1, match.group('alias') or canon)

    path_to_posts = absolute_path_to_posts()
    short_url_base = mango.settings.SHORT_URL_BASE.decode('utf-8').rstrip(u'/')
    short_url = u'/'.join(alias_fragments).replace(path_to_posts, short_url_base, 1)
    canon_url = u'/'.join(canon_fragments).replace(path_to_posts, u'', 1)

    return (short_url, canon_url)

def posts(path_to_posts=PATH_TO_POSTS):
    """
    Returns all of the posts in the directory and all directories below it. This is used for the index pages
    as well as the index pages of any of the subfolders
    
    The posts are returned in list of 3-tuples sorted by their year and month in the form: (year, month, [posts])
    
    >>> year, month, these_posts = posts('mango/examples/')[1]
    >>> year == 2010
    True
    >>> month == 4
    True
    >>> these_posts[0]['html']
    u"\\n<p>Welcome to Mango. ... <strong>Congratulations!</strong></p>"
    """
    posts = []
    for dirpath, dirnames, filenames in os.walk(path_to_posts):
        filenames[:] = [f for f in filenames if not f.startswith('.')]
        for filename in filenames:
            joined_path = os.path.join(dirpath, filename)
            absolute_path = os.path.abspath(joined_path)
            # ignore symlink if it points to a file (post) in same directory
            if absolute_path == os.path.realpath(joined_path):
                this = cache.get(absolute_path)
                if not this:
                    this = parse_markdown(get_contents(absolute_path))
                    this['meta']['url'] = post_urls(absolute_path)[1]
                    cache.set(absolute_path, this, CACHE_SECONDS)
                posts.append(this)

    dated_posts = [post for post in posts if post['meta'].get('datetime')] # display a post only if it has both a date and a time
    dated_posts.sort(key=lambda post: post['meta']['datetime'], reverse=True) # most recent first

    last_index = len(dated_posts) - 1
    posts = []
    for index, post in enumerate(dated_posts):
        dt = post['meta']['datetime']
        this_year, this_month = dt.year, dt.month

        if index == 0:
            year, month = this_year, this_month
            these_posts = []

        if this_year == year and this_month == month:
            these_posts.append(post)
        else:
            posts.append((year, month, these_posts))
            year, month = this_year, this_month
            these_posts = [post]

        if index == last_index:
            posts.append((year, month, these_posts))

    return posts

def primary_author_email():
    """
    Returns the email address of the primary author as set in the settings file
    
    >>> primary_author_email()
    '... <...@...>'
    """
    name = PRIMARY_AUTHOR_NAME
    email = PRIMARY_AUTHOR_EMAIL
    if name and email:
        email = '%s <%s>' % (name, email)
    return email

def convert_html_chars(string):
    return string.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')