mango-experimental / utils.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

import datetime
import os
import re

import markdown
import pytz

from django.conf import settings
from django.core.cache import cache
from django.template import Context, loader
import mango.settings
from mango.settings import *

block = r'(?m)^(%s(?=[ \n])[^\n]*(\n|$))+'
match = r'(?m)^%s(?=[ \n]) ?'

RE = {
    '\r\n?':         re.compile(r'\r\n?'),

    'replacements': (
        (re.compile(r'(?<!\\)\.\.\.(?!\.)'), u'\u2026'),     # ... -> ellipsis
        (re.compile(r' -- '), u'\u2009\u2014\u2009'),        # [space][hyphen][hyphen][space] -> [thin space][em dash][thin space]
        (re.compile(r'(?<!\\)&lt;&lt;(?!&lt;)'), u'\u00AB'), # << -> «
        (re.compile(r'(?<!\\)&gt;&gt;(?!&gt;)'), u'\u00BB'), # >> -> »
    ),
    'heading':       re.compile(r'(?m)\s*<(h[1-6])[^>]*>(?P<title>.+?)</\1>$(?P<html>[\s\S]*)'),

    # excerpts
    'hand-crafted':  re.compile(r' {,3}\S+:.*(\n[ \t]*\S+:.*)*\n{2,}(?P<excerpt>(\|(?=[ \n])[^\n]*\n)+)'),
    'excerpt':       re.compile(block % r'\|'),
    'excerpt_pipes': re.compile(match % r'\|'),

    # updates
    'update':        re.compile(block % r'\|\|'),
    'update_pipes':  re.compile(match % r'\|\|'),

    # {{ filesize }} following internal links
    'ref-style':     re.compile(r'(\[(?P<id>[^\]]+)\]\s*){{\s*filesize\s*}}'),
    'inline':        re.compile(r'(\[[^\]]+\]\(/(?P<path>\S+?)\)\s*){{\s*filesize\s*}}'),

    'alias=>canon':  re.compile(r'^(0*(?P<alias>.*?)=>)?(?P<canon>.+)$'),
}

def id_to_path(identifier, text):
    """
    Finds the path after the [X] identifier in the text (used for filesize in our Markdown files)
    
    >>> path = id_to_path('1', '[1]: /static/downloads/package.zip')
    >>> path.endswith('/downloads/package.zip')
    True
    """
    m = re.search(r'(?m)^ {,3}\[' + identifier + r'\]:\s+/(\S+)', text)
    try:
        path = m.group(1)
    except AttributeError:
        return False

    return os.path.join(PATH_TO_STATIC, *path.split('/')[1:])
    
def get_contents(filepath):
    """
    Returns the contents of the file as a UTF-8 encoded string

    >>> get_contents('mango/examples/1=>my-first-post.text')
    u"date:\\t13 April ... **Congratulations!**\\n"
    """
    f = open(filepath)
    u = f.read().decode('utf-8')
    f.close()
    return u

def print_filesize(path_to_file, plaintext=False):
    """
    Prints the filesize of the file specified by `path_to_file`
    (wrapped in an HTML span unless `plaintext` is True)
    
    >>> head, tail = os.path.split(__file__)
    >>> path_to_file = os.path.join(head, 'examples', '1=>my-first-post.text')
    >>> print_filesize(path_to_file)
    u'<span class="filesize">(258\u2009bytes)</span>'
    >>> print_filesize(path_to_file, True)
    u'(258\u2009bytes)'
    >>> print_filesize('DOES_NOT_EXIST')
    u''
    """
    kb_size = KILOBYTE_SIZE
    try:
        filesize = os.path.getsize(path_to_file)
        bytes = (
            ('bytes', 1),
            ('kB', kb_size**1),
            ('MB', kb_size**2),
            ('GB', kb_size**3),
            ('TB', kb_size**4),
        )
        for t in bytes:
            if filesize <= t[1] * kb_size:
                if t[0] == 'bytes':
                    text = u'%s\u2009bytes' % filesize
                else:
                    text = u'≈%s\u2009%s' % (round(float(filesize)/t[1], 1), t[0])

                before, after = '(', ')'
                if not plaintext:
                    before = '<span class="filesize">' + before
                    after += '</span>'

                return u''.join([before, text, after])
    except:
        return u'' # fail silently

def parse_text(text):
    t = loader.get_template('update.dhtml')
    for match in re.finditer(RE['update'], text):
        capture = match.group(0)
        c = Context(parse_text(re.sub(RE['update_pipes'], '', capture)))
        text = text.replace(capture, t.render(c))
    md = markdown.Markdown(extensions = ('meta',) + MARKDOWN_EXTENSIONS)
    html = md.convert(text)

    if REPLACEMENTS: # perform replacements on HTML so that code snippets are not affected
        fragments = re.split(r'(?s)(<code>.*?</code>|<pre>.*?</pre>)', html)
        html = ''
        for fragment in fragments:
            if not re.match(r'(?s)^<(code|pre)>.*?</\1>$', fragment):
                for key, value in dict(RE['replacements']).items():
                    fragment = re.sub(key, value, fragment)
            html += fragment

    meta = {}
    if hasattr(md, 'Meta'): # this needs to be checked as this doesn't exist if the file was empty
        for key, value in md.Meta.items(): # note: every item in md.Meta.items() is a list
            meta[key] = value
            if len(value) == 1:
                if key in META_LISTS:
                    meta[key] = value[0].split(', ')
                else:
                    meta[key] = value[0]

    if meta.has_key('date') and meta.has_key('time'):
        tz = pytz.timezone(settings.TIME_ZONE)
        dt_format = ' '.join([MARKDOWN_DATE_FORMAT, MARKDOWN_TIME_FORMAT])
        try:
            meta['datetime'] = tz.localize(datetime.datetime.strptime(' '.join([meta['date'], meta['time']]), dt_format)).astimezone(pytz.utc)
        except ValueError: # date and/or time incorrectly formatted
            meta['datetime'] = None

    # Changed to process the HTML as this will also detect if the Markdown has an HTML header tag (in legal Markdown format).
    # It also allows the setting of the title in meta in which case nothing is altered.
    if not meta.has_key('title'):
        m = re.match(RE['heading'], html)
        if m:
            meta['title'] = m.group('title')
            html = m.group('html')

    return {'meta': meta, 'html': html}


#TODO - look into whether updates can be placed inside excerpts correctly
def parse_file(filepath, plaintext=False):
    """
    Returns the parsed text of the given string as a string. If plaintext only the mango replacements are done
    
    >>> parsed = parse_file('mango/examples/1=>my-first-post.text')
    >>> parsed['html']
    u"\\n<p>Welcome to Mango. ... <strong>Congratulations!</strong></p>"
    >>> parsed['meta']['title']
    u'My First Post'
    >>> parsed['excerpt']
    u"\\n<p>Welcome to Mango. ... <strong>Congratulations!</strong></p>"
    >>> parse_file('mango/examples/1=>my-first-post.text', True)
    u"date:\\t13 April ... **Congratulations!**\\n\\n"
    """
    # Looks to see if we have a cached entry for it.
    # Cached data is in the form {'data': data, 'mod_time': time}.
    cache_key = u'%s%s' % ('plaintext:' if plaintext else '', filepath) # ':' is illegal character in file names =)
    c = cache.get(cache_key)
    if c and c['mod_time'] == os.path.getmtime(filepath):
        return c['data']

    text = get_contents(filepath)
    text = re.sub(RE['\r\n?'], '\n', text) + '\n' # keep regular expressions as simple as possible
    text = re.sub(RE['ref-style'],
                    lambda m: m.group(1) + print_filesize(id_to_path(m.group('id'), text), plaintext=plaintext),
                    text) # e.g. Download the [tiny calendar icon set][1] {{ filesize }}.
    text = re.sub(RE['inline'],
                    lambda m: m.group(1) + print_filesize(m.group('path'), plaintext=plaintext),
                    text) # e.g. Download the [tiny calendar icon set](/downloads/tiny-calendar-icon-set.zip) {{ filesize }}.

    if plaintext:
        cache.set(cache_key, {'data': text, 'mod_time': os.path.getmtime(filepath)}, POST_CACHE_SECONDS)
        return text

    excerpt = ''
    match = re.match(RE['hand-crafted'], text)
    if match:
        capture = match.group('excerpt')
        excerpt = parse_text(re.sub(RE['excerpt_pipes'], '', capture))['html'] + '\n'
        text = text.replace(capture, '')
    else:
        for match in re.finditer(RE['excerpt'], text):
            capture = match.group(0)
            snippet = parse_text(re.sub(RE['excerpt_pipes'], '', capture))['html']
            excerpt += snippet + '\n'
            text = text.replace(capture, snippet)

    data = parse_text(text)
    data['excerpt'] = excerpt if excerpt else data['html']
    cache.set(cache_key, {'data': data, 'mod_time': os.path.getmtime(filepath)}, POST_CACHE_SECONDS)
    return data

def absolute_path_to_posts():
    """
    Returns the Unix style absolute path to the posts directory
    """
    path_to_posts = mango.settings.PATH_TO_POSTS
    if not path_to_posts.startswith('/'):
        #TODO - comment this
        path_to_this = os.path.split(__file__)[0] # strip /utils.py
        project_path = os.path.split(path_to_this)[0] # strip /mango
        path_to_posts = os.path.join(project_path, *path_to_posts.split(u'/'))
        fragments = [u'']
        head, tail = os.path.split(path_to_posts)
        while tail:
            fragments.insert(1, tail)
            head, tail = os.path.split(head)
        path_to_posts = u'/'.join(fragments)
    return path_to_posts

def post_urls(filepath):
    """
    Returns a post's short and canonical URLs
    
    >>> path_to_posts = mango.utils.absolute_path_to_posts()
    >>> setattr(mango.settings, 'BASE_URL', 'http://example.com/')
    >>> setattr(mango.settings, 'SHORT_URL_BASE', 'http://✪df.ws/')
    >>> post_urls(os.path.join(path_to_posts, '01=>my-first-post.text'))
    ({'abs': u'http://\u272adf.ws/1/', 'rel': u'/1/'}, {'abs': u'http://example.com/my-first-post/', 'rel': u'/my-first-post/'})
    >>> setattr(mango.settings, 'SHORT_URL_BASE', '')
    >>> post_urls(os.path.join(path_to_posts, '01=>my-first-post.text'))
    ({'abs': u'http://example.com/1/', 'rel': u'/1/'}, {'abs': u'http://example.com/my-first-post/', 'rel': u'/my-first-post/'})
    >>> post_urls(os.path.join(path_to_posts,
    ...         'js=>javascript', 'libs=>libraries', 'prototype.js', '$.text'))
    ({'abs': u'http://example.com/js/libs/prototype.js/$/', 'rel': u'/js/libs/prototype.js/$/'}, {'abs': u'http://example.com/javascript/libraries/prototype.js/$/', 'rel': u'/javascript/libraries/prototype.js/$/'})
    """
    canon_fragments = [u'', u'']
    alias_fragments = [u'', u'']

    head, tail = os.path.split(os.path.abspath(filepath))
    match = re.match(RE['alias=>canon'], tail)
    if match:
        canon = os.path.splitext(match.group('canon'))[0] # strip extension
        canon_fragments.insert(1, canon)
        alias_fragments.insert(1, match.group('alias') or canon)

    while tail:
        head, tail = os.path.split(head)
        match = re.match(RE['alias=>canon'], tail)
        if match:
            canon = match.group('canon')
            canon_fragments.insert(1, canon)
            alias_fragments.insert(1, match.group('alias') or canon)

    path_to_posts = absolute_path_to_posts()
    base_url = mango.settings.BASE_URL.decode('utf-8').rstrip(u'/')
    short_url_base = mango.settings.SHORT_URL_BASE.decode('utf-8').rstrip(u'/') or base_url

    short_path = u'/'.join(alias_fragments).replace(path_to_posts, u'', 1)
    canon_path = u'/'.join(canon_fragments).replace(path_to_posts, u'', 1)

    short_urls = {'rel': short_path, 'abs': short_url_base + short_path}
    canon_urls = {'rel': canon_path, 'abs': base_url + canon_path}

    return (short_urls, canon_urls)

def get_posts(path_to_posts, include_pages=False, reverse=True):
    """
    Returns all of the posts in the directory and all directories below it
    
    >>> get_posts('mango/examples')[1]['html']
    u"\\n<p>Welcome to Mango. ... <strong>Congratulations!</strong></p>"
    """
    documents = []
    for dirpath, dirnames, filenames in os.walk(path_to_posts):
        filenames[:] = [f for f in filenames if not f.startswith('.')]
        for filename in filenames:
            joined_path = os.path.join(dirpath, filename)
            absolute_path = os.path.abspath(joined_path)
            # ignore symlink if it points to a file (post) in same directory
            if absolute_path == os.path.realpath(joined_path):
                this = parse_file(absolute_path)
                short_urls, canon_urls = post_urls(absolute_path)
                this['short_urls'] = short_urls
                this['canon_urls'] = canon_urls
                documents.append(this)

    posts = []
    pages = []
    for document in documents:
        if document['meta'].get('datetime'):
            posts.append(document)
        else:
            pages.append(document)

    posts.sort(key=lambda post: post['meta']['datetime'], reverse=reverse)
    pages.sort(key=lambda page: page['meta'].get('title'))

    if include_pages:
        return pages + posts

    return posts

def posts(path_to_posts=PATH_TO_POSTS, include_pages=False):
    """
    Simple wrapper for `get_posts` which returns cached posts if appropriate
    
    >>> posts('mango/examples')[1]['html']
    u"\\n<p>Welcome to Mango. ... <strong>Congratulations!</strong></p>"
    """
    cache_key = u'posts%s:%s' % ('+pages' if include_pages else '', path_to_posts)
    posts = cache.get(cache_key)
    if posts:
        return posts

    posts = get_posts(path_to_posts, include_pages=include_pages)
    cache.set(cache_key, posts, INDEX_CACHE_SECONDS)
    return posts

def archives(path_to_posts=PATH_TO_POSTS):
    """
    Returns all of the posts in the directory and all directories below it,
    in the form (year, month, [posts])
    
    >>> year, month, these_posts = archives('mango/examples')[1]
    >>> year == 2010
    True
    >>> month == 4
    True
    >>> these_posts[0]['html']
    u"\\n<p>Welcome to Mango. ... <strong>Congratulations!</strong></p>"
    """
    cache_key = u'archives:%s' % path_to_posts
    archives = cache.get(cache_key) # won't conflict with the posts above as this will always be a folder
    if archives:
        return archives # there is no check for modification time, this is just always current for 5 minutes

    archives = []
    posts = get_posts(path_to_posts)
    if posts:
        dt = posts[0]['meta']['datetime']
        year, month = dt.year, dt.month

        these_posts = []
        for post in posts:
            dt = post['meta']['datetime']
            this_year, this_month = dt.year, dt.month

            if this_year == year and this_month == month:
                these_posts.append(post)
            else:
                archives.append((year, month, these_posts))
                year, month = this_year, this_month
                these_posts = [post]

        archives.append((year, month, these_posts))

    cache.set(cache_key, archives, INDEX_CACHE_SECONDS)
    return archives

def primary_author_email():
    """
    Returns the email address of the primary author as set in the settings file
    
    >>> primary_author_email()
    '... <...@...>'
    """
    name = PRIMARY_AUTHOR_NAME
    email = PRIMARY_AUTHOR_EMAIL
    if name and email:
        email = '%s <%s>' % (name, email)
    return email

def convert_html_chars(string):
    return string.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.