Source code for biolite.report

# BioLite - Tools for processing gene sequence data and automating workflows
# Copyright (c) 2012-2013 Brown University. All rights reserved.
# 
# This file is part of BioLite.
# 
# BioLite is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# BioLite is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with BioLite.  If not, see <http://www.gnu.org/licenses/>.

"""
Provides a framework for generating HTML reports from BioLite diagnostics.
The typical usage is to extend the `BaseReport` class for each pipeline, and
override the `init` method to specify **lookups** and **generators**.

**Lookups** are called with `self.lookup` and specify entities or attributes
that should be loaded from the diagnotics into the `self.data` AttributeDict.
For example::

  self.lookup('args', diagnostics.INIT)

will load the initialization entity, which includes all of the command-line
arguments passed to the pipeline for a given run.

**Generators** are functions that return lists of HTML lines, which are
concatenated together to form the final HTML report, in the order that the
generators are attached. A generator function will typically start by checking
if a diagnostics value was successfully loaded into `self.data`, e.g.::

  def report_arguments(self):
  	if 'args' in self.data:
		html = [self.header('Arguments')]
		html += ['<p>%s</[>' % a for a in self.data.args]
		return html

The generator is attached to the report in the `init` method with the line::

  self.generator(self.report_arguments)

"""

import os
import re
import shlex
import shutil
import textwrap

from collections import namedtuple, defaultdict
from docutils.core import publish_parts

import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as pyplot
from matplotlib import cm
import numpy as np

import database
import diagnostics
import utils
from config import datadir

# From ColorBrewer2, Qualitative, Set3
palette10 = (
	'#8DD3C7',
	'#FFED6F',
	'#BEBADA',
	'#FB8072',
	'#80B1D3',
	'#FDB462',
	'#B3DE69',
	'#FCCDE5',
	'#BC80BD',
	'#CCEBC5')

Field = namedtuple('Field', 'key title type format')

profile_schema = (
	Field('name', 'Command', str, '{}'),
	Field('entity', 'Stage', str, '{}'),
	Field('walltime', 'Wall Time (s)', float, '{:.2f}'),
	Field('usertime', 'User Time (s)', float, '{:.2f}'),
	Field('systime', 'System Time (s)', float, '{:.2f}'),
	Field('mem', 'Memory (KB)', int, '{:,d}'),
	Field('vmem', 'Virtual Memory (KB)', int, '{:,d}'))

profile_aggregators = {
	'walltime': sum,
	'usertime': sum,
	'systime': sum,
	'mem': max,
	'vmem': max }
	
[docs]def profile_aggregate(profiles): """ Applies aggregators (sum or max) to fields in the input profiles list. Returns a dict of aggregated values. """ schema = profile_schema[2:] values = defaultdict(list) aggregates = dict() # Collect all the profile entries into lists for each field # (e.g. 'walltime', 'mem', etc.) for profile in profiles: for field in schema: if field.key in profile_aggregators: try: values[field.key].append(field.type(profile[field.key])) except KeyError: pass # Run the aggregators on each of these lists to produce a final # list where each entry is the formatted, aggregated value, or # '-' if the value list was empty. for field in schema: if values[field.key]: op = profile_aggregators.get(field.key, sum) aggregates[field.key] = op(values[field.key]) return aggregates
[docs]def copy_css(outdir): """ Copy CSS files and images needed for report templates. """ css = utils.safe_mkdir(os.path.join(outdir, 'css')) shutil.copy(os.path.join(datadir, 'bootstrap.min.css'), css) img = utils.safe_mkdir(os.path.join(outdir, 'img')) shutil.copy(os.path.join(datadir, 'glyphicons-halflings.png'), img) shutil.copy(os.path.join(datadir, 'glyphicons-halflings-white.png'), img)
[docs]def copy_js(outdir): """ Copy Javascript files used for some report features. """ js = utils.safe_mkdir(os.path.join(outdir, 'js')) files = ('d3.min.js', 'jsphylosvg-min.js', 'raphael-min.js') for f in files: shutil.copy(os.path.join(datadir, f), js)
def _figure_props(axes, props): if 'title' in props: axes.set_title(props['title']) if 'xlabel' in props: axes.set_xlabel(props['xlabel'], fontsize=12) if 'ylabel' in props: axes.set_ylabel(props['ylabel'], fontsize=12) if 'xlim' in props: pyplot.xlim(props['xlim']) if 'ylim' in props: pyplot.ylim(props['ylim']) if 'xscale' in props: axes.set_xscale(props['xscale']) if 'yscale' in props: axes.set_yscale(props['yscale']) if 'xticks' in props: axes.set_xticks(props['xticks']) if 'yticks' in props: axes.set_yticks(props['yticks']) if 'xticklabels' in props: axes.set_xticklabels(props['xticklabels']) if 'yticklabels' in props: axes.set_yticklabels(props['yticklabels']) if 'yline' in props: for y in props['yline']: pyplot.axhline(y=y) if 'box' in props: pyplot.box(props['box']) def _figure_create(props={}): figure = pyplot.figure( figsize=props.get('figsize'), dpi=props.get('dpi', 72)) axes = figure.add_subplot('111') _figure_props(axes, props) return figure, axes def _figure_save(figure, outdir, imgname): pyplot.tight_layout() imgname = utils.safe_str(imgname) figure.savefig(os.path.join(outdir, imgname)) return "<img src=\"{}\"/>".format(imgname)
[docs]class BaseReport: """ A base class that provides basic infrastructure for reporting diagnostics via HTML for a given run. This is intended to be sub-classed within an BioLite pipeline script, to define how the diagnostics for that pipeline should be summarized and plotted. """ def __init__(self, id, run_id, outdir=None, verbose=False, hlevel=1): """ Override init() instead of this function. """ self.id = id self.run_id = run_id self.outdir = outdir self.verbose = verbose self.hlevel = hlevel self.name = '' self.data = utils.AttributeDict() self.js = [] self.generators = [] self.init() self.lookup('profile', "%s.*.profile" % self.name, func=diagnostics.lookup_like) self.generator(self.profile_table) def __repr__(self): if self.data: return True return False def __str__(self): if self.data: html = [] for func in self.generators: func_html = func() if func_html: html += func_html if func.__doc__: doc = publish_parts( textwrap.dedent(func.__doc__), writer_name='html') html.append('<blockquote>%s</blockquote>' % doc['body']) # Join lines, collapse tabs and ignore empty lines in output. return '\n'.join([line.replace('\t','') for line in html if line]) else: return ''
[docs] def init(self): """ Override this function with a series of lookup() and generator() calls that specify the diagnostics lookups needed by your report, and the sub-class functions that generate the HTML output. """ pass
[docs] def lookup(self, name, entity, attribute=None, func=diagnostics.lookup): """ Lookup data from the diagnotics table for the given entity and store it in the self.data dictoinary. """ values = func(self.run_id, entity) if values: if attribute: try: self.data[name] = values[attribute] except KeyError: pass else: self.data[name] = values
[docs] def query(self, name, sql, args, database=database): values = database.execute(sql, args).fetchall() if values: self.data[name] = values
[docs] def extract_arg(self, entity, arg): """ Parse the 'command' attrbute of 'entity' to find the value for the argument 'arg'. """ command = diagnostics.lookup(self.run_id, entity).get('command') if command: args = shlex.split(str(command)) for i, a in enumerate(args): if a == arg and (i+1) < len(args): return args[i+1] utils.info("no argument '%s' for command in '%s' for run %d" % (arg, entity, int(self.run_id))) else: utils.info("could not find 'commmand' in entity '%s' for run %d" % (entity, int(self.run_id))) return None
[docs] def add_js(self, name): """ Copy a Javascript file from the BioLite share directory, and include a reference to it in the HTML output. Current options are: * d3.min.js * jsphylosvg-min.js * raphael-min.js """ if not name in ('d3.min.js', 'jsphylosvg-min.js', 'raphael-min.js'): utils.info("warning: javascript name '%s' unrecognized" % name) self.js.append(name) shutil.copy( os.path.join(datadir, name), utils.safe_mkdir(os.path.join(self.outdir, 'js')))
[docs] def get_js(self): return map('<script type="text/javascript" src="js/{}"></script>'.format, self.js)
[docs] def generator(self, func): """ Add functions in your sub-class to the 'generators' list, and their list-of-strings output will be appended in order to the output of the object's __repr__ function. """ self.generators.append(func)
[docs] def check(self, *args): """ Check if multiple keys are in the report's data dictionary. Return true if all exists, otherwise false. """ for name in args: if name not in self.data: return False return True
[docs] def zip(self, *args): """ Zip together multiple items from the report's data dictionary. """ return zip(*[self.data[arg] for arg in args])
[docs] def header(self, html, level=0): return "<h{0}>{1}</h{0}>".format(self.hlevel + level, html)
[docs] def percent(self, data_name, field_name, num, div): try: data = self.data[data_name] data[field_name] = 100.0 * float(data[num]) / float(data[div]) except KeyError: pass
[docs] def str2list(self, name): """ Converts a diagnostics string with key `name` in `self.data` into a list, by parsing it as a typical Python list representation :samp:`[item1, item2, ... ]`. """ if name in self.data: self.data[name] = diagnostics.str2list(self.data[name])
[docs] def summarize(self, schema, name, attr=None): """ Returns a 2-column summary table of a pipeline's key statistics. """ if name in self.data: html = ['<table class="table table-striped table-condensed table-mini">'] if attr is not None: data = self.data[name][attr] else: data = self.data[name] for field in schema: row = "<tr><td><b>%s</b></td><td class=\"right\">%s</td></tr>" value = data.get(field.key) if not value is None: html.append(row % ( field.title, field.format.format(field.type(data[field.key])))) else: html.append(row % (field.title, '-')) html.append("</table>") return html
[docs] def table(self, rows, headers=None, style=None): """ Returns an HTML table with a row for each tuple in `rows`, and an option header row for the tuple `headers`. The `style` string indicates justification for a column as either l (left) or r (right). For example, 'lr' prints a table with the first column left-justified and the second column right-justified. """ styles = {'c': " style=\"center\"", 'r': " style=\"right\""} html = ['<table class="table table-striped table-condensed table-mini">'] if style: style = [styles.get(s, '') for s in style] else: style = [''] * max(map(len, rows)) if headers: html.append("<tr>") html += map("<th>{}</th>".format, headers) html.append("</tr>") for row in rows: html.append("<tr>") html += map("<td{0[0]}>{0[1]}</td>".format, zip(style, row)) html.append("</tr>") html.append("</table>") return html
[docs] def histogram(self, imgname, data, bins=100, props={}): """ Plots a histogram in dict `data` with the given number of `bins` to the file `imgname`. The keys of the dict should correspond to bins with width 1, and the values to frequencies. """ props['figsize'] = props.get('figsize', (9,3)) xmax = max(data.iterkeys()) bins = min(xmax + 1, bins) binsize = ((xmax - 1) / bins) + 1 # Rebin. old_data = data data = np.zeros(bins) for i, count in old_data.iteritems(): data[i / binsize] += count # Create an x range with the bin size. halfbin = 0.45 * float(binsize) xlim = (-halfbin, binsize * data.size - halfbin) x = np.arange(xlim[0], xlim[1], binsize) # Create the figure. props['xlim'] = props.get('xlim', xlim) props['yticks'] = props.get('yticks', [0, max(data)]) figure, axes = _figure_create(props) axes.bar(x, data, width=props.get('width', 0.9 * binsize), color=props.get('color', 'r')) return _figure_save(figure, self.outdir, imgname)
[docs] def histogram_categorical(self, imgname, data, props={}): """ Plots a histogram in dict `data` to the file `imgname`, using the keys as categories and values as frequencies. """ props['figsize'] = props.get('figsize', (9,3)) categories = sorted(data) frequencies = map(data.get, categories) # Create the figure. props['xlim'] = props.get('xlim', [-1.0, len(data)]) props['xticks'] = props.get('xticks', np.arange(len(data))) props['xticklabels'] = props.get('xticklabels', map(str, categories)) props['yticks'] = props.get('yticks', [0, max(frequencies)]) figure, axes = _figure_create(props) axes.bar(np.arange(len(data)) - 0.45, frequencies, width=props.get('width', 0.9), color=props.get('color', 'r')) return _figure_save(figure, self.outdir, imgname)
[docs] def histogram_overlay(self, imgname, hists, labels=None, bins=100, props={}): """ Plots up to 3 histograms over each other. Histograms are plotted in the order of the hists list, so that the last histogram is the topmost. The histograms are plotted with alpha=0.5 and colors red, blue, green. """ colors = ('r', 'b', 'g') props['figsize'] = props.get('figsize', (9,3)) xmax = max([hist[:,0].max() for hist in hists]) # Rebin. binsize = ((xmax - 1) / bins) + 1 rebin = list() for hist in hists: rebin.append(np.zeros(bins)) for i in xrange(hist.shape[0]): rebin[-1][int(hist[i][0] / binsize)] += hist[i][1] ymax = max([hist.max() for hist in rebin]) padding = 0.04 * ymax # Add padding to non-zero bins. for hist in rebin: hist[hist.nonzero()] += padding # Create an x range with the bin size. x = np.arange(0, binsize * bins, binsize) figure, axes = _figure_create(props) if not labels: labels = [None] * len(hists) for y, label, c in zip(rebin, labels, colors): axes.bar(x, y, label=label, color=c, width=binsize, alpha=0.5, bottom=-padding) pyplot.axhline(y=0, color='k') axes.set_xlim([0, binsize * bins]) axes.set_xticks([i * binsize * bins / 10 for i in range(0,11)]) axes.set_xticks([i * binsize for i in range(0,bins+1)], minor=True) axes.set_ylim([-padding, ymax]) axes.set_yticks([0, ymax]) font = matplotlib.font_manager.FontProperties() font.set_size('small') axes.legend(loc=props.get('loc'), prop=font) return _figure_save(figure, self.outdir, imgname)
[docs] def barplot(self, imgname, data, props={}): """ Plots bars for a dict `data` to the file `imgname`, using the keys as categories and values as heights. """ props['figsize'] = props.get('figsize', (9,3)) categories = sorted(data) frequencies = map(data.get, categories) # Create the figure. props['xlim'] = props.get('xlim', [-1.0, len(data)]) props['xticks'] = props.get('xticks', np.arange(len(data))) props['xticklabels'] = props.get('xticklabels', map(str, categories)) figure, axes = _figure_create(props) axes.bar(np.arange(len(data)) - 0.45, frequencies, width=props.get('width', 0.9), color=props.get('color', 'r')) return _figure_save(figure, self.outdir, imgname)
[docs] def scatterplot(self, imgname, plot, props={}): """ Plots the (X,Y) points given in `plot` to the file `imgname`. `plot` should be a tuple of the form `(x, y, ...)` where x and y are list or nparray objects and any additional fields are parameters to the matplotlib `plot` function (such as color or label). """ # Create the figure. figure, axes = _figure_create(props) axes.plot(*plot) return _figure_save(figure, self.outdir, imgname)
[docs] def multiscatterplot(self, imgname, plots, props={}): """ Plots multiple sets of (X,Y) points given in `plots` to the file `imgname`. `plots` should be a list of tuples of the form `(x, y, color, label)` where x and y are list or nparray objects, color is a matplotlib color specification (for instance, 'r' for red) and label is a string. """ # Create the figure. figure, axes = _figure_create(props) for x, y, c, label in plots: axes.plot(x, y, c, label=label, mfc='none', mec=c[0]) font = matplotlib.font_manager.FontProperties() font.set_size('small') axes.legend(loc=props.get('loc'), prop=font) return _figure_save(figure, self.outdir, imgname)
[docs] def lineplot(self, imgname, data, props={}): """ Plots a single line for the values in `data` to the file `imgname`. """ # Create the figure. figure, axes = _figure_create(props) axes.plot(data, color=props.get('color', 'r')) return _figure_save(figure, self.outdir, imgname)
[docs] def multilineplot(self, imgname, plots, props={}): """ Plots multiple lines, one for each `(x, y, label)` tuple in the `plots` list, to the file `imgname`. """ # Create the figure. figure, axes = _figure_create(props) for x, y, label in plots: axes.plot(x, y, label=label, marker=props.get('marker'), mfc=props.get('mfc')) font = matplotlib.font_manager.FontProperties() font.set_size('small') axes.legend(loc=props.get('loc', 'lower right'), prop=font) return _figure_save(figure, self.outdir, imgname)
[docs] def imageplot(self, imgname, matrix, props={}, vmin=0.0, vmax=1.0): """ Plots a 2D `matrix` as an image to the filename `imgname`. """ # Create the figure. figure, axes = _figure_create(props) axes.imshow(matrix, cmap=cm.gray, interpolation='none', aspect='auto', vmin=vmin, vmax=vmax) return _figure_save(figure, self.outdir, imgname)
[docs] def profile_table(self): # CPU and memory usage for each command called by this pipeline. if 'profile' in self.data: html = [self.header("Resourse Usage")] # The aggregate row always gets written. agg = profile_aggregate(self.data.profile.values()) html += ["<table class=\"table\">", "<tr>"] for field in profile_schema: if field.key in profile_aggregators: html.append('<th>%s</th>' % field.title) html.append('</tr>') for field in profile_schema: if field.key in profile_aggregators: try: s = field.format.format(field.type(agg[field.key])) except: s = '-' op = profile_aggregators.get(field.key, sum).__name__ html.append('<td class="right">%s [%s]</td>' % (s, op)) html += [ '</tr>', '</table>' ] # The detailed rows are hidden by default. html += [ '<a class="btn btn-info" onclick="togglestats(this)"><i class="icon-list-alt icon-white"></i> Show/hide details</a>', '<table class="stats table table-striped table-condensed">'] headers = [field.title for field in profile_schema] # Write the column headers. html.append("<tr><th>") html.append("</th><th>".join(headers)) html.append("</th></tr>") # Write the values. for entity in self.data.profile: profile = self.data.profile[entity] tokens = entity.split('.') profile['entity'] = '.'.join(tokens[1:-2]) html.append("<tr>") # First two columns are strings and don't use "right" <td> for field in profile_schema[:2]: try: html += [ "<td>", field.format.format(field.type(profile[field.key])), "</td>"] except KeyError: html.append("<td class=\"center\">&ndash;</td>") for field in profile_schema[2:]: try: html += [ "<td class=\"right\">", field.format.format(field.type(profile[field.key])), "</td>"] except KeyError: html.append("<td class=\"center\">&ndash;</td>") html.append("</tr>") html += ["</tr>", "</table>"] return html