# BioLite - Tools for processing gene sequence data and automating workflows
# Copyright (c) 2012 Brown University. All rights reserved.
#
# This file is part of BioLite.
#
# BioLite is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# BioLite is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with BioLite. If not, see <http://www.gnu.org/licenses/>.
"""
Provides a framework for generating HTML reports from BioLite diagnostics.
The typical usage is to extend the `BaseReport` class for each pipeline, and
override the `init` method to specify **lookups** and **generators**.
**Lookups** are called with `self.lookup` and specify entities or attributes
that should be loaded from the diagnotics into the `self.data` AttributeDict.
For example::
self.lookup('args', diagnostics.INIT)
will load the initialization entity, which includes all of the command-line
arguments passed to the pipeline for a given run.
**Generators** are functions that return lists of HTML lines, which are
concatenated together to form the final HTML report, in the order that the
generators are attached. A generator function will typically start by checking
if a diagnostics value was successfully loaded into `self.data`, e.g.::
def report_arguments(self):
if 'args' in self.data:
html = [self.header('Arguments')]
html += ['<p>%s</[>' % a for a in self.data.args]
return html
The generator is attached to the report in the `init` method with the line::
self.generator(self.report_arguments)
"""
import os
import re
import shlex
from collections import namedtuple, defaultdict
from docutils.core import publish_parts
import matplotlib
matplotlib.use('Agg')
matplotlib.rcParams['axes.formatter.limits'] = '-10,10'
import matplotlib.pyplot as pyplot
import numpy as np
import database
import diagnostics
import utils
stylesheet = """
body { font-family: sans-serif; }
table { border-collapse: collapse; }
table, th, td { border: 1px solid black; }
th, td { padding: 5px; }
td.right { text-align: right; font-family: monospace }
td.center { text-align: center; }
table.minimal { border: 0; border-bottom: 1px solid black; }
tr.minimal { border: 0; border-top: 1px solid black; }
td.minimal { border: 0; vertical-align: top; padding-right: 25px }
"""
javascript = """
<script type="text/javascript">
function togglestats(element){
// Toggles the visibility of a child div of element called
element.querySelector("#stats").style.display == "block" ? element.querySelector("#stats").style.display = "none" :
element.querySelector("#stats").style.display = "block";
}
</script>
"""
Field = namedtuple('Field', 'key title type format')
profile_schema = (
Field('name', 'Command', str, '{}'),
Field('entity', 'Stage', str, '{}'),
Field('walltime', 'Wall Time (s)', float, '{:.2f}'),
Field('usertime', 'User Time (s)', float, '{:.2f}'),
Field('systime', 'System Time (s)', float, '{:.2f}'),
Field('mem', 'Memory (KB)', int, '{:,d}'),
Field('vmem', 'Virtual Memory (KB)', int, '{:,d}'))
profile_aggregators = {
'walltime': sum,
'usertime': sum,
'systime': sum,
'mem': max,
'vmem': max }
[docs]def profile_aggregate(profiles):
"""
Applies aggregators (sum or max) to fields in the input profiles list.
Returns a dict of aggregated values.
"""
schema = profile_schema[2:]
values = defaultdict(list)
aggregates = dict()
# Collect all the profile entries into lists for each field
# (e.g. 'walltime', 'mem', etc.)
for profile in profiles:
for field in schema:
if field.key in profile_aggregators:
try:
values[field.key].append(field.type(profile[field.key]))
except KeyError:
pass
# Run the aggregators on each of these lists to produce a final
# list where each entry is the formatted, aggregated value, or
# '-' if the value list was empty.
for field in schema:
if values[field.key]:
op = profile_aggregators.get(field.key, sum)
aggregates[field.key] = op(values[field.key])
return aggregates
def _figure_props(axes, props):
if 'title' in props:
axes.set_title(props['title'])
if 'xlabel' in props:
axes.set_xlabel(props['xlabel'], fontsize=12)
if 'ylabel' in props:
axes.set_ylabel(props['ylabel'], fontsize=12)
if 'xlim' in props:
pyplot.xlim(props['xlim'])
if 'ylim' in props:
pyplot.ylim(props['ylim'])
if 'xscale' in props:
axes.set_xscale(props['xscale'])
if 'yscale' in props:
axes.set_yscale(props['yscale'])
if 'xticks' in props:
axes.set_xticks(props['xticks'])
if 'yticks' in props:
axes.set_yticks(props['yticks'])
if 'xticklabels' in props:
axes.set_xticklabels(props['xticklabels'])
if 'yticklabels' in props:
axes.set_yticklabels(props['yticklabels'])
if 'yline' in props:
for y in props['yline']:
pyplot.axhline(y=y)
if 'box' in props:
pyplot.box(props['box'])
def _figure_create(props={}):
figure = pyplot.figure(
figsize=props.get('figsize'),
dpi=props.get('dpi', 72))
axes = figure.add_subplot('111')
_figure_props(axes, props)
return figure, axes
def _figure_save(figure, outdir, imgname):
pyplot.tight_layout()
imgname = utils.safe_str(imgname)
figure.savefig(os.path.join(outdir, imgname))
return "<img src=\"{}\"/>".format(imgname)
[docs]class BaseReport:
"""
A base class that provides basic infrastructure for reporting
diagnostics via HTML for a given run.
This is intended to be sub-classed within an BioLite pipeline script,
to define how the diagnostics for that pipeline should be summarized
and plotted.
"""
def __init__(self, id, run_id, outdir=None, verbose=False, hlevel=1):
"""
Override init() instead of this function.
"""
self.id = id
self.run_id = run_id
self.outdir = outdir
self.verbose = verbose
self.hlevel = hlevel
self.name = ''
self.data = utils.AttributeDict()
self.generators = []
self.init()
self.lookup('profile', "%s.*.profile" % self.name,
func=diagnostics.lookup_like)
self.generator(self.profile_table)
def __repr__(self):
if self.data:
return True
return False
def __str__(self):
if self.data:
html = []
for func in self.generators:
func_html = func()
if func_html:
html += func_html
#html.append("<p>{}</p>".format(func.__doc__))
if func.__doc__:
doc = publish_parts(func.__doc__, writer_name='html')
html.append(doc['body'])
# Join lines, collapse tabs and ignore empty lines in output.
return '\n'.join([line.replace('\t','') for line in html if line])
else:
return ''
[docs] def init(self):
"""
Override this function with a series of lookup() and generator()
calls that specify the diagnostics lookups needed by your report, and
the sub-class functions that generate the HTML output.
"""
pass
[docs] def lookup(self, name, entity, attribute=None, func=diagnostics.lookup):
"""
Lookup data from the diagnotics table for the given entity and store
it in the self.data dictoinary.
"""
values = func(self.run_id, entity)
if values:
if attribute:
try:
self.data[name] = values[attribute]
except KeyError:
pass
else:
self.data[name] = values
[docs] def query(self, name, sql, args, database=database):
values = database.execute(sql, args).fetchall()
if values:
self.data[name] = values
[docs] def generator(self, func):
"""
Add functions in your sub-class to the 'generators' list, and their
list-of-strings output will be appended in order to the output of the
object's __repr__ function.
"""
self.generators.append(func)
[docs] def check(self, *args):
"""
Check if multiple keys are in the report's data dictionary. Return
true if all exists, otherwise false.
"""
for name in args:
if name not in self.data:
return False
return True
[docs] def zip(self, *args):
"""
Zip together multiple items from the report's data dictionary.
"""
return zip(*[self.data[arg] for arg in args])
[docs] def percent(self, data_name, field_name, num, div):
try:
data = self.data[data_name]
data[field_name] = 100.0 * float(data[num]) / float(data[div])
except KeyError:
pass
[docs] def str2list(self, name):
"""
Converts a diagnostics string with key `name` in `self.data` into a
list, by parsing it as a typical Python list representation
:samp:`[item1, item2, ... ]`.
"""
if name in self.data:
self.data[name] = diagnostics.str2list(self.data[name])
[docs] def summarize(self, schema, name, attr=None):
"""
Returns a 2-column summary table of a pipeline's key statistics.
"""
if name in self.data:
html = ["<table>"]
if attr is not None:
data = self.data[name][attr]
else:
data = self.data[name]
for field in schema:
row = "<tr><td><b>%s</b></td><td class=\"right\">%s</td></tr>"
value = data.get(field.key)
if not value is None:
html.append(row % (
field.title,
field.format.format(field.type(data[field.key]))))
else:
html.append(row % (field.title, '-'))
html.append("</table>")
return html
[docs] def table(self, rows, headers=None, style=None):
"""
Returns an HTML table with a row for each tuple in `rows`, and an
option header row for the tuple `headers`. The `style` string
indicates justification for a column as either l (left) or r (right).
For example, 'lr' prints a table with the first column left-justified
and the second column right-justified.
"""
styles = {'c': " style=\"center\"", 'r': " style=\"right\""}
html = ["<table>"]
if style:
style = [styles.get(s, '') for s in style]
else:
style = [''] * max(map(len, rows))
if headers:
html.append("<tr>")
html += map("<th>{}</th>".format, headers)
html.append("</tr>")
for row in rows:
html.append("<tr>")
html += map("<td{0[0]}>{0[1]}</td>".format, zip(style, row))
html.append("</tr>")
html.append("</table>")
return html
[docs] def histogram(self, imgname, data, bins=100, props={}):
"""
Plots a histogram in dict `data` with the given number of `bins`
to the file `imgname`. The keys of the dict should correspond to
bins with width 1, and the values to frequencies.
"""
props['figsize'] = props.get('figsize', (9,3))
xmax = max(data.iterkeys())
bins = min(xmax + 1, bins)
binsize = ((xmax - 1) / bins) + 1
# Rebin.
old_data = data
data = np.zeros(bins)
for i, count in old_data.iteritems():
data[i / binsize] += count
# Create an x range with the bin size.
halfbin = 0.45 * float(binsize)
xlim = (-halfbin, binsize * data.size - halfbin)
x = np.arange(xlim[0], xlim[1], binsize)
# Create the figure.
props['xlim'] = props.get('xlim', xlim)
props['yticks'] = props.get('yticks', [0, max(data)])
figure, axes = _figure_create(props)
axes.bar(x, data,
width=props.get('width', 0.9 * binsize),
color=props.get('color', 'r'))
return _figure_save(figure, self.outdir, imgname)
[docs] def histogram_categorical(self, imgname, data, props={}):
"""
Plots a histogram in dict `data` to the file `imgname`, using the keys
as categories and values as frequencies.
"""
props['figsize'] = props.get('figsize', (9,3))
categories = sorted(data)
frequencies = map(data.get, categories)
# Create the figure.
props['xlim'] = props.get('xlim', [-1.0, len(data)])
props['xticks'] = props.get('xticks', np.arange(len(data)))
props['xticklabels'] = props.get('xticklabels', map(str, categories))
props['yticks'] = props.get('yticks', [0, max(frequencies)])
figure, axes = _figure_create(props)
axes.bar(np.arange(len(data)) - 0.45, frequencies,
width=props.get('width', 0.9),
color=props.get('color', 'r'))
return _figure_save(figure, self.outdir, imgname)
[docs] def histogram_overlay(self, imgname, hists, labels=None, bins=100, props={}):
"""
Plots up to 3 histograms over each other. Histograms are plotted in the
order of the hists list, so that the last histogram is the topmost.
The histograms are plotted with alpha=0.5 and colors red, blue, green.
"""
colors = ('r', 'b', 'g')
props['figsize'] = props.get('figsize', (9,3))
xmax = max([hist[:,0].max() for hist in hists])
# Rebin.
binsize = ((xmax - 1) / bins) + 1
rebin = list()
for hist in hists:
rebin.append(np.zeros(bins))
for i in xrange(hist.shape[0]):
rebin[-1][int(hist[i][0] / binsize)] += hist[i][1]
ymax = max([hist.max() for hist in rebin])
padding = 0.04 * ymax
# Add padding to non-zero bins.
for hist in rebin:
hist[hist.nonzero()] += padding
# Create an x range with the bin size.
x = np.arange(0, binsize * bins, binsize)
figure, axes = _figure_create(props)
if not labels:
labels = [None] * len(hists)
for y, label, c in zip(rebin, labels, colors):
axes.bar(x, y, label=label, color=c, width=binsize, alpha=0.5, bottom=-padding)
pyplot.axhline(y=0, color='k')
axes.set_xlim([0, binsize * bins])
axes.set_xticks([i * binsize * bins / 10 for i in range(0,11)])
axes.set_xticks([i * binsize for i in range(0,bins+1)], minor=True)
axes.set_ylim([-padding, ymax])
axes.set_yticks([0, ymax])
font = matplotlib.font_manager.FontProperties()
font.set_size('small')
axes.legend(loc=props.get('loc'), prop=font)
return _figure_save(figure, self.outdir, imgname)
[docs] def barplot(self, imgname, data, props={}):
"""
Plots bars for a dict `data` to the file `imgname`, using the keys
as categories and values as heights.
"""
props['figsize'] = props.get('figsize', (9,3))
categories = sorted(data)
frequencies = map(data.get, categories)
# Create the figure.
props['xlim'] = props.get('xlim', [-1.0, len(data)])
props['xticks'] = props.get('xticks', np.arange(len(data)))
props['xticklabels'] = props.get('xticklabels', map(str, categories))
figure, axes = _figure_create(props)
axes.bar(np.arange(len(data)) - 0.45, frequencies,
width=props.get('width', 0.9),
color=props.get('color', 'r'))
return _figure_save(figure, self.outdir, imgname)
[docs] def scatterplot(self, imgname, plot, props={}):
"""
Plots the (X,Y) points given in `plot` to the file `imgname`.
`plot` should be a tuple of the form `(x, y, ...)` where x and y
are list or nparray objects and any additional fields are parameters
to the matplotlib `plot` function (such as color or label).
"""
# Create the figure.
figure, axes = _figure_create(props)
axes.plot(*plot)
return _figure_save(figure, self.outdir, imgname)
[docs] def multiscatterplot(self, imgname, plots, props={}):
"""
Plots multiple sets of (X,Y) points given in `plots` to the file
`imgname`.
`plots` should be a list of tuples of the form `(x, y, color, label)`
where x and y are list or nparray objects, color is a matplotlib color
specification (for instance, 'r' for red) and label is a string.
"""
# Create the figure.
figure, axes = _figure_create(props)
for x, y, c, label in plots:
axes.plot(x, y, c, label=label, mfc='none', mec=c[0])
font = matplotlib.font_manager.FontProperties()
font.set_size('small')
axes.legend(loc=props.get('loc'), prop=font)
return _figure_save(figure, self.outdir, imgname)
[docs] def lineplot(self, imgname, data, props={}):
"""
Plots a single line for the values in `data` to the file `imgname`.
"""
# Create the figure.
figure, axes = _figure_create(props)
axes.plot(data, color=props.get('color', 'r'))
return _figure_save(figure, self.outdir, imgname)
[docs] def multilineplot(self, imgname, plots, props={}):
"""
Plots multiple lines, one for each `(x, y, label)` tuple in the
`plots` list, to the file `imgname`.
"""
# Create the figure.
figure, axes = _figure_create(props)
for x, y, label in plots:
axes.plot(x, y, label=label,
marker=props.get('marker'), mfc=props.get('mfc'))
font = matplotlib.font_manager.FontProperties()
font.set_size('small')
axes.legend(loc=props.get('loc', 'lower right'), prop=font)
return _figure_save(figure, self.outdir, imgname)
[docs] def profile_table(self):
"""
CPU and memory usage for each command called by this pipeline.
"""
if 'profile' in self.data:
html = [self.header("Resourse Usage")]
# The aggregate row always gets written.
agg = profile_aggregate(self.data.profile.values())
html += ["<table class=\"profile\">", "<tr>"]
for field in profile_schema:
if field.key in profile_aggregators:
html.append('<th>%s</th>' % field.title)
html.append('</tr>')
for field in profile_schema:
if field.key in profile_aggregators:
try:
s = field.format.format(field.type(agg[field.key]))
except:
s = '-'
op = profile_aggregators.get(field.key, sum).__name__
html.append('<td class="right">%s [%s]</td>' % (s, op))
html += [ '</tr>', '</table>' ]
# The detailed rows are hidden by default.
html += [
'<div onclick="togglestats(this)">',
'<div style="color:blue">',
'Click here to show/hide full resource usage statistics',
'</div>',
'<div id="stats" style="display:none;" >']
html.append("<table class=\"profile\">")
headers = [field.title for field in profile_schema]
# Write the column headers.
html.append("<tr><th>")
html.append("</th><th>".join(headers))
html.append("</th></tr>")
# Write the values.
for entity in self.data.profile:
profile = self.data.profile[entity]
tokens = entity.split('.')
profile['entity'] = '.'.join(tokens[1:-2])
html.append("<tr>")
# First two columns are strings and don't use "right" <td>
for field in profile_schema[:2]:
try:
html += [
"<td>",
field.format.format(field.type(profile[field.key])),
"</td>"]
except KeyError:
html.append("<td class=\"center\">–</td>")
for field in profile_schema[2:]:
try:
html += [
"<td class=\"right\">",
field.format.format(field.type(profile[field.key])),
"</td>"]
except KeyError:
html.append("<td class=\"center\">–</td>")
html.append("</tr>")
html += ["</tr>", "</table>"]
html += ["</div></div>"] # Close out the divs that allow display toggling
return html