# BioLite - Tools for processing gene sequence data and automating workflows
# Copyright (c) 2012-2013 Brown University. All rights reserved.
#
# This file is part of BioLite.
#
# BioLite is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# BioLite is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with BioLite. If not, see <http://www.gnu.org/licenses/>.
"""
Diagnostics usually come in the form of plots or summary statistics.
They can serve many purposes, such as:
* diagnosing problems in sample preparation and optimizing future preparations;
* providing feedback on the sequencing itself, e.g. on read quality;
* implementing 'sanity checks' at intermediate steps of analysis;
* finding optimal parameters by comparing previous runs;
* recording computational and storage demands, and predicting future demands.
The *diagnostics* database table archives summary statistics that can be
accessed across multiple stages of a pipeline, from different pipelines, and in
HTML reports.
A diagnostics record looks like:
::
catalog_id | run_id | entity | attribute | value | timestamp
The `entity` field acts as a namespace to prevent attribute collisions, since
the same attribute name can arise multiple times within a pipeline run.
When running a BioLite pipeline, the default entity is the pipeline name plus
the stage name, so that values can be traced to the pipeline and stage during
which they were entered. Entries in the diagnostics table can include paths to
derivative files, which can be summaries of intermediate files that are used to
generate reports or intermediate data files that serve as input to other stages
and pipelines.
Initializing
------------
Before logging to diagnostics, your script must initialize this module with a
BioLite catalog ID and a name for the run using the `init` method. This will
return a new run ID from the :ref:`runs-table`. Optionally, you can pass an
existing run ID to `init` to continue a previous run.
Diagnostics are automatically initialized by the Pipeline and IlluminaPipeline
classes in the :ref:`pipeline-module`.
Logging a record
----------------
Use the `log` function described below.
Detailed system utilization statistics, including memory high-water marks and
compute wall-time are recorded automatically (by the wrapper base class) for
any wrapper that your pipeline calls, and for the overall pipeline itself.
Provenance
----------
Because every wrapper call is automatically logged, the diagnostics table holds
a complete non-executable history of the analysis, which complements the
original scripts that were used to run the analysis. In combination, the
diagnostics table and original scripts provide provenance for all analyses.
"""
import ast
import atexit
import datetime
import getpass
import hashlib
import os
import re
import resource
import socket
import time
from collections import defaultdict, OrderedDict, namedtuple
import config
import database
import utils
# Internal data.
_id = None
_run_id = None
_diag_file = None
_prog_file = None
_diag_cache = defaultdict(dict)
_prog_cache = dict()
_profiled = False
# Used by parse_program_output: compile once when this module loads.
_parse_pattern = re.compile(r"^\[biolite\.?(\w*)\]\s+(\S+)=(\S+)")
OutputPattern = namedtuple('OutputPattern', "re entity attr")
# External data.
prefix = list()
INIT = '__init__'
EXIT = '__exit__'
run_fields = [column[0] for column in database.runs_schema]
diagnostic_fields = [column[0] for column in database.diagnostics_schema]
### Helper functions. ###
def _escape(s):
return s.replace('\t','\\t').replace('\n','\\n')
def _unescape(s):
return s.replace('\\t','\t').replace('\\n','\n')
[docs]def timestamp():
"""
Returns the current time in ISO 8601 format, e.g.
:samp:`YYYY-MM-DDTHH:MM:SS[.mmmmmm][+HH:MM]`.
"""
return datetime.datetime.now().isoformat()
[docs]def str2list(data):
"""
Converts a diagnostics string with key `name` in `self.data` into a
list, by parsing it as a typical Python list representation
:samp:`[item1, item2, ... ]`.
"""
return ast.literal_eval(data)
[docs]def get_run_id():
"""Returns the `run_id` (as a string)"""
return _run_id
[docs]def get_entity():
"""
Returns the current `entity` as a dot-delimited string.
"""
global prefix
return '.'.join(prefix)
### Initialization. ###
[docs]def init(id, name, run_id=None, workdir=os.getcwd()):
"""
By default, appends to a file `diagnostics.txt` in the current working
directory, but you can override this with the `workdir` argument.
You must specify a catalog `id` and a `name` for the run. If no `run_id`
is specified, an auto-incremented run ID will be allocated by inserting
a new row into the :ref:`runs-table`.
Returns the `run_id` (as a string).
"""
global _id, _diag_file, _prog_file, _run_id
_id = id
if run_id is None:
cursor = database.execute("""
INSERT INTO runs (id,name,hostname,username,timestamp)
VALUES(?,?,?,?,?);""",
(_id, name, socket.gethostname(), getpass.getuser(), timestamp()))
_run_id = str(cursor.lastrowid)
# For new runs, the scratch directory is set to:
# <workdir>/<name>-<run_id>
workdir = os.path.join(workdir, '%s-%s' % (name, _run_id))
utils.safe_mkdir(workdir)
os.chdir(workdir)
else:
_run_id = str(run_id)
_diag_file = open(os.path.join(workdir, 'diagnostics.txt'), 'a')
_prog_file = open(os.path.join(workdir, 'programs.txt'), 'a')
return _run_id
[docs]def check_init():
"""
Aborts if the biolite.diagnostics.init() has not been called yet.
"""
if _diag_file == None:
utils.die("diagonistics have not been initialized")
### Cache functions. ###
[docs]def merge():
"""
Merges the diagnostics and program caches into the SQLite database.
"""
database.execute("BEGIN")
for entity in _diag_cache:
for attribute, values in _diag_cache[entity].iteritems():
database.execute("""
REPLACE INTO diagnostics
(id,run_id,entity,attribute,value,timestamp)
VALUES(?,?,?,?,?,?);""",
(_id, _run_id, entity, attribute, values[0], values[1]))
for binary, values in _prog_cache.iteritems():
database.execute("""
REPLACE INTO programs (binary,name,version)
VALUES (?,?,?);""", (binary, values[0], values[1]))
database.execute("COMMIT")
[docs]def load_cache():
"""
Similar to a merge, but loads the local diagnostics file into an
in-memory cache instead of the SQLite database.
Uses the filename specified with `name`, or the file `diagnostics.txt` in
the current working directory (default).
"""
global _run_id, _diag_file, _diag_cache, _prog_file, _prog_cache
# Close local diagnostics file so we can reopen in read mode.
check_init()
name = _diag_file.name
_diag_file.close()
with open(name, 'r') as f:
for line in f:
# fields
# id,run_id,entity,attribute,value,timestamp
row = map(_unescape, line.rstrip('\n').split('\t', 5))
if row[1] == _run_id:
_diag_cache[row[2]][row[3]] = (row[4], row[5])
# Reopen local diagnostics file in append mode.
_diag_file = open(name, 'a')
# Do the same for the programs file.
name = _prog_file.name
_prog_file.close()
with open(name, 'r') as f:
for line in f:
# fields
# binary_hash,name,version
row = map(_unescape, line.rstrip('\n').split('\t', 2))
_prog_cache[row[0]] = (row[1], row[2])
_prog_file = open(name, 'a')
### Logging functions. ###
[docs]def log(attribute, value):
"""
Log an `attribute`/`value` pair in the diagnostics using the currently set
`entity`. The pair is written to the local diagnostics text file and also
into the local in-memory cache.
"""
global _id, _run_id, _diag_file, _diag_cache
ts = timestamp()
check_init()
attribute = str(attribute)
value = str(value)
entity = get_entity()
# Escape tab and newline.
row = map(_escape, (_id, _run_id, entity, attribute, value, ts))
print >> _diag_file, '\t'.join(row)
_diag_file.flush()
# Create a cached copy locally, in case the pipeline wants to lookup a
# value before the merge to the global database happens.
_diag_cache[entity][attribute] = (value, ts)
[docs]def log_path(path, log_prefix=None):
"""
Logs a `path` by writing these attributes at the current `entity`, with
an optional prefix for this entry:
1) the full `path` string
2) the full `path` string, converted to an absolute path by os.path.abspath()
3) the `size` of the file/directory at the path (according to `os.stat`)
4) the `access time` of the file/directory at the path (according to `os.stat`)
5) the `modify time` of the file/directory at the path (according to `os.stat`)
6) the `permissions` of the file/directory at the path (according to `os.stat`)
"""
if log_prefix is not None:
prefix.append(log_prefix)
log('path', path)
log('abspath', os.path.abspath(path))
stat = os.stat(path)
log('size', stat.st_size)
log('atime', datetime.datetime.fromtimestamp(stat.st_atime).isoformat())
log('mtime', datetime.datetime.fromtimestamp(stat.st_mtime).isoformat())
log('mode', oct(stat.st_mode))
if log_prefix is not None:
prefix.pop()
[docs]def log_dict(d, prefix=None):
"""
Log a dictionary `d` by calling `log` for each key/value pair.
"""
if prefix is None:
prefix = ''
else:
prefix += '.'
for key, val in d.iteritems():
log(prefix + key, val)
[docs]def log_program_version(name, version, path):
"""
Enter the version string and a hash of the binary file at `path` into the
programs table.
"""
chunks = hashlib.md5()
if os.path.exists(path):
# Build hash from 64KB chunks of the binary file.
with open(path, 'rb') as f:
for chunk in iter(lambda: f.read(65536), ''):
chunks.update(chunk)
binary_hash = chunks.hexdigest()
# Store new hashes in the programs cache/file.
if binary_hash not in _prog_cache:
# Escape tab and newline.
row = map(_escape, (binary_hash, name, version))
print >> _prog_file, '\t'.join(row)
_prog_file.flush()
_prog_cache[binary_hash] = (name, version)
return binary_hash
else:
return None
[docs]def log_program_output(filename, patterns=None):
"""
Read backwards through a program's output to find any [biolite] markers,
then log their key=value pairs in the diagnostics.
A marker can specify an entity suffix with the form [biolite.suffix].
[biolite.profile] markers are handled specially, since mem= and vmem=
entries need to be accumulated. These are inserted into a program's output
on Linux systems by the preloaded memusage.so library.
You can optionally include a list of additional patterns, specified as
OutputPattern tuples with:
(regular expression string, entity, attribute)
and the first line of program output matching the pattern will be logged
to that entity and attribute name. The value will be the subexpressions
matched by the regular expression, either a single value if there is one
subexpression, or a string of the tuple if there are more.
"""
profile = dict()
lines = list()
with open(filename, 'r') as f:
for line in utils.readlines_reverse(f):
match = _parse_pattern.match(line)
if match:
entity, attr, val = match.groups()
# Stop after encountering a previous timestamp.
if attr == 'timestamp':
break
# Accumulate memory profile values.
if entity == 'profile' and (attr == 'mem' or attr == 'vmem'):
profile[attr] = profile.get(attr, 0) + int(val)
else:
if entity:
prefix.append(entity)
log(attr, val)
if entity:
prefix.pop()
elif patterns:
lines.append(line)
prefix.append('profile')
log_dict(profile)
prefix.pop()
while lines and patterns:
line = lines.pop(0)
for i, pattern in enumerate(patterns):
match = re.match(pattern.re, line)
if match:
if pattern.entity:
prefix.append(pattern.entity)
m = match.groups()
if len(m) == 1:
log(pattern.attr, m[0])
else:
log(pattern.attr, m)
if pattern.entity:
prefix.pop()
patterns.pop(i)
break
### Lookup functions. ###
[docs]def lookup(run_id, entity):
"""
Returns a dictionary of `attribute`/`value` pairs for the given `run_id` and
`entity` in the SQLite database.
Returns an empty dictionary if no records are found.
"""
stmt = """
SELECT attribute, value
FROM diagnostics
WHERE run_id=? AND entity=?
ORDER BY timestamp;"""
return dict(database.execute(stmt, (run_id, entity)))
[docs]def local_lookup(entity):
"""
Similar to `lookup`, but queries the in-memory cache instead of the SQLite
database. This can provide lookups when the local diagnostics text file
has not yet been merged into the SQLite database (for instance, after
restarting a pipeline that never completed, and hence never reached a
diagnostics merge).
Returns an empty dictionary if no records are found.
"""
d = _diag_cache.get(entity, dict())
# Return a new dict with only the value field (not the timestamp).
return dict([(k, v[0]) for k, v in d.iteritems()])
[docs]def lookup_like(run_id, entity):
"""
Similar to `lookup`, but allows for wildcards in the entity name (either
the SQL '%' wildcard or the more standard UNIX '*' wildcard).
Returns a dictinoary of dictionaries keyed on [`entity`][`attribute`].
"""
stmt = """
SELECT entity, attribute, value
FROM diagnostics
WHERE run_id=? AND entity LIKE ?
ORDER BY timestamp;"""
values = OrderedDict()
for row in database.execute(stmt, (run_id, entity.replace('*', '%'))):
try:
values[row[0]][row[1]] = row[2]
except KeyError:
values[row[0]] = dict([(row[1], row[2])])
return values
[docs]def lookup_by_id(id, entity):
if id is None:
global _id
id = _id
stmt = """
SELECT attribute, value
FROM diagnostics
WHERE id=? AND entity=?
ORDER BY run_id;"""
return dict(database.execute(stmt, (id, entity)))
[docs]def lookup_entities(run_id):
stmt = "SELECT entity FROM diagnostics WHERE run_id=?;"
return frozenset([row[0] for row in database.execute(stmt, (run_id,))])
[docs]def lookup_pipelines(run_id):
stmt = "SELECT entity FROM diagnostics WHERE run_id=?;"
rows = database.execute(stmt, (run_id,))
return frozenset([row[0].partition('.')[0] for row in rows])
[docs]def lookup_run(run_id):
stmt = "SELECT * FROM runs WHERE run_id=?;"
row = database.execute(stmt, (run_id,)).fetchone()
return utils.AttributeDict(zip(run_fields, row))
[docs]def lookup_runs(id=None, name=None, order='ASC'):
if id and name:
where = "WHERE id=? AND name=?"
args = (id, name)
elif id:
where = "WHERE id=?"
args = (id,)
else:
where = ""
args = ()
stmt = "SELECT * FROM runs %s ORDER BY run_id %s" % (where, order)
for row in database.execute(stmt, args):
yield utils.AttributeDict(zip(run_fields, row))
[docs]def lookup_prev_run(id, previous):
"""
If `previous` is an integer, tries to lookup the exit diagnostics of a
previous run with that run ID. If `previous` is any string, To input the
results from a previous pipeline run, use the (--previous, -p) argument
with a 'RUN_SPEC', which is either a specific run ID to lookup in the
diagnostics, or the wildcard '*', meaning the latest of any previous run
found in the diagnostics for the given catalog ID.
"""
try:
prev_id = int(previous)
values = lookup(prev_id, EXIT)
except ValueError:
# It is a string.
values = lookup_by_id(id, EXIT)
return values
[docs]def dump(run_id):
stmt = """
SELECT entity, attribute, value, timestamp
FROM diagnostics
WHERE run_id=?
ORDER BY timestamp;"""
for row in database.execute(stmt, (run_id,)):
print '|'.join([str(x) for x in row])
[docs]def dump_by_id(id):
stmt = "SELECT * FROM diagnostics WHERE id=? ORDER BY timestamp;"
for row in database.execute(stmt, (id,)):
print '|'.join([str(x) for x in row[1:]])
[docs]def dump_all():
stmt = "SELECT * FROM diagnostics ORDER BY timestamp;"
for row in database.execute(stmt):
print '|'.join([str(x) for x in row])
[docs]def list_runs(name=None, id=None, hidden=False):
where = ["hidden=?"]
args = [int(hidden)]
if name:
where.append("name=?")
args.append(name)
if id:
where.append("id=?")
args.append(id)
stmt = "SELECT * FROM runs WHERE %s ORDER BY run_id;" % ' AND '.join(where)
for row in database.execute(stmt, tuple(args)):
print '|'.join([str(x) for x in row])
[docs]def hide_run(run_id):
stmt = "UPDATE runs SET hidden=? WHERE run_id=?;"
database.execute(stmt, (1, run_id))
[docs]def unhide_run(run_id):
stmt = "UPDATE runs SET hidden=? WHERE run_id=?;"
database.execute(stmt, (0, run_id))
[docs]def list_programs():
for row in database.execute("SELECT * FROM programs;"):
sep = '=' * 80
print "%s|%s\n%s\n%s\n" % (row[0], row[1], sep, row[2])
### Exit profiling, to capture source usage for failed runs. ###
[docs]def exit_profiler(start):
"""
Capture script resource usage, after a script run ends or as an
exit handler if the script fails.
"""
global _profiled, prefix
if not _profiled:
walltime = time.time() - start
rchild = resource.getrusage(resource.RUSAGE_CHILDREN)
rself = resource.getrusage(resource.RUSAGE_SELF)
# Check if there is an exiting profile (e.g., if this is a restart).
profile = local_lookup('%s.profile' % EXIT)
walltime += float(profile.get('walltime', 0.0))
usertime = rchild.ru_utime + rself.ru_utime \
+ float(profile.get('usertime', 0))
systime = rchild.ru_stime + rself.ru_stime \
+ float(profile.get('systime', 0))
vmem = None
if config.kernel == 'Darwin':
mem = (rchild.ru_maxrss + rself.ru_maxrss) / 1024
mem = max(mem, int(profile.get('mem', 0)))
else:
mem, vmem = utils.memusage()
mem = max(mem, int(profile.get('mem', 0)))
vmem = max(vmem, int(profile.get('vmem', 0)))
# Log profile.
prefix = [EXIT, 'profile']
log('walltime', walltime)
log('usertime', usertime)
log('systime', systime)
log('mem', mem)
if vmem:
log('vmem', vmem)
_profiled = True
[docs]def register_exit_profiler(start):
atexit.register(exit_profiler, start)