# BioLite - Tools for processing gene sequence data and automating workflows
# Copyright (c) 2012-2013 Brown University. All rights reserved.
#
# This file is part of BioLite.
#
# BioLite is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# BioLite is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with BioLite. If not, see <http://www.gnu.org/licenses/>.
"""
Utility functions used by other BioLite modules.
"""
import fcntl
import io
import inspect
import hashlib
import os
import re
import resource
import subprocess
import sys
import time
import zipfile
from itertools import groupby
[docs]def die(*messages):
"""
Prints the current BioLite module and an error `message`, then aborts.
"""
sys.stderr.write("%s.%s: " % get_caller_info())
sys.stderr.write(' '.join(map(str, messages)))
sys.stderr.write('\n')
sys.exit(1)
[docs]def info(*messages):
"""
Prints the current BioLite module and a `message`.
"""
sys.stderr.write("%s.%s: " % get_caller_info())
sys.stderr.write(' '.join(map(str, messages)))
sys.stderr.write('\n')
[docs]def table(rows, convert=True):
"""
Outputs the given `rows` as tabulated strings, similar to the output of the
`column -t` UNIX command.
The input `rows` variable is a list of lists, where the sublists all have
the same length and contain the cells of the table. The output is a
tabulated string for each sublist (row).
"""
if rows:
# Convert cells to strings.
if convert:
for i, row in enumerate(rows):
rows[i] = tuple(str(cell) for cell in row)
# Find the max widths for each column.
widths = [0] * len(rows[0])
for row in rows:
for i, cell in enumerate(row):
widths[i] = max(widths[i], len(cell))
# Use the widths to create a format string.
fmt = ' '.join('{:<%d}' % width for width in widths)
# Output the formatted rows.
for row in rows:
yield fmt.format(*row)
[docs]def safe_mkdir(path):
"""
Creates the directory, including any missing parent directories, at the
specified `path`.
Aborts if the path points to an existing regular file.
Returns the absolute path of the directory.
"""
if os.path.isfile(path):
die("'{0}' is a regular file: can't overwrite" % path)
elif os.path.isdir(path):
info("directory '%s' already exists" % path)
else:
info("creating directory '%s'" % path)
try:
os.makedirs(path)
except OSError as e:
die("""failed to recursively create the directory
%s
%s
Do you have write permision to that path?
Or does part of that path already exist as a regular file?""" % (path, e))
return os.path.abspath(path)
[docs]def safe_remove(path):
"""
Removes a file at the given `path` only if it exists.
"""
if os.path.isfile(path):
os.remove(path)
[docs]def truncate_file(path):
"""
Truncates a file (i.e. overwrites with 0 bytes) at the given `path`.
"""
open(path, 'w').close()
[docs]def rusage_diff(r1, r2):
"""
Returns an rusage object where each field is the difference of the
corresponding fields in `r1` and `r2`.
"""
rdiff = [(f1 - f2) for f1,f2 in zip(r1,r2)]
return resource.struct_rusage(rdiff)
[docs]def failed_executable(executable, e):
"""
Diagnose why a wrapped executable failed to execute, and print an
intelligble error message for the user.
"""
if e.errno == 2:
die("command not found: '%s'" % executable)
elif e.errno == 8:
die("""execetuable format error '{0}'
Does '{0}' have the correct architecture (eg. 32-bit vs 64-bit)
for the machine you are trying to run it on?""".format(executable))
elif e.errno == 13:
die("""permission denied for command '%s'
Do you need to run 'chmod 755' on it?""" % executable)
else:
die("unknown error when executing '%s':\n%s" % (executable ,e))
[docs]def safe_call(*args, **kwargs):
"""
Calls an executable as a subprocess and checks the return value.
All `args` and `kwargs` are passed to a `subprocess.Popen` call, except for
the special keywords `return_ok`, whose value is used to check the return
value of the subprocess. By default, this is zero and any non-zero return
is considered an error. To disable this check, set `return_ok` to
`None`.
Returns a 3-tuple with the return code, the elapsed walltime, and an
rusage structure with the elapsed usertime and systime.
"""
return_ok = kwargs.pop('return_ok', 0)
rusage_start = resource.getrusage(resource.RUSAGE_CHILDREN)
start = time.time()
try:
p = subprocess.Popen(*args, **kwargs)
p.wait()
except OSError as e:
failed_executable(args[0][0], e)
walltime = time.time() - start
rusage_end = resource.getrusage(resource.RUSAGE_CHILDREN)
retcode = p.returncode
if (return_ok is not None) and (retcode != return_ok):
# Give some context to the non-zero return, if stderr is available.
if 'stderr' in kwargs:
stderr = kwargs['stderr'].name
if stderr and os.path.isfile(stderr):
subprocess.call(['tail', '-3', stderr])
die("non-zero return (%d) from command:\n%s" % (retcode, ' '.join(args[0])))
return retcode, walltime, rusage_diff(rusage_end, rusage_start)
[docs]def safe_str(s):
"""
Returns the string `s` with only alpha-numerical characters and the special
characters :samp:`()[]{}|:.-_` preserved. All other characters are replaced
by :samp:`_`.
"""
valid_chars = frozenset(r"()[]{}|:.-_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
return ''.join(c if c in valid_chars else '_' for c in str(s))
[docs]def timestamp():
"""
Returns the current time in :samp:`YYYY-MM-DD HH:MM:SS` format.
"""
return time.strftime("%Y-%m-%d %H:%M:%S")
[docs]def safe_print(f, line):
"""
Places an exclusive lock around the file object `f` and writes `line` to
it as an atomic write operation.
A line return is appended after `line`.
"""
fcntl.lockf(f, fcntl.LOCK_EX)
f.seek(0, io.SEEK_END)
f.write(line + '\n')
fcntl.lockf(f, fcntl.LOCK_UN)
[docs]def readlines_reverse(f):
"""
Seeks to the end of the file object `f` and yields lines in reverse order.
"""
# Seek to end of file.
f.seek(0, 2)
blocksize = 32*1024
last_row = ''
while f.tell() != 0:
try:
f.seek(-blocksize, 1)
except IOError:
blocksize = f.tell()
f.seek(-blocksize, 1)
block = f.read(blocksize)
f.seek(-blocksize, 1)
rows = block.split('\n')
rows[-1] = rows[-1] + last_row
while rows:
last_row = rows.pop(-1)
if rows and last_row:
yield last_row
yield last_row
[docs]def cat_to_file(input_path, output_path, mode='a', start=0):
"""
Uses the :command:`cat` or :command:`awk` command to copy the contents
at `input_path` to `output_path`, starting at line 0 of `input_path`
and appending to `output_path` by default.
"""
with open(output_path, mode) as f:
if start > 0:
ret = subprocess.call(['awk', 'NR>%d' % start, input_path], stdout=f)
else:
ret = subprocess.call(['cat', input_path], stdout=f)
return ret
[docs]def head(path, n=1):
"""
Returns a string with the first `n` lines of `path`.
"""
return subprocess.check_output(['head', '-n', str(n), path])
[docs]def head_to_file(input_path, output_path, n=1, mode='w'):
"""
Uses the :command:`head` to copy the first `n` lines of `input_path` to
`output_path`, overwriting the contents of `output_path` by default.
"""
with open(output_path, mode) as f:
ret = subprocess.call(['head', '-n', str(n), input_path], stdout=f)
return ret
[docs]def tail(path, n=1):
"""
Returns a string with the last `n` lines of `path`.
"""
return subprocess.check_output(['tail', '-n', str(n), path])
[docs]def tail_to_file(input_path, output_path, n=1, mode='w'):
"""
Uses the :command:`head` to copy the last `n` lines of `input_path` to
`output_path`, overwriting the contents of `output_path` by default.
"""
with open(output_path, mode) as f:
ret = subprocess.call(['tail', '-n', str(n), input_path], stdout=f)
return ret
[docs]def get_caller_info(depth=2):
"""
Uses the inspect module to determine the name of the calling function and
its module.
Returns a 2-tuple with the module name and the function name.
"""
try:
frame = inspect.stack()[depth]
except:
die("could not access the caller's frame at stack index %d" % depth)
func = frame[3]
module = inspect.getmodule(frame[0])
if module:
return (module.__name__, func)
else:
return ('<unknown>', func)
[docs]def get_caller_locals(depth=2):
"""
Uses the inspect module to return a dictionary of the local variables in
the caller's frame at the given `depth`. The default `depth` of 2
corresponds to the frame that calls this function.
"""
try:
frame = inspect.stack()[depth]
except:
die("could not access the caller's frame at stack index %d" % depth)
return frame[0].f_locals
[docs]class AttributeDict(dict):
"""
A mutable alternative to namedtuple that supports accessing values as
attributes or with the dict [] operator.
"""
def __init__(self, *args, **kwargs):
super(AttributeDict, self).__init__(*args, **kwargs)
self._initialized = True
def __getattr__(self, name):
try:
return self[name]
except KeyError:
raise AttributeError(name)
def __setattr__(self, name, value):
if hasattr(self, '_initialized'):
super(AttributeDict, self).__setitem__(name, value)
else:
super(AttributeDict, self).__setattr__(name, value)
[docs]def sorted_alphanum(l):
"""
Sorts a list of strings `l` and returns a list with the elements in
alpha-numerical order (i.e. strings starting with numbers are correctly
ordered by numerical value).
"""
convert = lambda text: int(text) if text.isdigit() else text
alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
return sorted(l, key=alphanum_key)
[docs]def memusage():
"""
Reads the current memory usage for this process from /proc/self/status
and returns two integer values `mem` and `vmem` which correspond to the
VmHWM (max physical memory) and VmPeak (max virtual memory) fields.
*Note*: only works on Linux.
"""
mem = 0
vmem = 0
with open('/proc/self/status', 'r') as f:
for line in f:
if line[:6] == 'VmPeak':
vmem = int(line.split()[1])
elif line[:5] == 'VmHWM':
mem = int(line.split()[1])
return mem, vmem
[docs]def which(executable):
"""
Returns the full path to `executable` by searching through all entries in the
$PATH environment variable, and looking for an executable file with that
name.
Returns `None` if the executable is not found.
"""
fpath, fname = os.path.split(executable)
if fpath and os.path.exists(executable):
return executable
else:
for path in os.environ["PATH"].split(os.pathsep):
executable = os.path.join(path, fname)
if os.path.exists(executable):
return executable
return None
[docs]def basename(path):
"""
Finds the base filename of the path, than the base of the filename
(everything before the last .extension).
"""
return os.path.splitext(os.path.basename(path))[0]
[docs]def zipdir(dirname):
"""
Recursively zips all files in `dirname` into a zip archive with the name
`dirname.zip` in the current working directory.
"""
with zipfile.ZipFile(dirname + '.zip', 'w') as zf:
for root, dirs, files in os.walk(dirname):
for f in files:
zf.write(os.path.join(root, f))
[docs]def number_range(numbers):
"""
Collapse a list of numbers into a list of range strings, following
http://stackoverflow.com/questions/9470611/how-to-do-an-inverse-range-i-
"""
ranges = []
for k, it in groupby(enumerate(sorted(numbers)), lambda x: x[1] - x[0]):
rng = list(it)
if len(rng) == 1:
s = str(rng[0][1])
else:
s = "%s-%s" % (rng[0][1], rng[-1][1])
ranges.append(s)
return ranges
[docs]def bytes_to_gb(b):
"""
Returns a string representing the given number of bytes as GB.
"""
gb = float(b)/(2**30)
if gb < 1.0:
return '%f' % gb
else:
return '%.1f' % gb
[docs]def mem_to_mb(mem):
"""
Convert a memory string, like 2G or 100mb, to an integer number of
megabytes.
"""
factor = 1
index = -1
if mem.endswith('G') or mem.endswith('g'):
factor = 1024
elif mem.endswith('gb') or mem.endswith('Gb') or mem.endswith('GB'):
factor = 1024
index = -2
elif mem.endswith('m') or mem.endswith('M'):
pass
elif mem.endswith('mb') or mem.endswith('Mb') or mem.endswith('MB'):
index = -2
else:
die("unrecognized memory value '%s'" % mem)
try:
return int(mem[:index]) * factor
except ValueError as e:
die("can't convert memory value '%s' to an integer" % mem[:index])
[docs]def md5sum(path):
"""
Use hashlib.md5() to calculate the MD5 hash of a file at `path`.
"""
chunks = hashlib.md5()
# Build hash from 64KB chunks of the binary file.
with open(path, 'rb') as f:
for chunk in iter(lambda: f.read(65536), ''):
chunks.update(chunk)
return chunks.hexdigest()
[docs]def human_readable_size(kb, prec):
"""
Returns a integer number of kilobytes as a string with closest matching
size of KB, MB, GB, or TB with `prec` number of digits.
"""
l = len(str(kb))
try:
fkb = float(kb)
except ValueError:
return '-'
if l <= 3:
out = '%s KB' % str(kb)
elif l <= 6:
out = '%.*f MB' % (prec, fkb/1024.0)
elif l <= 9:
out = '%.*f GB' % (prec, fkb/1048576.0)
else:
out = '%.*f TB' % (prec, fkb/1073741824.0)
return out