Commits

drewthepooh committed 07edd4b

Added script to calculate per-base cov across a region of a BAM

  • Participants
  • Parent commits a5e1896

Comments (0)

Files changed (4)

+import pysam
+import csv
+import concurrent.futures
+from multiprocessing import cpu_count
+import os
+
+'''
+Calculate coverage per-base across a region of a BAM file.
+Run this script inside a directory containing the BAM files you want coverage
+calculations for.
+Output files will be named the same as the BAM files, but with .csv at the end.
+(therefore you must name the input BAM files something unique and descriptive!).
+Input BAM files must end in .bam
+
+This script requires python3 with the pysam module installed
+'''
+
+
+def get_coverage(input_bam):
+
+    print('Calculating coverage for:', input_bam)
+
+    output_f = input_bam + '.csv'
+    with open(output_f, 'w', newline='') as fout:
+        writer = csv.writer(fout, delimiter='\t', lineterminator='\n')
+        writer.writerow(['chrom', 'position', 'coverage'])
+
+        sam = pysam.Samfile(input_bam, 'rb')
+
+        chrom = '3L'
+        start_pos = 7346677  # I am assuming Chuan's coordinates are 1-based
+        end_pos = 7357465
+
+        for pileupcolumn in sam.pileup(reference=chrom, start=start_pos, end=end_pos):
+            # Have to add this due to what seems to be a pysam bug
+            if start_pos <= pileupcolumn.pos <= end_pos:
+                writer.writerow(['2L', (pileupcolumn.pos + 1), pileupcolumn.n])
+
+        sam.close()
+
+if __name__ == '__main__':
+    bam_files = [f for f in os.listdir('.') if f.endswith('.bam')]
+    with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_count()) as executor:
+        executor.map(get_coverage, bam_files)

flagstats2csv.py

-import csv
-import subprocess
-from os.path import basename
-from multiprocessing import Pool
-import argparse
-import sys
-
-
-def get_flagstats(file_path):
-
-    print('Calculating stats for', basename(file_path), file=sys.stderr)
-    flagstat_bytes = subprocess.check_output(['samtools', 'flagstat', file_path])
-    flagstat_output = flagstat_bytes.decode('UTF-8')
-    flagstats_raw = flagstat_output.split('\n')[:-1]
-
-    flagstats_nums = [stat[0] for stat in
-                      (flagstat_str.split(' ') for flagstat_str in flagstats_raw)]
-    flagstats_nums.insert(0, basename(file_path))
-    return flagstats_nums
-
-
-def flagstats_calculator(*file_paths):
-
-    writer = csv.writer(sys.stdout, delimiter='\t', lineterminator='\n')
-    stats = ['sample',
-             'total',
-             'duplicates',
-             'mapped',
-             'paired',
-             'read1',
-             'read2',
-             'properly paired',
-             'with itself and mate mapped',
-             'singletons',
-             'with mate mapped to a different chr',
-             'with mate mapped to a different chr (mapQ>=5)']
-    writer.writerow(stats)
-
-    if __name__ == '__main__':
-        pool = Pool()
-        rows = pool.map(func=get_flagstats, iterable=file_paths)
-        pool.close()
-        pool.join()
-        writer.writerows(rows)
-
-
-def main():
-    helpText = ('Takes a number of bam files (can be input with file name globbing), '
-                'calculates flagstats in parallel, and outputs to stdout.  Capture the '
-                'output to generate a csv file. Note that due to the nature of the '
-                'multiprocessing, you may have to hit ctrl-C twice if an error is raised.')
-    parser = argparse.ArgumentParser(description=helpText)
-    parser.add_argument('input_BAM',
-                        help='BAM file for analysis',
-                        nargs='+')
-    args = parser.parse_args()
-
-    flagstats_calculator(*args.input_BAM)
-
-if __name__ == '__main__':
-    main()

flagstats2csv.py3

+import csv
+import subprocess
+from os.path import basename
+from multiprocessing import Pool
+import argparse
+import sys
+
+
+def get_flagstats(file_path):
+
+    print('Calculating stats for', basename(file_path), file=sys.stderr)
+    flagstat_bytes = subprocess.check_output(['samtools', 'flagstat', file_path])
+    flagstat_output = flagstat_bytes.decode('UTF-8')
+    flagstats_raw = flagstat_output.split('\n')[:-1]
+
+    flagstats_nums = [stat[0] for stat in
+                      (flagstat_str.split(' ') for flagstat_str in flagstats_raw)]
+    flagstats_nums.insert(0, basename(file_path))
+    return flagstats_nums
+
+
+def flagstats_calculator(*file_paths):
+
+    writer = csv.writer(sys.stdout, delimiter='\t', lineterminator='\n')
+    stats = ['sample',
+             'total',
+             'duplicates',
+             'mapped',
+             'paired',
+             'read1',
+             'read2',
+             'properly paired',
+             'with itself and mate mapped',
+             'singletons',
+             'with mate mapped to a different chr',
+             'with mate mapped to a different chr (mapQ>=5)']
+    writer.writerow(stats)
+
+    if __name__ == '__main__':
+        pool = Pool()
+        rows = pool.map(func=get_flagstats, iterable=file_paths)
+        pool.close()
+        writer.writerows(rows)
+
+
+def main():
+    helpText = ('Takes a number of bam files (can be input with file name globbing), '
+                'calculates flagstats in parallel, and outputs to stdout.  Capture the '
+                'output to generate a csv file. Note that due to the nature of the '
+                'multiprocessing, you may have to hit ctrl-C twice if an error is raised.')
+    parser = argparse.ArgumentParser(description=helpText)
+    parser.add_argument('input_BAM',
+                        help='BAM file for analysis',
+                        nargs='+')
+    args = parser.parse_args()
+
+    flagstats_calculator(*args.input_BAM)
+
+if __name__ == '__main__':
+    main()

pipe_classes.py

-import logging
-import pprint
-
-logging.basicConfig(level=logging.DEBUG)
-
-
-class ListView(object):
-
-    def __init__(self, lst):
-        self._list = lst
-
-    def getView(self):
-        if len(self._list) <= 4:
-            return str(self._list)
-        else:
-            return '[{}, {} (...) {} {}]'.format(self._list[0], self._list[1], self._list[-2], self._list[-1])
-
-
-class Stage(object):
-
-    def __init__(self, func):
-        self._func = func
-
-    def run(self):
-        logging.info('Running ' + self._func.__name__)
-        self.func()
-        logging.info('Finished' + self._func.__name__)
-
-
-class ChildStage(object):
-
-    def __init__(self, parameters):
-        self.parameters = parameters
-        self._parameters_view = ListView(self.parameters).getView()
-
-    def run(self):
-        pretty_command = pprint.pformat(self.parameters, indent=4)
-        logging.debug('running command:\n{}'.format(pretty_command))
-        logging.info('Running subprocess with parameters: ' + self._parameters_view)
-
-a = ChildStage(['a', 'b', 'c', 'd', 'e', 'f'])
-a.run()
-b = ChildStage(['a', 'b'])
-b.run()