Source

utils / extract_sm.py

Full commit
#!/usr/bin/python

import sys
import os
import pysam
import getopt


bam_file = None
samples = []
extract_by_rg = False
extract_prefix = "extract_"
list_samples = False

(opts, args) = getopt.getopt(sys.argv[1:], 'f:rp:l')
for o, a in opts:
  if o == '-f':
    bam_file = a
  if o == '-r':
    extract_by_rg = True  
  if o == '-p':
    extract_prefix = a
  if o == '-l':
    list_samples = True  

if not bam_file or not os.path.exists(bam_file):
  sys.stderr.write("Please, provide a BAM file\n")
  sys.exit(1)

samples = args
tokeep = [] 
rg2sm = {}
sample2fo = {}

bamh = pysam.Samfile(bam_file, 'rb')

if list_samples:
  for rg in bamh.header['RG']:
    sys.stdout.write("Sample: %s\tRead Group: %s\n" % (rg['SM'], rg['ID']))
  sys.exit(0)
  
for rg in bamh.header['RG']:
  if extract_by_rg:
    if len(samples)==0 or rg['ID'] in samples:
      tokeep.append(rg['ID'])
      rg2sm[rg['ID']] = rg['ID']
  else:
    if len(samples)==0 or rg['SM'] in samples:
      tokeep.append(rg['SM'])
      rg2sm[rg['ID']] = rg['SM']

if not tokeep:
  sys.stderr.write("No samples matched, exiting\n")
  sys.exit(1)

for sm in set(tokeep):
  fname = extract_prefix + sm + '.bam'
  header = bamh.header
  if extract_by_rg:
    rh = [x for x in header['RG'] if x['RG'] == sm]
  else:
    rh = [x for x in header['RG'] if x['SM'] == sm]
  header['RG'] = rh
  if not os.path.exists(fname):
    sys.stderr.write("Opening %s\n" % fname)
    sample2fo[sm] = pysam.Samfile(fname, 'wb', header=header)
  else:
    sys.stderr.write("File %s already exists, couldn't proceed\n" % fname)
    sys.exit(1)  
  
  

for r in bamh.fetch():
  rg = [x[1] for x in r.tags if x[0] == 'RG'][0]
  try:
    sm = rg2sm[rg]
    sample2fo[sm].write(r)
  except KeyError:
    continue  
  
for sm in sample2fo.keys():
  sample2fo[sm].close()