Source

utils / extract_sm.py

Full commit
#!/usr/bin/python

import sys
import os
import pysam
import getopt


bam_file = None
samples = []
extract_by_rg = False
extract_prefix = "extract_"
list_samples = False

(opts, args) = getopt.getopt(sys.argv[1:], 'f:rp:l')
for o, a in opts:
  if o == '-f':
    bam_file = a
  if o == '-r':
    extract_by_rg = True  
  if o == '-p':
    extract_prefix = a
  if o == '-l':
    list_samples = True  

if not bam_file or not os.path.exists(bam_file):
  sys.stderr.write("Please, provide a BAM file\n")
  sys.exit(1)

samples = args
tokeep = {} #a dictionary files -> list of read groups 
rg2sm = {}
sample2fo = {}

bamh = pysam.Samfile(bam_file, 'rb')

if list_samples:
  for rg in bamh.header['RG']:
    sys.stdout.write("Sample: %s\tRead Group: %s\n" % (rg['SM'], rg['ID']))
  sys.exit(0)
  
for rg in bamh.header['RG']:
  if len(samples) == 0:
    # create one file for each
    if extract_by_rg:
      tokeep[rg['ID']] = [rg['ID']]
      rg2sm[rg['ID']] = rg['ID']
    else:
      try:
        tokeep[rg['SM']].append(rg['ID'])
        rg2sm[rg['ID']] = rg['SM']
      except KeyError:  
        tokeep[rg['SM']] = [rg['ID']]
        rg2sm[rg['ID']] = rg['SM']
  else:
    if extract_by_rg:
      if rg['ID'] in samples:
        tokeep[rg['ID']] = [rg['ID']]
        rg2sm[rg['ID']] = rg['ID']
    else:
      if rg['SM'] in samples:
        try:
          tokeep[rg['SM']].append(rg['ID'])
          rg2sm[rg['ID']] = rg['SM']
        except KeyError:
          tokeep[rg['SM']] = [rg['ID']]
          rg2sm[rg['ID']] = rg['SM']

if not tokeep:
  sys.stderr.write("No samples matched, exiting\n")
  sys.exit(1)

for sm in tokeep.keys():
  fname = extract_prefix + sm + '.bam'
  header = bamh.header
  if extract_by_rg:
    rh = [x for x in header['RG'] if x['RG'] == sm]
  else:
    rh = [x for x in header['RG'] if x['SM'] == sm]
  header['RG'] = rh
  sample2fo[sm] = pysam.Samfile(fname, 'wb', header=header)
  
  

for r in bamh.fetch():
  rg = [x[1] for x in r.tags if x[0] == 'RG'][0]
  try:
    sm = rg2sm[rg]
    sample2fo[sm].write(r)
  except KeyError:
    continue  
  
for sm in sample2fo.keys():
  sample2fo[sm].close()