Source

utils / extract_sm.py

Davide Cittaro 68aefec 



























Davide Cittaro f4cdaa5 

Davide Cittaro 68aefec 






Davide Cittaro f4cdaa5 
Davide Cittaro 68aefec 


Davide Cittaro f4cdaa5 
Davide Cittaro 68aefec 

Davide Cittaro f4cdaa5 
Davide Cittaro 68aefec 



Davide Cittaro f4cdaa5 
Davide Cittaro 68aefec 



Davide Cittaro f4cdaa5 
Davide Cittaro 68aefec 

Davide Cittaro f4cdaa5 
Davide Cittaro 68aefec 
Davide Cittaro f4cdaa5 


Davide Cittaro 68aefec 
Davide Cittaro f4cdaa5 



















#!/usr/bin/python

import sys
import os
import pysam
import getopt


bam_file = None
samples = []
extract_by_rg = False
extract_prefix = "extract_"

(opts, args) = getopt.getopt(sys.argv[1:], 'f:rp:')
for o, a in opts:
  if o == '-f':
    bam_file = a
  if o == '-r':
    extract_by_rg = True  
  if o == '-p':
    extract_prefix = a

if not bam_file or not os.path.exists(bam_file):
  sys.stderr.write("Please, provide a BAM file\n")
  sys.exit(1)

samples = args
tokeep = {} #a dictionary files -> list of read groups 
rg2sm = {}
sample2fo = {}

bamh = pysam.Samfile(bam_file, 'rb')
for rg in bamh.header['RG']:
  if len(samples) == 0:
    # create one file for each
    if extract_by_rg:
      tokeep[rg['ID']] = [rg['ID']]
      rg2sm[rg['ID']] = rg['ID']
    else:
      try:
        tokeep[rg['SM']].append(rg['ID'])
        rg2sm[rg['ID']] = rg['SM']
      except KeyError:  
        tokeep[rg['SM']] = [rg['ID']]
        rg2sm[rg['ID']] = rg['SM']
  else:
    if extract_by_rg:
      if rg['ID'] in samples:
        tokeep[rg['ID']] = [rg['ID']]
        rg2sm[rg['ID']] = rg['ID']
    else:
      if rg['SM'] in samples:
        try:
          tokeep[rg['SM']].append(rg['ID'])
          rg2sm[rg['ID']] = rg['SM']
        except KeyError:
          tokeep[rg['SM']] = [rg['ID']]
          rg2sm[rg['ID']] = rg['SM']

if not tokeep:
  sys.stderr.write("No samples matched, exiting\n")
  sys.exit(1)

for sm in tokeep.keys():
  fname = extract_prefix + sn + '.bam'
  header = bamh.header
  if extract_by_rg:
    rh = [x for x in header['RG'] if x['RG'] == sm]
  else:
    rh = [x for x in header['RG'] if x['SM'] == sm]
  header['RG'] = rh
  sample2fo[sm] = pysam.Samfile(fname, 'wb', header=header)
  
  

for r in bamh.fetch():
  rg = [x[1] for x in r.tags if x[0] == 'RG'][0]
  sm = rg2sm[rg]
  sample2fo[sm].write(r)
  
for sm in sample2fo.keys():
  sample2fo[sm].close()