Source

utils / extract_sm.py

Full commit
Davide Cittaro 68aefec 











Davide Cittaro 922beab 
Davide Cittaro 68aefec 
Davide Cittaro 922beab 
Davide Cittaro 68aefec 






Davide Cittaro 922beab 

Davide Cittaro 68aefec 





Davide Cittaro b441e82 
Davide Cittaro f4cdaa5 

Davide Cittaro 68aefec 

Davide Cittaro 922beab 





Davide Cittaro 68aefec 
Davide Cittaro b441e82 

Davide Cittaro 431a8df 
Davide Cittaro f4cdaa5 
Davide Cittaro 68aefec 
Davide Cittaro b441e82 
Davide Cittaro 431a8df 
Davide Cittaro b441e82 
Davide Cittaro 68aefec 
Davide Cittaro f4cdaa5 


Davide Cittaro 68aefec 
Davide Cittaro b441e82 
Davide Cittaro 922beab 
Davide Cittaro f4cdaa5 





Davide Cittaro b441e82 
Davide Cittaro f9979a9 
Davide Cittaro b441e82 

Davide Cittaro f9979a9 
Davide Cittaro b441e82 
Davide Cittaro f4cdaa5 




Davide Cittaro 922beab 




Davide Cittaro f4cdaa5 




Davide Cittaro 431a8df 
#!/usr/bin/python

import sys
import os
import pysam
import getopt


bam_file = None
samples = []
extract_by_rg = False
extract_prefix = "extract_"
list_samples = False

(opts, args) = getopt.getopt(sys.argv[1:], 'f:rp:l')
for o, a in opts:
  if o == '-f':
    bam_file = a
  if o == '-r':
    extract_by_rg = True  
  if o == '-p':
    extract_prefix = a
  if o == '-l':
    list_samples = True  

if not bam_file or not os.path.exists(bam_file):
  sys.stderr.write("Please, provide a BAM file\n")
  sys.exit(1)

samples = args
tokeep = [] 
rg2sm = {}
sample2fo = {}

bamh = pysam.Samfile(bam_file, 'rb')

if list_samples:
  for rg in bamh.header['RG']:
    sys.stdout.write("Sample: %s\tRead Group: %s\n" % (rg['SM'], rg['ID']))
  sys.exit(0)
  
for rg in bamh.header['RG']:
  if extract_by_rg:
    if len(samples)==0 or rg['ID'] in samples:
      tokeep.append(rg['ID'])
      rg2sm[rg['ID']] = rg['ID']
  else:
    if len(samples)==0 or rg['SM'] in samples:
      tokeep.append(rg['SM'])
      rg2sm[rg['ID']] = rg['SM']

if not tokeep:
  sys.stderr.write("No samples matched, exiting\n")
  sys.exit(1)

for sm in set(tokeep):
  fname = extract_prefix + sm + '.bam'
  header = bamh.header
  if extract_by_rg:
    rh = [x for x in header['RG'] if x['RG'] == sm]
  else:
    rh = [x for x in header['RG'] if x['SM'] == sm]
  header['RG'] = rh
  if not os.path.exists(fname):
    sys.stderr.write("Opening %s\n" % fname)
    sample2fo[sm] = pysam.Samfile(fname, 'wb', header=header)
  else:
    sys.stderr.write("File %s already exists, couldn't proceed\n" % fname)
    sys.exit(1)  
  
  

for r in bamh.fetch():
  rg = [x[1] for x in r.tags if x[0] == 'RG'][0]
  try:
    sm = rg2sm[rg]
    sample2fo[sm].write(r)
  except KeyError:
    continue  
  
for sm in sample2fo.keys():
  sample2fo[sm].close()