Commits

Jeffrey Hsu committed 9ccb236

Fixed header parsing. Made sure multiple read groups work. Now prints the
rsID as well.

  • Participants
  • Parent commits 82d1e49

Comments (0)

Files changed (1)

     """ Contains information about a givent position in a BAM file
     """
     #:TODO Need to get rid of low quality bases
-    def __init__(self, region, position, samples):
+    def __init__(self, region, position, samples, phredThreshold):
 	""" Location is one-based.  Internally this will get changed to zero
 	base by pysam.  
 	"""
 	self.allele_counts = {}
 	self.strand_bias = {}  
 	self.samples = samples
+	self.phredThreshold=phredThreshold
 	for i in self.samples:
 	    self.allele_counts[i] = {}
 
 
-    def __call__(self, pileups, position=None, samples=None):
+    def __call__(self, pileups, position=None, samples=None, phredThreshold=None):
 	if position == None: position = self.position
 	else:  pass
 	if samples == None: n = self.samples
 	else: pass
+	if phredThreshold == None: qualT = self.phredThreshold
 	# Hmm somewhere the possition got converted into zero base
 	if pileups.pos != position-1: pass
 	else:
 		# checks first to see if the read is a duplicate
 		if read.alignment.is_duplicate: pass
 		else:
-		    read_group = read.alignment.opt('RG')
 		    qpos = read.qpos
-		    base = read.alignment.seq[qpos]
-		    strand = read.alignment.is_reverse
-		    self.allele_counts[read_group][base]=\
-			    self.allele_counts[read_group].get(base,0)+1
-
-		    self.strand_bias[strand] = self.strand_bias.get(strand,0)+1
+		    base_quality = read.alignment.qual[qpos]
+		    if ord(base_quality)-33 > qualT:
+			read_group = read.alignment.opt('RG')
+			base = read.alignment.seq[qpos]
+			strand = read.alignment.is_reverse
+			self.allele_counts[read_group][base]=\
+				self.allele_counts[read_group].get(base,0)+1
+			self.strand_bias[strand]=\
+				self.strand_bias.get(strand,0)+1
+		    else: pass
 		    # print(qpos, read.alignment.qname, read.alignment.is_proper_pair)
 
 
     # :TODO These options do nothing right now?
     p.add_option("-v",  "--vcf_file", dest="inputisvcfile", help="the input \
 	    is a VCF file")
+    p.add_option("-q", "--quality_threshold",type="int", dest="qual", help="base quality \
+	    threshold to take allele counts from")
     options, args = p.parse_args()
-
+    if options.qual: pass
+    else: options.qual=20
     # Open the bedfile/vcf file
     # file_a = csv.reader(open(args[0], "rU"), delimiter="\t")
     # Right now defaulting to VCF file
 	samples = []
 	# Grab only the header information
 	header=bamFile.text
-	m = re.compile('\@RG.*$')
+	m = re.compile('@RG.*')
 	readGroups = m.findall(header)
 	for r in readGroups:
 	    r=r.split('\t')
 		else : pass
 	    readGroup_sample[ID] = SM
 	    samples.append(ID)
-	bam_ReadGroups[bamName] = samples
-
-	
+	bam_ReadGroups[bamName] = samples	
     # For testing purposes
     x=1
-    # :TODO print file header
-    header = ["chr", "pos"]
+    
+    # Print the Header
+    header = ["chr", "pos", "rsID", "ref"]
     for i in bam_Names:
 	ReadGroupsinBam = bam_ReadGroups[i]
 	for t in ReadGroupsinBam:
 	    header.append(readGroup_sample[t])
 	    header.append("Major")
 	    header.append("Minor")
+    print("\t".join(header))
 
-    print("\t".join(header))
+
     for line in file_a:
 	"""
 	if line[0][0] == "#":pass
 	for bamFile, bamNames in map(None, bam_Files, bam_Names):
 	    # :TODO in the VCF and bed files make sure to type the attributes
 	    variant = lociInformation(str(line.chr), int(line.pos),
-		    samples=bam_ReadGroups[bamNames])
+		    samples=bam_ReadGroups[bamNames],
+		    phredThreshold=options.qual)
 	    # Method Call
 	    bamFile.pileup(line.chr, int(line.pos), int(line.pos),callback=variant)
 	    # :TODO Make sure ordering is retained
 
 	    count_format.append(allele_string)
 	    count_format.extend(temp)
-	
-	print("%s\t%s\t" % (str(line.chr), str(line.pos))+"\t".join(count_format))
+	line_info = [str(line.chr), str(line.pos), str(line.id),
+		str(line.ref)]
+	line_info.extend(count_format)
+	print("\t".join(line_info))
 	# For testing purposes
+	"""
 	if x > 1: break
 	else: x += 1
+	"""
 	# :TODO k-means clustering of the data. 
 
 if __name__ == '__main__':