1. Jeffrey Hsu
  2. seqAnno

Commits

Jeffrey Hsu  committed 4d512d2 Merge

merge committ

  • Participants
  • Parent commits e9c671d, 1154eaa
  • Branches default

Comments (0)

Files changed (2)

File geneCounts.py

View file
-import optparse, pysam, sys, csv
+"""
+Grabs the counts from an annotated gene list and calculates some statistics using an annotation file.
+Currently only does a gene by bene approach.  Working on to improve both
+functionality and speed.  Right now this takes a very long time to finish.  
+
+Usage:
+    python geneCounts.py [options] GTFfile BAMFile
+
+
+Author: Jeffrey Hsu
+2010
+"""
+
+import optparse, pysam, sys, cProfile
+from gtfFile import gtfFile
+
+class candidateRegion(object):
+    def __init__(self):
+	pass
+
 
 class Counter(object):
+    """
+	Callback class
+    """
     mCounts = 0
     def __init__(self, gene):
 	self.gene = gene
+	# span_slice contains a list of the reads that span splice junctions
+	self.splicereads = []
+
 	#self.transcript = transcript
     def __call__(self, alignment):
-	self.mCounts += 1
+	if alignment.is_proper_pair and alignment.is_read1:
+	    cigarTypes = [ x[0] for x in alignment.cigar]
+	    if 3 in cigarTypes:
+		# 3 corresponds to skipping reference ie it spans a splice
+		# junction
+		if alignment.qname in self.splicereads:
+		    pass
+		# :TODO Those indexes are run dependent, but using for speed
+		# atm
+		else: 
+		    self.splicereads.append(alignment.qname)
+		    self.mCounts +=  1 
+	    else:
+		self.mCounts += 1
 
 def main():
     ###########################
     p.add_option("-D", "--debug", action="store_true", dest="D", help="debug")
 
     options, args = p.parse_args()
-    debug = 0
-    # :TODO use a wrapper class for the GTF file 
-    fh = csv.reader(open(args[0], "rU"), delimiter="\t")
+    debug = 0 
     bamfile = pysam.Samfile(args[1], "rb")
-    
+ 
     #firstline = fh.next()
     # 2 Z-score up, and increases count as an exon.  
     
     c = Counter("Gene")
+    
+    """
+    fh = gtfFile(args[0])
     for line in fh:
-	gene = line[8][9:line[8].find(";")-1]
-	transcript = line[8][line[8].find(";")+18:line[8].rfind(";")-1]
-	if c.gene==gene:  pass
+	
+	if c.gene==line.gene:  pass
 	else:
 	    print(str(c.gene)+"\t" + str(c.mCounts))
-	    c = Counter(gene=gene)
-	if line[0] in contigs and line[2]=="exon" and "_dup1" not in transcript:
-	    bamfile.fetch(line[0], int(line[3]), int(line[4]) , callback= c)
+	    # Creates a new counter object for the new gene
+	    c = Counter(gene=line.gene)
+	if line.region in contigs and line.type=="exon" and "_dup1" not in\
+		line.gene:
+	    bamfile.fetch(line.region, line.start, line.end, callback= c)
 	else:
 	    pass    
+    """
+    fh = open(args[0], "rU")
+    for line in fh:
+	line = line.strip('\n').split('\t')
+	gene = line[8][9:line[8].find(";")-1]
+	if c.gene == gene: pass
+	else: 
+	    print(str(c.gene)+"\t" + str(c.mCounts))
+	    c = Counter(gene=gene)
+	if line[0] in contigs and line[2] == "exon" and "_dubp" not in\
+		gene:
+	    bamfile.fetch(line[0], int(line[3]), int(line[4]), callback=c)
+	else:
+	    pass
+	
 	if options.D:
 	    debug += 1
 	    if debug > 400:
 		break
 	    else:pass
 	else: pass
-    
 
 
 if __name__=='__main__':

File gtfFile.py

View file
 
 class baseFile(object):
     def __init__(self, filename):
-    """ Removes the header
-    """
+	self.processed_Meta = False
+
+	
 	if isinstance(filename, str):
 	    try:
 		self.handle = open(filename, "rU")
 	else: 
 	    print("GTFfile needs a filename or a file")
 	    sys.exit()
-    
+	 
     def __iter__(self):
 	return self
     
     def __parse_line(self, line):
 	return line
-
-    def next(self):
+    
+    def iterationCheck(self):
 	if self.handle.closed:
 	    raise StopIteration
 	else: pass
 	if not line:
 	    self.handle.close()
 	    raise StopIteration
-	return __parse_line(self.line)
+	return line
+
+    def next(self):
+	line = iterationCheck()
+	return line
+
 
     def chunk(self, nodes):
 	"""
     """
 	A flexible container object for GTF/GFF files.  
     """
-    self.processed_Meta = False
 
 
     def process_Meta():
 	self.processed_Meta = True
 	
 
-    def __parse_line():
+    def __parse_line(self,line):
 	line = line.strip('\n').split('\t')
-	propString = 'region, source, type, start, end, unknown, direction, stuff,
-	gene, transcript'
+	propString = 'region, source, type, start, end, unknown, direction,\
+		stuff, gene, transcript'
 	conv_line = (str, str, str, int, int, str, str, str, str, str)
+	gene = line[8][9:line[8].find(";")-1]
+	transcript = line[8][line[8].find(";")+17:line[8].rfind(";")-1]
+	line[-1]=gene
+	line.append(transcript)
 	gtfLine = collections.namedtuple('gtfLine', propString)
 	try:
 	    modLine = gtfLine(*[x(y) for x,y in zip(conv_line, line)])
 	    sys.exit()
 	return modLine
 
+    def next(self):
+	line = self.iterationCheck()
+	return(self.__parse_line(line))