Commits

Ross Lazarus  committed b06574e

Working - now writes ranks correctly and sorted

  • Participants
  • Parent commits 8bd93d7

Comments (0)

Files changed (1)

File tools/rgenetics/rgGSEA.py

         os.symlink(opts.use_gmt,fakeGMT)             
         fakeRanks = os.path.join(opts.output_dir,"%s.rnk" % os.path.basename(opts.input_name))
         if opts.input_ranks:
-            r = open(opts.input_ranks,'r').readlines() # suck in and remove blank ids that cause gsea to barf rml april 10 2012
-            r = [x.split('\t') for x in r[1:]] # ignore head
+            infname = opts.input_ranks
+            res = open(opts.input_ranks,'r').readlines() # suck in and remove blank ids that cause gsea to barf rml april 10 2012
+            res = [x.split('\t') for x in res[1:]] # ignore head
         else: # read tabular
-            dat = open(opts.input_tab,'r').readlines()
-            dat = [x.strip().split('\t') for x in dat]
-            pvals = [float(x[self.adjpvalcol]) for x in dat[1:]]
-            signs = [float(x[self.signcol]) for x in dat[1:]]
-            ids = [x[self.idcol] for x in dat[1:]]
-            r = []
+            infname = opts.input_tab
+            indat = open(opts.input_tab,'r').readlines()
+            dat = [x.strip().split('\t') for x in indat[1:]]
+            pvals = [float(x[self.adjpvalcol]) for x in dat]
+            signs = [float(x[self.signcol]) for x in dat]
+            ids = [x[self.idcol] for x in dat]
+            res = []
             for i,id in enumerate(ids):
                 p = pvals[i]
                 if p == 0.0:
                      p = 1e-999
-                lp = math.log(p,10)
+                lp = -math.log10(p) # large positive if low p value
                 sign = signs[i]
-                if sign > 0:
-                     p = -p # if negative fold change, leave p as negative but if positive, swap so large positive number instead of large negative 
-                r.append((p,(id,'%f\n' % p))) # decorate
-            r.sort()
-            r = [x[1] for x in r] # undecorate so ranked in log pvalue order
-        len1 = len(r)
-        r = [x for x in r if len(x[0]) > 0 and len(x[1]) > 0 and x[0].upper() <> 'NA']
-        len2 = len(r)
+                if sign < 0:
+                     lp = -lp # if negative, swap p to negative
+                res.append((id,'%f\n' % lp))
+        len1 = len(res)
+        res = [x for x in res if len(x[0]) > 0 and len(x[1]) > 0 and x[0].upper() <> 'NA']
+        len2 = len(res)
         delta = len1 - len2
         if delta <> 0:
-            print sys.stdout,'WARN: %d of %d rank input file %s rows deleted - probably dup or NA IDs' % (delta,len1,opts.input_ranks)
-        rd = dict(zip([x[0] for x in r],r)) # this does something mysterious to dupes - probably keeps first one
-        ranks = ['\t'.join(x) for x in rd.values()]
+            print sys.stdout,'WARN: %d of %d rank input file %s rows deleted - probably dup or NA IDs' % (delta,len1,infname)
+        rd = dict(zip([x[0] for x in res],res)) # this does something mysterious to dupes - probably keeps last one
+        ranks = rd.values()
+        ranks = [(float(x[1]),x) for x in ranks] # decorate
+        ranks.sort()
+        ranks.reverse()
+        ranks = ['\t'.join(x[1]) for x in ranks]
         if len(ranks) < 2:
-             print sys.stderr,'Input %s has 1 or less rows with two tab delimited fields - please check the tool documentation' % opts.input_ranks
+             print sys.stderr,'Input %s has 1 or less rows with two tab delimited fields - please check the tool documentation' % infname
              sys.exit(1)
         rclean = open(fakeRanks,'w')
+        rclean.write('contig\tscore\n')
         rclean.write(''.join(ranks)) # line ends untouched
         rclean.close()
         cl = []