orange-bioinformatics / _bioinformatics /

#!/usr/bin/env python

from collections import defaultdict
from import run_simple_query
from import GeneSets, GeneSet
import time

def display_string(string):
    return string.capitalize().replace("_", " ")

regulation = "updown"
organism   = "Homo sapiens"
condition  = "organism_part"
org_code   = "9606"
max_pvalue = 1e-5

query = run_simple_query(regulation=regulation, organism=organism, condition=condition, start=0, rows=5) # Do a short pre-query to see how many genes there are
total_genes = query["totalResults"]

no_of_genes = 50 # 50 genes per query used, since the HTTP requests don't seem stable with more than 50 genes at a time.
no_of_pages = total_genes / no_of_genes  # Gene Expression Atlas API only permits access to [50,200] genes at a time.

print total_genes

sets = defaultdict(list)

for start in range(no_of_pages+1):
    start = time.time()
    query = run_simple_query(regulation=regulation, organism=organism, condition=condition, start=start*no_of_genes, rows=no_of_genes)
    for result in query["results"]:
        i += 1
        print result["gene"]["name"] + "\t" + str(i) # For printing out gene names and the current number of genes during debugging
        for exp in result["expressions"]: 
            diff_exp = [e for e in exp["experiments"] if e["pvalue"] <= max_pvalue] # Use only genes that are significantly diff. expressed
            if diff_exp:
                    sets[exp["ef"], exp["efv"]].append(result["gene"]["name"]) # The Gene Expression Atlas entries are not consistent. 
                    sets[exp["efoTerm"], exp["efoId"]].append(result["gene"]["name"])
    print time.time() - start
gene_sets = []
for (ef, efv), genes in sets.items():
    ef_display = display_string(ef)
    gs = GeneSet(genes, "Diff. expressed in %s=%s." % (ef_display, efv), id=ef + ":" + efv,
                 description="Diff. expressed in %s=%s." % (ef_display, efv),
                 link="{organism}&gprop_0=&gnot_0=&gval_0=&fact_1=&fexp_1=UPDOWN&fmex_1=&fval_1=%22{efv}%22+&view=hm".format( \
                         organism = "+".join(organism.lower().split()), efv = "+".join(efv.lower().split())), organism=org_code, hierarchy=("Gene Expression Atlas", display_string(condition)))

final_set = GeneSets(gene_sets)

print final_set

from import register
import Orange.utils.serverfiles as serverfiles
import sys

    sf_server = serverfiles.ServerFiles(sys.argv[1], sys.argv[2])
    print "argv[1] = username, argv[2] = password"

set_split = final_set.split_by_hierarchy()

for s in set_split:
    register(s, sf_server)

print "Gene sets successfully registered..."