Source

SNPpy / merge.py

Full commit
from utils import print_timing

def create_anno_table(alleletype, **schemas):
   from sqlalchemy.sql import text
   sm = '_'.join([schemas[k] for k in sorted(schemas.keys())])
   if alleletype == None:
      allelea = "allelea_id"
      alleleb = "alleleb_id"
   elif alleletype == "forward":
      allelea = "allelea_forward_id"
      alleleb = "alleleb_forward_id"
   elif alleletype == "top":
      allelea = "allelea_top_id"
      alleleb = "alleleb_top_id"
   else:
      print "write_ped_file: alleletype %s unknown, cannot proceed"%alleletype
      exit(1)
   ctes = ["dp_anno_%(s)s AS \
     ( SELECT id, %(allelea)s, %(alleleb)s, rsid, chromosome, location \
     FROM \
             (SELECT  anno.*, \
                      row_number() OVER(PARTITION BY anno.rsid ORDER BY anno.id) \
             FROM     %(s)s.anno \
                      INNER JOIN %(s)s.geno \
                      ON       anno.id = geno.anno_id \
             WHERE    idlink_id        = \
                      (SELECT MIN(idlink.id) \
                      FROM    %(s)s.idlink \
                      ) \
             ) AS q \
     WHERE   row_number = '1' \
     ) \
   """%{"s":schemas[k], "allelea":allelea, "alleleb":alleleb} for k in sorted(schemas.keys())]
   sel = " INTERSECT ".join(["SELECT * FROM dp_anno_%(s)s"%{'s':schemas[k]} for k in sorted(schemas.keys())])
   create_anno_table = text("CREATE TABLE %(sm)s.anno AS (WITH "%{'sm':sm} + ','.join(ctes) + sel + ")", autocommit=True)
   return create_anno_table
   conn.execute(q)

def create_tables(**schemas):
   from sqlalchemy.sql import text
   a1 = schemas['s1']
   sm = '_'.join([schemas[k] for k in sorted(schemas.keys())])
   create_tables = text("""
   DROP SCHEMA IF EXISTS %(sm)s CASCADE;
   CREATE SCHEMA %(sm)s;
   CREATE TABLE %(sm)s.allele AS SELECT * FROM %(s1)s.allele;
   CREATE TABLE %(sm)s.chromo AS SELECT * FROM %(s1)s.chromo;
   CREATE TABLE %(sm)s.sex AS SELECT * FROM %(s1)s.sex;
   CREATE TABLE %(sm)s.snpval AS SELECT * FROM %(s1)s.snpval;
   """%{'s1':schemas['s1'], 'sm':sm}, autocommit=True)
   return create_tables

@print_timing
def write_merged_geno_table(conn, **schemas):
   from sqlalchemy.sql import text
   import copy
   sm = '_'.join([schemas[k] for k in sorted(schemas.keys())])
   l = len(schemas)
   schemas_ext = copy.copy(schemas)
   schemas_ext.update({'sm':sm})
   schemalst = [" * FROM %s.geno "%schemas[k] for k in sorted(schemas.keys())]
   cmdstr = "CREATE TABLE %s.geno AS SELECT"%sm + " UNION ALL SELECT ".join(schemalst)
   create_geno_table = text(cmdstr, autocommit=True)
   conn.execute(create_geno_table)

def create_merged_tables(**schemas):
   from sqlalchemy.sql import text
   import copy
   sm = '_'.join([schemas[k] for k in sorted(schemas.keys())])
   l = len(schemas)
   schemas_ext = copy.copy(schemas)
   schemas_ext.update({'sm':sm})
   schemalst1 = [" * FROM %s.idlink "%schemas[k] for k in sorted(schemas.keys())]
   cmdstr1 = "CREATE TABLE %s.idlink AS SELECT"%sm + " UNION ALL SELECT ".join(schemalst1)
   schemalst2 = [" * FROM %s.pheno "%schemas[k] for k in sorted(schemas.keys())]
   cmdstr2 = "CREATE TABLE %s.pheno AS SELECT"%sm + " UNION ALL SELECT ".join(schemalst2)
   schemalst3 = [" * FROM %s.race "%schemas[k] for k in sorted(schemas.keys())]
   cmdstr3 = "CREATE TABLE %s.race AS SELECT"%sm + " UNION ALL SELECT ".join(schemalst3)
   cmdstr = ";".join([cmdstr1, cmdstr2, cmdstr3])+";"
   merge_tables = text(cmdstr, autocommit=True)
   return merge_tables

@print_timing
def write_merged_map_ped_files(dbstring, mapfilename, pedfilename, alleletype, **schemas):
   import os, sys
   from dbutils import get_db_type
   dbtype = get_db_type(dbstring)
   if dbtype == 'affy6' and alleletype != None:
      sys.exit("database is marked as 'affy6', so alleletype option not required - you have used %s as alleletype."%alleletype)
   if dbtype == 'illumina':
      if alleletype != 'forward' and alleletype != 'top':
         sys.exit("database is marked as 'illumina', so alleletype must be either 'forward' or 'top', not %s"%alleletype)
   from utils import file_exists
   if [x for x in [mapfilename, pedfilename] if file_exists(x)]:
      sys.exit("Exiting...")
   print "creating map file %s and ped file %s"%(mapfilename, pedfilename)
   from sqlalchemy import create_engine
   db = create_engine(dbstring)
   conn = db.connect()
   from sqlalchemy.sql import text
   sm = '_'.join([schemas[k] for k in sorted(schemas.keys())])
   conn.execute(create_tables(**schemas))
   conn.execute(create_anno_table(alleletype, **schemas))
   conn.execute(create_merged_tables(**schemas))
   write_merged_geno_table(conn, **schemas)
   from output import write_map_file, write_ped_file
   write_map_file(dbstring, sm, mapfilename)
   write_ped_file(dbstring, sm, pedfilename, alleletype)

@print_timing
def write_merged_tfam_tped_files(dbstring, tfamfilename, tpedfilename, alleletype, **schemas):
   import os, sys
   from dbutils import get_db_type
   dbtype = get_db_type(dbstring)
   if dbtype == 'affy6' and alleletype != None:
      sys.exit("database is marked as 'affy6', so alleletype option not required - you have used %s as alleletype."%alleletype)
   if dbtype == 'illumina':
      if alleletype != 'forward' and alleletype != 'top':
         sys.exit("database is marked as 'illumina', so alleletype must be either 'forward' or 'top', not %s"%alleletype)
   from utils import file_exists
   if [x for x in [tfamfilename, tpedfilename] if file_exists(x)]:
      sys.exit("Exiting...")
   print "creating tfam file %s and tped file %s"%(tfamfilename, tpedfilename)
   from sqlalchemy import create_engine
   db = create_engine(dbstring)
   conn = db.connect()
   from sqlalchemy.sql import text
   sm = '_'.join([schemas[k] for k in sorted(schemas.keys())])
   conn.execute(create_tables(**schemas))
   conn.execute(create_anno_table(alleletype, **schemas))
   conn.execute(create_merged_tables(**schemas))
   write_merged_geno_table(conn, **schemas)
   from output import write_tfam_file, write_tped_file
   write_tfam_file(dbstring, sm, tfamfilename)
   write_tped_file(dbstring, sm, tpedfilename, alleletype)