Created by Sasha Kaurov last modified 2015-09-03

2.2m

import numpy as np
import matplotlib.pyplot as plt

# read stuff
data = np.genfromtxt('/group/dolan-lab/ekhramtsova/IDELA_GEMMA/plink_files/YRI1and3_bgl_R2_0.8_101208.rm_me_maf_0.05_hwe_0.001_missing.phen.removed_SNPs.post.assoc.frq', dtype=None, names=True)
ref = np.genfromtxt('GWAS_2.2M_10.4_SNPs.txt', dtype=None)
ref_gen = np.genfromtxt('snpinfo_ref.xls', dtype=None, delimiter='\t', names=True)

# arrays with reference maf's
ref_maf = np.zeros(len(ref))

# filling-up mafs:
for i in range(len(ref_maf)):
    ref_maf[i] = data['MAF'][ref[i] == data['SNP']][0]

# define bins edges(!!!)
bins = np.arange(0, 0.50001, 0.05)

# That is how reference maf array looks like:
H,X,p = plt.hist(ref_maf, bins=bins) #H -- distribution
plt.show()

# Here is ho maf in 'data' is distributed:
plt.hist(data['MAF'], bins=bins)
plt.show()

# Now generating random samples
N = 1000 # numer of random rs's

mock = np.zeros(N*len(ref), dtype=('|S20, float'))
j = 0
for i in range(len(bins)-1):
    temp = data[(data['MAF']>bins[i]) & (data['MAF']<=bins[i+1])]
    temp = temp[np.random.permutation(len(temp))]
    temp = temp[:N*H[i]]
    mock['f0'][j:j+N*H[i]] = temp['SNP']
    mock['f1'][j:j+N*H[i]] = temp['MAF']
    j += N*H[i]

# check 
plt.hist(mock['f1'], bins=bins)
plt.show()

# save list of SNPs
np.savetxt('temp.txt', mock['f0'])



snpinfo = np.genfromtxt('snpinfo.xls', dtype=None, delimiter='\t', names=True)
snpinfo_maf = np.zeros(len(snpinfo))
j=0
for i in range(len(bins)-1):
	snpinfo_maf[j:j+H[i]*N] = bins[i]+0.001
	j+=H[i]*N

plt.hist(snpinfo_maf, bins=bins)
plt.show()

hist_temp = []
hist_temp_g = []
for ii in range(N):
	g = []
	rs_i = 0
	for i in range(len(bins)-1):
		temp = snpinfo[(snpinfo_maf>bins[i]) & (snpinfo_maf<=bins[i+1])]
		temp = temp[ii*H[i]:(1+ii)*H[i]]
		temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup']
		for k in range(len(temp)):
			b = False
			temp2 = temp[k].split(':')
			for j in range(len(temp2)):
				temp3 = temp2[j].split(' ')
				if len(temp3)>1:
					if temp3[1] == 'YRI':
						g.append(temp3[0])
						b=True
			rs_i += b
	print rs_i, len(set(g))
	hist_temp.append(rs_i)
	hist_temp_g.append(len(set(g)))

plt.hist(hist_temp)
plt.plot([58], [0], 'or', ms=20)
plt.show()

plt.plot(hist_temp, hist_temp_g, '.b')
plt.plot([58], [200], 'or', ms=20)
plt.show()

g = []
rs_i = 0
temp = ref_gen
temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup']
for k in range(len(temp)):
	b = False
	temp2 = temp[k].split(':')
	for j in range(len(temp2)):
		temp3 = temp2[j].split(' ')
		if temp3[1] == 'YRI':
			g.append(temp3[0])
			b=True
	rs_i += b

print rs_i, len(set(g))


##########

snp_list_8000 = np.genfromtxt('snp_list_8000.csv', dtype=None)
j=0
for i in range(len(ref)):
    np.sum(ref[i] == snp_list_8000)

Raw

import numpy as np
import matplotlib.pyplot as plt

# read stuff

data = np.genfromtxt('/group/dolan-lab/ekhramtsova/IDELA_GEMMA/YRI.chr1_22.TGP_and_imputed.20130416.exclude_BAD_SNPs.removed_missing_ind.frq', dtype=None, names=True)
ref = np.genfromtxt('GWAS_6M_10.4_SNPs.txt', dtype=None)
ref_gen = np.genfromtxt('snpinfo (2).xls', dtype=None, delimiter='\t', names=True)

# arrays with reference maf's
ref_maf = np.zeros(len(ref))

# filling-up mafs:
for i in range(len(ref_maf)):
    ref_maf[i] = data['MAF'][ref[i] == data['SNP']][0]

# define bins edges(!!!)
bins = np.arange(0, 0.50001, 0.05)

# That is how reference maf array looks like:
H,X,p = plt.hist(ref_maf, bins=bins) #H -- distribution
plt.show()

# Here is ho maf in 'data' is distributed:
plt.hist(data['MAF'], bins=bins)
plt.show()

# Now generating random samples
N = 100 # numer of random rs's

mock = np.zeros(N*len(ref), dtype=('|S20, float'))
j = 0
for i in range(len(bins)-1):
    temp = data[(data['MAF']>bins[i]) & (data['MAF']<=bins[i+1])]
    temp = temp[np.random.permutation(len(temp))]
    temp = temp[:N*H[i]]
    mock['f0'][j:j+N*H[i]] = temp['SNP']
    mock['f1'][j:j+N*H[i]] = temp['MAF']
    j += N*H[i]

# check 
plt.hist(mock['f1'], bins=bins)
plt.show()

# save list of SNPs
np.savetxt('temp_6m.txt', mock['f0'], fmt='%s')
np.savetxt('temp_6m_maf.txt', mock['f1'], fmt='%f')



snpinfo = np.genfromtxt('snpinfo.xls', dtype=None, delimiter='\t', names=True)
snpinfo_maf = np.zeros(len(snpinfo))
j=0
for i in range(len(bins)-1):
	snpinfo_maf[j:j+H[i]*N] = bins[i]+0.001
	j+=H[i]*N

plt.hist(snpinfo_maf, bins=bins)
plt.show()

hist_temp = []
hist_temp_g = []
for ii in range(N):
	g = []
	rs_i = 0
	for i in range(len(bins)-1):
		temp = snpinfo[(snpinfo_maf>bins[i]) & (snpinfo_maf<=bins[i+1])]
		temp = temp[ii*H[i]:(1+ii)*H[i]]
		temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup']
		for k in range(len(temp)):
			b = False
			temp2 = temp[k].split(':')
			for j in range(len(temp2)):
				temp3 = temp2[j].split(' ')
				if len(temp3)>1:
					if temp3[1] == 'YRI':
						g.append(temp3[0])
						b=True
			rs_i += b
	print rs_i, len(set(g))
	hist_temp.append(rs_i)
	hist_temp_g.append(len(set(g)))

plt.hist(hist_temp)
plt.plot([58], [0], 'or', ms=20)
plt.show()

plt.plot(hist_temp, hist_temp_g, '.b')
plt.plot([58], [200], 'or', ms=20)
plt.show()

g = []
rs_i = 0
temp = ref_gen
temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup']
for k in range(len(temp)):
	b = False
	temp2 = temp[k].split(':')
	for j in range(len(temp2)):
		temp3 = temp2[j].split(' ')
		if temp3[1] == 'YRI':
			g.append(temp3[0])
			b=True
	rs_i += b

print rs_i, len(set(g))


##########

snp_list_8000 = np.genfromtxt('snp_list_8000.csv', dtype=None)
j=0
for i in range(len(ref)):
    j+=np.sum(ref[i] == snp_list_8000)

print j

snippet.txt

Raw

import numpy as np
import matplotlib.pyplot as plt

# read stuff
data = np.genfromtxt('/group/dolan-lab/ekhramtsova/IDELA_GEMMA/plink_files/YRI1and3_bgl_R2_0.8_101208.rm_me_maf_0.05_hwe_0.001_missing.phen.removed_SNPs.post.assoc.frq', dtype=None, names=True)
ref = np.genfromtxt('GWAS_2.2M_10.4_SNPs.txt', dtype=None)
ref_gen = np.genfromtxt('snpinfo_ref.xls', dtype=None, delimiter='\t', names=True)

# arrays with reference maf's
ref_maf = np.zeros(len(ref))

# filling-up mafs:
for i in range(len(ref_maf)):
    ref_maf[i] = data['MAF'][ref[i] == data['SNP']][0]

# define bins edges(!!!)
bins = np.arange(0, 0.50001, 0.05)

# That is how reference maf array looks like:
H,X,p = plt.hist(ref_maf, bins=bins) #H -- distribution
plt.show()

# Here is ho maf in 'data' is distributed:
plt.hist(data['MAF'], bins=bins)
plt.show()

# Now generating random samples
N = 1000 # numer of random rs's


snpinfo = np.genfromtxt('low_thresh_112.xls', dtype=None, delimiter='\t', names=True)
#snpinfo_maf = np.loadtxt('temp_6m_maf.txt')[:len(snpinfo)]
snpinfo_maf = np.zeros(len(snpinfo))
j=0
for i in range(len(bins)-1):
	snpinfo_maf[j:j+H[i]*N] = bins[i]+0.001
	j+=H[i]*N

plt.hist(snpinfo_maf, bins=bins)
plt.show()


for tresh in [0.001,0.0001,0.0005,0.00001,0.00005]:
	g = []
	rs_i = 0
	rs_i2 = [0,0,0,0,0]
	temp = ref_gen
	temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup']
	for k in range(len(temp)):
		gx = []
		b = False
		rs_i2 = [0,0,0,0,0,0,0,0,0,0]
		temp2 = temp[k].split(':')
		for j in range(len(temp2)):
			temp3 = temp2[j].split(' ')
			if (temp3[1] == 'YRI') and (float(temp3[2])<tresh):
				g.append(temp3[0])
				gx.append(temp3[0])
				b=True
		rs_i += b
		print len(gx), rs_i2[9]
	print rs_i, len(set(g)), rs_i2
	rs_i_ref = 1.0*rs_i
	set_g_ref = len(set(g))
	hist_temp = []
	hist_temp_g = []
	hist_temp_g2 = []
	for ii in range(N-1):
		g = []
		rs_i = 0
		rs_i2 = [0,0,0,0,0,0,0,0,0,0]
		for i in range(len(bins)-1):
			temp = snpinfo[(snpinfo_maf>bins[i]) & (snpinfo_maf<=bins[i+1])]
			temp = temp[ii*H[i]:(1+ii)*H[i]]
			temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup']
			for k in range(len(temp)):
				gx = []
				b = False
				temp2 = temp[k].split(':')
				for j in range(len(temp2)):
					temp3 = temp2[j].split(' ')
					if len(temp3)>1:
						if (temp3[1] == 'YRI') and (float(temp3[2])<tresh):
							g.append(temp3[0])
							gx.append(temp3[0])
							b=True
				rs_i += b
		print rs_i, len(set(g)), rs_i2
		hist_temp.append(rs_i)
		hist_temp_g2.append(rs_i2)
		hist_temp_g.append(len(set(g)))
	plt.clf()
	plt.hist(hist_temp,30)
	plt.plot(rs_i_ref, [0], 'or', ms=20)
	plt.xlabel('eQTL count')
	plt.ylabel('frequency')
	plt.ylim([-6,160])
	plt.savefig('eQTL_count_2.2M_'+str(tresh)+'.png')

Comments (0)

HTTPS

SSH

You can clone a snippet to your computer for local editing. Learn more.

Snippets

Alexander Kaurov katya

Comments (0)