Snippets
Created by
Sasha Kaurov
last modified
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | import numpy as np
import matplotlib.pyplot as plt
# read stuff
data = np.genfromtxt('/group/dolan-lab/ekhramtsova/IDELA_GEMMA/plink_files/YRI1and3_bgl_R2_0.8_101208.rm_me_maf_0.05_hwe_0.001_missing.phen.removed_SNPs.post.assoc.frq', dtype=None, names=True)
ref = np.genfromtxt('GWAS_2.2M_10.4_SNPs.txt', dtype=None)
ref_gen = np.genfromtxt('snpinfo_ref.xls', dtype=None, delimiter='\t', names=True)
# arrays with reference maf's
ref_maf = np.zeros(len(ref))
# filling-up mafs:
for i in range(len(ref_maf)):
ref_maf[i] = data['MAF'][ref[i] == data['SNP']][0]
# define bins edges(!!!)
bins = np.arange(0, 0.50001, 0.05)
# That is how reference maf array looks like:
H,X,p = plt.hist(ref_maf, bins=bins) #H -- distribution
plt.show()
# Here is ho maf in 'data' is distributed:
plt.hist(data['MAF'], bins=bins)
plt.show()
# Now generating random samples
N = 1000 # numer of random rs's
mock = np.zeros(N*len(ref), dtype=('|S20, float'))
j = 0
for i in range(len(bins)-1):
temp = data[(data['MAF']>bins[i]) & (data['MAF']<=bins[i+1])]
temp = temp[np.random.permutation(len(temp))]
temp = temp[:N*H[i]]
mock['f0'][j:j+N*H[i]] = temp['SNP']
mock['f1'][j:j+N*H[i]] = temp['MAF']
j += N*H[i]
# check
plt.hist(mock['f1'], bins=bins)
plt.show()
# save list of SNPs
np.savetxt('temp.txt', mock['f0'])
snpinfo = np.genfromtxt('snpinfo.xls', dtype=None, delimiter='\t', names=True)
snpinfo_maf = np.zeros(len(snpinfo))
j=0
for i in range(len(bins)-1):
snpinfo_maf[j:j+H[i]*N] = bins[i]+0.001
j+=H[i]*N
plt.hist(snpinfo_maf, bins=bins)
plt.show()
hist_temp = []
hist_temp_g = []
for ii in range(N):
g = []
rs_i = 0
for i in range(len(bins)-1):
temp = snpinfo[(snpinfo_maf>bins[i]) & (snpinfo_maf<=bins[i+1])]
temp = temp[ii*H[i]:(1+ii)*H[i]]
temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup']
for k in range(len(temp)):
b = False
temp2 = temp[k].split(':')
for j in range(len(temp2)):
temp3 = temp2[j].split(' ')
if len(temp3)>1:
if temp3[1] == 'YRI':
g.append(temp3[0])
b=True
rs_i += b
print rs_i, len(set(g))
hist_temp.append(rs_i)
hist_temp_g.append(len(set(g)))
plt.hist(hist_temp)
plt.plot([58], [0], 'or', ms=20)
plt.show()
plt.plot(hist_temp, hist_temp_g, '.b')
plt.plot([58], [200], 'or', ms=20)
plt.show()
g = []
rs_i = 0
temp = ref_gen
temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup']
for k in range(len(temp)):
b = False
temp2 = temp[k].split(':')
for j in range(len(temp2)):
temp3 = temp2[j].split(' ')
if temp3[1] == 'YRI':
g.append(temp3[0])
b=True
rs_i += b
print rs_i, len(set(g))
##########
snp_list_8000 = np.genfromtxt('snp_list_8000.csv', dtype=None)
j=0
for i in range(len(ref)):
np.sum(ref[i] == snp_list_8000)
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | import numpy as np
import matplotlib.pyplot as plt
# read stuff
data = np.genfromtxt('/group/dolan-lab/ekhramtsova/IDELA_GEMMA/YRI.chr1_22.TGP_and_imputed.20130416.exclude_BAD_SNPs.removed_missing_ind.frq', dtype=None, names=True)
ref = np.genfromtxt('GWAS_6M_10.4_SNPs.txt', dtype=None)
ref_gen = np.genfromtxt('snpinfo (2).xls', dtype=None, delimiter='\t', names=True)
# arrays with reference maf's
ref_maf = np.zeros(len(ref))
# filling-up mafs:
for i in range(len(ref_maf)):
ref_maf[i] = data['MAF'][ref[i] == data['SNP']][0]
# define bins edges(!!!)
bins = np.arange(0, 0.50001, 0.05)
# That is how reference maf array looks like:
H,X,p = plt.hist(ref_maf, bins=bins) #H -- distribution
plt.show()
# Here is ho maf in 'data' is distributed:
plt.hist(data['MAF'], bins=bins)
plt.show()
# Now generating random samples
N = 100 # numer of random rs's
mock = np.zeros(N*len(ref), dtype=('|S20, float'))
j = 0
for i in range(len(bins)-1):
temp = data[(data['MAF']>bins[i]) & (data['MAF']<=bins[i+1])]
temp = temp[np.random.permutation(len(temp))]
temp = temp[:N*H[i]]
mock['f0'][j:j+N*H[i]] = temp['SNP']
mock['f1'][j:j+N*H[i]] = temp['MAF']
j += N*H[i]
# check
plt.hist(mock['f1'], bins=bins)
plt.show()
# save list of SNPs
np.savetxt('temp_6m.txt', mock['f0'], fmt='%s')
np.savetxt('temp_6m_maf.txt', mock['f1'], fmt='%f')
snpinfo = np.genfromtxt('snpinfo.xls', dtype=None, delimiter='\t', names=True)
snpinfo_maf = np.zeros(len(snpinfo))
j=0
for i in range(len(bins)-1):
snpinfo_maf[j:j+H[i]*N] = bins[i]+0.001
j+=H[i]*N
plt.hist(snpinfo_maf, bins=bins)
plt.show()
hist_temp = []
hist_temp_g = []
for ii in range(N):
g = []
rs_i = 0
for i in range(len(bins)-1):
temp = snpinfo[(snpinfo_maf>bins[i]) & (snpinfo_maf<=bins[i+1])]
temp = temp[ii*H[i]:(1+ii)*H[i]]
temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup']
for k in range(len(temp)):
b = False
temp2 = temp[k].split(':')
for j in range(len(temp2)):
temp3 = temp2[j].split(' ')
if len(temp3)>1:
if temp3[1] == 'YRI':
g.append(temp3[0])
b=True
rs_i += b
print rs_i, len(set(g))
hist_temp.append(rs_i)
hist_temp_g.append(len(set(g)))
plt.hist(hist_temp)
plt.plot([58], [0], 'or', ms=20)
plt.show()
plt.plot(hist_temp, hist_temp_g, '.b')
plt.plot([58], [200], 'or', ms=20)
plt.show()
g = []
rs_i = 0
temp = ref_gen
temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup']
for k in range(len(temp)):
b = False
temp2 = temp[k].split(':')
for j in range(len(temp2)):
temp3 = temp2[j].split(' ')
if temp3[1] == 'YRI':
g.append(temp3[0])
b=True
rs_i += b
print rs_i, len(set(g))
##########
snp_list_8000 = np.genfromtxt('snp_list_8000.csv', dtype=None)
j=0
for i in range(len(ref)):
j+=np.sum(ref[i] == snp_list_8000)
print j
|
Comments (0)
You can clone a snippet to your computer for local editing. Learn more.