Snippets
Created by
Sasha Kaurov
last modified
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | import numpy as np import matplotlib.pyplot as plt # read stuff data = np.genfromtxt('/group/dolan-lab/ekhramtsova/IDELA_GEMMA/plink_files/YRI1and3_bgl_R2_0.8_101208.rm_me_maf_0.05_hwe_0.001_missing.phen.removed_SNPs.post.assoc.frq', dtype=None, names=True) ref = np.genfromtxt('GWAS_2.2M_10.4_SNPs.txt', dtype=None) ref_gen = np.genfromtxt('snpinfo_ref.xls', dtype=None, delimiter='\t', names=True) # arrays with reference maf's ref_maf = np.zeros(len(ref)) # filling-up mafs: for i in range(len(ref_maf)): ref_maf[i] = data['MAF'][ref[i] == data['SNP']][0] # define bins edges(!!!) bins = np.arange(0, 0.50001, 0.05) # That is how reference maf array looks like: H,X,p = plt.hist(ref_maf, bins=bins) #H -- distribution plt.show() # Here is ho maf in 'data' is distributed: plt.hist(data['MAF'], bins=bins) plt.show() # Now generating random samples N = 1000 # numer of random rs's mock = np.zeros(N*len(ref), dtype=('|S20, float')) j = 0 for i in range(len(bins)-1): temp = data[(data['MAF']>bins[i]) & (data['MAF']<=bins[i+1])] temp = temp[np.random.permutation(len(temp))] temp = temp[:N*H[i]] mock['f0'][j:j+N*H[i]] = temp['SNP'] mock['f1'][j:j+N*H[i]] = temp['MAF'] j += N*H[i] # check plt.hist(mock['f1'], bins=bins) plt.show() # save list of SNPs np.savetxt('temp.txt', mock['f0']) snpinfo = np.genfromtxt('snpinfo.xls', dtype=None, delimiter='\t', names=True) snpinfo_maf = np.zeros(len(snpinfo)) j=0 for i in range(len(bins)-1): snpinfo_maf[j:j+H[i]*N] = bins[i]+0.001 j+=H[i]*N plt.hist(snpinfo_maf, bins=bins) plt.show() hist_temp = [] hist_temp_g = [] for ii in range(N): g = [] rs_i = 0 for i in range(len(bins)-1): temp = snpinfo[(snpinfo_maf>bins[i]) & (snpinfo_maf<=bins[i+1])] temp = temp[ii*H[i]:(1+ii)*H[i]] temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup'] for k in range(len(temp)): b = False temp2 = temp[k].split(':') for j in range(len(temp2)): temp3 = temp2[j].split(' ') if len(temp3)>1: if temp3[1] == 'YRI': g.append(temp3[0]) b=True rs_i += b print rs_i, len(set(g)) hist_temp.append(rs_i) hist_temp_g.append(len(set(g))) plt.hist(hist_temp) plt.plot([58], [0], 'or', ms=20) plt.show() plt.plot(hist_temp, hist_temp_g, '.b') plt.plot([58], [200], 'or', ms=20) plt.show() g = [] rs_i = 0 temp = ref_gen temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup'] for k in range(len(temp)): b = False temp2 = temp[k].split(':') for j in range(len(temp2)): temp3 = temp2[j].split(' ') if temp3[1] == 'YRI': g.append(temp3[0]) b=True rs_i += b print rs_i, len(set(g)) ########## snp_list_8000 = np.genfromtxt('snp_list_8000.csv', dtype=None) j=0 for i in range(len(ref)): np.sum(ref[i] == snp_list_8000) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | import numpy as np import matplotlib.pyplot as plt # read stuff data = np.genfromtxt('/group/dolan-lab/ekhramtsova/IDELA_GEMMA/YRI.chr1_22.TGP_and_imputed.20130416.exclude_BAD_SNPs.removed_missing_ind.frq', dtype=None, names=True) ref = np.genfromtxt('GWAS_6M_10.4_SNPs.txt', dtype=None) ref_gen = np.genfromtxt('snpinfo (2).xls', dtype=None, delimiter='\t', names=True) # arrays with reference maf's ref_maf = np.zeros(len(ref)) # filling-up mafs: for i in range(len(ref_maf)): ref_maf[i] = data['MAF'][ref[i] == data['SNP']][0] # define bins edges(!!!) bins = np.arange(0, 0.50001, 0.05) # That is how reference maf array looks like: H,X,p = plt.hist(ref_maf, bins=bins) #H -- distribution plt.show() # Here is ho maf in 'data' is distributed: plt.hist(data['MAF'], bins=bins) plt.show() # Now generating random samples N = 100 # numer of random rs's mock = np.zeros(N*len(ref), dtype=('|S20, float')) j = 0 for i in range(len(bins)-1): temp = data[(data['MAF']>bins[i]) & (data['MAF']<=bins[i+1])] temp = temp[np.random.permutation(len(temp))] temp = temp[:N*H[i]] mock['f0'][j:j+N*H[i]] = temp['SNP'] mock['f1'][j:j+N*H[i]] = temp['MAF'] j += N*H[i] # check plt.hist(mock['f1'], bins=bins) plt.show() # save list of SNPs np.savetxt('temp_6m.txt', mock['f0'], fmt='%s') np.savetxt('temp_6m_maf.txt', mock['f1'], fmt='%f') snpinfo = np.genfromtxt('snpinfo.xls', dtype=None, delimiter='\t', names=True) snpinfo_maf = np.zeros(len(snpinfo)) j=0 for i in range(len(bins)-1): snpinfo_maf[j:j+H[i]*N] = bins[i]+0.001 j+=H[i]*N plt.hist(snpinfo_maf, bins=bins) plt.show() hist_temp = [] hist_temp_g = [] for ii in range(N): g = [] rs_i = 0 for i in range(len(bins)-1): temp = snpinfo[(snpinfo_maf>bins[i]) & (snpinfo_maf<=bins[i+1])] temp = temp[ii*H[i]:(1+ii)*H[i]] temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup'] for k in range(len(temp)): b = False temp2 = temp[k].split(':') for j in range(len(temp2)): temp3 = temp2[j].split(' ') if len(temp3)>1: if temp3[1] == 'YRI': g.append(temp3[0]) b=True rs_i += b print rs_i, len(set(g)) hist_temp.append(rs_i) hist_temp_g.append(len(set(g))) plt.hist(hist_temp) plt.plot([58], [0], 'or', ms=20) plt.show() plt.plot(hist_temp, hist_temp_g, '.b') plt.plot([58], [200], 'or', ms=20) plt.show() g = [] rs_i = 0 temp = ref_gen temp = temp['expression_gene_population_and_pvaluesup_span_titleThe_eQTL_pvalue_in_a_population_CEU_or_YRI_for_a_particular_SNP_together_with_the_target_gene_is_shownbbspansup'] for k in range(len(temp)): b = False temp2 = temp[k].split(':') for j in range(len(temp2)): temp3 = temp2[j].split(' ') if temp3[1] == 'YRI': g.append(temp3[0]) b=True rs_i += b print rs_i, len(set(g)) ########## snp_list_8000 = np.genfromtxt('snp_list_8000.csv', dtype=None) j=0 for i in range(len(ref)): j+=np.sum(ref[i] == snp_list_8000) print j |
Comments (0)
You can clone a snippet to your computer for local editing. Learn more.