Commits

Giovanni Marco Dall'Olio committed d7cb100

MULT: new fixes by Ali, on the Multiple Alleles patch

I made some changes to the code, so it works without any errors. However, since I do not know the code fully, I am uncertain about some changes.
1. I don't know how the kbits function is working. Since we are not going to use binaries, I am not sure if this will break the code or not.
2. I change line 397 in binary_to_network.py to snps_str = re.findall("^.*?\s+?([01ACGTactg ]*)$", current_line). Orignially it was snps_str = re.findall("^.*?\s+?([01 ]*)$", current_line).
3. Finally, I have changed the pair_distance in GenerateNetwork.py from
pair_distance = '{0:08b}'.format(genotype1['value']^genotype2['value']).count('1')
to
pair_distance = len([i for i in xrange(len(genotype1['genotype'])) if genotype1['genotype'][i] != genotype2['genotype'][i]])

Here I changed genotype1['value'] to genotype1['genotype'] because it would raise error KeyError: 'Attribute does not exist'.

Unfortunately, I do not know how to submit my changes to the new branch that you have created. So I am attaching the source files in this email.

Comments (0)

Files changed (3)

src/GenotypeNetwork.py

 
         genotype_counts = collections.Counter(genotypes)
         node_genotypes = []
-        node_values = []
         node_n_datapoints = []
         self.add_vertices(len(genotype_counts.keys()))
-        for (genotype, geno_count)  in genotype_counts.items():
-            node_genotypes.append(genotype)
-            try:
-                node_values.append(int(genotype, base=2))
-            except ValueError:
-                raise GenotypeNetworkInitError("something went wrong when converting genotypes to binary strings. Check if any SNP is triallelic")
-            node_n_datapoints.append(geno_count)
+#        for (genotype, geno_count)  in genotype_counts.items():
+#            node_genotypes.append(genotype)
+#            try:
+#                node_values.append(int(genotype, base=2))
+#            except ValueError:
+#                raise GenotypeNetworkInitError("something went wrong when converting genotypes to binary strings. Check if any SNP is triallelic")
+#            node_n_datapoints.append(geno_count)
         self.vs['genotype'] = node_genotypes
-        self.vs['value'] = node_values
         self.vs['n_datapoints'] = node_n_datapoints
         for genotype1 in self.vs:
 #            logging.debug(str((genotype1.index, genotype1['name'], genotype1['value'])))
             for genotype2 in self.vs:
                 if genotype1.index >= genotype2.index:
-                    pair_distance = '{0:08b}'.format(genotype1['value']^genotype2['value']).count('1')
+                    #pair_distance = '{0:08b}'.format(genotype1['value']^genotype2['value']).count('1')
+	            pair_distance = len([i for i in xrange(len(genotype1['genotype'])) if genotype1['genotype'][i] != genotype2['genotype'][i]])
                     if pair_distance > 0 and pair_distance <= distance:
 #                    print genotype1['name'], genotype2['name'],
 #                    print '{:08b}'.format(genotype1['value']^genotype2['value']).count('1')
         self['allcontinents'] = allcontinents
         self.vs['genotype'] = node_genotypes
         self.vs['n_datapoints'] = node_n_datapoints
-        try:
-            self.vs['value'] = [int(n, base=2) for n in node_genotypes]
-        except ValueError:
-            raise GenotypeNetworkInitError("something went wrong when converting genotypes to binary strings. Check if any SNP is triallelic")
+#        try:
+#            self.vs['value'] = [int(n, base=2) for n in node_genotypes]
+#        except ValueError:
+#            raise GenotypeNetworkInitError("something went wrong when converting genotypes to binary strings. Check if any SNP is triallelic")
         self.vs['node_individuals'] = [list(n) for n in node_individuals]
         self.vs['continent'] = [list(n) for n in node_continent]
         self.vs['cont_str'] = ['_'.join(conts) for conts in self.vs['continent']]
 #            logging.debug(str((genotype1.index, genotype1['name'], genotype1['value'])))
             for genotype2 in self.vs:
                 if genotype1.index >= genotype2.index:
-                    pair_distance = '{0:08b}'.format(genotype1['value']^genotype2['value']).count('1')
+                    #pair_distance = '{0:08b}'.format(genotype1['value']^genotype2['value']).count('1')
+		    pair_distance = len([i for i in xrange(len(genotype1['genotype'])) if genotype1['genotype'][i] != genotype2['genotype'][i]])
                     if pair_distance > 0 and pair_distance <= distance:
 #                    print genotype1['name'], genotype2['name'],
 #                    print '{:08b}'.format(genotype1['value']^genotype2['value']).count('1')
     True
 
     """
-    return '{0:08b}'.format(int(genotype1, base=2)^int(genotype2, base=2)).count('1') == 1
+    a = len([i for i in xrange(len(genotype1)) if genotype1[i]!= genotype2[i]])
+    if a == 1:
+	return True
+    else:
+	return False
+    #return '{0:08b}'.format(int(genotype1, base=2)^int(genotype2, base=2)).count('1') == 1
 
 def random_genotype_network(chromosome_len, sample_size, max_variation=None, seed=None):
     """

src/binary_to_network.py

 #
                     # TODO: this can be done better using numpy's tools
                     for pos in xrange(len(genotypes)):
-                        if genotypes[pos] not in ("01actgACTG"):
+                        if genotypes[pos] not in ("0", "1","A","C","G","T","a","c","g","t"):
                             filteredout_genotypes.update([pos])
 
 #                    all_genotypes.append(''.join(genotypes[upstream:downstream]))
                 if current_line.startswith('#Ref. allele:'):
                     ref_alleles = current_line.split()[2:]
             else:
-                snps_str = re.findall("^.*?\s+?([01 ]*)$", current_line)
+                snps_str = re.findall("^.*?\s+?([01ACGTactg ]*)$", current_line)
                 if snps_str != []:
                     n_snps = len(snps_str[0].split())
             if current_line == '': # if current_line is empty, it means that we reached the downstream of the file
     if skip_gene is True:
         return
     
-    # if step is not a multiple of n_snps, remove some snps from the beginning and the end
+    # if step is not a multiple of n_snps, remove some snps from the beginning and the ned
     if central_window_only is True:
         if options.windows_size == 0:
             windows_size = n_snps   

src/filteredvcf_to_binary.py

             snp_annotations["ref"].append(snp_fields[3])
 
             snp_ids.append(snp_id)
-            chromosome1          = [int(snp_fields[i][0]) for i in range(9, len(snp_fields))]
+            #chromosome1          = [int(snp_fields[i][0]) for i in range(9, len(snp_fields))]
+	    chromosome1 = []
+	    for i in range(9, len(snp_fields)):
+		 if snp_fields[i][0] == '0':
+		    chromosome1.append(ref)
+		 if snp_fields[i][0] == '1':
+		    chromosome1.append(alt)
+
+	    chromosome2 = []
+	    for i in range(9, len(snp_fields)):
+		 if snp_fields[i][2] == '0':
+		    chromosome2.append(ref)
+		 if snp_fields[i][2] == '1':
+		    chromosome2.append(alt)
 #            print len(chromosome1)
-            chromosome2          = [int(snp_fields[i][2]) for i in range(9, len(snp_fields))]
+            #chromosome2          = [int(snp_fields[i][2]) for i in range(9, len(snp_fields))]
             unphased_individuals = [snp_fields[i][1] for i in range(9, len(snp_fields))]
             if unphased_individuals.count('/') > 0:
                 raise GenotypeNetworkInitError("unphased genotypes in {0}".format(snp_id))