Commits

Anonymous committed 0268ada

Initial commit of my greedy algorithm to find the most selective bits

Comments (0)

Files changed (1)

bisect_filters.py

+# Identify the patterns which most nicely partition the reverse index.
+
+
+
+import time
+
+num_bits = 1024
+ignore_below = 10
+
+# Save a bit of space by making sure that all ints are the same int
+# (Cpython creates a new object for integers > 100. This reuses the old one.)
+class CidIntern(dict):
+    def __missing__(self, i):
+        self[i] = i
+        return i
+    
+
+def load_dataset(filename):
+    cid_intern = CidIntern()
+    # Read lines of the form:
+    #   3xCO:34,78,91,102
+    pattern_to_members = {}
+    with open(filename) as f:
+        for line in f:
+            pattern, members = line.split(":")
+            members = set(cid_intern[int(cid)] for cid in members.split(","))
+            if len(members) >= ignore_below:
+                pattern_to_members[pattern] = members
+
+    return pattern_to_members, set(cid_intern)
+
+
+def best_splits(all_members, pattern_to_members):
+    unchanged = [True] * len(all_members)
+    def get_score(target_members):
+        # Simple score: sum of the square of the size difference between haves
+        # and have-nots. (I emphasize breaking larger groups in half.)
+        score = 0
+        for i, members in enumerate(all_members):
+            have_size = len(members & target_members)
+            have_not_size = len(members) - have_size
+            delta = (have_size - have_not_size) ** 2
+            score += delta
+            
+            if have_size == 0 or have_not_size == 0:
+                unchanged[i] = False
+        return score
+
+    best_score, _, pattern, pattern_set = min((get_score(v), -len(v), k, v)
+                                                for (k, v) in pattern_to_members.iteritems())
+
+    # If none of the bits can affect the group then it's useless; get rid of it!
+    # (Process in reverse order so I can delete in-place.)
+    # TODO: return this info to caller and let caller decide to delete
+    for i in range(len(all_members)-1, -1, -1):
+        if unchanged[i]:
+            del all_members[i]
+    
+    return best_score, pattern, pattern_set
+
+# pid -> set(cid which contain that pattern)
+pattern_sets = set()
+
+
+def main():
+    #pattern_to_members, all_compounds = load_dataset("pubchem.counts")
+    pattern_to_members, all_compounds = load_dataset("zinc_only_fragment.counts")
+    print "Loaded %d compounds and %d patterns" % (len(all_compounds), len(pattern_to_members))
+    # This contains a list of sets of compounds. Originally it's one set
+    current = [all_compounds]
+    
+    for i in range(num_bits):
+        t1 = time.time()
+        # Find the pattern which "best" partitions the data set
+        best_score, best_pattern, best_set = best_splits(current, pattern_to_members)
+
+        # Not going to use the pattern again
+        del pattern_to_members[best_pattern]
+
+        # Partition all of the sets into components which have and
+        # don't have the best_pattern. Keep track of the children
+        # which are "large enough."
+        
+        new = []
+        for compounds in current:
+            have = compounds & best_set
+            have_not = compounds - best_set
+            if len(have) >= ignore_below:
+                new.append(have)
+            if len(have_not) >= ignore_below:
+                new.append(have_not)
+        largest = max(len(x) for x in new)
+        avg = sum(len(x) for x in new) / (0.0+len(new))
+        current = new
+
+        dt = "%.1fs" % (time.time() - t1)
+        print i, best_pattern, "largest", largest, "dt", dt, "cells", len(current), "avg", avg
+        
+    print "Total time", time.time() - start_time
+
+if __name__ == "__main__":
+    main()