Commits

dalke  committed 480723c

File to make an inverted index from the sqlite files

  • Participants
  • Parent commits f77184b

Comments (0)

Files changed (1)

File make_inverted_index.py

+# Extract data from the (now old) sqlite-based format and generate an
+# inverted index of the form:
+
+#  pattern + ":" + ",".join(index)
+# where "pattern" is of the form str(count) + "x" + smarts
+# and index is a number in the range 0 <= i < num molecules
+#  (The inverted index does *not* keep the original identeifier name.)
+
+import os
+import sqlite3
+from collections import defaultdict
+import sys
+
+dirname = "/Users/dalke/databases/pubchem"
+filenames = [name for name in os.listdir(dirname) if name.endswith(".sqlite")]
+filenames.sort()
+print filenames
+
+output_filename = "small_pubchem.counts"
+
+import generate_subgraphs
+
+if os.path.exists(output_filename):
+    os.unlink(output_filename)
+
+# smarts -> set of cids
+pattern_members = defaultdict(set)
+
+cid_offset = 0
+for fi, filename in enumerate(filenames):
+    #if os.path.exists("stop"):
+    #    break
+    if fi == 2:
+        break
+    old_conn = sqlite3.connect(os.path.join(dirname, filename))
+
+    print "Processing %s (have %d unique patterns)" % (filename, len(pattern_members))
+
+    old_c = old_conn.cursor()
+    old_c.execute("create index if not exists PatternCounts_cid on PatternCounts(cid)")
+    old_conn.commit()
+
+    # Load up all of the old patterns
+    old_pid_to_smarts = {}
+    old_c.execute("select pid, smarts from Patterns")
+    for (old_pid, smarts) in old_c.fetchall():
+        old_pid_to_smarts[old_pid] = smarts
+
+    old_c.execute("select cid, name, smiles from Compounds")
+
+    old_cid_to_new = {}
+    for (old_cid, name, smiles) in old_c.fetchall():
+        old_cid_to_new[old_cid] = cid_offset
+        cid_offset += 1
+
+    data_size = old_c.execute("select count(*) from PatternCounts").fetchone()[0]
+    old_c.execute("select pid, cid, count from PatternCounts")
+    for data_i, (pid, cid, count) in enumerate(old_c):
+        if data_i % 1000 == 0:
+            sys.stdout.write("%7d / %d    \r" % (data_i+1, data_size))
+            sys.stdout.flush()
+        if count > 10:
+            count = 10
+        for i in range(1, count):
+            smarts = "%dx%s" % (i, old_pid_to_smarts[pid])
+            pattern_members[smarts].add(old_cid_to_new[cid])
+
+with open(output_filename, "w") as f:
+    for pattern in sorted(pattern_members):
+        members = pattern_members[pattern]
+        f.write("%s:%s\n" % (pattern, ",".join(map(str, sorted(members)))))