Commits

Joe Topjian committed 99235fb

Inital Commit

  • Participants

Comments (0)

Files changed (3)

+data.json
+*.pyc

File discogs_parse.py

+import xml.parsers.expat
+from collections import defaultdict
+import json
+
+DISCOGS_DB_DUMP = '/Users/jtopjian/Downloads/discogs_20120301_masters.xml'
+
+data = defaultdict(dict)
+curr_id = 0
+curr_tag = None
+curr_value = ""
+useful_tags = ['artist', 'name', 'genre', 'year', 'style', 'title']
+
+def start_element(name, attrs):
+    global curr_id, curr_tag, data
+    if name == 'master':
+        curr_id = attrs['id']
+        data[curr_id] = defaultdict(list)
+    if name in useful_tags:
+        curr_tag = name
+    else:
+        curr_tag = None
+
+def char_data(value):
+    global curr_value
+    if curr_tag and curr_id:
+        curr_value = curr_value + value
+
+def end_element(name):
+    global curr_id, curr_tag, data, curr_value
+    if curr_tag and curr_id and curr_value:
+        data[curr_id][curr_tag].append(curr_value)
+    if name == 'master':
+        curr_id = 0
+    curr_tag = None
+    curr_value = ""
+
+p = xml.parsers.expat.ParserCreate()
+p.StartElementHandler = start_element
+p.EndElementHandler = end_element
+p.CharacterDataHandler = char_data
+
+f = open(DISCOGS_DB_DUMP)
+p.ParseFile(f)
+
+w = open('data.json', 'w')
+json.dump(data, w)
+import sys
+import json
+search = []
+
+if len(sys.argv) < 2:
+    print "Need an argument"
+    print "find.py classical idm"
+    print "  - will find all albums with styles of both classical and idm music"
+    sys.exit(1)
+else:
+    for x in sys.argv[1:]:
+        search.append(x.lower())
+
+f = open('data.json')
+data = json.load(f)
+
+for k in data:
+    to_print = 1
+    curr_data = data[k]
+    name = curr_data.get('name', ['none'])[0]
+    title = curr_data.get('title',['none'])[0]
+    genre = curr_data.get('genre',['none'])
+    style = curr_data.get('style',['none'])
+
+    if style[0] == 'none':
+        continue
+
+    output = "%s: %s:\n" % (name.encode('ascii','ignore'), title.encode('ascii','ignore'))
+    output += "    %s\n" % ', '.join(genre).encode('ascii','ignore')
+    for s in search:
+        if s not in [x.lower() for x in style]:
+            to_print = 0
+    output += "    %s\n" % ', '.join(style).encode('ascii','ignore')
+
+    if to_print:
+        print output