tpherndon / django-fedora
A Django web UI for Fedora 3.1 (http://www.fedora-commons.org)
Clone this repository (size: 156.8 KB): HTTPS / SSH
$ hg clone http://bitbucket.org/tpherndon/django-fedora/
| commit 51: | 94506f00efa3 |
| parent 50: | 592c7ccce8c4 |
| branch: | default |
Adding check for existence of MD5 hash
9 months ago
Changed (Δ389 bytes):
raw changeset »
djadora/management/commands/index_objects.py (62 lines added, 59 lines removed)
Up to file-list djadora/management/commands/index_objects.py:
| … | … | @@ -11,7 +11,7 @@ from django.contrib.contenttypes.models |
11 |
11 |
|
12 |
12 |
class Command(NoArgsCommand): |
13 |
13 |
help = 'Scans Fedora Repository and adds new objects as Collection and MSKImage ORM model instances' |
14 |
||
14 |
||
15 |
15 |
def handle_noargs(self, **options): |
16 |
16 |
# Goal: connect to a running Fedora repository through Django |
17 |
17 |
# collect a list of DigitalObjects |
| … | … | @@ -20,13 +20,13 @@ class Command(NoArgsCommand): |
20 |
20 |
# see if pid exists in Collections or MSKImages |
21 |
21 |
# if not, pull objectXML |
22 |
22 |
# parse for dc:type, dispatch as appropriate |
23 |
||
23 |
||
24 |
24 |
do_pids = DigitalObject.objects.filter(objectstate__exact='A').values_list('dopid') |
25 |
25 |
# print "do_pids:", do_pids |
26 |
26 |
eval_pids = [pid[0] for pid in do_pids if pid[0].split(':')[0] in settings.PID_NAMESPACES] |
27 |
27 |
eval_pids.sort() |
28 |
28 |
# print "eval_pids:", eval_pids |
29 |
||
29 |
||
30 |
30 |
collection_pids = Collection.objects.values_list('pid') |
31 |
31 |
# print "collection_pids:", collection_pids |
32 |
32 |
collection_pids = [pid[0] for pid in collection_pids if pid] |
| … | … | @@ -38,11 +38,11 @@ class Command(NoArgsCommand): |
38 |
38 |
extant_pids = collection_pids + image_pids # maybe needs OR, maybe doesn't work, won't know until tested |
39 |
39 |
extant_pids.sort() |
40 |
40 |
# print "extant_pids:", extant_pids |
41 |
||
41 |
||
42 |
42 |
new_pids = [pid for pid in eval_pids if pid not in extant_pids] |
43 |
43 |
# new_pids.sort() |
44 |
44 |
new_pids = sorted(new_pids, key=lambda x: int(x[x.find(':')+1:])) |
45 |
||
45 |
||
46 |
46 |
if new_pids: |
47 |
47 |
print "new_pids:", new_pids |
48 |
48 |
self.fed = client.FedoraClient(settings.FEDORA_HOST, settings.FEDORA_USER, settings.FEDORA_PASSWORD) |
| … | … | @@ -63,21 +63,21 @@ class Command(NoArgsCommand): |
63 |
63 |
elif dc_type == 'StillImage': |
64 |
64 |
image, tags = self.create_mskimage(pid, obj_xml, dc_type) |
65 |
65 |
print image, tags |
66 |
|
|
66 |
#from django.db import connection |
|
67 |
67 |
# print connection.queries |
68 |
if len(image.dc_date) != 10: |
|
69 |
print "Gonna fail!" |
|
70 |
image.dc_date = None |
|
68 |
if len(image.dc_date) != 10: |
|
69 |
print "Gonna fail!" |
|
70 |
image.dc_date = None |
|
71 |
71 |
try: |
72 |
72 |
image.save() |
73 |
|
|
73 |
if tags: |
|
74 |
74 |
for item in tags: |
75 |
75 |
Tag.objects.add_tag(image, item) |
76 |
76 |
except: |
77 |
77 |
# print connection.queries |
78 |
78 |
raise |
79 |
79 |
print "Created ", image |
80 |
||
80 |
||
81 |
81 |
deleted_pids = [pid for pid in extant_pids if pid not in eval_pids] |
82 |
82 |
if deleted_pids: |
83 |
83 |
# print "deleted_pids:", deleted_pids |
| … | … | @@ -88,7 +88,7 @@ class Command(NoArgsCommand): |
88 |
88 |
col.delete() |
89 |
89 |
except Collection.DoesNotExist: |
90 |
90 |
pass |
91 |
||
91 |
||
92 |
92 |
try: |
93 |
93 |
image = MSKImage.objects.get(pk=pid) |
94 |
94 |
tagged_items = TaggedItem.objects.filter(object_id__exact=image.pid) |
| … | … | @@ -98,8 +98,8 @@ class Command(NoArgsCommand): |
98 |
98 |
image.delete() |
99 |
99 |
except MSKImage.DoesNotExist: |
100 |
100 |
pass |
101 |
||
102 |
||
101 |
||
102 |
||
103 |
103 |
def create_collection(self, pid, obj_xml, dc_type): |
104 |
104 |
# print et.tostring(obj_xml) |
105 |
105 |
# print "pid in create_collection: ", pid |
| … | … | @@ -114,19 +114,19 @@ class Command(NoArgsCommand): |
114 |
114 |
parent_collection = parent_orig.split('/')[1] |
115 |
115 |
# print "Parent collection pid ", parent_collection |
116 |
116 |
# print et.tostring(obj_xml) |
117 |
||
117 |
||
118 |
118 |
try: |
119 |
119 |
parent_collection = Collection.objects.get(pk=parent_collection) |
120 |
120 |
except Collection.DoesNotExist: |
121 |
121 |
dc_ns = './/{http://purl.org/dc/elements/1.1/}' |
122 |
122 |
dc_type_find = dc_ns + 'type' |
123 |
||
123 |
||
124 |
124 |
obj_xml = self.fed.get_object_xml(pid) |
125 |
125 |
dc_type = obj_xml.findtext(dc_type_find) |
126 |
126 |
# if dc_type == 'Collection': |
127 |
127 |
parent_collection = self.create_collection(parent_collection, obj_xml, dc_type) |
128 |
128 |
parent_collection.save() |
129 |
||
129 |
||
130 |
130 |
# print "This should also be a Collection. parent_collection: ", parent_collection |
131 |
131 |
else: |
132 |
132 |
# print "parent is None" |
| … | … | @@ -134,7 +134,7 @@ class Command(NoArgsCommand): |
134 |
134 |
|
135 |
135 |
# object_xml |
136 |
136 |
# skip, passed in |
137 |
||
137 |
||
138 |
138 |
# get canonical DC, latest version |
139 |
139 |
dc_xml = self.fed.get_datastream(pid, 'DC') |
140 |
140 |
# datastreams = self.fed.get_datastreams(pid) |
| … | … | @@ -146,43 +146,43 @@ class Command(NoArgsCommand): |
146 |
146 |
dc_xml = et.fromstring(dc_xml) |
147 |
147 |
except TypeError: |
148 |
148 |
dc_xml = '' |
149 |
||
149 |
||
150 |
150 |
if dc_xml: |
151 |
151 |
# source |
152 |
152 |
dc_ns = './/{http://purl.org/dc/elements/1.1/}' |
153 |
153 |
dc_source_find = dc_ns + 'source' |
154 |
154 |
source = dc_xml.findtext(dc_source_find) |
155 |
||
155 |
||
156 |
156 |
# dc_creator |
157 |
157 |
dc_creator_find = dc_ns + 'creator' |
158 |
158 |
dc_creator = dc_xml.findtext(dc_creator_find) |
159 |
||
159 |
||
160 |
160 |
# dc_date |
161 |
161 |
dc_date_find = dc_ns + 'date' |
162 |
162 |
dc_date = dc_xml.findtext(dc_date_find) |
163 |
163 |
# look for timezone string |
164 |
164 |
if dc_date and dc_date.find('T') and dc_date.find('T') != -1: |
165 |
165 |
dc_date = dc_date[:dc_date.find('T')] |
166 |
||
166 |
||
167 |
167 |
# dc_description |
168 |
168 |
dc_desc_find = dc_ns + 'description' |
169 |
169 |
dc_description = dc_xml.findtext(dc_desc_find) |
170 |
||
170 |
||
171 |
171 |
# dc_language |
172 |
172 |
dc_lang_find = dc_ns + 'language' |
173 |
173 |
dc_language = dc_xml.findtext(dc_lang_find) |
174 |
||
174 |
||
175 |
175 |
# dc_publisher |
176 |
176 |
dc_pub_find = dc_ns + 'publisher' |
177 |
177 |
dc_publisher = dc_xml.findtext(dc_pub_find) |
178 |
||
178 |
||
179 |
179 |
# dc_title |
180 |
180 |
dc_title_find = dc_ns + 'title' |
181 |
181 |
dc_title = dc_xml.findtext(dc_title_find) |
182 |
||
182 |
||
183 |
183 |
# dc_type |
184 |
184 |
# We know this is "Collection" already |
185 |
||
185 |
||
186 |
186 |
# label |
187 |
187 |
properties = obj_xml.findall('.//{info:fedora/fedora-system:def/foxml#}property') |
188 |
188 |
label = '' |
| … | … | @@ -194,8 +194,8 @@ class Command(NoArgsCommand): |
194 |
194 |
parent_collection=parent_collection, |
195 |
195 |
object_xml=et.tostring(obj_xml), |
196 |
196 |
source=source, |
197 |
label=label, |
|
198 |
dc_creator=dc_creator, |
|
197 |
label=label, |
|
198 |
dc_creator=dc_creator, |
|
199 |
199 |
dc_date=dc_date, |
200 |
200 |
dc_description=dc_description, |
201 |
201 |
dc_language=dc_language, |
| … | … | @@ -203,7 +203,7 @@ class Command(NoArgsCommand): |
203 |
203 |
dc_title=dc_title, |
204 |
204 |
dc_type=dc_type) |
205 |
205 |
return col |
206 |
||
206 |
||
207 |
207 |
else: |
208 |
208 |
# label |
209 |
209 |
properties = obj_xml.findall('.//{info:fedora/fedora-system:def/foxml#}property') |
| … | … | @@ -216,10 +216,10 @@ class Command(NoArgsCommand): |
216 |
216 |
col = Collection(pid=pid, |
217 |
217 |
parent_collection=parent_collection, |
218 |
218 |
object_xml=et.tostring(obj_xml), |
219 |
label=label, |
|
219 |
label=label, |
|
220 |
220 |
dc_type=dc_type) |
221 |
221 |
return col |
222 |
||
222 |
||
223 |
223 |
def create_mskimage(self, pid, obj_xml, dc_type): |
224 |
224 |
print "pid:", pid |
225 |
225 |
# parent_collection |
| … | … | @@ -238,7 +238,7 @@ class Command(NoArgsCommand): |
238 |
238 |
except Collection.DoesNotExist: |
239 |
239 |
dc_ns = './/{http://purl.org/dc/elements/1.1/}' |
240 |
240 |
dc_type_find = dc_ns + 'type' |
241 |
||
241 |
||
242 |
242 |
obj_xml = self.fed.get_object_xml(pid) |
243 |
243 |
dc_type = obj_xml.findtext(dc_type_find) |
244 |
244 |
# if dc_type == 'Collection': |
| … | … | @@ -247,11 +247,11 @@ class Command(NoArgsCommand): |
247 |
247 |
# print "This should be a Collection. parent_collection:", parent_collection |
248 |
248 |
# object_xml |
249 |
249 |
# skip, passed in |
250 |
||
250 |
||
251 |
251 |
# get canonical DC, latest version |
252 |
252 |
dc_xml = self.fed.get_datastream(pid, 'DC') |
253 |
253 |
dc_xml = et.fromstring(dc_xml) |
254 |
||
254 |
||
255 |
255 |
# source |
256 |
256 |
dc_ns = './/{http://purl.org/dc/elements/1.1/}' |
257 |
257 |
dc_source_find = dc_ns + 'source' |
| … | … | @@ -260,58 +260,58 @@ class Command(NoArgsCommand): |
260 |
260 |
# dc_creator |
261 |
261 |
dc_creator_find = dc_ns + 'creator' |
262 |
262 |
dc_creator = dc_xml.findtext(dc_creator_find) |
263 |
||
263 |
||
264 |
264 |
# dc_date |
265 |
265 |
dc_date_find = dc_ns + 'date' |
266 |
266 |
dc_date = dc_xml.findtext(dc_date_find) |
267 |
267 |
# look for timezone string |
268 |
268 |
if dc_date and dc_date.find('T') and dc_date.find('T') != -1: |
269 |
269 |
dc_date = dc_date[:dc_date.find('T')] |
270 |
||
270 |
||
271 |
271 |
# dc_description |
272 |
272 |
dc_desc_find = dc_ns + 'description' |
273 |
273 |
dc_description = dc_xml.findtext(dc_desc_find) |
274 |
||
274 |
||
275 |
275 |
# dc_language |
276 |
276 |
dc_lang_find = dc_ns + 'language' |
277 |
277 |
dc_language = dc_xml.findtext(dc_lang_find) |
278 |
||
278 |
||
279 |
279 |
# dc_publisher |
280 |
280 |
dc_pub_find = dc_ns + 'publisher' |
281 |
281 |
dc_publisher = dc_xml.findtext(dc_pub_find) |
282 |
||
282 |
||
283 |
283 |
# dc_title |
284 |
284 |
dc_title_find = dc_ns + 'title' |
285 |
285 |
dc_title = dc_xml.findtext(dc_title_find) |
286 |
||
286 |
||
287 |
287 |
# dc_type |
288 |
288 |
# We know this is "StillImage" already |
289 |
||
289 |
||
290 |
290 |
# dc_subject |
291 |
291 |
dc_subject_find = dc_ns + 'subject' |
292 |
292 |
# Going to get back a list of subjects |
293 |
293 |
dc_subject = [el.text for el in dc_xml.findall(dc_subject_find)] |
294 |
294 |
dc_subject = '; '.join(dc_subject) |
295 |
||
295 |
||
296 |
296 |
# dc_format |
297 |
297 |
dc_format_find = dc_ns + 'format' |
298 |
298 |
dc_format = [el.text for el in dc_xml.findall(dc_format_find)] |
299 |
299 |
dc_format = ', '.join(dc_format) |
300 |
||
300 |
||
301 |
301 |
# label |
302 |
302 |
properties = obj_xml.findall('.//{info:fedora/fedora-system:def/foxml#}property') |
303 |
303 |
label = '' |
304 |
304 |
for prop in properties: |
305 |
305 |
if prop.get('NAME').endswith('label'): |
306 |
306 |
label = prop.get('VALUE') |
307 |
||
307 |
||
308 |
308 |
# content digest of ORIGINAL datastream |
309 |
309 |
datastreams = obj_xml.findall('.//{info:fedora/fedora-system:def/foxml#}datastream') |
310 |
310 |
orig_ds = None |
311 |
311 |
for ds in datastreams: |
312 |
312 |
if ds.attrib['ID'] == 'ORIGINAL': |
313 |
313 |
orig_ds = ds |
314 |
||
314 |
||
315 |
315 |
dsvs = ds.findall('.//{info:fedora/fedora-system:def/foxml#}datastreamVersion') |
316 |
316 |
# find most recent datastreamVersion |
317 |
317 |
dsv = None |
| … | … | @@ -323,17 +323,20 @@ class Command(NoArgsCommand): |
323 |
323 |
for item in dsvs: |
324 |
324 |
dsv_id = item.attrib['ID'] |
325 |
325 |
dsv_list.append((dsv_id, item)) |
326 |
||
326 |
||
327 |
327 |
dsv_list.sort() |
328 |
328 |
dsv = dsv_list[-1][1] |
329 |
||
329 |
||
330 |
330 |
cd_elem = dsv.find('.//{info:fedora/fedora-system:def/foxml#}contentDigest') |
331 |
original_md5 = cd_elem.attrib['DIGEST'] |
|
332 |
||
331 |
try: |
|
332 |
original_md5 = cd_elem.attrib['DIGEST'] |
|
333 |
except AttributeError: |
|
334 |
original_md5 = '' |
|
335 |
||
333 |
336 |
# get canonical MODS, latest version |
334 |
337 |
mods_xml = self.fed.get_datastream(pid, 'MODS') |
335 |
338 |
mods_xml = et.fromstring(mods_xml) |
336 |
||
339 |
||
337 |
340 |
# get photographer |
338 |
341 |
# find the <name..> elements |
339 |
342 |
# find the one with <role><roleTerm>pht</role></roleTerm> |
| … | … | @@ -343,7 +346,7 @@ class Command(NoArgsCommand): |
343 |
346 |
# order |
344 |
347 |
mods_ns = './/{http://www.loc.gov/mods/v3}' |
345 |
348 |
names = mods_xml.findall(mods_ns + 'name') |
346 |
||
349 |
||
347 |
350 |
role_terms = ('pht', 'Photographer') |
348 |
351 |
photographer_name = None |
349 |
352 |
for item in names: |
| … | … | @@ -351,8 +354,8 @@ class Command(NoArgsCommand): |
351 |
354 |
for role in roles: |
352 |
355 |
if role.text in role_terms: |
353 |
356 |
photographer_name = item |
354 |
||
355 |
fname = lname = None |
|
357 |
||
358 |
fname = lname = None |
|
356 |
359 |
if photographer_name: |
357 |
360 |
name_parts = photographer_name.findall(mods_ns + 'namePart') |
358 |
361 |
for part in name_parts: |
| … | … | @@ -360,7 +363,7 @@ class Command(NoArgsCommand): |
360 |
363 |
lname = part.text |
361 |
364 |
if part.attrib['type'] == 'given': |
362 |
365 |
fname = part.text |
363 |
||
366 |
||
364 |
367 |
photographer = '' |
365 |
368 |
if fname and lname: |
366 |
369 |
photographer = ' '.join((fname, lname)) |
| … | … | @@ -373,14 +376,14 @@ class Command(NoArgsCommand): |
373 |
376 |
tags = None |
374 |
377 |
# print tags |
375 |
378 |
|
376 |
||
379 |
||
377 |
380 |
# Now we create a new MSKImage instance |
378 |
381 |
img = MSKImage(parent_collection=parent_collection, |
379 |
382 |
pid=pid, |
380 |
383 |
object_xml=et.tostring(obj_xml), |
381 |
source=source, |
|
384 |
source=source, |
|
382 |
385 |
label=label, |
383 |
dc_creator=dc_creator, |
|
386 |
dc_creator=dc_creator, |
|
384 |
387 |
dc_date=dc_date, |
385 |
388 |
dc_description=dc_description, |
386 |
389 |
dc_language=dc_language, |
| … | … | @@ -393,5 +396,5 @@ class Command(NoArgsCommand): |
393 |
396 |
photographer=photographer, |
394 |
397 |
# tags=tags |
395 |
398 |
) |
396 |
||
399 |
||
397 |
400 |
return img, tags |
