tpherndon / django-fedora

A Django web UI for Fedora 3.1 (http://www.fedora-commons.org)

Clone this repository (size: 156.8 KB): HTTPS / SSH
$ hg clone http://bitbucket.org/tpherndon/django-fedora/
commit 51: 94506f00efa3
parent 50: 592c7ccce8c4
branch: default
Adding check for existence of MD5 hash
her...@localhost.localdomain
9 months ago

Changed (Δ389 bytes):

Up to file-list djadora/management/commands/index_objects.py:

@@ -11,7 +11,7 @@ from django.contrib.contenttypes.models
11
11
12
12
class Command(NoArgsCommand):
13
13
    help = 'Scans Fedora Repository and adds new objects as Collection and MSKImage ORM model instances'
14
        
14
15
15
    def handle_noargs(self, **options):
16
16
        # Goal:  connect to a running Fedora repository through Django
17
17
        # collect a list of DigitalObjects
@@ -20,13 +20,13 @@ class Command(NoArgsCommand):
20
20
            # see if pid exists in Collections or MSKImages
21
21
            # if not, pull objectXML
22
22
            # parse for dc:type, dispatch as appropriate
23
            
23
24
24
        do_pids = DigitalObject.objects.filter(objectstate__exact='A').values_list('dopid')
25
25
        # print "do_pids:", do_pids
26
26
        eval_pids = [pid[0] for pid in do_pids if pid[0].split(':')[0] in settings.PID_NAMESPACES]
27
27
        eval_pids.sort()
28
28
        # print "eval_pids:", eval_pids
29
        
29
30
30
        collection_pids = Collection.objects.values_list('pid')
31
31
        # print "collection_pids:", collection_pids
32
32
        collection_pids = [pid[0] for pid in collection_pids if pid]
@@ -38,11 +38,11 @@ class Command(NoArgsCommand):
38
38
        extant_pids = collection_pids + image_pids  # maybe needs OR, maybe doesn't work, won't know until tested
39
39
        extant_pids.sort()
40
40
        # print "extant_pids:", extant_pids
41
        
41
42
42
        new_pids = [pid for pid in eval_pids if pid not in extant_pids]
43
43
#         new_pids.sort()
44
44
        new_pids = sorted(new_pids, key=lambda x: int(x[x.find(':')+1:]))
45
        
45
46
46
        if new_pids:
47
47
            print "new_pids:", new_pids
48
48
            self.fed = client.FedoraClient(settings.FEDORA_HOST, settings.FEDORA_USER, settings.FEDORA_PASSWORD)
@@ -63,21 +63,21 @@ class Command(NoArgsCommand):
63
63
                elif dc_type == 'StillImage':
64
64
                    image, tags = self.create_mskimage(pid, obj_xml, dc_type)
65
65
                    print image, tags
66
                    from django.db import connection
66
                    #from django.db import connection
67
67
#                     print connection.queries
68
		    if len(image.dc_date) != 10:
69
			print "Gonna fail!"
70
			image.dc_date = None
68
                    if len(image.dc_date) != 10:
69
                        print "Gonna fail!"
70
                        image.dc_date = None
71
71
                    try:
72
72
                        image.save()
73
			if tags:
73
                        if tags:
74
74
                            for item in tags:
75
75
                                Tag.objects.add_tag(image, item)
76
76
                    except:
77
77
#                         print connection.queries
78
78
                        raise
79
79
                    print "Created ", image
80
                    
80
81
81
        deleted_pids = [pid for pid in extant_pids if pid not in eval_pids]
82
82
        if deleted_pids:
83
83
            # print "deleted_pids:", deleted_pids
@@ -88,7 +88,7 @@ class Command(NoArgsCommand):
88
88
                    col.delete()
89
89
                except Collection.DoesNotExist:
90
90
                    pass
91
                
91
92
92
                try:
93
93
                    image = MSKImage.objects.get(pk=pid)
94
94
                    tagged_items = TaggedItem.objects.filter(object_id__exact=image.pid)
@@ -98,8 +98,8 @@ class Command(NoArgsCommand):
98
98
                    image.delete()
99
99
                except MSKImage.DoesNotExist:
100
100
                    pass
101
                
102
                    
101
102
103
103
    def create_collection(self, pid, obj_xml, dc_type):
104
104
#         print et.tostring(obj_xml)
105
105
#         print "pid in create_collection: ", pid
@@ -114,19 +114,19 @@ class Command(NoArgsCommand):
114
114
            parent_collection = parent_orig.split('/')[1]
115
115
#             print "Parent collection pid ", parent_collection
116
116
#             print et.tostring(obj_xml)
117
            
117
118
118
            try:
119
119
                parent_collection = Collection.objects.get(pk=parent_collection)
120
120
            except Collection.DoesNotExist:
121
121
                dc_ns = './/{http://purl.org/dc/elements/1.1/}'
122
122
                dc_type_find = dc_ns + 'type'
123
            
123
124
124
                obj_xml = self.fed.get_object_xml(pid)
125
125
                dc_type = obj_xml.findtext(dc_type_find)
126
126
    #             if dc_type == 'Collection':
127
127
                parent_collection = self.create_collection(parent_collection, obj_xml, dc_type)
128
128
                parent_collection.save()
129
            
129
130
130
            # print "This should also be a Collection. parent_collection: ", parent_collection
131
131
        else:
132
132
#             print "parent is None"
@@ -134,7 +134,7 @@ class Command(NoArgsCommand):
134
134
135
135
        # object_xml
136
136
        # skip, passed in
137
        
137
138
138
        # get canonical DC, latest version
139
139
        dc_xml = self.fed.get_datastream(pid, 'DC')
140
140
#         datastreams = self.fed.get_datastreams(pid)
@@ -146,43 +146,43 @@ class Command(NoArgsCommand):
146
146
            dc_xml = et.fromstring(dc_xml)
147
147
        except TypeError:
148
148
            dc_xml = ''
149
        
149
150
150
        if dc_xml:
151
151
            # source
152
152
            dc_ns = './/{http://purl.org/dc/elements/1.1/}'
153
153
            dc_source_find = dc_ns + 'source'
154
154
            source = dc_xml.findtext(dc_source_find)
155
    
155
156
156
            # dc_creator
157
157
            dc_creator_find = dc_ns + 'creator'
158
158
            dc_creator = dc_xml.findtext(dc_creator_find)
159
            
159
160
160
            # dc_date
161
161
            dc_date_find = dc_ns + 'date'
162
162
            dc_date = dc_xml.findtext(dc_date_find)
163
163
            # look for timezone string
164
164
            if dc_date and dc_date.find('T') and dc_date.find('T') != -1:
165
165
                dc_date = dc_date[:dc_date.find('T')]
166
            
166
167
167
            # dc_description
168
168
            dc_desc_find = dc_ns + 'description'
169
169
            dc_description = dc_xml.findtext(dc_desc_find)
170
            
170
171
171
            # dc_language
172
172
            dc_lang_find = dc_ns + 'language'
173
173
            dc_language = dc_xml.findtext(dc_lang_find)
174
            
174
175
175
            # dc_publisher
176
176
            dc_pub_find = dc_ns + 'publisher'
177
177
            dc_publisher = dc_xml.findtext(dc_pub_find)
178
            
178
179
179
            # dc_title
180
180
            dc_title_find = dc_ns + 'title'
181
181
            dc_title = dc_xml.findtext(dc_title_find)
182
            
182
183
183
            # dc_type
184
184
            # We know this is "Collection" already
185
            
185
186
186
            # label
187
187
            properties = obj_xml.findall('.//{info:fedora/fedora-system:def/foxml#}property')
188
188
            label = ''
@@ -194,8 +194,8 @@ class Command(NoArgsCommand):
194
194
                             parent_collection=parent_collection,
195
195
                             object_xml=et.tostring(obj_xml),
196
196
                             source=source,
197
                             label=label, 
198
                             dc_creator=dc_creator, 
197
                             label=label,
198
                             dc_creator=dc_creator,
199
199
                             dc_date=dc_date,
200
200
                             dc_description=dc_description,
201
201
                             dc_language=dc_language,
@@ -203,7 +203,7 @@ class Command(NoArgsCommand):
203
203
                             dc_title=dc_title,
204
204
                             dc_type=dc_type)
205
205
            return col
206
                
206
207
207
        else:
208
208
            # label
209
209
            properties = obj_xml.findall('.//{info:fedora/fedora-system:def/foxml#}property')
@@ -216,10 +216,10 @@ class Command(NoArgsCommand):
216
216
            col = Collection(pid=pid,
217
217
                             parent_collection=parent_collection,
218
218
                             object_xml=et.tostring(obj_xml),
219
                             label=label, 
219
                             label=label,
220
220
                             dc_type=dc_type)
221
221
            return col
222
        
222
223
223
    def create_mskimage(self, pid, obj_xml, dc_type):
224
224
        print "pid:", pid
225
225
        # parent_collection
@@ -238,7 +238,7 @@ class Command(NoArgsCommand):
238
238
        except Collection.DoesNotExist:
239
239
            dc_ns = './/{http://purl.org/dc/elements/1.1/}'
240
240
            dc_type_find = dc_ns + 'type'
241
        
241
242
242
            obj_xml = self.fed.get_object_xml(pid)
243
243
            dc_type = obj_xml.findtext(dc_type_find)
244
244
#             if dc_type == 'Collection':
@@ -247,11 +247,11 @@ class Command(NoArgsCommand):
247
247
        # print "This should be a Collection.  parent_collection:", parent_collection
248
248
        # object_xml
249
249
        # skip, passed in
250
        
250
251
251
        # get canonical DC, latest version
252
252
        dc_xml = self.fed.get_datastream(pid, 'DC')
253
253
        dc_xml = et.fromstring(dc_xml)
254
        
254
255
255
        # source
256
256
        dc_ns = './/{http://purl.org/dc/elements/1.1/}'
257
257
        dc_source_find = dc_ns + 'source'
@@ -260,58 +260,58 @@ class Command(NoArgsCommand):
260
260
        # dc_creator
261
261
        dc_creator_find = dc_ns + 'creator'
262
262
        dc_creator = dc_xml.findtext(dc_creator_find)
263
        
263
264
264
        # dc_date
265
265
        dc_date_find = dc_ns + 'date'
266
266
        dc_date = dc_xml.findtext(dc_date_find)
267
267
        # look for timezone string
268
268
        if dc_date and dc_date.find('T') and dc_date.find('T') != -1:
269
269
            dc_date = dc_date[:dc_date.find('T')]
270
        
270
271
271
        # dc_description
272
272
        dc_desc_find = dc_ns + 'description'
273
273
        dc_description = dc_xml.findtext(dc_desc_find)
274
        
274
275
275
        # dc_language
276
276
        dc_lang_find = dc_ns + 'language'
277
277
        dc_language = dc_xml.findtext(dc_lang_find)
278
        
278
279
279
        # dc_publisher
280
280
        dc_pub_find = dc_ns + 'publisher'
281
281
        dc_publisher = dc_xml.findtext(dc_pub_find)
282
        
282
283
283
        # dc_title
284
284
        dc_title_find = dc_ns + 'title'
285
285
        dc_title = dc_xml.findtext(dc_title_find)
286
        
286
287
287
        # dc_type
288
288
        # We know this is "StillImage" already
289
        
289
290
290
        # dc_subject
291
291
        dc_subject_find = dc_ns + 'subject'
292
292
        # Going to get back a list of subjects
293
293
        dc_subject = [el.text for el in dc_xml.findall(dc_subject_find)]
294
294
        dc_subject = '; '.join(dc_subject)
295
        
295
296
296
        # dc_format
297
297
        dc_format_find = dc_ns + 'format'
298
298
        dc_format = [el.text for el in dc_xml.findall(dc_format_find)]
299
299
        dc_format = ', '.join(dc_format)
300
                
300
301
301
        # label
302
302
        properties = obj_xml.findall('.//{info:fedora/fedora-system:def/foxml#}property')
303
303
        label = ''
304
304
        for prop in properties:
305
305
            if prop.get('NAME').endswith('label'):
306
306
                label = prop.get('VALUE')
307
                
307
308
308
        # content digest of ORIGINAL datastream
309
309
        datastreams = obj_xml.findall('.//{info:fedora/fedora-system:def/foxml#}datastream')
310
310
        orig_ds = None
311
311
        for ds in datastreams:
312
312
            if ds.attrib['ID'] == 'ORIGINAL':
313
313
                orig_ds = ds
314
                
314
315
315
        dsvs = ds.findall('.//{info:fedora/fedora-system:def/foxml#}datastreamVersion')
316
316
        # find most recent datastreamVersion
317
317
        dsv = None
@@ -323,17 +323,20 @@ class Command(NoArgsCommand):
323
323
            for item in dsvs:
324
324
                dsv_id = item.attrib['ID']
325
325
                dsv_list.append((dsv_id, item))
326
                
326
327
327
            dsv_list.sort()
328
328
            dsv = dsv_list[-1][1]
329
            
329
330
330
        cd_elem = dsv.find('.//{info:fedora/fedora-system:def/foxml#}contentDigest')
331
        original_md5 = cd_elem.attrib['DIGEST']
332
        
331
        try:
332
            original_md5 = cd_elem.attrib['DIGEST']
333
        except AttributeError:
334
            original_md5 = ''
335
333
336
        # get canonical MODS, latest version
334
337
        mods_xml = self.fed.get_datastream(pid, 'MODS')
335
338
        mods_xml = et.fromstring(mods_xml)
336
        
339
337
340
        # get photographer
338
341
        # find the <name..> elements
339
342
        # find the one with <role><roleTerm>pht</role></roleTerm>
@@ -343,7 +346,7 @@ class Command(NoArgsCommand):
343
346
        # order
344
347
        mods_ns = './/{http://www.loc.gov/mods/v3}'
345
348
        names = mods_xml.findall(mods_ns + 'name')
346
        
349
347
350
        role_terms = ('pht', 'Photographer')
348
351
        photographer_name = None
349
352
        for item in names:
@@ -351,8 +354,8 @@ class Command(NoArgsCommand):
351
354
            for role in roles:
352
355
                if role.text in role_terms:
353
356
                    photographer_name = item
354
                    
355
        fname = lname = None        
357
358
        fname = lname = None
356
359
        if photographer_name:
357
360
            name_parts = photographer_name.findall(mods_ns + 'namePart')
358
361
            for part in name_parts:
@@ -360,7 +363,7 @@ class Command(NoArgsCommand):
360
363
                    lname = part.text
361
364
                if part.attrib['type'] == 'given':
362
365
                    fname = part.text
363
                    
366
364
367
        photographer = ''
365
368
        if fname and lname:
366
369
            photographer = ' '.join((fname, lname))
@@ -373,14 +376,14 @@ class Command(NoArgsCommand):
373
376
            tags = None
374
377
#         print tags
375
378
376
                
379
377
380
        # Now we create a new MSKImage instance
378
381
        img = MSKImage(parent_collection=parent_collection,
379
382
                        pid=pid,
380
383
                        object_xml=et.tostring(obj_xml),
381
                        source=source, 
384
                        source=source,
382
385
                        label=label,
383
                        dc_creator=dc_creator, 
386
                        dc_creator=dc_creator,
384
387
                        dc_date=dc_date,
385
388
                        dc_description=dc_description,
386
389
                        dc_language=dc_language,
@@ -393,5 +396,5 @@ class Command(NoArgsCommand):
393
396
                        photographer=photographer,
394
397
#                         tags=tags
395
398
                        )
396
        
399
397
400
        return img, tags