Commits

Anonymous committed 5f863fa

got database updater working again

Comments (0)

Files changed (23)

 import datetime
+import json
 
 import sqlalchemy as sql
 
 
     def __init__(self, dburi=None):
         self._uri = dburi
-        self.db = self._connect()
-        self.records = self.db.tables['records']
-        self.metadata = self.db.tables['metadata']
-        self._record_id = 0
-        self._record_cache = []
-        self._metadata_cache = []
-        self._set_ids = set(self._get_set_ids())
-    
+        self._db = self._connect()
+        self._records = self._db.tables['records']
+        self._sets = self._db.tables['sets']
+        self._setrefs = self._db.tables['setrefs']
+        self._reset_cache()
+        
     def _connect(self):
         dburi = self._uri
         if dburi is None:
             
         engine = sql.create_engine(dburi)
         db = sql.MetaData(engine)
+        
+        sql.Table('records', db,
+                  sql.Column('record_id', sql.Unicode, primary_key=True),
+                  sql.Column('modified', sql.DateTime, index=True),
+                  sql.Column('deleted', sql.Boolean),
+                  sql.Column('data', sql.String))
+        
+        sql.Table('sets', db,
+                  sql.Column('set_id', sql.Unicode, primary_key=True),
+                  sql.Column('hidden', sql.Boolean),
+                  sql.Column('name', sql.Unicode),
+                  sql.Column('description', sql.Unicode))
 
-        sql.Table('records', db,
-                  sql.Column('record_id', sql.Integer, primary_key=True),
-                  sql.Column('name', sql.Unicode, unique=True, index=True),
-                  sql.Column('when_modified', sql.DateTime, index=True),
-                  sql.Column('deleted', sql.Boolean),
-                  sql.Column('content_type', sql.Unicode),
-                  sql.Column('is_set', sql.Boolean),
-                  sql.Column('is_asset', sql.Boolean),
-                  sql.Column('sets', sql.Unicode)
-                  )
+        sql.Table('setrefs', db,
+                  sql.Column('record_id', sql.Integer, 
+                             sql.ForeignKey('records.record_id'),
+                             index=True, primary_key=True),
+                  sql.Column('set_id', sql.Integer,
+                             sql.ForeignKey('sets.set_id'),
+                             index=True, primary_key=True))
         
-        sql.Table('metadata', db,
-                  sql.Column('metadata_id', sql.Integer, primary_key=True),
-                  sql.Column('record_id', sql.Integer,
-                             sql.ForeignKey('records.record_id'), index=True),
-                  sql.Column('field', sql.String),
-                  sql.Column('value', sql.Unicode),
-                  sql.Column('reference', sql.Integer)
-                  )
-
         db.create_all()
         return db
 
-    def _get_set_ids(self):
-        for record in self.records.select(
-            self.records.c.is_set == True).execute():
-            yield record.name
+    def flush(self):
+        oai_ids = set()
+        for row in sql.select([self._records.c.record_id]).execute():
+            oai_ids.add(row[0])
+        for row in sql.select([self._sets.c.set_id]).execute():
+            oai_ids.add(row[0])
+
+        deleted_records = []
+        deleted_sets = []
+        deleted_setrefs = []
+
+        inserted_records = []
+        inserted_sets = []
+        inserted_setrefs = []
+
         
-    def _get_record_id(self, id):
-        result = None
-        for record in self.records.select(
-            self.records.c.name == id).execute():
-            result = record['record_id']
-        return result
-    
-    def get_record(self, id):
-        result = None
-        for record in self.records.select(
-            self.records.c.name == id).execute():
+        for oai_id, item in self._cache['records'].items():
+            if oai_id in oai_ids:
+                # record allready exists
+                deleted_records.append(oai_id)
+            item['record_id'] = oai_id
+            inserted_records.append(item)
+                
+        for oai_id, item in self._cache['sets'].items():
+            if oai_id in oai_ids:
+                # set allready exists
+                deleted_sets.append(oai_id)
+            item['set_id'] = oai_id
+            inserted_sets.append(item)
 
-            result = {'id': record['name'],
-                      'deleted': record['deleted'],
-                      'is_set': record['is_set'],
-                      'content_type': record['content_type'],
-                      'when_modified': record['when_modified'],
-                      }
-            break
+        for record_id, set_ids in self._cache['setrefs'].items():
+            deleted_setrefs.append(record_id)
+            for set_id in set_ids:
+                inserted_setrefs.append(
+                    {'record_id':record_id, 'set_id': set_id})
+
+        # delete all processed records before inserting
+        if deleted_records:
+            self._records.delete().execute(
+                [{'record_id': rid} for rid in deleted_records])
+        if deleted_sets:
+            self._sets.delete().execute(
+                [{'set_id': sid} for sid in deleted_sets])
+        if deleted_setrefs:
+            self._setrefs.delete().execute(
+                [{'record_id': rid} for rid in deleted_setrefs])
+
+        # batch inserts
+        if inserted_records:
+            self._records.insert().execute(inserted_records)
+        if inserted_sets:
+            self._sets.insert().execute(inserted_sets)
+        if inserted_setrefs:
+            self._setrefs.insert().execute(inserted_setrefs)
+
+        self._reset_cache()
+
+    def _reset_cache(self):
+        self._cache = {'records': {}, 'sets': {}, 'setrefs': {}}
         
-        return result
-                
-    def get_metadata(self, id):
-        result = {}
-        for record in self.metadata.select(
-            sql.and_(self.records.c.name == id,
-                     self.metadata.c.record_id == self.records.c.record_id)).execute():
+            
+    def update_record(self, oai_id, modified, deleted, sets, data):
+        # adds a record, call flush to actually store in db
+        data['sets'] = sets.keys()
+        data = json.dumps(data)
+        self._cache['records'][oai_id] = (dict(modified=modified,
+                                               deleted=deleted,
+                                               data=data))
+        self._cache['setrefs'][oai_id] = []
+        for set_id in sets:
+            self._cache['sets'][set_id] = dict(
+                name = sets[set_id]['name'],
+                description = sets[set_id].get('description'),
+                hidden = sets[set_id].get('hidden', False))
+            self._cache['setrefs'][oai_id].append(set_id)
+            
+    def get_record(self, oai_id):
+        row = self.records.select(
+            self._records.c.record_id == oai_id).execute().fetch_one()
+        if row is None:
+            return {}
+        return dict(row)
 
-            result.setdefault(record['field'], []).append(record['value'])
+    def get_set(self, oai_id):
+        row = self.records.select(
+            self._sets.c.set_id == oai_id).execute().fetch_one()
+        if row is None:
+            return {}
+        return dict(row)
 
-        return result or None
+    def remove_record(self, oai_id):
+        for result in self._records.delete(
+            self._records.c.record_id == oai_id).execute():
+            pass
+        for result in self._setrefs.delete(
+            self._setrefs.c.record_id == oai_id).execute():
+            pass
 
-    def get_sets(self, id):
-        result = []
-
-        for record in self.records.select(
-            self.records.c.name == id).execute():
-            result = record['sets'].strip().split(' ')
-        
-        return result
-
-    def get_assets(self, id):
-        assets = self.get_metadata(id)
-        if assets is None:
-            return []
-        assets = assets.get('asset', [])
-        result = []
-        for asset_id in assets:
-            md = self.get_metadata(asset_id)
-            data = {}
-            data['mimetype'] = md.pop('mimetype')[0]
-            data['url'] = md.pop('url')[0]
-            data['absolute_uri'] = md.pop('absolute_uri')[0]
-            data['filename'] = md.pop('filename')[0]
-            data['md5'] = md.pop('md5')[0]
-            data['metadata'] = md
-            result.append(data)
-        
-        return result
-    
-    def get_set(self, id):
-        md = self.get_metadata(id)
-        if not md:
-            return {}
-        result = {'name': md['name'][0],
-                  'description': md['description'][0],
-                  'id': id}
-        return result
-
-    def remove_content(self, id):
-        rid = self._get_record_id(id)
-        for result in self.records.delete(self.records.c.record_id == rid).execute():
+    def remove_set(self, oai_id):
+        for result in self._sets.delete(
+            self._sets.c.set_id == oai_id).execute():
             pass
-        self._remove_metadata(rid)
-        return True
-
-    def flush_update(self):
-        self.records.insert().execute(self._record_cache)
-        self._record_cache = []
-        self.metadata.insert().execute(self._metadata_cache)
-        self._metadata_cache = []
-        
-    def add_content(self, id, sets, record_data, meta_data, assets_data):
-        record_id = self._add_record(record_data, sets)
-        self._add_metadata(record_id, meta_data)
-
-        for num, asset_data in enumerate(assets_data):
-            asset_name = u'%s:asset:%s' % (id, num)
-            self._add_asset(record_id, asset_name, asset_data)
-        
-        return record_id
-
-    def _add_record(self, record_data, sets):
-        self._record_id += 1
-        record_id = self._record_id
-        rowdata = {'record_id': record_id,
-                   'name': record_data['id'],
-                   'deleted': record_data['deleted'],
-                   'is_set': record_data['is_set'],
-                   'is_asset': record_data.get('is_asset', False),
-                   'sets': u' %s ' % ' '.join(sets),
-                   'content_type': record_data['content_type'],
-                   'when_modified': record_data['when_modified']}
-        
-        self._record_cache.append(rowdata)
-
-        for set in sets:
-            # add dynamic sets
-            if not set in self._set_ids:
-                self.add_set(set, set)
-        
-        return record_id
-
-    def _add_metadata(self, record_id, meta_data):
-        for key, vals in meta_data.items():
-            for val in vals:
-                self._metadata_cache.append({'field': key,
-                                             'value': val,
-                                             'record_id': record_id})
-
-    def _remove_metadata(self, record_id):
-        asset_ids = []
-        self.metadata.delete(self.metadata.c.record_id == record_id).execute()
-
-    def _add_asset(self, record_id, asset_name, asset_data):
-
-        # an asset is just a record with is_asset == True
-        record_data = {'id': asset_name,
-                       'deleted': False,
-                       'is_set': False,
-                       'is_asset': True,
-                       'content_type': u'',
-                       'when_modified': datetime.datetime.now()
-                       }
-
-        asset_id = self._add_record(record_data, [])
-
-        # assets have required metadata        
-        meta_data = {'filename': [asset_data['filename']],
-                    'url': [asset_data['url']],
-                    'absolute_uri': [asset_data['absolute_uri']],
-                    'md5': [asset_data['md5']],
-                    'mimetype': [asset_data['mimetype']],
-                   }
-        
-        # additional metada can be provided
-        meta_data.update(asset_data['metadata'])
-        
-        self._add_metadata(asset_id, meta_data)
-        # relate the asset record to the publication record
-        self._add_metadata(record_id, {u'asset': [asset_name]})
-
-    def add_set(self, set_id, name, description=None):
-        
-        if description is None:
-            description = [u'']
-        elif not isinstance(description, list):
-            description = [description]
-
-        record_data = {'id': set_id,
-                       'content_type': u'set',
-                       'deleted': False,
-                       'sets': u'',
-                       'is_set': True,
-                       'is_asset': False,
-                       'when_modified': datetime.datetime.now()}
-        
-        meta_data  =  {'id':[set_id],
-                       'name': [name],
-                       'description': description}
-
-
-        if not set_id in self._set_ids:
-            # add a new set
-            record_id = self.add_content(set_id, [], record_data, meta_data, {})
-            self._set_ids.add(set_id)
-        else:
-            # set is allready there, update the metadata
-            record_id = self._get_record_id(set_id)
-            self._remove_metadata(record_id)
-            self._add_metadata(record_id, meta_data)
-
-        return record_id
-                         
-    def remove_set(self, id):
-        self.remove_content(id)
+        for result in self._setrefs.delete(
+            self._setrefs.c.set_id == oai_id).execute():
+            pass
 
     def oai_sets(self, offset=0, batch_size=20):
-        for row in self.records.select(self.records.c.is_set==True
+        for row in self._sets.select(
+              self._sets.c.hidden == False
             ).offset(offset).limit(batch_size).execute():
-            result = {}
-            for data in self.metadata.select(
-                self.metadata.c.record_id==row['record_id']).execute():
-                result[data.field] = data.value
-            yield result
-
+            yield {'id': row.set_id,
+                   'name': row.name,
+                   'description': row.description}
+            
     def oai_query(self,
                   offset=0,
                   batch_size=20,
             until_date = datetime.datetime.now()
 
 
-        query = self.records.select(
-            sql.and_(self.records.c.is_set == False,
-                     self.records.c.is_asset == False),
-            order_by = [sql.desc(self.records.c.when_modified)])
+        query = self._records.select(
+            order_by=[sql.desc(self._records.c.modified)])
 
         # filter dates
-        query.append_whereclause(self.records.c.when_modified <= until_date)
+        query.append_whereclause(self._records.c.modified <= until_date)
 
         if not identifier is None:
-            query.append_whereclause(self.records.c.name == identifier)
+            query.append_whereclause(self._records.c.record_id == identifier)
 
         if not from_date is None:
-            query.append_whereclause(self.records.c.when_modified >= from_date)
+            query.append_whereclause(self._records.c.modified >= from_date)
 
         # filter sets
 
         setclauses = []
         for set_id in sets:
             setclauses.append(
-                self.records.c.sets.like(u'%% %s %%' % set_id))
+                sql.and_(
+                self._setrefs.c.set_id == set_id,
+                self._setrefs.c.record_id == self._records.c.record_id))
             
         if setclauses:
             query.append_whereclause(sql.or_(*setclauses))
         filter_setclauses = []
         for set_id in filter_sets:
             filter_setclauses.append(
-                self.records.c.sets.like(u'%% %s %%' % set_id))
+                sql.and_(
+                self._setrefs.c.set_id == set_id,
+                self._setrefs.c.record_id == self._records.c.record_id))
             
         if filter_setclauses:
             query.append_whereclause(sql.or_(*filter_setclauses))
         not_setclauses = []
         for set_id in not_sets:
             not_setclauses.append(
-                self.records.c.sets.like(u'%% %s %%' % set_id))
-
+                sql.and_(
+                self._setrefs.c.set_id == set_id,
+                self._setrefs.c.record_id == self._records.c.record_id))
             
         if not_setclauses:
-            query.append_whereclause(sql.not_(
-                sql.or_(*not_setclauses)))
+            query.append_whereclause(sql.not_(sql.or_(*not_setclauses)))
 
         for row in query.distinct().offset(offset).limit(batch_size).execute():
-            record = dict(row)
-            record['id'] = record['name']
-            del record['name']
-            record['sets'] = record['sets'].strip().split(' ')
-            if record['sets'] == [u'']:
-                record['sets'] = []
+            record = {'id': row.record_id,
+                      'deleted': row.deleted,
+                      'modified': row.modified,
+                      'data': json.loads(row.data)}
             yield {'record': record,
-                   'sets': record['sets'],
-                   'metadata': self.get_metadata(row['name']) or {},
+                   'sets': record['data']['sets'],
+                   'metadata': record['data'],
                    'assets':{}}
        
     def empty_database(self):
-        self.records.delete().execute()
-        self.metadata.delete().execute()
+        self._records.delete().execute()
+        self._sets.delete().execute()
+        self._setrefs.delete().execute()
 
-        self._record_id = 0
-        self._record_cache = []
-        self._metadata_cache = []
-        #self._set_ids = set(self._get_set_ids())
-

moai/example-1234.xml

+<publication xmlns="http://example.org/data">
+  <id>1234</id>
+  <title>An Example Test Publication</title>
+  <abstract>
+    This is a piece of example data that is used to 
+    demonstrate how arbitrary data can be loaded into
+    the MOAI OAIPMH server.
+    Data is not limited to XML files, it could also come
+    from an SQL database, in which case a different data
+    provider should be used.
+  </abstract>
+  <subject>example</subject>
+  <subject>test</subject>
+  <author>
+    <givenName>Jane</givenName>
+    <familyName>Doe</familyName>
+    <author-id>41431324</author-id>
+  </author>
+  <author>
+    <givenName>John</givenName>
+    <familyName>Doe</familyName>
+  </author>
+  <access>public</access>
+  <issued>2010-10-10T15:53:00Z</issued>
+  <modified>2010-10-12T15:56:00Z</modified>
+  <asset>
+    <type>application/pdf</type>
+    <name>example.pdf</name>
+    <description>example pdf</description>
+    <access>public</access>
+  </asset>
+</publication>
 
     def update(self, path, provider):
         self.provider = provider
-        self.nsmap = {'ex':'http://example.org'}
+
+        self.nsmap = {'ex':'http://example.org/data'}
         doc = etree.parse(path)
         self.root = doc.getroot()
 
         self.id = self.xpath('ex:id/text()', 'id', unicode, required=True)
-        self.content_type = unicode(self.root.xpath('local-name()'))
-        if self.content_type == 'publication':
-            self.label = self.xpath('ex:title/text()', 'label', unicode, required=True)
-        else:
-            self.label = self.xpath('ex:name/text()', 'label', unicode, required=True)
-            
-        self.when_modified = datetime(*time.gmtime(os.path.getmtime(path))[:6])
+        self.modified = datetime(*time.gmtime(os.path.getmtime(path))[:6])
         self.deleted = False
-        self.sets = self.xpath('ex:set/@ref', 'set', unicode, multi=True)
-        self.sets.append(self.content_type)
-        self.sets.extend(self.xpath('ex:scope/text()', 'scope', unicode, multi=True))
-        self.is_set = self.content_type == u'set'
+        self.sets = {}
+        self.sets[u'example'] = {'name':u'example',
+                                 'description':u'An Example Set'}
+        
+        self.data = {}
+        for el in self.root:
+            tagname = el.tag.split('}', 1)[-1]
+            if tagname in ['author', 'asset']:
+                value = {}
+                for s_el in el:
+                    text = s_el.text.strip().decode('utf8')
+                    value[s_el.tag.split('}', 1)[-1]] = text
+            else:
+                value = el.text.strip().decode('utf8')
+            self.data.setdefault(tagname,[]).append(value)
 
-        self._assets = []
-        if self.content_type == u'person':
-            self._fields = self.set_person_fields()
-        elif self.content_type == u'set':
-            self._fields = self.set_set_fields()
-        else:
-            self._fields = self.set_publication_fields()
 
-        # Instead of letting the updater fail the record on not-valid XML,
-        # remove the conflicting characters 
-        #self._sanitize()
-
-    def set_set_fields(self):
-        return {u'name': [self.label],
-                u'description': self.xpath(
-            'ex:description/text()',
-            'description',
-            unicode,
-            multi=True)}
-        
-    def set_publication_fields(self):
-        fields = {
-            u'description': [
-            self.xpath('ex:abstract/text()', 'abstract', unicode)],
-            u'title': [self.label],
-            u'date': self.xpath('ex:issued/text()',
-                                'subject', datetime, multi=True),
-            u'subject': self.xpath('ex:keyword/text()',
-                                   'subject', unicode, multi=True),
-            u'identifier': ['http://purl.example.org/%s' % self.id],
-            u'language': self.xpath('ex:abstract/@xml:lang',
-                                    'author', unicode, multi=True),
-            u'type': [self.content_type]
-        }
-
-        if fields['date']:
-            # fields should always be unicode
-            fields['date'] = [unicode(fields['date'][0].isoformat())]
-        
-        if 'public' in self.sets:
-           fields[u'rights'] = [u'public domain, no restrictions']
-
-        authors = []
-        author_rel = []
-        ids = self.xpath('ex:author/@ref', 'author', unicode, multi=True)
-        for id in ids:
-            author_rel.append(id)
-            person = ExampleContentObject()
-            person.update(
-                self.provider.get_content_by_id(id.replace(':','_')+'.xml'),
-                self.provider)
-            authors.append(person.label)
-        fields[u'author'] = authors
-        fields[u'author_rel'] = author_rel
-        fields[u'contributor'] = authors
-        fields[u'url'] = [u'http://hdl.handle.net/????/%s' % self.id]
-        fields[u'dare_id'] = [u'urn:NBN:nl:ui:??-%s' %self.id]
-
-        for el in self.root.xpath('ex:asset', namespaces=self.nsmap):
-            asset = {}
-            for child in el.xpath('*[text()]'):
-                asset[child.tag.split('}')[-1]] = unicode(child.text)
-            assert u'filename' in asset, 'found asset without filename'
-            assert u'mimetype' in asset, 'found asset without mimetype'
-            asset[u'url'] = u'asset/%s/%s' % (
-                self.id,
-                asset['filename'])
-            
-            path = os.path.join(os.path.dirname(__file__),
-                                'example_data',
-                                'assets', self.id,
-                                asset['filename'])
-            assert os.path.isfile(path), "Can not find asset: %s" % path
-
-            asset[u'absolute_uri'] = u'file://%s' % path
-            asset[u'md5'] = u''
-            asset[u'metadata'] = {}
-            if asset[u'access']:
-                asset[u'metadata'][u'access'] = [asset[u'access']]
-                del asset[u'access']
-            if asset[u'modified']:
-                asset[u'metadata'][u'modified'] = [asset[u'modified']]
-                del asset[u'modified']
-            self._assets.append(asset)
-                
-        #fields[u'asset'] = assets
-        
-        return fields
-
-    def get_assets(self):
-        return self._assets
-
-    def set_person_fields(self):
-        fields = {
-            u'name' : [self.label],
-            u'surname': self.xpath('ex:surname/text()',
-                                   'surname', unicode, multi=True),
-            u'firstname': self.xpath('ex:firstname/text()',
-                                     'firstname', unicode, multi=True),
-            u'initials': self.xpath('ex:initials/text()',
-                                    'initials', unicode, multi=True),
-            u'dai': self.xpath('ex:dai/text()',
-                               'initials', unicode, multi=True),
-            }
-        return fields
+        if 'public' in self.data['access']:
+            self.sets[u'public'] = {'name':u'public',
+                                    'description':u'Public access'}
+        elif 'private' in self.data['access']:
+            self.sets[u'private'] = {'name':u'private',
+                                     'description':u'Private access'}

moai/examples/__init__.py

-#

moai/examples/example_configuration.py

-import os
-import shutil
-
-from moai import ConfigurationProfile, name
-from moai.update import DatabaseUpdater
-from moai.provider.file import FileBasedContentProvider
-from moai.server import Server, FeedConfig
-from moai.http.cherry import start_server
-from moai.database.sqlite import SQLiteDatabase
-from moai.examples.example_content import ExampleContentObject
-            
-class ExampleConfiguration(ConfigurationProfile):
-    name('example_configuration')
-    
-    def get_content_provider(self):
-        provider = FileBasedContentProvider(self.config['path'], '*.xml')
-        provider.set_logger(self.log)
-        return provider
-
-
-    def get_database_updater(self):
-
-        dbpath = '/tmp/moai.new.db'
-        if os.path.isfile(dbpath):
-            self.log.warning('removing old moai.new.db')
-            os.remove(dbpath)
-        
-        return DatabaseUpdater(self.get_content_provider(),
-                               ExampleContentObject,
-                               SQLiteDatabase(dbpath, 'w'),
-                               self.log)
-
-    def get_database(self):
-        if os.path.isfile('/tmp/moai.new.db'):
-            shutil.move('/tmp/moai.new.db',
-                        '/tmp/moai.db')
-            
-        return SQLiteDatabase('/tmp/moai.db', 'r')
-    
-    def get_server(self):
-        server_url = 'http://%s:%s/repo' % (self.config['host'],
-                                            self.config['port'])
-        asset_path = os.path.join(os.path.dirname(__file__),
-                                  'example_data',
-                                  'assets')
-                                  
-        server = Server(server_url,
-                        self.get_database())
-        server.add_config(
-            FeedConfig('example',
-                       'An example OAI Server',
-                       '%s/example' % server_url,
-                       self.log,
-                       base_asset_path=asset_path,
-                       sets_allowed=['public'],
-                       metadata_prefixes=['oai_dc', 'mods',
-                                          'didl', 'nl_didl']))
-        return server
-                   
-    def start_development_server(self):
-        start_server('127.0.0.1', self.config['port'], 10, 'repo', self.get_server())
-
-        

moai/examples/example_content.py

-import os
-import time
-from datetime import datetime
-
-from lxml import etree
-
-from moai.content import XMLContentObject
-
-class ExampleContentObject(XMLContentObject):
-
-    def update(self, path, provider):
-        self.provider = provider
-        self.nsmap = {'ex':'http://example.org'}
-        doc = etree.parse(path)
-        self.root = doc.getroot()
-
-        self.id = self.xpath('ex:id/text()', 'id', unicode, required=True)
-        self.content_type = unicode(self.root.xpath('local-name()'))
-        if self.content_type == 'publication':
-            self.label = self.xpath('ex:title/text()', 'label', unicode, required=True)
-        else:
-            self.label = self.xpath('ex:name/text()', 'label', unicode, required=True)
-            
-        self.when_modified = datetime(*time.gmtime(os.path.getmtime(path))[:6])
-        self.deleted = False
-        self.sets = self.xpath('ex:set/@ref', 'set', unicode, multi=True)
-        self.sets.append(self.content_type)
-        self.sets.extend(self.xpath('ex:scope/text()', 'scope', unicode, multi=True))
-        self.is_set = self.content_type == u'set'
-
-        self._assets = []
-        if self.content_type == u'person':
-            self._fields = self.set_person_fields()
-        elif self.content_type == u'set':
-            self._fields = self.set_set_fields()
-        else:
-            self._fields = self.set_publication_fields()
-
-        # Instead of letting the updater fail the record on not-valid XML,
-        # remove the conflicting characters 
-        #self._sanitize()
-
-    def set_set_fields(self):
-        return {u'name': [self.label],
-                u'description': self.xpath(
-            'ex:description/text()',
-            'description',
-            unicode,
-            multi=True)}
-        
-    def set_publication_fields(self):
-        fields = {
-            u'description': [
-            self.xpath('ex:abstract/text()', 'abstract', unicode)],
-            u'title': [self.label],
-            u'date': self.xpath('ex:issued/text()',
-                                'subject', datetime, multi=True),
-            u'subject': self.xpath('ex:keyword/text()',
-                                   'subject', unicode, multi=True),
-            u'identifier': ['http://purl.example.org/%s' % self.id],
-            u'language': self.xpath('ex:abstract/@xml:lang',
-                                    'author', unicode, multi=True),
-            u'type': [self.content_type]
-        }
-
-        if fields['date']:
-            # fields should always be unicode
-            fields['date'] = [unicode(fields['date'][0].isoformat())]
-        
-        if 'public' in self.sets:
-           fields[u'rights'] = [u'public domain, no restrictions']
-
-        authors = []
-        author_rel = []
-        ids = self.xpath('ex:author/@ref', 'author', unicode, multi=True)
-        for id in ids:
-            author_rel.append(id)
-            person = ExampleContentObject()
-            person.update(
-                self.provider.get_content_by_id(id.replace(':','_')+'.xml'),
-                self.provider)
-            authors.append(person.label)
-        fields[u'author'] = authors
-        fields[u'author_rel'] = author_rel
-        fields[u'contributor'] = authors
-        fields[u'url'] = [u'http://hdl.handle.net/????/%s' % self.id]
-        fields[u'dare_id'] = [u'urn:NBN:nl:ui:??-%s' %self.id]
-
-        for el in self.root.xpath('ex:asset', namespaces=self.nsmap):
-            asset = {}
-            for child in el.xpath('*[text()]'):
-                asset[child.tag.split('}')[-1]] = unicode(child.text)
-            assert u'filename' in asset, 'found asset without filename'
-            assert u'mimetype' in asset, 'found asset without mimetype'
-            asset[u'url'] = u'asset/%s/%s' % (
-                self.id,
-                asset['filename'])
-            
-            path = os.path.join(os.path.dirname(__file__),
-                                'example_data',
-                                'assets', self.id,
-                                asset['filename'])
-            assert os.path.isfile(path), "Can not find asset: %s" % path
-
-            asset[u'absolute_uri'] = u'file://%s' % path
-            asset[u'md5'] = u''
-            asset[u'metadata'] = {}
-            if asset[u'access']:
-                asset[u'metadata'][u'access'] = [asset[u'access']]
-                del asset[u'access']
-            if asset[u'modified']:
-                asset[u'metadata'][u'modified'] = [asset[u'modified']]
-                del asset[u'modified']
-            self._assets.append(asset)
-                
-        #fields[u'asset'] = assets
-        
-        return fields
-
-    def get_assets(self):
-        return self._assets
-
-    def set_person_fields(self):
-        fields = {
-            u'name' : [self.label],
-            u'surname': self.xpath('ex:surname/text()',
-                                   'surname', unicode, multi=True),
-            u'firstname': self.xpath('ex:firstname/text()',
-                                     'firstname', unicode, multi=True),
-            u'initials': self.xpath('ex:initials/text()',
-                                    'initials', unicode, multi=True),
-            u'dai': self.xpath('ex:dai/text()',
-                               'initials', unicode, multi=True),
-            }
-        return fields

moai/examples/example_data/assets/publication:1/test.txt

-This is an asset

moai/examples/example_data/persons/person_1.xml

-<person xmlns="http://example.org">
-    <id>person:1</id>
-    <name>Doe, J.</name>
-    <surname>Doe</surname>
-    <firstname>Joe</firstname>
-    <initials>J.</initials>
-    <dai>12345</dai>
-</person>

moai/examples/example_data/publications/2008/publication_1.xml

-<publication xmlns="http://example.org">
-    <id>publication:1</id>
-    <title>An example publication</title>
-    <set ref="set:1"/>
-    <scope>public</scope>
-    <author ref="person:1"/>
-    <abstract xml:lang="en">
-        This is an example publication that comes with the OAIMetaServer
-    </abstract>
-    <keyword>test</keyword>
-    <keyword>example</keyword>
-    <issued>2008-10-31T12:36:00</issued>
-    <asset>
-      <filename>test.txt</filename>
-      <mimetype>text/plain</mimetype>
-      <access>open</access>
-      <modified>2010-04-25T12:36:00</modified>
-    </asset>
-</publication>

moai/examples/example_data/sets/set_1.xml

-<set xmlns="http://example.org">
-    <id>set:1</id>
-    <name>Example Publications</name>
-    <description>Just an example set for testing</description>
-</set>

moai/examples/example_plugin.py

-
-from moai import Plugin, name
-
-class ExamplePlugin(Plugin):
-    name('example_plugin')
-
-    def __init__(self, database, log, config):
-        self.db = database
-        self.log = log
-        self.config = config
-        
-    def run(self, updated_ids):
-        self.log.info('Hello %s from ExamplePlugin' % self.config['hello'])
-        
-        print 'Hello %s from example plugin -> Updating %s records' % (
-                self.config['hello'], len(updated_ids))

moai/examples/simple/__init__.py

Empty file removed.

moai/examples/simple/config.py

-import os
-import shutil
-
-from moai import ConfigurationProfile, name
-from moai.update import DatabaseUpdater
-from moai.provider.file import FileBasedContentProvider
-from moai.server import Server, FeedConfig
-from moai.http.cherry import start_server
-from moai.database.sqlite import SQLiteDatabase
-from moai.examples.simple.content import SimpleDCContentObject
-            
-class SimpleDCConfiguration(ConfigurationProfile):
-    name('simple_dc_configuration')
-    
-    def get_content_provider(self):
-        provider = FileBasedContentProvider(self.config['path'], '*.xml')
-        provider.set_logger(self.log)
-        return provider
-
-
-    def get_database_updater(self):
-
-        dbpath = '/tmp/moai.new.db'
-        if os.path.isfile(dbpath):
-            self.log.warning('removing old moai.new.db')
-            os.remove(dbpath)
-        
-        return DatabaseUpdater(self.get_content_provider(),
-                               SimpleDCContentObject,
-                               SQLiteDatabase(dbpath, 'w'),
-                               self.log)
-
-    def get_database(self):
-        if os.path.isfile('/tmp/moai.new.db'):
-            shutil.move('/tmp/moai.new.db',
-                        '/tmp/moai.db')
-            
-        return SQLiteDatabase('/tmp/moai.db', 'r')
-    
-    def get_server(self):
-        server_url = 'http://%s:%s/repo' % (self.config['host'],
-                                            self.config['port'])
-        asset_path = os.path.join(os.path.dirname(__file__), 'data')
-                                  
-        server = Server(server_url,
-                        self.get_database())
-        server.add_config(
-            FeedConfig('example',
-                       'An example OAI Server',
-                       '%s/example' % server_url,
-                       self.log,
-                       base_asset_path=asset_path,
-                       metadata_prefixes=['oai_dc', 'mods',
-                                          'didl', 'nl_didl']))
-        return server
-                   
-    def start_development_server(self):
-        start_server('127.0.0.1', self.config['port'], 10, 'repo', self.get_server())
-
-        

moai/examples/simple/content.py

-import os
-import time
-from datetime import datetime
-
-from lxml import etree
-
-from moai.content import XMLContentObject
-
-class SimpleDCContentObject(XMLContentObject):
-
-    def update(self, path, provider):
-        self.provider = provider
-        self.nsmap = {'dc':'http://purl.org/dc/elements/1.1/'}
-        doc = etree.parse(path)
-        self.root = doc.getroot()
-
-        self.id = self.xpath('dc:identifier[not(@scheme)]/text()',
-                             'identifier', unicode, required=True)
-        self.content_type = u'publication'
-        self.label = self.xpath('dc:title/text()',
-                                'label', unicode, required=True)
-        self.is_set = False
-        
-        self.when_modified = datetime(*time.gmtime(os.path.getmtime(path))[:6])
-        self.deleted = False
-        self.sets = []
-        self.sets.extend(self.xpath('dc:subject/text()',
-                                    'subject', unicode, multi=True))
-
-        self._assets = [{u'filename': u'%s.pdf' % self.id,
-                         u'url': u'asset/%s/%s.pdf' % (self.id, self.id),
-                         u'absolute_uri': u'',
-                         u'mimetype': u'application/pdf',
-                         u'md5': u'',
-                         u'metadata': {}}]
-
-
-        
-        self._fields = self.set_publication_fields()
-        
-    def set_publication_fields(self):
-        fields = {
-            u'description': self.xpath('dc:description/text()',
-                                       'description', unicode, multi=True),
-            u'title': [self.label],
-            u'date': self.xpath('dc:date/text()',
-                                'date', datetime, multi=True),
-            u'subject': self.xpath('dc:subject/text()',
-                                   'subject', unicode, multi=True),
-            u'identifier': [self.id],
-            u'language': self.xpath('dc:lanauge/text()',
-                                    'language', unicode, multi=True),
-            u'type': self.xpath('dc:type/text()',
-                                'type', unicode, multi=True),
-            u'url': self.xpath('dc:identifier[@scheme="dcterms:URI"]/text()',
-                               'identifier', unicode, multi=True),
-            u'author': self.xpath('dc:creator/text()',
-                                  'creator', unicode, multi=True),
-        }
-        
-        if fields['date']:
-            # fields should always be unicode
-            fields['date'] = [unicode(fields['date'][0].isoformat())]
-
-        return fields
-    
-    def get_assets(self):
-        return self._assets

moai/examples/simple/data/Py3kEuro08/Py3kEuro08.pdf

Binary file removed.

moai/examples/simple/data/Py3kEuro08/Py3kEuro08.xml

-<oai_dc:dc 
- xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
- xmlns:dc="http://purl.org/dc/elements/1.1/"
- xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
-  <dc:title>Python 3000 and You</dc:title>
-  <dc:subject>python</dc:subject>
-  <dc:subject>py3k</dc:subject>
-  <dc:description>
-    The keynote held by Guido van Rossum at EuroPython 2008
-  </dc:description>
-  <dc:creator>Guido van Rossum</dc:creator>
-  <dc:type>Keynote</dc:type>
-  <dc:identifier>Py3kEuro08</dc:identifier>
-  <dc:identifier scheme="dcterms:URI">
-    http://www.europython2008.eu/Keynotes
-  </dc:identifier>
-  <dc:language>en</dc:language>
-  <dc:date>2008-07-10T12:36:00</dc:date>
-</oai_dc:dc>
-
     
     def listSets(self, cursor=0, batch_size=20):
         for set in self.db.oai_sets(cursor, batch_size):
-            oai_id = self.config.get_setspec_id(set['id'])
-            yield [oai_id, set['name'], set['description']]
+            yield [set['id'], set['name'], set['description']]
 
     def listRecords(self, metadataPrefix, set=None, from_=None, until=None,
                     cursor=0, batch_size=10):
 
     def _createHeader(self, record):
         oai_id = self.config.get_oai_id(record['record']['id'])
-        datestamp = record['record']['when_modified']
-        sets = [self.config.get_setspec_id(s) for s in record['sets']]
+        datestamp = record['record']['modified']
+        sets = record['sets']
         deleted = record['record']['deleted']
         for deleted_set in self.config.sets_deleted:
             if deleted_set in record['sets']:

moai/provider/file.py

     """
     implements(IContentProvider)
 
-    def __init__(self, path, content_filter="*"):
-        self._path = path
+    def __init__(self, uri, content_filter="*"):
+        assert uri.startswith('file://'), 'unknown uri format'
+        path = uri[7:]
+        basedir, filename = os.path.split(path)
+        if '*' in filename or '?' in filename:
+            content_filter = filename
+            self._path = basedir
+        else:
+            self._path = path
         self._filter = content_filter
         self._content = {}
 
 
     implements(IServer)
 
-    def __init__(self, base_url, db, config, content):
+    def __init__(self, base_url, db, config):
         self.base_url = base_url
         self._db = db
         self._config = config
-        self._content = content
 
     def download_asset(self, req, url, config):
         """Download an asset
 import logging.handlers
 import datetime
 import pkg_resources
+from pkg_resources import iter_entry_points
 import ConfigParser
 
 from optparse import OptionParser
 
 from moai.utils import (parse_config_file,
                         get_duration,
+                        get_moai_log,
                         ProgressBar)
+from moai.update import DatabaseUpdater
+from moai.database import Database
 
 VERSION = pkg_resources.working_set.by_key['moai'].version
                  
         sys.exit(1)
     configfile = ConfigParser.ConfigParser()
     configfile.read(config_path)
+    profiles = []
+    config = {}
     for section in configfile.sections():
-        if section == 'app:%s' % profile_name:
-            break
-    else:
+        if not configfile.has_option(section, 'use'):
+            continue
+        if configfile.get(section, 'use') == 'egg:moai':
+            profiles.append(section.split(':', 1)[1])
+        if profile_name == section.split(':', 1)[1]:
+            for option in configfile.options(section):
+                config[option] = configfile.get(section, option)
+            
+    if not profile_name in profiles:
         sys.stderr.write('unknown profile: %s\n' % profile_name)
+        sys.stderr.write('known profiles are: %s\n' % ', '.join(profiles))
         sys.exit(1)
-        raise ValueError('No such profile found: %s' % profile_name)
 
-    config = {}
-    for option in configfile.options(section):
-        config[option] = configfile.get(section, option)
+    database = Database(config['database'])
+    for content_point in iter_entry_points(group='moai.content',
+                                           name=config['content']):
+        content_class = content_point.load()
 
-        
-    import ipdb; ipdb.set_trace()
+    provider_name = config['provider'].split(':', 1)[0]
+    for provider_point in iter_entry_points(group='moai.provider',
+                                           name=provider_name):
+        provider = provider_point.load()(config['provider'])
+
+    log = get_moai_log()
+    updater = DatabaseUpdater(provider,
+                              content_class,
+                              database,
+                              log,
+                              flush_threshold=-1)
     
-    updater = profile.get_database_updater()
     progress = ProgressBar()
     error_count = 0
     starttime = time.time()
         msg_count = ('%%0.%sd/%%s' % len(str(total))) % (count, total)
         if not error is None:
             error_count += 1
-            profile.log.error('%s %s' % (msg_count, error.logmessage()))
+            log.error('%s %s' % (msg_count, error.logmessage()))
             if options.debug:
                 print >> sys.stderr, '\n'
                 import traceback
         elif options.quiet:
             pass
         elif options.verbose:
-            profile.log.info('%s Added %s'  % (msg_count, id))
+            log.info('%s Added %s'  % (msg_count, id))
         else:
             progress.tick(count, total)
         updated.append(id)
 
     duration = get_duration(starttime)
     msg = 'Updating database with %s objects took %s' % (total, duration)
-    profile.log.info(msg)
+    log.info(msg)
     if not options.verbose and not options.quiet:
         print >> sys.stderr, msg
 
         if error_count > 1:
             multi = 's'
         msg = '%s error%s occurred during updating' % (error_count, multi)
-        profile.log.warning(msg)
+        log.warning(msg)
         if not options.verbose and not options.quiet:
             print >> sys.stderr, msg
 
-    plugin_names = moai.get_plugin_names()
-    configured_plugins = profile.config.get('plugins', [])
-    plugin_names = [n for n in plugin_names if n in configured_plugins]
-   
-    if len(plugin_names) == 0:
-        sys.exit(0)
-    
-    for num, name in enumerate(plugin_names):
-        num += 1
-        msg = 'Running plugin %s/%s: %s' % (num,
-                                            len(plugin_names),
-                                            name)
-        if not options.verbose and not options.quiet:
-            print >> sys.stderr, msg
-        profile.log.info(msg)
-        config = parse_config_file(configfile, name)
-
-        plugin = moai.get_plugin(name)(updater.db,
-                                       profile.log,
-                                       config)
-        try:
-            plugin.run(updated)
-        except Exception, err:
-            errname = type(err).__name__
-            if not options.quiet:
-                print >> sys.stderr, '-> %s: %s' % (errname, err)
-            profile.log.error('Error while running plugin %s:\n%s' % (name, err))
-            if options.debug:
-                raise
-from lxml.builder import E
-
-from zope.interface import implements
-
-from moai.interfaces import IDatabaseUpdater
 from moai.error import ContentError, DatabaseError
 
 class DatabaseUpdater(object):
     (implementations of :ref:`IContentProvider` and :ref:`IContentObject`)
     """
 
-    implements(IDatabaseUpdater)
-
     def __init__(self, content, content_class, database, log, flush_threshold=-1):
         self.set_database(database)
         self.set_content_provider(content)
                 content_data = self._provider.get_content_by_id(content_id)
                 content = self._content_object_class()
                 stop = content.update(content_data, self._provider)
+                
                 if stop is False:
                     self._log.info('Ignoring %s' % content_id)
                     continue
                        ContentError(self._content_object_class, content_id))
                 continue
 
-            # Test the content for xml compatibility
+            # Not a set, compose the record
             try:
-                self._xml_comp_error(content)
-            except ValueError, err:
-                if not supress_errors:
-                    raise ValueError(err)
-                yield (count, total, content_id, 
-                       ContentError(self._content_object_class, content_id))
-                continue
-
-            # If it is a set, dump the db-cache
-            if content.is_set:
-                try:
-                    self.db.add_set(content.id, content.label, 
-                                            content.get_values('description'))
-                except Exception:
-                    if not supress_errors:
-                        raise
-                    yield (count, total, content.id, DatabaseError(content.id, 
-                           'set'))
-                    continue
-                yield count, total, content.id, None
-                continue
-
-            # Not a set, compose the record
-            record_data = {'id': content.id,
-                           'content_type': content.content_type,
-                           'is_set': content.is_set,
-                           'when_modified': content.when_modified,
-                           'deleted': content.deleted}
-            id = content.id
-            sets = content.sets
-            assets = content.get_assets()
-
-            got_error = False
-            metadata = {}
-            for name in content.field_names():
-                try:
-                    metadata[name] = content.get_values(name)
-                except Exception:
-                    if not supress_errors:
-                        raise
-                    yield count, total, id, DatabaseError(id, 'set')
-                    got_error = True
-                    break
-            if got_error:
-                continue
-
-            try:
-                self.db.add_content(id, sets, record_data, metadata, assets)
+                self.db.update_record(content.id,
+                                      content.modified,
+                                      content.deleted,
+                                      content.sets,
+                                      content.data)
             except Exception:
                 if not supress_errors:
                     raise
-                yield count, total, id, DatabaseError(id, 'set')
+                yield count, total, content.id, DatabaseError(id, 'set')
                 continue
            
-            yield count, total, id, None
+            yield count, total, content.id, None
 
         # Always flush db-cache
         try:
-            self.db.flush_update()
+            self.db.flush()
         except Exception, err:
             if not supress_errors:
                 raise
-
-    def _xml_comp_error(self, content):
-        # Check content for XML comp., discard record on fail
-        # Illegal content might be replaced in the IContentObject 
-        # implementation, so the record gets included
-
-        for foo in ['id', 'label', 'content_type']:
-            try:
-                bar = eval('content.' + foo)
-                E("foo", bar)
-            except ValueError, err:
-                raise ValueError("\n\n%s = %s\n" %(foo, repr(bar)))
-        
-        for foo in content.sets:
-            try:
-                E("foo", foo)
-            except ValueError, err:
-                raise ValueError("%s 'sets' = %s\n" %(content.id, repr(foo)))
-
-        for name in content.field_names():
-            try:
-                for value in content.get_values(name):
-                    E("foo", value)
-            except ValueError, err:
-                raise ValueError("%s : metadata[%s] = %s" %(
-                                                content.id, name, repr(value)))
-
 import os
-from pkg_resources import iter_entry_points
 
 from webob import Request, Response
 
     formats = formats.split()
     admin_email = admin_email.split()
 
-    for content_point in iter_entry_points(group='moai.content', name=content):
-        content_class = content_point.load()
-        break
-    else:
-        raise ValueError('No such content profile: %s' % content)
-
     database = Database(database)
     feedconfig = FeedConfig(name,
                             url,
                             admin_emails=admin_email,
                             metadata_prefixes=formats)
-    content = content_class()
-    server = Server(url, database, feedconfig, content_class)
+    server = Server(url, database, feedconfig)
     return MOAIWSGIApp(server)
 
 class FileIterable(object):
 formats = oai_dc mods
 disallow_sets = private
 database = sqlite:///moai-example.db
-provider = file://moai/examples/*.xml
+provider = file://moai/example-*.xml
 content = moai_example
 
 [server:main]