Commits

Martin Czygan committed d4ff3d3

reworked components and some simplifications

- inspired by
http://dirtsimple.org/2007/02/wsgi-middleware-considered-harmful.html
started to wrap our data dict into a metadata bag (util.mdbag)
this dict subclass should provide library functions to the most common
manipulations
- top level attributes of our import configuration can now be accessed
like attributes, e.g. import_conf.location or import_conf.source_id
- Processor started using the mdbag and has a little more sane names
for its instance attributes
- We now have three commands (renamed, since pylint doesn't like
lowercase class names): Copyall, Copy, GetFincID
- RedisStore uses pickle now (instead of json), but this yields some
errors on compund types like marc records; TODO: fix this
- reworked encode_fincid and decode_fincid to accept optional source ids
and to handle numbers with a trailing X (PPN?) appropriatly

  • Participants
  • Parent commits 6ce38e6

Comments (0)

Files changed (7)

config/v2/simple-fi.mc.xml

         needs to talk to our metadata store to find out, whether
         it has been imported from somewhere already.
         -->
-        <get_or_set_finc_id dst="001" />
+        <get_finc_id dst="001" />
     </process>
 
     <export kind="intermediate">

mdhub/commands/__init__.py

 # -*- coding: utf-8 -*-
 
+"""
+Commands/Transformations implementation.
+"""
+
 import logging
 import copy
 import pymarc
+from mdhub import util
 
 FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(funcName)-10s %(message)s'
 log = logging.getLogger('mdhub.commands')
     def __repr__(self):
         return u'<{0} {1}>'.format(self.name, self.argmap.items())
 
-    def execute(self, data):
-        log.debug("Executing {0} on data: {1}".format(self.name, data))
-        return data
+    def execute(self, bag):
+        """
+        Stub implementation of execute()
+        """
+        log.debug("Executing {0} on bag: {1}".format(self.name, bag))
+        return bag
 
-class command_noop(Command):
+class Noop(Command):
     """
     An example noop command. Just some kind of identity function.
     """
-    def execute(self, data):
-        super(command_noop, self).execute(data)
-        return data
+    pass
 
-class command_copyall(Command):
+class Copyall(Command):
     """
     Copy all tags/field into our own MARC record.
     """
-    def execute(self, data):
-        if data['type'] == 'marc':
-            data['marc'] = copy.deepcopy(data['original'])
+    def execute(self, bag):
+        if bag['type'] == 'marc':
+            bag['marc'] = copy.deepcopy(bag['original'])
             log.debug("Created deep copy of original MARC record.")
         else:
             log.error("Don't know how to copyall on non-MARC input.")
-            raise NotImplementedError()
-        return data
+            raise NotImplementedError
+        return bag
 
-class command_copy(Command):
+class Copy(Command):
     """
     Copy a ``src`` field value into a ``dst`` field value.
     """
-    def execute(self, data):
-        if data['type'] == 'marc':
-            src = self.argmap['src']
-            dst = self.argmap['dst']
-            field = data['original'].get_fields(src)
-            try:
-                data['marc'].remove_field(field)
-            except pymarc.exceptions.FieldNotFound, fnf:
-                log.warning("Tried to delete a non existing field.")
-            for fld in field:
-                fld.tag = dst
-            data['marc'].add_field(field)
+    def execute(self, bag):
+        if bag['type'] == 'marc':
+            src, dst = self.argmap['src'], self.argmap['dst']
+            fields = copy.deepcopy(bag['original'].get_fields(src))
+            for field in fields:
+                field.tag = dst
+            bag['marc'].add_field(fields)
         else:
             log.error(
                 "Don't know how to map non-MARC field to MARC fields yet.")
-            raise NotImplementedError()
-        return data
+            raise NotImplementedError
+        return bag
 
-class command_get_or_set_finc_id(Command):
+class GetFincID(Command):
     """
-    Get a finc id. Either a finc id identified by source and record id is
+    Get a finc id. Either a FINC ID identified by source and record id is
     already in our DB or it isn't.
     """
-    pass
+    def execute(self, bag):
+        store = bag['store']
+        if bag['type'] == 'marc':
+            finc_id = util.encode_fincid(bag.get_original_marc_value('001'),
+                source_id=bag['source_id'])
+            record = store.get_record(finc_id)
+            if record == None:
+                store.set_record(finc_id, 'dummy')
+                log.debug("Stored raw FINC record: {0}".format(finc_id))
+            else:
+                log.debug("We already got FINC id: {0}".format(finc_id))
+            bag['finc_id'] = finc_id
 
 COMMAND_MAP = {
-    'noop' : command_noop,
-    'copyall' : command_copyall,
-    'copy' : command_copy,
-    'get_or_set_finc_id' : command_get_or_set_finc_id,
+    'noop' : Noop,
+    'copyall' : Copyall,
+    'copy' : Copy,
+    'get_finc_id' : GetFincID,
 }
 
 if __name__ == '__main__':
     pass
+

mdhub/importconfig.py

 from lxml import objectify
 from mdhub.commands import command_factory
 
-class FincImportConfiguration(object):
+class ImportConfiguration(object):
     """
     Configuration wrapper around our import XML files.
     """
         self._datasource_attributes = None
         self._commands = []
 
+    def __getattr__(self, attr):
+        if attr in self.datasource_attributes:
+            return self.datasource_attributes[attr]
+        raise AttributeError
+
     @property
     def root_node(self):
         """
 
     @property
     def datasource_attributes(self):
-        """ Datasource attribute. These attributes are global.
+        """ Datasource attributes. These attributes are 'global'.
         """
         if self._datasource_attributes == None:
             self._datasource_attributes = dict(self.root_node[0].items())
         print "Commands: {0}".format(self.commands)
 
 if __name__ == '__main__':
-    fi = FincImportConfiguration('/Users/ronit/bitbucket/miku/mdhub/config/v2/simple-fi.mc.xml')
-    print fi.datasource_attributes['type']
-    print fi.datasource_attributes
-    print fi.commands
+    import_conf = ImportConfiguration('/Users/ronit/bitbucket/miku/mdhub/config/v2/simple-fi.mc.xml')
+    print import_conf.datasource_attributes['type']
+    print import_conf.datasource_attributes
+    print import_conf.commands
 

mdhub/location.py

 import logging
 
 log = logging.getLogger('mdhub.location')
-logging.basicConfig(level=logging.DEBUG,
+logging.basicConfig(level=logging.INFO,
     format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
     datefmt='%m-%d %H:%M')
 
                 filename, value_error))
             # force return an empty iterator
             return EmptyInterator()
-    return pymarc.MARCReader(file(filename))
+    return pymarc.MARCReader(file(filename), to_unicode=True, force_utf8=True)
 
 def zip_iterator(filename):
     """
             for entry in zip_iterator(fname):
                 yield entry
         elif fname.endswith('.csv'):
-            raise NotImplementedError()
+            raise NotImplementedError
         elif fname.endswith('.xml'):
-            raise NotImplementedError()
+            raise NotImplementedError
 
 if __name__ == '__main__':
     # In my home data dir, I have tar.gz's and plain marc files mixed

mdhub/processor.py

 """
 
 import ConfigParser
-import pymarc
 import time
 import sys
 import os
+import copy
 from mdhub import store
 from mdhub import importconfig
 from mdhub import location
+from mdhub import exc
+from mdhub import util
 
+def get_store_type(store_conf_path):
+    """
+    For a path to store configuration file, get the
+    kind of store we are dealing with.
+    """
+    parser = ConfigParser.SafeConfigParser()
+    parser.read(store_conf_path)
+    return parser.sections()[0]
 
 class Processor(object):
     """
     Hi, I'm god. Maybe you should refactor me.
     """
-    def __init__(self, import_configuration, store_configuration):
+    def __init__(self, import_conf_path, store_conf_path):
         """
         Initialize the god processor. ``import_configuration`` and
         ``store_configuration`` are pathes to the configuration files,
         not objects.
         """
-        self.import_configuration = os.path.expanduser(import_configuration)
-        self.store_configuration = os.path.expanduser(store_configuration)
+        self.import_conf_path = os.path.expanduser(import_conf_path)
         # get a handle on a import configuration
-        self.fi = importconfig.FincImportConfiguration(
-            self.import_configuration)
+        self.import_conf = importconfig.ImportConfiguration(
+            self.import_conf_path)
 
         # Just get the store type out of the config
         # and set ``self.store`` accordingly (without import magic)
-        config_parser = ConfigParser.SafeConfigParser()
-        config_parser.read(self.store_configuration)
-        self.store_type = config_parser.sections()[0]
+        self.store_conf_path = os.path.expanduser(store_conf_path)
+        self.store_type = get_store_type(self.store_conf_path)
         if self.store_type == 'redis':
-            self.store = store.RedisStore(self.store_configuration)
+            self.store = store.RedisStore(self.store_conf_path)
+        elif self.store_type == 'rdbms':
+            raise NotImplementedError
+        else:
+            raise exc.MetaDataHubException(
+                "No suitable store defined in: {0}".format(
+                    self.store_conf_path))
         # TODO: Add more store types here ...
 
     def process(self):
         """
-        Given a finc import configuration (``self.import_configuration``) and
-        a store (DB) configuration (``self.store_configuration``) just make it
-        happen.
+        Given a FINC import configuration (``self.import_configuration``) and
+        a store (key-value store, RDBMS) configuration
+        (``self.store_configuration``) process all available records and
+        feed them to our store appropriately.
         """
         start = time.time()
         records, commands = 0, 0
-        for record_iterator in location.record_iterator(
-            self.fi.datasource_attributes['location']):
+        iterator = location.record_iterator(self.import_conf.location)
+
+        for record_iterator in iterator:
             for record in record_iterator:
                 # Bootstrap internal representation of the metadata:
                 # This dict will be passed along all commands; each command
                 # is free to add and modify keys and values.
-                data = {
-                    # import configuration
-                    'fi' : self.fi,
-                    # store configuration: this is needed to ask for
-                    # finc ids and such
-                    'store' : self.store,
-                    # the original item (marc, csv, xml, ...)
-                    # TODO: this should be made immutable
-                    'original' : record,
-                    # a shortcut to the type of the original item
-                    'type' : self.fi.datasource_attributes['type'],
-                    # our export marc
-                    'marc' : pymarc.Record(),
-                }
-                for command in self.fi.commands:
-                    data = command.execute(data)
+                bag = util.mdbag()
+                bag['import_conf'] = self.import_conf
+                bag['store'] = self.store
+                bag['original'] = copy.deepcopy(record) # we store a python object at the moment
+
+                for command in self.import_conf.commands:
+                    bag = command.execute(bag)
                     commands += 1
+
                 # ``data`` is processed by now - should
                 # be handed over to the export side of things
                 records += 1
+
         stop = time.time()
         print >> sys.stderr, \
             "Processed {0} records and {1} commands in {2:.4f} seconds".format(
 
 import redis
 import ConfigParser
-import json
+import pickle
 
-class Store(object):
-    """
-    Not sure if we'll actually need this.
-    """
-    def __init__(self):
-        pass
-
-class RedisStore(Store):
+class RedisStore(object):
     """
     Redis store for debugging.
     """
         self.config.read(configuration)
         self.database = redis.Redis(
             host=self.config.get('redis', 'host'),
-            port=self.config.get('redis', 'port'),
-            db=self.config.get('redis', 'database'),
+            port=int(self.config.get('redis', 'port')),
+            db=int(self.config.get('redis', 'database')),
         )
 
     def get_record(self, finc_id):
         """
         Just return a plain record or None if ``finc_id`` isn't taken yet.
         """
-        return self.database.get(finc_id) # returns None on miss
+        result = self.database.get(finc_id) # returns None on miss
+        if not result == None:
+            result = pickle.loads(result)
+        return result
 
     def set_record(self, finc_id, payload):
         """
         Set record with finc_id as key. Payload will be *jsonified*
         by default.
         """
-        try:
-            _ = json.loads(payload)
-            jsonified = payload
-        except ValueError:
-            jsonified = json.dumps(payload)
-        self.database.set(finc_id, jsonified)
+        self.database.set(finc_id, pickle.dumps(payload))
 
 """
 
 import string
+import logging
+import re
+import collections
+from mdhub import exc
+
+FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(funcName)-10s %(message)s'
+log = logging.getLogger('mdhub.commands')
+logging.basicConfig(level=logging.INFO,
+    format=FORMAT, datefmt='%m-%d %H:%M')
+
+CODES = string.digits + string.letters
+
+def _encode_fincid(identifier, length=5):
+    """
+    Internal method which doesn't do sanity check. Should
+    not be used directly.
+    """
+    result = ''
+    while identifier > 0:
+        result += CODES[identifier % 62]
+        identifier -= identifier % 62
+        identifier /= 62
+    result = result[::-1]
+    result = result.rjust(length, '0')
+    return result
+
+def encode_fincid(identifier, source_id=None):
+    """ Encodes a record identifier to a FINC-ID. Optionally a ``source_id``
+    can be blended in. The ``length`` parameter specifies the padding.
 
-def encode_fincid(number, length=5):
-    """ ``length`` is for padding.
     >>> encode_fincid(0)
     '00000'
-    >>> encode_fincid(1000)
-    '000g8'
+    >>> encode_fincid(1)
+    '00001'
     >>> encode_fincid(916132831)
     'ZZZZZ'
+
+    >>> encode_fincid('0')
+    '00000'
+    >>> encode_fincid('1')
+    '00001'
+    >>> encode_fincid('916132831')
+    'ZZZZZ'
+
+    >>> encode_fincid(0, source_id=1)
+    '00100000'
+    >>> encode_fincid(1, source_id=1)
+    '00100001'
+    >>> encode_fincid(916132831, source_id=1)
+    '001ZZZZZ'
+
+    >>> encode_fincid(0, source_id=0)
+    '00000000'
+    >>> encode_fincid(1, source_id=1)
+    '00100001'
+    >>> encode_fincid(916132831, source_id=238327)
+    'ZZZZZZZZ'
+
+    >>> encode_fincid('0X')
+    '00000X'
+    >>> encode_fincid('1X')
+    '00001X'
+    >>> encode_fincid('916132831X')
+    'ZZZZZX'
+
+    >>> encode_fincid('0X', source_id=0)
+    '00000000X'
+    >>> encode_fincid('1X', source_id=1)
+    '00100001X'
+    >>> encode_fincid('916132831X', source_id=238327)
+    'ZZZZZZZZX'
     """
-    codes = string.digits + string.letters
+    char = None
+    if not isinstance(identifier, int):
+        # Default numeric identifier
+        if re.match('^\d+$', identifier):
+            identifier = int(identifier)
+
+        # PPN-style identifier, numbers followed by a letter (X typically)
+        elif re.match('(\d+)([^\d])', identifier):
+            identifier, char = re.match('(\d+)([^\d])', identifier).groups()
+            identifier = int(identifier)
+        else:
+            log.error(
+                "Given identifier is neither numeric nor PPN-like: {0}".format(
+                    identifier))
+            raise exc.MetaDataHubException(
+                "Given identifier is neither numeric nor PPN-like: {0}".format(
+                    identifier))
+
+    if not (isinstance(source_id, int) or source_id == None):
+        # source_id should be int
+        if re.match('^\d+$', source_id):
+            source_id = int(source_id)
+        else:
+            log.error("Given source_id is not numeric: {0}".format(source_id))
+            raise exc.MetaDataHubException(
+                "Given source_id is not numeric: {0}".format(source_id))
+
     result = ''
-    while number > 0:
-        result += codes[number % 62]
-        number -= number % 62
-        number /= 62
-    result = result[::-1]
-    result = result.rjust(length, '0')
+    if not source_id == None:
+        result += _encode_fincid(source_id, length=3)
+    result += _encode_fincid(identifier, length=5)
+    if not char == None:
+        result += char
     return result
 
-def decode_fincid(number):
+def _decode_fincid(finc_id):
     """
+    Internal method which doesn't do sanity check. Should
+    not be used directly.
+
     >>> decode_fincid('00000')
-    0
+    '0'
     >>> decode_fincid('000g8')
-    1000
+    '1000'
     >>> decode_fincid('ZZZZZ')
-    916132831
+    '916132831'
+
+    >>> decode_fincid('00000X')
+    '0X'
+    >>> decode_fincid('000g8X')
+    '1000X'
+    >>> decode_fincid('ZZZZZX')
+    '916132831X'
+
+    >>> decode_fincid('00100000X', with_source=True)
+    ('1', '0X')
+    >>> decode_fincid('001000g8X', with_source=True)
+    ('1', '1000X')
+    >>> decode_fincid('001ZZZZZX', with_source=True)
+    ('1', '916132831X')
     """
-    codes = string.digits + string.letters
     result, position = 0, 1
-    number = number[::-1]
-    for c in number[:]:
-        result += position * codes.index(c)
+    finc_id = finc_id[::-1]
+    for c in finc_id[:]:
+        result += position * CODES.index(c)
         position *= 62
-    return result
+    return str(result)
+
+def decode_fincid(finc_id, with_source=False):
+    """
+    Decode FINC ID with sanity basic checks.
+    """
+    if not with_source:
+        if len(finc_id) == 5:
+            return _decode_fincid(finc_id)
+        elif len(finc_id) == 6:
+            return _decode_fincid(finc_id[:5]) + finc_id[5]
+        else:
+            log.error("Suspect FINC ID: {0}".format(finc_id))
+            raise exc.MetaDataHubException(
+                "Suspect FINC ID: {0}".format(finc_id))
+    else:
+        if len(finc_id) == 8:
+            source_encoded = finc_id[:3]
+            record_encoded = finc_id[3:8]
+            return (_decode_fincid(source_encoded),
+                    _decode_fincid(record_encoded))
+        elif len(finc_id) == 9:
+            source_encoded = finc_id[:3]
+            record_encoded = finc_id[3:8]
+            char = finc_id[8]
+            return (_decode_fincid(source_encoded),
+                    _decode_fincid(record_encoded) + char)
+        else:
+            log.error("Suspect FINC ID: {0}".format(finc_id))
+            raise exc.MetaDataHubException(
+                "Suspect FINC ID: {0}".format(finc_id))
+
+class mdbag(collections.MutableMapping):
+    """
+    The rationale for this custom dict subclass is that out internal
+    data representation (metadata bag), which gets passed around from
+    transformation to transformation, should not rely on raw keys, but
+    should offer some kind of API to work with.
+    """
+    def __init__(self):
+        self.bag = {}
+
+    def __getitem__(self, key):
+        """ dict protocol.
+        """
+        if key == 'type':
+            return self.bag['import_conf'].type
+        elif key == 'source_id':
+            return self.bag['import_conf'].source_id
+        return self.bag[key]
+
+    def __setitem__(self, key, value):
+        """ dict protocol.
+        """
+        self.bag[key] = value
+
+    def __delitem__(self, key):
+        """ dict protocol.
+        """
+        del self.bag[key]
+
+    def __iter__(self):
+        """ dict protocol.
+        """
+        return self
+
+    def __len__(self):
+        """ dict protocol.
+        """
+        return len(self.bag)
+
+    def get_original_marc_value(self, tag):
+        """
+        Helper method to get the value of a MARC tag directly.
+        """
+        if self['type'] == 'marc':
+            return self['original'][tag].data
+        else:
+            raise exc.MetaDataHubException(
+                "Can't get MARC value for non-MARC datasource item.")
 
 if __name__ == '__main__':
     import doctest