Source

mdhub / mdhub / util.py

# -*- coding: utf-8 -*-

"""
FINCID AAABBBBB[C]

AAA (3) => source_id (BSZ, PaperC, ...) - via encode_fincid
BBBBB[C] (5-6) => finc-encoded ID of the record ID (within the source)

Exception
    In BSZ 8-digit RID finc-encoded (BBBBB),
    plus the 9th digit of the PPN (C)

Finc ID space: 0-916,132,831
"""

import string
import logging
import re
import collections
from mdhub import exc

logger = logging.getLogger(__name__)

CODES = string.digits + string.letters

def _encode_fincid(identifier, length=5):
    """
    Internal method which doesn't do sanity check. Should
    not be used directly.
    """
    result = ''
    while identifier > 0:
        result += CODES[identifier % 62]
        identifier -= identifier % 62
        identifier /= 62
    result = result[::-1]
    result = result.rjust(length, '0')
    return result

def encode_fincid(identifier, source_id=None):
    """ Encodes a record identifier to a FINC-ID. Optionally a ``source_id``
    can be blended in. The ``length`` parameter specifies the padding.

    >>> encode_fincid(0)
    '00000'
    >>> encode_fincid(1)
    '00001'
    >>> encode_fincid(916132831)
    'ZZZZZ'

    >>> encode_fincid('0')
    '00000'
    >>> encode_fincid('1')
    '00001'
    >>> encode_fincid('916132831')
    'ZZZZZ'

    >>> encode_fincid(0, source_id=1)
    '00100000'
    >>> encode_fincid(1, source_id=1)
    '00100001'
    >>> encode_fincid(916132831, source_id=1)
    '001ZZZZZ'

    >>> encode_fincid(0, source_id=0)
    '00000000'
    >>> encode_fincid(1, source_id=1)
    '00100001'
    >>> encode_fincid(916132831, source_id=238327)
    'ZZZZZZZZ'

    >>> encode_fincid('0X')
    '00000X'
    >>> encode_fincid('1X')
    '00001X'
    >>> encode_fincid('916132831X')
    'ZZZZZX'

    >>> encode_fincid('0X', source_id=0)
    '00000000X'
    >>> encode_fincid('1X', source_id=1)
    '00100001X'
    >>> encode_fincid('916132831X', source_id=238327)
    'ZZZZZZZZX'
    """
    char = None
    if not isinstance(identifier, int):
        # Default numeric identifier
        if re.match('^\d+$', identifier):
            identifier = int(identifier)

        # PPN-style identifier, numbers followed by a letter (X typically)
        elif re.match('(\d+)([^\d])', identifier):
            identifier, char = re.match('(\d+)([^\d])', identifier).groups()
            identifier = int(identifier)
        else:
            logger.error(
                "Given identifier is neither numeric nor PPN-like: {0}".format(
                    identifier))
            raise exc.MetaDataHubException(
                "Given identifier is neither numeric nor PPN-like: {0}".format(
                    identifier))

    if not (isinstance(source_id, int) or source_id == None):
        # source_id should be int
        if re.match('^\d+$', source_id):
            source_id = int(source_id)
        else:
            logger.error("Given source_id is not numeric: {0}".format(source_id))
            raise exc.MetaDataHubException(
                "Given source_id is not numeric: {0}".format(source_id))

    result = ''
    if not source_id == None:
        result += _encode_fincid(source_id, length=3)
    result += _encode_fincid(identifier, length=5)
    if not char == None:
        result += char
    return result

def _decode_fincid(finc_id):
    """
    Internal method which doesn't do sanity check. Should
    not be used directly.

    >>> decode_fincid('00000')
    '0'
    >>> decode_fincid('000g8')
    '1000'
    >>> decode_fincid('ZZZZZ')
    '916132831'

    >>> decode_fincid('00000X')
    '0X'
    >>> decode_fincid('000g8X')
    '1000X'
    >>> decode_fincid('ZZZZZX')
    '916132831X'

    >>> decode_fincid('00100000X', with_source=True)
    ('1', '0X')
    >>> decode_fincid('001000g8X', with_source=True)
    ('1', '1000X')
    >>> decode_fincid('001ZZZZZX', with_source=True)
    ('1', '916132831X')
    """
    result, position = 0, 1
    finc_id = finc_id[::-1]
    for c in finc_id[:]:
        result += position * CODES.index(c)
        position *= 62
    return str(result)

def decode_fincid(finc_id, with_source=False):
    """
    Decode FINC ID with sanity basic checks.
    """
    if not with_source:
        if len(finc_id) == 5:
            return _decode_fincid(finc_id)
        elif len(finc_id) == 6:
            return _decode_fincid(finc_id[:5]) + finc_id[5]
        else:
            logger.error("Suspect FINC ID: {0}".format(finc_id))
            raise exc.MetaDataHubException(
                "Suspect FINC ID: {0}".format(finc_id))
    else:
        if len(finc_id) == 8:
            source_encoded = finc_id[:3]
            record_encoded = finc_id[3:8]
            return (_decode_fincid(source_encoded),
                    _decode_fincid(record_encoded))
        elif len(finc_id) == 9:
            source_encoded = finc_id[:3]
            record_encoded = finc_id[3:8]
            char = finc_id[8]
            return (_decode_fincid(source_encoded),
                    _decode_fincid(record_encoded) + char)
        else:
            logger.error("Suspect FINC ID: {0}".format(finc_id))
            raise exc.MetaDataHubException(
                "Suspect FINC ID: {0}".format(finc_id))

class mdbag(collections.MutableMapping):
    """
    The rationale for this custom dict subclass is that out internal
    data representation (metadata bag), which gets passed around from
    transformation to transformation, should not rely on raw keys, but
    should offer some kind of API to work with.
    """
    def __init__(self):
        self.bag = {}

    def __getitem__(self, key):
        """ dict protocol.
        """
        if key == 'type':
            return self.bag['import_conf'].type
        elif key == 'source_id':
            return self.bag['import_conf'].source_id
        return self.bag[key]

    def __setitem__(self, key, value):
        """ dict protocol.
        """
        self.bag[key] = value

    def __delitem__(self, key):
        """ dict protocol.
        """
        del self.bag[key]

    def __iter__(self):
        """ dict protocol.
        """
        return self

    def __len__(self):
        """ dict protocol.
        """
        return len(self.bag)

    def get_original_marc_value(self, tag):
        """
        Helper method to get the value of a MARC tag directly.
        """
        if self['type'] == 'marc':
            return self['original'][tag].data
        else:
            raise exc.MetaDataHubException(
                "Can't get MARC value for non-MARC datasource item.")

if __name__ == '__main__':
    import doctest
    doctest.testmod()
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.