Commits

Thomas Waldmann committed ebd15cd

add new storage-ng storage/indexing/... subsystem

  • Participants
  • Parent commits b2542a3
  • Branches storage-ng

Comments (0)

Files changed (35)

File MoinMoin/storage/__init__.py

+# Copyright: 2011 MoinMoin:RonnyPfannschmidt
+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - storage subsystem
+============================
+
+We use a layered approach like this::
+
+ Indexing Middleware               does complex stuff like indexing, searching,
+ |                                 listing, lookup by name, ACL checks, ...
+ v
+ Routing  Middleware               dispatches to multiple backends based on the
+ |                 |               name, cares about absolute and relative names
+ v                 v
+ "stores" Backend  Other Backend   simple stuff: store, get, destroy revisions
+ |           |
+ v           v
+ meta store  data store            simplest stuff: store, get, destroy and iterate
+                                   over key/value pairs
+"""
+
+
+CONTENT, USERPROFILES = 'content', 'userprofiles'
+
+BACKENDS_PACKAGE = 'storage.backends'
+
+
+def backend_from_uri(uri):
+    """
+    create a backend instance for uri
+    """
+    backend_name_uri = uri.split(':', 1)
+    if len(backend_name_uri) != 2:
+        raise ValueError("malformed backend uri: %s" % backend_uri)
+    backend_name, backend_uri = backend_name_uri
+    module = __import__(BACKENDS_PACKAGE + '.' + backend_name, globals(), locals(), ['Backend', ])
+    return module.Backend.from_uri(backend_uri)
+
+
+def create_mapping(uri, mounts_acls):
+    namespace_mapping = [(mounts_acls[nsname][0],
+                          backend_from_uri(uri % dict(nsname=nsname)),
+                          mounts_acls[nsname][1])
+                         for nsname in mounts_acls]
+    # we need the longest mountpoints first, shortest last (-> '' is very last)
+    return sorted(namespace_mapping, key=lambda x: len(x[0]), reverse=True)
+
+
+def create_simple_mapping(uri='stores:fs:instance',
+                          content_acl=None, user_profile_acl=None):
+    """
+    When configuring storage, the admin needs to provide a namespace_mapping.
+    To ease creation of such a mapping, this function provides sane defaults
+    for different types of stores.
+    The admin can just call this function, pass a hint on what type of stores
+    he wants to use and a proper mapping is returned.
+
+    :params uri: '<backend_name>:<backend_uri>' (general form)
+                 backend_name must be a backend module name (e.g. stores)
+                 the backend_uri must have a %(nsname)s placeholder, it gets replaced
+                 by the CONTENT, USERPROFILES strings and result is given to
+                 to that backend's constructor
+
+                 for the 'stores' backend, backend_uri looks like '<store_name>:<store_uri>'
+                 store_name must be a store module name (e.g. fs)
+                 the store_uri must have a %(kind)s placeholder, it gets replaced
+                 by 'meta' or 'data' and the result is given to that store's constructor
+
+                 e.g.:
+                 'stores:fs:/path/to/store/%(nsname)s/%(kind)s' will create a mapping
+                 using the 'stores' backend with 'fs' stores and everything will be stored
+                 to below /path/to/store/.
+    """
+    # if no acls are given, use something mostly harmless:
+    if not content_acl:
+        content_acl = dict(before=u'', default=u'All:read,write,create', after=u'', hierarchic=False)
+    if not user_profile_acl:
+        user_profile_acl = dict(before=u'All:', default=u'', after=u'', hierarchic=False)
+    mounts_acls = {
+        CONTENT: ('', content_acl),
+        USERPROFILES: ('UserProfile', user_profile_acl),
+    }
+    return create_mapping(uri, mounts_acls)
+

File MoinMoin/storage/backends/__init__.py

+# Copyright: 2011 MoinMoin:RonnyPfannschmidt
+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - backend base classes
+"""
+
+
+from __future__ import absolute_import, division
+
+from abc import abstractmethod, ABCMeta
+
+
+class BackendBase(object):
+    """
+    ties together a store for metadata and a store for data, readonly
+    """
+    __metaclass__ = ABCMeta
+
+    @classmethod
+    @abstractmethod
+    def from_uri(cls, uri):
+        """
+        create an instance using the data given in uri
+        """
+
+    @abstractmethod
+    def open(self):
+        """
+        open the backend, allocate resources
+        """
+
+    @abstractmethod
+    def close(self):
+        """
+        close the backend, free resources (except the stored meta/data!)
+        """
+
+    @abstractmethod
+    def __iter__(self):
+        """
+        iterate over metaids
+        """
+
+    @abstractmethod
+    def retrieve(self, metaid):
+        """
+        return meta, data related to metaid
+        """
+
+
+class MutableBackendBase(BackendBase):
+    """
+    same as Backend, but read/write
+    """
+    @abstractmethod
+    def create(self):
+        """
+        create the backend
+        """
+
+    @abstractmethod
+    def destroy(self):
+        """
+        destroy the backend, erase all meta/data it contains
+        """
+
+    @abstractmethod
+    def store(self, meta, data):
+        """
+        store meta, data into the backend, return the metaid
+        """
+
+    @abstractmethod
+    def remove(self, metaid):
+        """
+        delete meta, data related to metaid from the backend
+        """

File MoinMoin/storage/backends/_tests/__init__.py

+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - backend tests
+"""
+
+
+from __future__ import absolute_import, division
+
+from StringIO import StringIO
+
+import pytest
+
+from config import SIZE, HASH_ALGORITHM
+
+class BackendTestBase(object):
+    def setup_method(self, method):
+        """
+        self.be needs to be an opened backend
+        """
+        raise NotImplemented
+
+    def teardown_method(self, method):
+        """
+        close self.be
+        """
+        self.be.close()
+
+    def test_getrevision_raises(self):
+        with pytest.raises(KeyError):
+            self.be.retrieve('doesnotexist')
+
+    def test_iter(self):
+        assert list(self.be) == []
+
+
+class MutableBackendTestBase(BackendTestBase):
+    def setup_method(self, method):
+        """
+        self.be needs to be an created/opened backend
+        """
+        raise NotImplemented
+
+    def teardown_method(self, method):
+        """
+        close and destroy self.be
+        """
+        self.be.close()
+        self.be.destroy()
+
+    def test_getrevision_raises(self):
+        with pytest.raises(KeyError):
+            self.be.retrieve('doesnotexist')
+
+    def test_store_get_del(self):
+        meta = dict(foo='bar')
+        data = 'baz'
+        metaid = self.be.store(meta, StringIO(data))
+        m, d = self.be.retrieve(metaid)
+        assert m == meta
+        assert d.read() == data
+        self.be.remove(metaid)
+        with pytest.raises(KeyError):
+            self.be.retrieve(metaid)
+
+    def test_store_check_size(self):
+        # no size
+        meta = dict(name='foo')
+        data = 'barbaz'
+        metaid = self.be.store(meta, StringIO(data))
+        m, d = self.be.retrieve(metaid)
+        assert meta[SIZE] == 6
+        # correct size
+        meta = dict(name='foo', size=6)
+        data = 'barbaz'
+        metaid = self.be.store(meta, StringIO(data))
+        m, d = self.be.retrieve(metaid)
+        assert meta[SIZE] == 6
+        # wrong size (less data than size declared in meta)
+        meta = dict(name='foo', size=42)
+        data = 'barbaz'
+        with pytest.raises(ValueError):
+            metaid = self.be.store(meta, StringIO(data))
+        # wrong size (more data than size declared in meta)
+        meta = dict(name='foo', size=3)
+        data = 'barbaz'
+        with pytest.raises(ValueError):
+            metaid = self.be.store(meta, StringIO(data))
+
+    def test_store_check_hash(self):
+        # no hash
+        meta = dict(name='foo')
+        data = 'barbaz'
+        metaid = self.be.store(meta, StringIO(data))
+        m, d = self.be.retrieve(metaid)
+        hashcode = meta[HASH_ALGORITHM]
+        # correct hash
+        meta = dict(name='foo')
+        meta[HASH_ALGORITHM] = hashcode
+        data = 'barbaz'
+        metaid = self.be.store(meta, StringIO(data))
+        m, d = self.be.retrieve(metaid)
+        assert meta[HASH_ALGORITHM] == hashcode
+        # wrong data -> hash mismatch
+        meta = dict(name='foo')
+        meta[HASH_ALGORITHM] = hashcode
+        data = 'brrbrr'
+        with pytest.raises(ValueError):
+            metaid = self.be.store(meta, StringIO(data))
+
+    def test_iter(self):
+        mds = [#(metadata items, data str)
+                (dict(name='one'), 'ONE'),
+                (dict(name='two'), 'TWO'),
+                (dict(name='three'), 'THREE'),
+              ]
+        expected_result = set()
+        for m, d in mds:
+            k = self.be.store(m, StringIO(d))
+            # note: store_revision injects some new keys (like dataid, metaid, size, hash key) into m
+            m = tuple(sorted(m.items()))
+            expected_result.add((k, m, d))
+        result = set()
+        for k in self.be:
+            m, d = self.be.retrieve(k)
+            m = tuple(sorted(m.items()))
+            result.add((k, m, d.read()))
+        assert result == expected_result
+

File MoinMoin/storage/backends/_tests/test_fileserver.py

+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - fileserver backend tests
+"""
+
+
+from __future__ import absolute_import, division
+
+import os
+import tempfile
+
+import pytest
+
+from config import MTIME
+from ..fileserver import Backend
+from . import BackendTestBase
+
+
+class TestFileServerBackend(BackendTestBase):
+    def setup_method(self, method):
+        self.path = path = tempfile.mkdtemp()
+        self.be = Backend(path)
+        self.be.open()
+
+    def _prepare(self, items):
+        expected_result = set()
+        for name, meta, data in items:
+            fn = os.path.join(self.path, name)
+            dn = os.path.dirname(fn)
+            try:
+                os.makedirs(dn)
+            except:
+                pass
+            with open(fn, 'wb') as f:
+                f.write(data)
+            meta = tuple(sorted(meta.items()))
+            expected_result.add((meta, data))
+        return expected_result
+
+    def test_files(self):
+        # note: as we can only store the data into the file system, meta can
+        # only have items that are generated by the fileserver backend:
+        items = [#name,  meta,   data
+                 ('foo.png', dict(size=11, contenttype='image/png'), 'png content'),
+                 ('bar.txt', dict(size=12, contenttype='text/plain'), 'text content'),
+                ]
+        expected_result = self._prepare(items)
+        result = set()
+        for i in self.be:
+            meta, data = self.be.retrieve(i)
+            # we don't want to check mtime
+            del meta[MTIME]
+            meta = tuple(sorted(meta.items()))
+            data = data.read()
+            result.add((meta, data))
+        assert result == expected_result
+
+    def test_dir(self):
+        # note: as we can only store the data into the file system, meta can
+        # only have items that are generated by the fileserver backend:
+        items = [#name,  meta,   data
+                 ('dir/foo.png', dict(size=11, contenttype='image/png'), 'png content'),
+                 ('dir/bar.txt', dict(size=12, contenttype='text/plain'), 'text content'),
+                ]
+        expected_result = self._prepare(items)
+        dir_meta = tuple(sorted(dict(size=0, contenttype='text/x.moin.wiki;charset=utf-8').items()))
+        dir_data = """\
+= Directory contents =
+ * [[../]]
+ * [[/bar.txt|bar.txt]]
+ * [[/foo.png|foo.png]]
+""".replace('\n', '\r\n')
+        expected_result.add((dir_meta, dir_data))
+        result = set()
+        for i in self.be:
+            meta, data = self.be.retrieve(i)
+            # we don't want to check mtime
+            del meta[MTIME]
+            meta = tuple(sorted(meta.items()))
+            data = data.read()
+            result.add((meta, data))
+        assert result == expected_result
+
+

File MoinMoin/storage/backends/_tests/test_stores.py

+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - stores backend tests
+
+Note: theoretically, it should be enough to test with one kind of store,
+      but we better test with a fs AND a memory store.
+"""
+
+
+from __future__ import absolute_import, division
+
+import pytest
+
+from ..stores import MutableBackend
+from . import MutableBackendTestBase
+
+from storage.stores.memory import BytesStore as MemoryBytesStore
+from storage.stores.memory import FileStore as MemoryFileStore
+
+class TestMemoryBackend(MutableBackendTestBase):
+    def setup_method(self, method):
+        meta_store = MemoryBytesStore()
+        data_store = MemoryFileStore()
+        self.be = MutableBackend(meta_store, data_store)
+        self.be.create()
+        self.be.open()
+
+import os
+import tempfile
+
+from storage.stores.fs import BytesStore as FSBytesStore
+from storage.stores.fs import FileStore as FSFileStore
+
+class TestFSBackend(MutableBackendTestBase):
+    def setup_method(self, method):
+        meta_path = tempfile.mkdtemp()
+        os.rmdir(meta_path)
+        meta_store = FSBytesStore(meta_path)
+        data_path = tempfile.mkdtemp()
+        os.rmdir(data_path)
+        data_store = FSFileStore(data_path)
+        self.be = MutableBackend(meta_store, data_store)
+        self.be.create()
+        self.be.open()
+
+

File MoinMoin/storage/backends/_util.py

+# Copyright: 2011 MoinMoin:RonnyPfannschmidt
+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - backend utilities
+"""
+
+
+from __future__ import absolute_import, division
+
+import hashlib
+
+
+class TrackingFileWrapper(object):
+    """
+    Wraps a file and computes hashcode and file size while it is read.
+    Requires that initially the realfile is open and at pos 0.
+    Users need to call .read(blocksize) until it does not return any more data.
+    After this self.hash and self.size will have the wanted values.
+    self.hash is the hash instance, you may want to call self.hash.hexdigest().
+    """
+    def __init__(self, realfile, hash_method='sha1'):
+        self._realfile = realfile
+        self._read = realfile.read
+        self._hash = hashlib.new(hash_method)
+        self._size = 0
+        self._finished = False
+        fpos = realfile.tell()
+        if fpos:
+            raise ValueError("file needs to be at pos 0")
+
+    def read(self, size=None):
+        # XXX: workaround for werkzeug.wsgi.LimitedStream
+        #      which expects None instead of -1 for "read everything"
+        if size is None:
+            data = self._read()
+            self._finished = True
+        else:
+            data = self._read(size)
+            if not data:
+                self._finished = True
+        self._hash.update(data)
+        self._size += len(data)
+        return data
+
+    @property
+    def size(self):
+        if not self._finished:
+            raise AttributeError("do not access size attribute before having read all data")
+        return self._size
+
+    @property
+    def hash(self):
+        if not self._finished:
+            raise AttributeError("do not access hash attribute before having read all data")
+        return self._hash
+

File MoinMoin/storage/backends/fileserver.py

+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - fileserver backend, exposing part of the filesystem (read-only)
+
+Files show as single revision items.
+
+  - metadata is made up from fs metadata + mimetype guessing
+  - data is read from the file
+
+Directories create a virtual directory item, listing the files in that
+directory.
+"""
+
+
+from __future__ import absolute_import, division
+
+import os
+import errno
+import stat
+from StringIO import StringIO
+
+from config import MTIME, SIZE, CONTENTTYPE
+from . import BackendBase
+
+
+class Backend(BackendBase):
+    """
+    exposes part of the filesystem (read-only)
+    """
+    @classmethod
+    def from_uri(cls, uri):
+        return cls(uri)
+
+    def __init__(self, path):
+        """
+        :param path: base directory (all files/dirs below will be exposed)
+        """
+        self.path = unicode(path)
+
+    def open(self):
+        pass
+
+    def close(self):
+        pass
+
+    def _mkpath(self, key):
+        # XXX unsafe keys?
+        return os.path.join(self.path, key)
+
+    def _mkkey(self, path):
+        root = self.path
+        assert path.startswith(root)
+        key = path[len(root)+1:]
+        return key
+
+    def __iter__(self):
+        # note: instead of just yielding the relative <path>, yield <path>/<mtime>,
+        # so if the file is updated, the revid will change (and the indexer's
+        # update() method can efficiently update the index).
+        for dirpath, dirnames, filenames in os.walk(self.path):
+            key = self._mkkey(dirpath)
+            if key:
+                yield key
+            for filename in filenames:
+                yield self._mkkey(os.path.join(dirpath, filename))
+
+    def _get_meta(self, fn):
+        path = self._mkpath(fn)
+        try:
+            st = os.stat(path)
+        except OSError as e:
+            if e.errno == errno.ENOENT:
+                raise KeyError(fn)
+            raise
+        meta = {}
+        meta[MTIME] = int(st.st_mtime) # use int, not float
+        if stat.S_ISDIR(st.st_mode):
+            # directory
+            # we create a virtual wiki page listing links to subitems:
+            ct = 'text/x.moin.wiki;charset=utf-8'
+            size = 0
+        elif stat.S_ISREG(st.st_mode):
+            # normal file
+            # TODO: real mimetype guessing
+            if fn.endswith('.png'):
+                ct = 'image/png'
+            elif fn.endswith('.txt'):
+                ct = 'text/plain'
+            else:
+                ct = 'application/octet-stream'
+            size = int(st.st_size) # use int instead of long
+        else:
+            # symlink, device file, etc.
+            ct = 'application/octet-stream'
+            size = 0
+        meta[CONTENTTYPE] = ct
+        meta[SIZE] = size
+        return meta
+
+    def _make_directory_page(self, path):
+        try:
+            dirs = []
+            files = []
+            names = os.listdir(path)
+            for name in names:
+                filepath = os.path.join(path, name)
+                if os.path.isdir(filepath):
+                    dirs.append(name)
+                else:
+                    files.append(name)
+            content = [
+                u"= Directory contents =",
+                u" * [[../]]",
+            ]
+            content.extend(u" * [[/%s|%s/]]" % (name, name) for name in sorted(dirs))
+            content.extend(u" * [[/%s|%s]]" % (name, name) for name in sorted(files))
+            content.append(u"")
+            content = u'\r\n'.join(content)
+        except OSError as err:
+            content = unicode(err)
+        return content
+
+    def _get_data(self, fn):
+        path = self._mkpath(fn)
+        try:
+            st = os.stat(path)
+            if stat.S_ISDIR(st.st_mode):
+                data = self._make_directory_page(path)
+                return StringIO(data.encode('utf-8'))
+            elif stat.S_ISREG(st.st_mode):
+                return open(path, 'rb')
+            else:
+                return StringIO('')
+        except (OSError, IOError) as e:
+            if e.errno == errno.ENOENT:
+                raise KeyError(fn)
+            raise
+
+    def retrieve(self, fn):
+        meta = self._get_meta(fn)
+        data = self._get_data(fn)
+        return meta, data
+

File MoinMoin/storage/backends/stores.py

+# Copyright: 2011 MoinMoin:RonnyPfannschmidt
+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - backend that ties together 2 key/value stores
+
+A meta store (a ByteStore):
+
+- key = revid UUID (bytes, ascii)
+- value = bytes (bytes, utf-8)
+
+A data store (a FileStore):
+
+- key = dataid UUID (bytes, ascii)
+- value = file (gets/returns open file instances, to read/write binary data)
+
+See the stores package for already implemented key/value stores.
+"""
+
+
+from __future__ import absolute_import, division
+
+from uuid import uuid4
+make_uuid = lambda: unicode(uuid4().hex)
+
+from config import REVID, DATAID, SIZE, HASH_ALGORITHM
+
+from . import BackendBase, MutableBackendBase
+from ._util import TrackingFileWrapper
+
+try:
+    import json
+except ImportError:
+    import simplejson as json
+
+STORES_PACKAGE = 'storage.stores'
+
+
+class Backend(BackendBase):
+    """
+    ties together a store for metadata and a store for data, readonly
+    """
+    @classmethod
+    def from_uri(cls, uri):
+        store_name_uri = uri.split(':', 1)
+        if len(store_name_uri) != 2:
+            raise ValueError("malformed store uri: %s" % uri)
+        store_name, store_uri = store_name_uri
+        module = __import__(STORES_PACKAGE + '.' + store_name, globals(), locals(), ['BytesStore', 'FileStore', ])
+        meta_store_uri = store_uri % dict(kind='meta')
+        data_store_uri = store_uri % dict(kind='data')
+        return cls(module.BytesStore(meta_store_uri), module.FileStore(data_store_uri))
+
+    def __init__(self, meta_store, data_store):
+        """
+        :param meta_store: a ByteStore for metadata
+        :param data_store: a FileStore for data
+        """
+        self.meta_store = meta_store
+        self.data_store = data_store
+
+    def open(self):
+        self.meta_store.open()
+        self.data_store.open()
+
+    def close(self):
+        self.meta_store.close()
+        self.data_store.close()
+
+    def __iter__(self):
+        for metaid in self.meta_store:
+            yield metaid
+
+    def _deserialize(self, meta_str):
+        text = meta_str.decode('utf-8')
+        meta = json.loads(text)
+        return meta
+
+    def _get_meta(self, metaid):
+        meta = self.meta_store[metaid]
+        # XXX Idea: we could check the type we get from the store:
+        # if it is a str/bytes, just use it "as is",
+        # if it is a file, read and close it (so we have a str/bytes).
+        return self._deserialize(meta)
+
+    def _get_data(self, dataid):
+        data = self.data_store[dataid]
+        # XXX Idea: we could check the type we get from the store:
+        # if it is a file, just return it "as is",
+        # if it is a str/bytes, wrap it into StringIO (so we always return
+        # a file-like object).
+        return data
+
+    def retrieve(self, metaid):
+        meta = self._get_meta(metaid)
+        dataid = meta[DATAID]
+        data = self._get_data(dataid)
+        return meta, data
+
+
+class MutableBackend(Backend, MutableBackendBase):
+    """
+    same as Backend, but read/write
+    """
+    def create(self):
+        self.meta_store.create()
+        self.data_store.create()
+
+    def destroy(self):
+        self.meta_store.destroy()
+        self.data_store.destroy()
+
+    def _serialize(self, meta):
+        text = json.dumps(meta, ensure_ascii=False)
+        meta_str = text.encode('utf-8')
+        return meta_str
+
+    def _store_meta(self, meta):
+        if REVID not in meta:
+            # Item.clear_revision calls us with REVID already present
+            meta[REVID] = make_uuid()
+        metaid = meta[REVID]
+        meta = self._serialize(meta)
+        # XXX Idea: we could check the type the store wants from us:
+        # if it is a str/bytes (BytesStore), just use meta "as is",
+        # if it is a file (FileStore), wrap it into StringIO and give that to the store.
+        self.meta_store[metaid] = meta
+        return metaid
+
+    def store(self, meta, data):
+        # XXX Idea: we could check the type the store wants from us:
+        # if it is a str/bytes (BytesStore), just use meta "as is",
+        # if it is a file (FileStore), wrap it into StringIO and give that to the store.
+        if DATAID not in meta:
+            tfw = TrackingFileWrapper(data, hash_method=HASH_ALGORITHM)
+            dataid = make_uuid()
+            self.data_store[dataid] = tfw
+            meta[DATAID] = dataid
+            # check whether size and hash are consistent:
+            size_expected = meta.get(SIZE)
+            size_real = tfw.size
+            if size_expected is not None and size_expected != size_real:
+                raise ValueError("computed data size (%d) does not match data size declared in metadata (%d)" % (
+                                 size_real, size_expected))
+            meta[SIZE] = size_real
+            hash_expected = meta.get(HASH_ALGORITHM)
+            hash_real = tfw.hash.hexdigest()
+            if hash_expected is not None and hash_expected != hash_real:
+                raise ValueError("computed data hash (%s) does not match data hash declared in metadata (%s)" % (
+                                 hash_real, hash_expected))
+            meta[HASH_ALGORITHM] = hash_real
+        else:
+            dataid = meta[DATAID]
+            # we will just asume stuff is correct if you pass it with a data id
+            if dataid not in self.data_store:
+                self.data_store[dataid] = data
+        # if something goes wrong below, the data shall be purged by a garbage collection
+        metaid = self._store_meta(meta)
+        return metaid
+
+    def _del_meta(self, metaid):
+        del self.meta_store[metaid]
+
+    def _del_data(self, dataid):
+        del self.data_store[dataid]
+
+    def remove(self, metaid):
+        meta = self._get_meta(metaid)
+        dataid = meta[DATAID]
+        self._del_meta(metaid)
+        self._del_data(dataid)
+

File MoinMoin/storage/middleware/__init__.py

+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - misc. middleware
+
+Middleware sits either on a backend or on another middleware.
+"""
+

File MoinMoin/storage/middleware/_tests/__init__.py

+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - middleware tests
+"""

File MoinMoin/storage/middleware/_tests/test_indexing.py

+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - indexing middleware tests
+"""
+
+
+from __future__ import absolute_import, division
+
+from StringIO import StringIO
+import hashlib
+
+import pytest
+
+from config import NAME, SIZE, ITEMID, REVID, DATAID, HASH_ALGORITHM, CONTENT, COMMENT
+
+from ..indexing import IndexingMiddleware
+
+from storage.backends.stores import MutableBackend
+from storage.stores.memory import BytesStore as MemoryBytesStore
+from storage.stores.memory import FileStore as MemoryFileStore
+
+
+class TestIndexingMiddleware(object):
+    def setup_method(self, method):
+        meta_store = MemoryBytesStore()
+        data_store = MemoryFileStore()
+        self.be = MutableBackend(meta_store, data_store)
+        self.be.create()
+        self.be.open()
+        index_dir = 'ix'
+        self.imw = IndexingMiddleware(index_dir=index_dir, backend=self.be)
+        self.imw.create()
+        self.imw.open()
+
+    def teardown_method(self, method):
+        self.imw.close()
+        self.imw.destroy()
+        self.be.close()
+        self.be.destroy()
+
+    def test_nonexisting_item(self):
+        item = self.imw[u'foo']
+        assert not item # does not exist
+
+    def test_store_revision(self):
+        item_name = u'foo'
+        data = 'bar'
+        item = self.imw[item_name]
+        rev = item.store_revision(dict(name=item_name), StringIO(data))
+        revid = rev.revid
+        # check if we have the revision now:
+        item = self.imw[item_name]
+        assert item # does exist
+        rev = item.get_revision(revid)
+        assert rev.meta[NAME] == item_name
+        assert rev.data.read() == data
+        revids = [rev.revid for rev in item.iter_revs()]
+        assert revids == [revid]
+
+    def test_overwrite_revision(self):
+        item_name = u'foo'
+        data = 'bar'
+        newdata = 'baz'
+        item = self.imw[item_name]
+        rev = item.store_revision(dict(name=item_name, comment=u'spam'), StringIO(data))
+        revid = rev.revid
+        # clear revision:
+        item.store_revision(dict(name=item_name, revid=revid, comment=u'no spam'), StringIO(newdata), overwrite=True)
+        # check if the revision was overwritten:
+        item = self.imw[item_name]
+        rev = item.get_revision(revid)
+        assert rev.meta[NAME] == item_name
+        assert rev.meta[COMMENT] == u'no spam'
+        assert rev.data.read() == newdata
+        revids = [rev.revid for rev in item.iter_revs()]
+        assert len(revids) == 1 # we still have the revision, cleared
+        assert revid in revids # it is still same revid
+
+    def test_destroy_revision(self):
+        item_name = u'foo'
+        item = self.imw[item_name]
+        rev = item.store_revision(dict(name=item_name, mtime=1), StringIO('bar'))
+        revid0 = rev.revid
+        rev = item.store_revision(dict(name=item_name, mtime=2), StringIO('baz'))
+        revid1 = rev.revid
+        rev = item.store_revision(dict(name=item_name, mtime=3), StringIO('...'))
+        revid2 = rev.revid
+        print "revids:", revid0, revid1, revid2
+        # destroy a non-current revision:
+        item.destroy_revision(revid0)
+        # check if the revision was destroyed:
+        item = self.imw[item_name]
+        with pytest.raises(KeyError):
+            item.get_revision(revid0)
+        revids = [rev.revid for rev in item.iter_revs()]
+        print "after destroy revid0", revids
+        assert sorted(revids) == sorted([revid1, revid2])
+        # destroy a current revision:
+        item.destroy_revision(revid2)
+        # check if the revision was destroyed:
+        item = self.imw[item_name]
+        with pytest.raises(KeyError):
+            item.get_revision(revid2)
+        revids = [rev.revid for rev in item.iter_revs()]
+        print "after destroy revid2", revids
+        assert sorted(revids) == sorted([revid1])
+        # destroy the last revision left:
+        item.destroy_revision(revid1)
+        # check if the revision was destroyed:
+        item = self.imw[item_name]
+        with pytest.raises(KeyError):
+            item.get_revision(revid1)
+        revids = [rev.revid for rev in item.iter_revs()]
+        print "after destroy revid1", revids
+        assert sorted(revids) == sorted([])
+
+    def test_destroy_item(self):
+        revids = []
+        item_name = u'foo'
+        item = self.imw[item_name]
+        rev = item.store_revision(dict(name=item_name, mtime=1), StringIO('bar'))
+        revids.append(rev.revid)
+        rev = item.store_revision(dict(name=item_name, mtime=2), StringIO('baz'))
+        revids.append(rev.revid)
+        # destroy item:
+        item.destroy_all_revisions()
+        # check if the item was destroyed:
+        item = self.imw[item_name]
+        assert not item # does not exist
+
+    def test_all_revisions(self):
+        item_name = u'foo'
+        item = self.imw[item_name]
+        item.store_revision(dict(name=item_name), StringIO('does not count, different name'))
+        item_name = u'bar'
+        item = self.imw[item_name]
+        item.store_revision(dict(name=item_name), StringIO('1st'))
+        item.store_revision(dict(name=item_name), StringIO('2nd'))
+        item = self.imw[item_name]
+        revs = [rev.data.read() for rev in item.iter_revs()]
+        assert len(revs) == 2
+        assert set(revs) == set(['1st', '2nd'])
+
+    def test_latest_revision(self):
+        item_name = u'foo'
+        item = self.imw[item_name]
+        item.store_revision(dict(name=item_name), StringIO('does not count, different name'))
+        item_name = u'bar'
+        item = self.imw[item_name]
+        item.store_revision(dict(name=item_name), StringIO('1st'))
+        expected_rev = item.store_revision(dict(name=item_name), StringIO('2nd'))
+        revs = list(self.imw.documents(all_revs=False, name=item_name))
+        assert len(revs) == 1  # there is only 1 latest revision
+        assert expected_rev.revid == revs[0].revid  # it is really the latest one
+
+    def test_auto_meta(self):
+        item_name = u'foo'
+        data = 'bar'
+        item = self.imw[item_name]
+        rev = item.store_revision(dict(name=item_name), StringIO(data))
+        print repr(rev.meta)
+        assert rev.meta[NAME] == item_name
+        assert rev.meta[SIZE] == len(data)
+        assert rev.meta[HASH_ALGORITHM] == hashlib.new(HASH_ALGORITHM, data).hexdigest()
+        assert ITEMID in rev.meta
+        assert REVID in rev.meta
+        assert DATAID in rev.meta
+
+    def test_documents(self):
+        item_name = u'foo'
+        item = self.imw[item_name]
+        rev1 = item.store_revision(dict(name=item_name), StringIO('x'))
+        rev2 = item.store_revision(dict(name=item_name), StringIO('xx'))
+        rev3 = item.store_revision(dict(name=item_name), StringIO('xxx'))
+        rev = self.imw.document(all_revs=True, size=2)
+        assert rev
+        assert rev.revid == rev2.revid
+        revs = list(self.imw.documents(all_revs=True, size=2))
+        assert len(revs) == 1
+        assert revs[0].revid == rev2.revid
+
+    def test_index_rebuild(self):
+        # first we index some stuff the slow "on-the-fly" way:
+        expected_latest_revids = []
+        item_name = u'foo'
+        item = self.imw[item_name]
+        r = item.store_revision(dict(name=item_name, mtime=1), StringIO('does not count, different name'))
+        expected_latest_revids.append(r.revid)
+        item_name = u'bar'
+        item = self.imw[item_name]
+        item.store_revision(dict(name=item_name, mtime=1), StringIO('1st'))
+        r = item.store_revision(dict(name=item_name, mtime=2), StringIO('2nd'))
+        expected_latest_revids.append(r.revid)
+
+        # now we remember the index contents built that way:
+        expected_latest_revs = list(self.imw.documents(all_revs=False))
+        expected_all_revs = list(self.imw.documents(all_revs=True))
+
+        print "*** all on-the-fly:"
+        self.imw.dump(all_revs=True)
+        print "*** latest on-the-fly:"
+        self.imw.dump(all_revs=False)
+
+        # now kill the index and do a full rebuild
+        self.imw.close()
+        self.imw.destroy()
+        self.imw.create()
+        self.imw.rebuild()
+        self.imw.open()
+
+        # read the index contents built that way:
+        all_revs = list(self.imw.documents(all_revs=True))
+        latest_revs = list(self.imw.documents(all_revs=False))
+        latest_revids = [rev.revid for rev in latest_revs]
+
+        print "*** all rebuilt:"
+        self.imw.dump(all_revs=True)
+        print "*** latest rebuilt:"
+        self.imw.dump(all_revs=False)
+
+        # should be all the same, order does not matter:
+        assert sorted(expected_all_revs) == sorted(all_revs)
+        assert sorted(expected_latest_revs) == sorted(latest_revs)
+        assert sorted(latest_revids) == sorted(expected_latest_revids)
+
+    def test_index_update(self):
+        # first we index some stuff the slow "on-the-fly" way:
+        expected_all_revids = []
+        expected_latest_revids = []
+        missing_revids = []
+        item_name = u'updated'
+        item = self.imw[item_name]
+        r = item.store_revision(dict(name=item_name, mtime=1), StringIO('updated 1st'))
+        expected_all_revids.append(r.revid)
+        # we update this item below, so we don't add it to expected_latest_revids
+        item_name = u'destroyed'
+        item = self.imw[item_name]
+        r = item.store_revision(dict(name=item_name, mtime=1), StringIO('destroyed 1st'))
+        destroy_revid = r.revid
+        # we destroy this item below, so we don't add it to expected_all_revids
+        # we destroy this item below, so we don't add it to expected_latest_revids
+        item_name = u'stayssame'
+        item = self.imw[item_name]
+        r = item.store_revision(dict(name=item_name, mtime=1), StringIO('stayssame 1st'))
+        expected_all_revids.append(r.revid)
+        # we update this item below, so we don't add it to expected_latest_revids
+        r = item.store_revision(dict(name=item_name, mtime=2), StringIO('stayssame 2nd'))
+        expected_all_revids.append(r.revid)
+        expected_latest_revids.append(r.revid)
+
+        # now build a fresh index at tmp location:
+        self.imw.create(tmp=True)
+        self.imw.rebuild(tmp=True)
+
+        # while the fresh index still sits at the tmp location, we update and add some items.
+        # this will not change the fresh index, but the old index we are still using.
+        item_name = u'updated'
+        item = self.imw[item_name]
+        r = item.store_revision(dict(name=item_name, mtime=2), StringIO('updated 2nd'))
+        expected_all_revids.append(r.revid)
+        expected_latest_revids.append(r.revid)
+        missing_revids.append(r.revid)
+        item_name = u'added'
+        item = self.imw[item_name]
+        r = item.store_revision(dict(name=item_name, mtime=1), StringIO('added 1st'))
+        expected_all_revids.append(r.revid)
+        expected_latest_revids.append(r.revid)
+        missing_revids.append(r.revid)
+        item_name = u'destroyed'
+        item = self.imw[item_name]
+        item.destroy_revision(destroy_revid)
+
+        # now switch to the not-quite-fresh-any-more index we have built:
+        self.imw.close()
+        self.imw.move_index()
+        self.imw.open()
+
+        # read the index contents we have now:
+        all_revids = [doc[REVID] for doc in self.imw._documents(all_revs=True)]
+        latest_revids = [doc[REVID] for doc in self.imw._documents(all_revs=False)]
+
+        # this index is outdated:
+        for missing_revid in missing_revids:
+            assert missing_revid not in all_revids
+            assert missing_revid not in latest_revids
+
+        # update the index:
+        self.imw.close()
+        self.imw.update()
+        self.imw.open()
+
+        # read the index contents we have now:
+        all_revids = [rev.revid for rev in self.imw.documents(all_revs=True)]
+        latest_revids = [rev.revid for rev in self.imw.documents(all_revs=False)]
+
+        # now it should have the previously missing rev and all should be as expected:
+        for missing_revid in missing_revids:
+            assert missing_revid in all_revids
+            assert missing_revid in latest_revids
+        assert sorted(all_revids) == sorted(expected_all_revids)
+        assert sorted(latest_revids) == sorted(expected_latest_revids)
+
+    def test_revision_contextmanager(self):
+        # check if rev.data is closed after leaving the with-block
+        item_name = u'foo'
+        meta = dict(name=item_name)
+        data = 'some test content'
+        item = self.imw[item_name]
+        data_file = StringIO(data)
+        with item.store_revision(meta, data_file) as rev:
+            assert rev.data.read() == data
+            revid = rev.revid
+        with pytest.raises(ValueError):
+            rev.data.read()
+        with item.get_revision(revid) as rev:
+            assert rev.data.read() == data
+        with pytest.raises(ValueError):
+            rev.data.read()
+
+
+    def test_indexed_content(self):
+        # TODO: this is a very simple check that assumes that data is put 1:1
+        # into index' CONTENT field.
+        item_name = u'foo'
+        meta = dict(name=item_name)
+        data = 'some test content'
+        item = self.imw[item_name]
+        data_file = StringIO(data)
+        with item.store_revision(meta, data_file) as rev:
+            expected_revid = rev.revid
+        doc = self.imw._document(content=u'test')
+        assert expected_revid == doc[REVID]
+        assert unicode(data) == doc[CONTENT]
+
+class TestProtectedIndexingMiddleware(object):
+    def setup_method(self, method):
+        meta_store = MemoryBytesStore()
+        data_store = MemoryFileStore()
+        self.be = MutableBackend(meta_store, data_store)
+        self.be.create()
+        self.be.open()
+        index_dir = 'ix'
+        self.imw = IndexingMiddleware(index_dir=index_dir, backend=self.be, user_name=u'joe', acl_support=True)
+        self.imw.create()
+        self.imw.open()
+
+    def teardown_method(self, method):
+        self.imw.close()
+        self.imw.destroy()
+        self.be.close()
+        self.be.destroy()
+
+    def test_documents(self):
+        item_name = u'public'
+        item = self.imw[item_name]
+        r = item.store_revision(dict(name=item_name, acl=u'joe:read'), StringIO('public content'))
+        revid_public = r.revid
+        revids = [rev.revid for rev in self.imw.documents(all_revs=False)]
+        assert revids == [revid_public]
+
+    def test_getitem(self):
+        item_name = u'public'
+        item = self.imw[item_name]
+        r = item.store_revision(dict(name=item_name, acl=u'joe:read'), StringIO('public content'))
+        revid_public = r.revid
+        # now testing:
+        item_name = u'public'
+        item = self.imw[item_name]
+        r = item[revid_public]
+        assert r.data.read() == 'public content'
+
+    def test_perf_create_only(self):
+        pytest.skip("usually we do no performance tests")
+        # determine create revisions performance
+        # for the memory backend we use, this is likely mostly building the indexes
+        item_name = u'foo'
+        item = self.imw[item_name]
+        for i in xrange(100):
+            item.store_revision(dict(name=item_name, acl=u'joe:create joe:read'), StringIO('some content'))
+
+    def test_perf_create_read(self):
+        pytest.skip("usually we do no performance tests")
+        # determine create + read revisions performance
+        # for the memory backend we use, this is likely mostly building the indexes and
+        # doing index lookups name -> itemid, itemid -> revids list
+        item_name = u'foo'
+        item = self.imw[item_name]
+        for i in xrange(100):
+            item.store_revision(dict(name=item_name, acl=u'joe:create joe:read'), StringIO('rev number %d' % i))
+        for r in item.iter_revs():
+            #print r.meta
+            #print r.data.read()
+            pass
+

File MoinMoin/storage/middleware/_tests/test_protecting.py

+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - protecting middleware tests
+"""
+
+
+from __future__ import absolute_import, division
+
+from StringIO import StringIO
+
+import pytest
+
+from config import ACL
+
+from ..protecting import ProtectingMiddleware, AccessDenied
+
+from .test_indexing import TestIndexingMiddleware
+
+UNPROTECTED = u'unprotected'
+PROTECTED = u'protected'
+
+UNPROTECTED_CONTENT = 'unprotected content'
+PROTECTED_CONTENT = 'protected content'
+
+class TestProtectingMiddleware(TestIndexingMiddleware):
+    def setup_method(self, method):
+        super(TestProtectingMiddleware, self).setup_method(method)
+        self.imw = ProtectingMiddleware(self.imw, user_name=u'joe')
+
+    def teardown_method(self, method):
+        self.imw = self.imw.indexer
+        super(TestProtectingMiddleware, self).teardown_method(method)
+
+    def _dummy(self):
+        # replacement for tests that use unsupported methods / attributes
+        pass
+
+    test_index_rebuild = _dummy
+    test_index_update = _dummy
+    test_indexed_content = _dummy
+
+    def make_items(self, unprotected_acl, protected_acl):
+        items = [(UNPROTECTED, unprotected_acl, UNPROTECTED_CONTENT),
+                 (PROTECTED, protected_acl, PROTECTED_CONTENT),
+                ]
+        revids = []
+        for item_name, acl, content in items:
+            item = self.imw[item_name]
+            r = item.store_revision(dict(name=item_name, acl=acl), StringIO(content))
+            revids.append(r.revid)
+        return revids
+
+    def test_documents(self):
+        revid_unprotected, revid_protected = self.make_items(u'joe:read', u'boss:read')
+        revids = [rev.revid for rev in self.imw.documents(all_revs=False)]
+        assert revids == [revid_unprotected]  # without revid_protected!
+    
+    def test_getitem(self):
+        revid_unprotected, revid_protected = self.make_items(u'joe:read', u'boss:read')
+        # now testing:
+        item = self.imw[UNPROTECTED]
+        r = item[revid_unprotected]
+        assert r.data.read() == UNPROTECTED_CONTENT
+        item = self.imw[PROTECTED]
+        with pytest.raises(AccessDenied):
+            r = item[revid_protected]
+
+    def test_write(self):
+        revid_unprotected, revid_protected = self.make_items(u'joe:write', u'boss:write')
+        # now testing:
+        item = self.imw[UNPROTECTED]
+        item.store_revision(dict(name=UNPROTECTED, acl=u'joe:write'), StringIO(UNPROTECTED_CONTENT))
+        item = self.imw[PROTECTED]
+        with pytest.raises(AccessDenied):
+            item.store_revision(dict(name=PROTECTED, acl=u'boss:write'), StringIO(UNPROTECTED_CONTENT))
+
+    def test_write_create(self):
+        # now testing:
+        item_name = u'newitem'
+        item = self.imw[item_name]
+        item.store_revision(dict(name=item_name), StringIO('new content'))
+
+    def test_overwrite(self):
+        revid_unprotected, revid_protected = self.make_items(u'joe:write joe:overwrite', u'boss:write boss:overwrite')
+        # now testing:
+        item = self.imw[UNPROTECTED]
+        item.store_revision(dict(name=UNPROTECTED, acl=u'joe:write joe:overwrite', revid=revid_unprotected),
+                            StringIO(UNPROTECTED_CONTENT), overwrite=True)
+        item = self.imw[PROTECTED]
+        with pytest.raises(AccessDenied):
+            item.store_revision(dict(name=PROTECTED, acl=u'boss:write boss:overwrite', revid=revid_protected),
+                                StringIO(UNPROTECTED_CONTENT), overwrite=True)
+
+    def test_destroy(self):
+        revid_unprotected, revid_protected = self.make_items(u'joe:destroy', u'boss:destroy')
+        # now testing:
+        item = self.imw[UNPROTECTED]
+        item.destroy_all_revisions()
+        item = self.imw[PROTECTED]
+        with pytest.raises(AccessDenied):
+            item.destroy_all_revisions()
+
+

File MoinMoin/storage/middleware/_tests/test_routing.py

+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - router middleware tests
+"""
+
+
+from __future__ import absolute_import, division
+
+from StringIO import StringIO
+
+import pytest
+
+from config import NAME, REVID
+
+from ..routing import Backend as RouterBackend
+
+from storage.backends.stores import MutableBackend as StoreBackend, Backend as ROBackend
+from storage.stores.memory import BytesStore as MemoryBytesStore
+from storage.stores.memory import FileStore as MemoryFileStore
+
+
+def make_ro_backend():
+    store = StoreBackend(MemoryBytesStore(), MemoryFileStore())
+    store.create()
+    store.store({NAME: 'test'}, StringIO(''))
+    store.store({NAME: 'test2'}, StringIO(''))
+    return ROBackend(store.meta_store, store.data_store)
+
+
+
+def pytest_funcarg__router(request):
+    root_be = StoreBackend(MemoryBytesStore(), MemoryFileStore())
+    sub_be = StoreBackend(MemoryBytesStore(), MemoryFileStore())
+    ro_be = make_ro_backend()
+    router = RouterBackend([('sub', sub_be), ('ro', ro_be), ('', root_be)])
+    router.open()
+    router.create()
+
+    @request.addfinalizer
+    def finalize():
+        router.close()
+        router.destroy()
+
+    return router
+
+def revid_split(revid):
+    # router revids are <backend_mountpoint>/<backend_revid>, split that:
+    return revid.rsplit(u'/', 1)
+
+def test_store_get_del(router):
+    root_name = u'foo'
+    root_revid = router.store(dict(name=root_name), StringIO(''))
+    sub_name = u'sub/bar'
+    sub_revid = router.store(dict(name=sub_name), StringIO(''))
+
+    assert revid_split(root_revid)[0] == ''
+    assert revid_split(sub_revid)[0] == 'sub'
+
+    # when going via the router backend, we get back fully qualified names:
+    root_meta, _ = router.retrieve(root_revid)
+    sub_meta, _ = router.retrieve(sub_revid)
+    assert root_name == root_meta[NAME]
+    assert sub_name == sub_meta[NAME]
+
+    # when looking into the storage backend, we see relative names (without mountpoint):
+    root_meta, _ = router.mapping[-1][1].retrieve(revid_split(root_revid)[1])
+    sub_meta, _ = router.mapping[0][1].retrieve(revid_split(sub_revid)[1])
+    assert root_name == root_meta[NAME]
+    assert sub_name == 'sub' + '/' + sub_meta[NAME]
+    # delete revs:
+    router.remove(root_revid)
+    router.remove(sub_revid)
+
+
+def test_store_readonly_fails(router):
+    with pytest.raises(TypeError):
+        router.store(dict(name=u'ro/testing'), StringIO(''))
+
+def test_del_readonly_fails(router):
+    ro_id = next(iter(router)) # we have only readonly items
+    print ro_id
+    with pytest.raises(TypeError):
+        router.remove(ro_id)
+
+
+def test_destroy_create_dont_touch_ro(router):
+    existing = set(router)
+    root_revid = router.store(dict(name=u'foo'), StringIO(''))
+    sub_revid = router.store(dict(name=u'sub/bar'), StringIO(''))
+
+    router.destroy()
+    router.create()
+
+    assert set(router) == existing
+
+
+def test_iter(router):
+    existing = set(router)
+    root_revid = router.store(dict(name=u'foo'), StringIO(''))
+    sub_revid = router.store(dict(name=u'sub/bar'), StringIO(''))
+    assert set(router) == (set([root_revid, sub_revid])|existing)
+

File MoinMoin/storage/middleware/_tests/test_serialization.py

+# Copyright: 2011 MoinMoin:RonnyPfannschmidt
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - serializer / deserializer tests
+"""
+
+
+from __future__ import absolute_import, division
+
+from StringIO import StringIO
+
+from ..indexing import IndexingMiddleware
+from ..serialization import serialize, deserialize
+
+from storage.backends.stores import MutableBackend
+from storage.stores.memory import BytesStore, FileStore
+
+
+contents = [
+    (u'Foo', {'name': u'Foo'}, ''),
+    (u'Foo', {'name': u'Foo'}, '2nd'),
+    (u'Subdir', {'name': u'Subdir'}, ''),
+    (u'Subdir/Foo', {'name': u'Subdir/Foo'}, ''),
+    (u'Subdir/Bar', {'name': u'Subdir/Bar'}, ''),
+]
+
+
+scenarios = [
+    ('Simple', ['']),
+    ('Nested', ['', 'Subdir']),
+]
+
+
+def pytest_generate_tests(metafunc):
+    metafunc.addcall(id='Simple->Simple', param=('Simple', 'Simple'))
+
+def pytest_funcarg__source(request):
+    # scenario
+    return make_middleware(request)
+
+def pytest_funcarg__target(request):
+    # scenario
+    return make_middleware(request)
+
+def make_middleware(request):
+    tmpdir = request.getfuncargvalue('tmpdir')
+    # scenario
+
+    meta_store = BytesStore()
+    data_store = FileStore()
+    backend = MutableBackend(meta_store, data_store)
+    backend.create()
+    backend.open()
+    request.addfinalizer(backend.destroy)
+    request.addfinalizer(backend.close)
+
+    mw = IndexingMiddleware(index_dir=str(tmpdir/'foo'),
+                            backend=backend)
+    mw.create()
+    mw.open()
+    request.addfinalizer(mw.destroy)
+    request.addfinalizer(mw.close)
+    return mw
+
+
+def test_serialize_deserialize(source, target):
+    i = 0
+    for name, meta, data in contents:
+        item = source['name']
+        item.store_revision(dict(meta, mtime=i), StringIO(data))
+        i += 1
+
+    io = StringIO()
+    serialize(source.backend, io)
+    io.seek(0)
+    deserialize(io, target.backend)
+    target.rebuild()
+
+    print sorted(source.backend)
+    print sorted(target.backend)
+    assert sorted(source.backend) == sorted(target.backend)
+

File MoinMoin/storage/middleware/indexing.py

+# Copyright: 2011 MoinMoin:RonnyPfannschmidt
+# Copyright: 2011 MoinMoin:ThomasWaldmann
+# Copyright: 2011 MoinMoin:MichaelMayorov
+# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.
+
+"""
+MoinMoin - indexing middleware
+
+The backends and stores moin uses are rather simple, it is mostly just a
+unsorted / unordered bunch of revisions (meta and data) with iteration.
+
+The indexer middleware adds the needed power: after all metadata and data
+is indexed, we can do all sorts of operations on the indexer level:
+* searching
+* lookup by name, uuid, ...
+* selecting
+* listing
+
+Using Whoosh (a fast pure-Python indexing and search library), we build,
+maintain and use 2 indexes:
+
+* "all revisions" index (big, needed for history search)
+* "latest revisions" index (smaller, just the current revisions)
+
+When creating or destroying revisions, indexes are automatically updated.
+
+There is also code to do a full index rebuild in case it gets damaged, lost
+or needs rebuilding for other reasons. There is also index update code to
+do a quick "intelligent" update of a "mostly ok" index, that just adds,
+updates, deletes stuff that is different in backend compared to current index.
+
+Indexing is the only layer that can easily deal with **names** (it can
+easily translate names to UUIDs and vice versa) and with **items** (it
+knows current revision, it can easily list and order historial revisions),
+using the index.
+
+The layers below are using UUIDs to identify revisions meta and data:
+
+* revid (metaid) - a UUID identifying a specific revision (revision metadata)
+* dataid - a UUID identifying some specific revision data (optional), it is
+  just stored into revision metadata.
+* itemid - a UUID identifying an item (== a set of revisions), it is just
+  stored into revision metadata. itemid is only easily usable on indexing
+  level.
+
+Many methods provided by the indexing middleware will be fast, because they
+will not access the layers below (like the backend), but just the index files,
+usually it is even just the small and thus quick latest-revs index.
+"""
+
+
+from __future__ import absolute_import, division
+
+import os
+import shutil
+import itertools
+import time
+import datetime
+from StringIO import StringIO
+
+from uuid import uuid4
+make_uuid = lambda: unicode(uuid4().hex)
+
+import logging
+
+from whoosh.fields import Schema, TEXT, ID, IDLIST, NUMERIC, DATETIME, KEYWORD, BOOLEAN
+from whoosh.index import open_dir, create_in, EmptyIndexError
+from whoosh.writing import AsyncWriter
+from whoosh.filedb.multiproc import MultiSegmentWriter
+from whoosh.qparser import QueryParser, MultifieldParser
+from whoosh.query import Every, Term
+from whoosh.sorting import FieldFacet
+
+from config import WIKINAME, NAME, NAME_EXACT, MTIME, CONTENTTYPE, TAGS, \
+                   LANGUAGE, USERID, ADDRESS, HOSTNAME, SIZE, ACTION, COMMENT, \
+                   CONTENT, ITEMLINKS, ITEMTRANSCLUSIONS, ACL, EMAIL, OPENID, \
+                   ITEMID, REVID
+
+LATEST_REVS = 'latest_revs'
+ALL_REVS = 'all_revs'
+INDEXES = [LATEST_REVS, ALL_REVS, ]
+
+
+def backend_to_index(meta, content, schema, wikiname):
+    """
+    Convert backend metadata/data to a whoosh document.
+
+    :param meta: revision meta from moin backend
+    :param content: revision data converted to indexable content
+    :param schema: whoosh schema
+    :param wikiname: interwikiname of this wiki
+    :returns: document to put into whoosh index
+    """
+
+    doc = dict([(str(key), value)
+                for key, value in meta.items()
+                if key in schema])
+    if MTIME in doc:
+        # we have UNIX UTC timestamp (int), whoosh wants datetime
+        doc[MTIME] = datetime.datetime.utcfromtimestamp(doc[MTIME])
+    doc[NAME_EXACT] = doc[NAME]
+    doc[WIKINAME] = wikiname
+    doc[CONTENT] = content
+    return doc
+
+
+def convert_to_indexable(meta, data):
+    """
+    Convert revision data to a indexable content.
+
+    :param meta: revision metadata (gets updated as a side effect)
+    :param data: revision data (file-like)
+                 please make sure that the content file is
+                 ready to read all indexable content from it. if you have just
+                 written that content or already read from it, you need to call
+                 rev.seek(0) before calling convert_to_indexable(rev).
+    :returns: indexable content, text/plain, unicode object
+    """
+    return unicode(data.read()) # TODO integrate real thing after merge into moin2 code base.
+
+
+class IndexingMiddleware(object):
+    def __init__(self, index_dir, backend, user_name=None, acl_support=False, **kw):
+        """
+        Store params, create schemas.
+        """
+        self.index_dir = index_dir
+        self.index_dir_tmp = index_dir + '.temp'
+        self.backend = backend
+        self.user_name = user_name # TODO use currently logged-in username
+        self.acl_support = acl_support
+        self.wikiname = u'' # TODO take from app.cfg.interwikiname
+        self.ix = {}  # open indexes
+        self.schemas = {}  # existing schemas
+
+        common_fields = {
+            # wikiname so we can have a shared index in a wiki farm, always check this!
+            WIKINAME: ID(stored=True),
+            # tokenized NAME from metadata - use this for manual searching from UI
+            # TODO was: NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0),
+            NAME: ID(stored=True, field_boost=2.0),
+            # unmodified NAME from metadata - use this for precise lookup by the code.
+            # also needed for wildcard search, so the original string as well as the query
+            # (with the wildcard) is not cut into pieces.
+            NAME_EXACT: ID(field_boost=3.0),
+            # revision id (aka meta id)
+            REVID: ID(unique=True, stored=True),
+            # MTIME from revision metadata (converted to UTC datetime)
+            MTIME: DATETIME(stored=True),
+            # tokenized CONTENTTYPE from metadata
+            # TODO was: CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()),
+            CONTENTTYPE: ID(stored=True),
+            # unmodified list of TAGS from metadata
+            TAGS: ID(stored=True),
+            LANGUAGE: ID(stored=True),
+            # USERID from metadata TODO: -> user ITEMID
+            USERID: ID(stored=True),
+            # ADDRESS from metadata
+            ADDRESS: ID(stored=True),
+            # HOSTNAME from metadata
+            HOSTNAME: ID(stored=True),
+            # SIZE from metadata
+            SIZE: NUMERIC(stored=True),
+            # ACTION from metadata
+            ACTION: ID(stored=True),
+            # tokenized COMMENT from metadata
+            COMMENT: TEXT(stored=True),
+            # data (content), converted to text/plain and tokenized
+            CONTENT: TEXT(stored=True),
+        }
+
+        latest_revs_fields = {
+            # ITEMID from metadata - as there is only latest rev of same item here, it is unique
+            ITEMID: ID(unique=True, stored=True),
+            # unmodified list of ITEMLINKS from metadata
+            ITEMLINKS: ID(stored=True),
+            # unmodified list of ITEMTRANSCLUSIONS from metadata
+            ITEMTRANSCLUSIONS: ID(stored=True),
+            # tokenized ACL from metadata
+            # TODO was: ACL: TEXT(analyzer=AclTokenizer(self._cfg), multitoken_query="and", stored=True),
+            ACL: ID(stored=True),
+        }
+        latest_revs_fields.update(**common_fields)
+
+        userprofile_fields = {
+            EMAIL: ID(unique=True, stored=True),
+            OPENID: ID(unique=True, stored=True),
+        }
+        latest_revs_fields.update(**userprofile_fields)
+
+        all_revs_fields = {
+            ITEMID: ID(stored=True),
+        }
+        all_revs_fields.update(**common_fields)
+
+        latest_revisions_schema = Schema(**latest_revs_fields)
+        all_revisions_schema = Schema(**all_revs_fields)
+
+        # Define dynamic fields
+        dynamic_fields = [("*_id", ID(stored=True)),
+                          ("*_text", TEXT(stored=True)),
+                          ("*_keyword", KEYWORD(stored=True)),
+                          ("*_numeric", NUMERIC(stored=True)),
+                          ("*_datetime", DATETIME(stored=True)),
+                          ("*_boolean", BOOLEAN(stored=True)),
+                         ]
+
+        # Adding dynamic fields to schemas
+        for glob, field_type in dynamic_fields:
+            latest_revisions_schema.add(glob, field_type, glob=True)
+            all_revisions_schema.add(glob, field_type, glob=True)
+
+        # schemas are needed by query parser and for index creation
+        self.schemas[ALL_REVS] = all_revisions_schema
+        self.schemas[LATEST_REVS] = latest_revisions_schema
+
+    def open(self):
+        """
+        Open all indexes.
+        """
+        index_dir = self.index_dir
+        try:
+            for name in INDEXES:
+                self.ix[name] = open_dir(index_dir, indexname=name)
+        except (IOError, OSError, EmptyIndexError) as err:
+            logging.error(u"%s [while trying to open index '%s' in '%s']" % (str(err), name, index_dir))
+            raise
+
+    def close(self):
+        """
+        Close all indexes.
+        """
+        for name in self.ix:
+            self.ix[name].close()
+        self.ix = {}
+
+    def create(self, tmp=False):
+        """
+        Create all indexes (empty).
+        """
+        index_dir = self.index_dir_tmp if tmp else self.index_dir
+        try:
+            os.mkdir(index_dir)
+        except:
+            # ignore exception, we'll get another exception below
+            # in case there are problems with the index_dir
+            pass
+        try:
+            for name in INDEXES:
+                create_in(index_dir, self.schemas[name], indexname=name)
+        except (IOError, OSError) as err:
+            logging.error(u"%s [while trying to create index '%s' in '%s']" % (str(err), name, index_dir))
+            raise
+
+    def destroy(self, tmp=False):
+        """
+        Destroy all indexes.
+        """
+        index_dir = self.index_dir_tmp if tmp else self.index_dir
+        if os.path.exists(index_dir):
+            shutil.rmtree(index_dir)
+
+    def move_index(self):
+        """
+        Move freshly built indexes from index_dir_tmp to index_dir.
+        """
+        self.destroy()
+        os.rename(self.index_dir_tmp, self.index_dir)
+
+    def index_revision(self, revid, meta, data, async=True):
+        """
+        Index a single revision, add it to all-revs and latest-revs index.
+        """
+        meta[REVID] = revid
+        content = convert_to_indexable(meta, data)
+        doc = backend_to_index(meta, content, self.schemas[ALL_REVS], self.wikiname)
+        if async:
+            writer = AsyncWriter(self.ix[ALL_REVS])
+        else:
+            writer = self.ix[ALL_REVS].writer()
+        with writer as writer:
+            writer.update_document(**doc) # update, because store_revision() may give us an existing revid
+        doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname)
+        if async:
+            writer = AsyncWriter(self.ix[LATEST_REVS])
+        else:
+            writer = self.ix[LATEST_REVS].writer()
+        with writer as writer:
+            writer.update_document(**doc)
+
+    def remove_revision(self, revid, async=True):
+        """
+        Remove a single revision from indexes.
+        """
+        if async:
+            writer = AsyncWriter(self.ix[ALL_REVS])
+        else:
+            writer = self.ix[ALL_REVS].writer()
+        with writer as writer:
+            writer.delete_by_term(REVID, revid)
+        if async:
+            writer = AsyncWriter(self.ix[LATEST_REVS])
+        else:
+            writer = self.ix[LATEST_REVS].writer()
+        with writer as writer:
+            # find out itemid related to the revid we want to remove:
+            with self.ix[LATEST_REVS].searcher() as searcher:
+                docnum_remove = searcher.document_number(revid=revid)
+                if docnum_remove is not None:
+                    itemid = searcher.stored_fields(docnum_remove)[ITEMID]
+            if docnum_remove is not None:
+                # we are removing a revid that is in latest revs index
+                try:
+                    latest_revids = self._find_latest_revids(self.ix[ALL_REVS], Term(ITEMID, itemid))
+                except AttributeError:
+                    # workaround for bug #200 AttributeError: 'FieldCache' object has no attribute 'code'
+                    latest_revids = []
+                if latest_revids:
+                    # we have a latest revision, just update the document in the index:
+                    assert len(latest_revids) == 1 # this item must have only one latest revision
+                    latest_revid = latest_revids[0]
+                    # we must fetch from backend because schema for LATEST_REVS is different than for ALL_REVS
+                    # (and we can't be sure we have all fields stored, too)
+                    meta, _ = self.backend.retrieve(latest_revid)
+                    # we only use meta (not data), because we do not want to transform data->content again (this
+                    # is potentially expensive) as we already have the transformed content stored in ALL_REVS index:
+                    with self.ix[ALL_REVS].searcher() as searcher:
+                        doc = searcher.document(revid=latest_revid)
+                        content = doc[CONTENT]
+                    doc = backend_to_index(meta, content, self.schemas[LATEST_REVS], self.wikiname)
+                    writer.update_document(**doc)
+                else:
+                    # this is no revision left in this item that could be the new "latest rev", just kill the rev
+                    writer.delete_document(docnum_remove)
+
+    def _modify_index(self, index, schema, wikiname, revids, mode='add', procs=1, limitmb=256):
+        """
+        modify index contents - add, update, delete the indexed documents for all given revids
+
+        Note: mode == 'add' is faster but you need to make sure to not create duplicate
+              documents in the index.
+        """
+        if procs == 1:
+            # MultiSegmentWriter sometimes has issues and is pointless for procs == 1,
+            # so use the simple writer when --procs 1 is given:
+            writer = index.writer()
+        else:
+            writer = MultiSegmentWriter(index, procs, limitmb)
+        with writer as writer:
+            for revid in revids:
+                if mode in ['add', 'update', ]:
+                    meta, data = self.backend.retrieve(revid)
+                    content = convert_to_indexable(meta, data)
+                    doc = backend_to_index(meta, content, schema, wikiname)