Commits

Thomas Waldmann committed 3c3c0f1

some indexing middleware methods now yield Revisions (not revids - use rev.revid if you need that)

I tried to optimize Revision and Meta class by reusing information we already have:
e.g. if we already did a index lookup before creating a related revision, we can
give the whoosh doc to Revision.__init__ so it does not access the index again.

It is also given to Meta.__init__ there, so it does not need to load metadata
from the store if you are just requesting some meta key/value that is also in
the whoosh document.

If some non-stored meta key or data is accessed, it gets loaded from store.

Added some __cmp__ methods to make sorting and comparing of Revisions and Meta
possible.

Comments (0)

Files changed (3)

storage/middleware/_tests/test_indexing.py

         rev = item.get_revision(revid)
         assert rev.meta[NAME] == item_name
         assert rev.data.read() == data
-        revids = list(item.iter_revs())
+        revids = [rev.revid for rev in item.iter_revs()]
         assert revids == [revid]
 
     def test_overwrite_revision(self):
         assert rev.meta[NAME] == item_name
         assert rev.meta[COMMENT] == u'no spam'
         assert rev.data.read() == newdata
-        revids = list(item.iter_revs())
+        revids = [rev.revid for rev in item.iter_revs()]
         assert len(revids) == 1 # we still have the revision, cleared
         assert revid in revids # it is still same revid
 
         item = self.imw[item_name]
         with pytest.raises(KeyError):
             item.get_revision(revid0)
-        revs = list(item.iter_revs())
-        print "after destroy revid0", revs
-        assert sorted(revs) == sorted([revid1, revid2])
+        revids = [rev.revid for rev in item.iter_revs()]
+        print "after destroy revid0", revids
+        assert sorted(revids) == sorted([revid1, revid2])
         # destroy a current revision:
         item.destroy_revision(revid2)
         # check if the revision was destroyed:
         item = self.imw[item_name]
         with pytest.raises(KeyError):
             item.get_revision(revid2)
-        revs = list(item.iter_revs())
-        print "after destroy revid2", revs
-        assert sorted(revs) == sorted([revid1])
+        revids = [rev.revid for rev in item.iter_revs()]
+        print "after destroy revid2", revids
+        assert sorted(revids) == sorted([revid1])
         # destroy the last revision left:
         item.destroy_revision(revid1)
         # check if the revision was destroyed:
         item = self.imw[item_name]
         with pytest.raises(KeyError):
             item.get_revision(revid1)
-        revs = list(item.iter_revs())
-        print "after destroy revid1", revs
-        assert sorted(revs) == sorted([])
+        revids = [rev.revid for rev in item.iter_revs()]
+        print "after destroy revid1", revids
+        assert sorted(revids) == sorted([])
 
     def test_destroy_item(self):
         revids = []
         item.store_revision(dict(name=item_name), StringIO('1st'))
         item.store_revision(dict(name=item_name), StringIO('2nd'))
         item = self.imw[item_name]
-        revs = [item[revid].data.read() for revid in item.iter_revs()]
+        revs = [rev.data.read() for rev in item.iter_revs()]
         assert len(revs) == 2
         assert set(revs) == set(['1st', '2nd'])
 
         item = self.imw[item_name]
         item.store_revision(dict(name=item_name), StringIO('1st'))
         expected_rev = item.store_revision(dict(name=item_name), StringIO('2nd'))
-        docs = list(self.imw.documents(all_revs=False, name=item_name))
-        assert len(docs) == 1  # there is only 1 latest revision
-        assert expected_rev.revid == docs[0][REVID]  # it is really the latest one
+        revs = list(self.imw.documents(all_revs=False, name=item_name))
+        assert len(revs) == 1  # there is only 1 latest revision
+        assert expected_rev.revid == revs[0].revid  # it is really the latest one
 
     def test_auto_meta(self):
         item_name = u'foo'
         data = 'bar'
         item = self.imw[item_name]
         rev = item.store_revision(dict(name=item_name), StringIO(data))
-        rev = item[rev.revid]
         print repr(rev.meta)
         assert rev.meta[NAME] == item_name
         assert rev.meta[SIZE] == len(data)
         rev1 = item.store_revision(dict(name=item_name), StringIO('x'))
         rev2 = item.store_revision(dict(name=item_name), StringIO('xx'))
         rev3 = item.store_revision(dict(name=item_name), StringIO('xxx'))
-        doc = self.imw.document(all_revs=True, size=2)
-        assert doc
-        assert doc[REVID] == rev2.revid
-        docs = list(self.imw.documents(all_revs=True, size=2))
-        assert len(docs) == 1
-        assert docs[0][REVID] == rev2.revid
+        rev = self.imw.document(all_revs=True, size=2)
+        assert rev
+        assert rev.revid == rev2.revid
+        revs = list(self.imw.documents(all_revs=True, size=2))
+        assert len(revs) == 1
+        assert revs[0].revid == rev2.revid
 
     def test_index_rebuild(self):
         # first we index some stuff the slow "on-the-fly" way:
         expected_latest_revids.append(r.revid)
 
         # now we remember the index contents built that way:
-        expected_latest_docs = list(self.imw.documents(all_revs=False))
-        expected_all_docs = list(self.imw.documents(all_revs=True))
+        expected_latest_revs = list(self.imw.documents(all_revs=False))
+        expected_all_revs = list(self.imw.documents(all_revs=True))
 
         print "*** all on-the-fly:"
         self.imw.dump(all_revs=True)
         self.imw.open()
 
         # read the index contents built that way:
-        all_docs = list(self.imw.documents(all_revs=True))
-        latest_docs = list(self.imw.documents(all_revs=False))
-        latest_revids = [doc[REVID] for doc in latest_docs]
+        all_revs = list(self.imw.documents(all_revs=True))
+        latest_revs = list(self.imw.documents(all_revs=False))
+        latest_revids = [rev.revid for rev in latest_revs]
 
         print "*** all rebuilt:"
         self.imw.dump(all_revs=True)
         self.imw.dump(all_revs=False)
 
         # should be all the same, order does not matter:
-        assert sorted(expected_all_docs) == sorted(all_docs)
-        assert sorted(expected_latest_docs) == sorted(latest_docs)
+        assert sorted(expected_all_revs) == sorted(all_revs)
+        assert sorted(expected_latest_revs) == sorted(latest_revs)
         assert sorted(latest_revids) == sorted(expected_latest_revids)
 
     def test_index_update(self):
         self.imw.open()
 
         # read the index contents we have now:
-        all_revids = [doc[REVID] for doc in self.imw.documents(all_revs=True)]
-        latest_revids = [doc[REVID] for doc in self.imw.documents(all_revs=False)]
+        all_revids = [doc[REVID] for doc in self.imw._documents(all_revs=True)]
+        latest_revids = [doc[REVID] for doc in self.imw._documents(all_revs=False)]
 
         # this index is outdated:
         for missing_revid in missing_revids:
         self.imw.open()
 
         # read the index contents we have now:
-        all_revids = [doc[REVID] for doc in self.imw.documents(all_revs=True)]
-        latest_revids = [doc[REVID] for doc in self.imw.documents(all_revs=False)]
+        all_revids = [rev.revid for rev in self.imw.documents(all_revs=True)]
+        latest_revids = [rev.revid for rev in self.imw.documents(all_revs=False)]
 
         # now it should have the previously missing rev and all should be as expected:
         for missing_revid in missing_revids:
         with pytest.raises(ValueError):
             rev.data.read()
 
+
     def test_indexed_content(self):
         # TODO: this is a very simple check that assumes that data is put 1:1
         # into index' CONTENT field.
         data_file = StringIO(data)
         with item.store_revision(meta, data_file) as rev:
             expected_revid = rev.revid
-        doc = self.imw.document(content=u'test')
+        doc = self.imw._document(content=u'test')
         assert expected_revid == doc[REVID]
         assert unicode(data) == doc[CONTENT]
 
-
 class TestProtectedIndexingMiddleware(object):
     def setup_method(self, method):
         meta_store = MemoryBytesStore()
         item = self.imw[item_name]
         r = item.store_revision(dict(name=item_name, acl=u''), StringIO('secret content'))
         revid_secret = r.revid
-        revids = [doc[REVID] for doc in self.imw.documents(all_revs=False)]
+        revids = [rev.revid for rev in self.imw.documents(all_revs=False)]
         assert revids == [revid_public]
 
     def test_getitem(self):
         item = self.imw[item_name]
         for i in xrange(100):
             item.store_revision(dict(name=item_name, acl=u'joe:create joe:read'), StringIO('rev number %d' % i))
-        for revid in item.iter_revs():
-            r = item[revid]
+        for r in item.iter_revs():
             #print r.meta
             #print r.data.read()
+            pass
 

storage/middleware/_tests/test_serialization.py

     backend.open()
     request.addfinalizer(backend.destroy)
     request.addfinalizer(backend.close)
-    
+
     mw = IndexingMiddleware(index_dir=str(tmpdir/'foo'),
                             backend=backend)
     mw.create()

storage/middleware/indexing.py

 
     def search(self, q, all_revs=False, **kw):
         """
-        Search with query q, yield stored fields.
+        Search with query q, yield Revisions.
         """
         with self.get_index(all_revs).searcher() as searcher:
             # Note: callers must consume everything we yield, so the for loop
                 latest_doc = not all_revs and doc or None
                 item = Item(self, user_name=self.user_name, latest_doc=latest_doc, itemid=doc[ITEMID])
                 if item.allows('read'):
-                    yield doc
+                    yield item[doc[REVID]]
 
     def search_page(self, q, all_revs=False, pagenum=1, pagelen=10, **kw):
         """
                 latest_doc = not all_revs and doc or None
                 item = Item(self, user_name=self.user_name, latest_doc=latest_doc, itemid=doc[ITEMID])
                 if item.allows('read'):
-                    yield doc
+                    yield item[doc[REVID]]
 
     def documents(self, all_revs=False, **kw):
         """
-        Yield documents matching the kw args.
+        Yield Revisions matching the kw args.
+        """
+        for doc in self._documents(all_revs, **kw):
+            latest_doc = not all_revs and doc or None
+            item = Item(self, user_name=self.user_name, latest_doc=latest_doc, itemid=doc[ITEMID])
+            if item.allows('read'):
+                yield item[doc[REVID]]
+
+    def _documents(self, all_revs=False, **kw):
+        """
+        Yield documents matching the kw args (internal use only, no ACL checks).
         """
         with self.get_index(all_revs).searcher() as searcher:
             # Note: callers must consume everything we yield, so the for loop
             # ends and the "with" is left to close the index files.
             if kw:
                 for doc in searcher.documents(**kw):
-                    latest_doc = not all_revs and doc or None
-                    item = Item(self, user_name=self.user_name, latest_doc=latest_doc, itemid=doc[ITEMID])
-                    if item.allows('read'):
-                        yield doc
+                    yield doc
             else: # XXX maybe this would make sense to be whoosh default behaviour for documents()?
                   #     should be implemented for whoosh >= 2.2.3
                 for doc in searcher.all_stored_fields():
-                    latest_doc = not all_revs and doc or None
-                    item = Item(self, user_name=self.user_name, latest_doc=latest_doc, itemid=doc[ITEMID])
-                    if item.allows('read'):
-                        yield doc
+                    yield doc
 
-    def document(self, all_revs=False, acl_check=True, **kw):
+    def document(self, all_revs=False, **kw):
         """
-        Return a document matching the kw args.
+        Return a Revision matching the kw args.
+        """
+        doc = self._document(all_revs, **kw)
+        if doc:
+            latest_doc = not all_revs and doc or None
+            item = Item(self, user_name=self.user_name, latest_doc=latest_doc, itemid=doc[ITEMID])
+            if item.allows('read'):
+                return item[doc[REVID]]
 
-        :param acl_check: check 'read' ACL if True
+    def _document(self, all_revs=False, **kw):
+        """
+        Return a document matching the kw args (internal use only, no ACL checks).
         """
         with self.get_index(all_revs).searcher() as searcher:
-            doc = searcher.document(**kw)
-            if doc and acl_check:
-                latest_doc = not all_revs and doc or None
-                item = Item(self, user_name=self.user_name, latest_doc=latest_doc, itemid=doc[ITEMID])
-                if item.allows('read'):
-                    return doc
-            else:
-                return doc
+            return searcher.document(**kw)
 
     def __getitem__(self, name):
         """
         self.user_name = user_name
         self.backend = self.indexer.backend
         if latest_doc is None:
-            # we need to switch off the acl check there to avoid endless recursion:
-            latest_doc = self.indexer.document(all_revs=False, acl_check=False, **query) or {}
+            # we need to call the method without acl check to avoid endless recursion:
+            latest_doc = self.indexer._document(all_revs=False, **query) or {}
         self._current = latest_doc
 
     def _get_itemid(self):
 
     def iter_revs(self):
         """
-        Iterate over revids belonging to this item (use index).
+        Iterate over Revisions belonging to this item.
         """
         if self:
-            for doc in self.indexer.documents(all_revs=True, itemid=self.itemid):
-                yield doc[REVID]
+            for rev in self.indexer.documents(all_revs=True, itemid=self.itemid):
+                yield rev
 
     def __getitem__(self, revid):
         """
         Get Revision with revision id <revid>.
         """
         self.require('read')
-        return Revision(self, revid)
+        rev = Revision(self, revid)
+        rev.data # XXX trigger KeyError if rev does not exist
+        return rev
 
     def get_revision(self, revid):
         """
         :type meta: dict
         :type data: open file (file must be closed by caller)
         :param overwrite: if True, allow overwriting of existing revs.
+        :returns: a Revision instance of the just created revision
         """
         self.require('write')
         if self.itemid is None:
         data.seek(0)  # rewind file
         self.indexer.index_revision(revid, meta, data)
         if not overwrite:
-            self._current = self.indexer.document(all_revs=False, acl_check=False, revid=revid)
+            self._current = self.indexer._document(all_revs=False, revid=revid)
         return Revision(self, revid)
 
     def store_all_revisions(self, meta, data):
         """
         Store over all revisions of this item.
         """
-        for revid in self.iter_revs():
-            meta[REVID] = revid
+        for rev in self.iter_revs():
+            meta[REVID] = rev.revid
             self.store_revision(meta, data, overwrite=True)
 
     def destroy_revision(self, revid):
         """
         Destroy all revisions of this item.
         """
-        for revid in self.iter_revs():
-            self.destroy_revision(revid)
+        for rev in self.iter_revs():
+            self.destroy_revision(rev.revid)
 
 
 class Revision(object):
     """
     An existing revision (exists in the backend).
     """
-    def __init__(self, item, revid):
+    def __init__(self, item, revid, doc=None):
         self.item = item
         self.revid = revid
         self.backend = item.backend
-        self.meta, self.data = self.backend.retrieve(self.revid) # raises KeyError if rev does not exist
+        self._doc = doc
+        self.meta = Meta(self, self._doc)
+        self._data = None
+        # Note: this does not immediately raise a KeyError for non-existing revs any more
+        # If you access data or meta, it will, though.
+
+    @property
+    def data(self):
+        if self._data is None:
+            meta, data = self.backend.retrieve(self.revid) # raises KeyError if rev does not exist
+            self.meta = Meta(self, self._doc, meta)
+            self._data = data
+        return self._data
 
     def close(self):
-        self.data.close()
+        if self._data is not None:
+            self._data.close()
 
     def __enter__(self):
         return self
     def __exit__(self, exc_type, exc_value, exc_tb):
         self.close()
 
+    def __cmp__(self, other):
+        return cmp(self.meta, other.meta)
+
+
+class Meta(object):
+    def __init__(self, revision, doc, meta=None):
+        self.revision = revision
+        self._doc = doc or {}
+        self._meta = meta or {}
+
+    def __contains__(self, key):
+        try:
+            self[key]
+        except KeyError:
+            return False
+        else:
+            return True
+
+    def __getitem__(self, key):
+        try:
+            return self._meta[key]
+        except KeyError:
+            pass
+        try:
+            return self._doc[key]
+        except KeyError:
+            pass
+        self._meta, self.revision._data = self.revision.backend.retrieve(self.revision.revid) # raises KeyError if rev does not exist
+        return self._meta[key]
+
+    def __cmp__(self, other):
+        if self[REVID] == other[REVID]:
+            return 0
+        return cmp(self[MTIME], other[MTIME])
+