1. marchael
  2. moin-2.0

Commits

Thomas Waldmann  committed 2dfaef3

move indexing related code to MoinMoin.storage.backends.indexing

remove Item.before_revision_commit hook - wrong place, such stuff must be
done in indexing middleware or it does not happen when non-UI commits happen,
e.g. when items are unserialized from xml.
I noticed this because ITEMLINKS and ITEMTRANSCLUSIONS where missing for data
coming from xml.

note: this is likely still not the final place. stuff like current user / ip
are not relevant / not available when unserializing xml.

  • Participants
  • Parent commits 328d83e
  • Branches pytest2

Comments (0)

Files changed (5)

File MoinMoin/converter/__init__.py

View file
  • Ignore whitespace
         return self._register(self.Entry(factory, type_input, type_output, priority))
 
 
-from ..util.mime import Type, type_moin_document
-
-from MoinMoin.config import NAME, CONTENTTYPE
-
-from MoinMoin import log
-logging = log.getLogger(__name__)
-
-
-def convert_to_indexable(rev):
-    """
-    convert a revision to an indexable document
-
-    :param rev: item revision - please make sure that the content file is
-                ready to read all indexable content from it. if you have just
-                written that content or already read from it, you need to call
-                rev.seek(0) before calling convert_to_indexable(rev).
-    """
-    try:
-        # TODO use different converter mode?
-        # Maybe we want some special mode for the input converters so they emit
-        # different output than for normal rendering), esp. for the non-markup
-        # content types (images, etc.).
-        input_contenttype = rev[CONTENTTYPE]
-        output_contenttype = 'text/plain'
-        type_input_contenttype = Type(input_contenttype)
-        type_output_contenttype = Type(output_contenttype)
-        reg = default_registry
-        # first try a direct conversion (this could be useful for extraction
-        # of (meta)data from binary types, like from images or audio):
-        conv = reg.get(type_input_contenttype, type_output_contenttype)
-        if conv:
-            doc = conv(rev, input_contenttype)
-            return doc
-        # otherwise try via DOM as intermediate format (this is useful if
-        # input type is markup, to get rid of the markup):
-        input_conv = reg.get(type_input_contenttype, type_moin_document)
-        output_conv = reg.get(type_moin_document, type_output_contenttype)
-        if input_conv and output_conv:
-            doc = input_conv(rev, input_contenttype)
-            # We do not convert smileys, includes, macros, links, because
-            # it does not improve search results or even makes results worse.
-            doc = output_conv(doc)
-            return doc
-        # no way
-        raise TypeError("No converter for %s --> %s" % (input_contenttype, output_contenttype))
-    except Exception as e: # catch all exceptions, we don't want to break an indexing run
-        logging.exception("Exception happened in conversion of item %r rev %d contenttype %s:" % (rev[NAME], rev.revno, rev[CONTENTTYPE]))
-        doc = u'ERROR [%s]' % str(e)
-        return doc
-
-
 default_registry = RegistryConverter()
 load_package_modules(__name__, __path__)
 

File MoinMoin/items/__init__.py

View file
  • Ignore whitespace
             newrev[CONTENTTYPE] = unicode(contenttype_current or contenttype_guessed or 'application/octet-stream')
 
         newrev[ACTION] = unicode(action)
-        self.before_revision_commit(newrev, data)
         storage_item.commit()
         item_modified.send(app._get_current_object(), item_name=name)
         return new_rev_no, size
 
-    def before_revision_commit(self, newrev, data):
-        """
-        hook that can be used to add more meta data to a revision before
-        it is committed.
-
-        :param newrev: new (still uncommitted) revision - modify as wanted
-        :param data: either str or open file (we can avoid having to read/seek
-                     rev's data with this)
-        """
-        remote_addr = request.remote_addr
-        if remote_addr:
-            if app.cfg.log_remote_addr:
-                newrev[ADDRESS] = unicode(remote_addr)
-                hostname = wikiutil.get_hostname(remote_addr)
-                if hostname:
-                    newrev[HOSTNAME] = hostname
-        if flaskg.user.valid:
-            newrev[USERID] = unicode(flaskg.user.id)
-
     def get_index(self):
         """ create an index of sub items of this item """
         if self.name:
     some kind of item with markup
     (internal links and transcluded items)
     """
-    def before_revision_commit(self, newrev, data):
-        """
-        add ITEMLINKS and ITEMTRANSCLUSIONS metadata
-        """
-        super(MarkupItem, self).before_revision_commit(newrev, data)
-
-        if hasattr(data, "read"):
-            data.seek(0)
-            data = data.read()
-        elif isinstance(data, str):
-            pass
-        else:
-            raise StorageError("unsupported content object: %r" % data)
-
-        from MoinMoin.converter import default_registry as reg
-
-        input_conv = reg.get(Type(self.contenttype), type_moin_document)
-        item_conv = reg.get(type_moin_document, type_moin_document, items='refs')
-
-        i = Iri(scheme='wiki', authority='', path='/' + self.name)
-
-        doc = input_conv(self.rev, self.contenttype)
-        doc.set(moin_page.page_href, unicode(i))
-        doc = item_conv(doc)
-
-        newrev[ITEMLINKS] = item_conv.get_links()
-        newrev[ITEMTRANSCLUSIONS] = item_conv.get_transclusions()
 
 
 class MoinWiki(MarkupItem):

File MoinMoin/items/_tests/test_Item.py

View file
  • Ignore whitespace
         assert u'<pre class="highlight">test_data\n' in result
         assert item2.data == ''
 
-class TestMarkupItem(object):
-    """ Test for the items with markup """
-
-    def test_before_revision_commit(self):
-        item_name = u'Markup_Item'
-        item = MarkupItem.create(item_name)
-        contenttype = u'text/x.moin.wiki;charset=utf-8'
-        meta = {CONTENTTYPE: contenttype}
-        item._save(meta)
-        item1 = MarkupItem.create(item_name)
-        MarkupItem.before_revision_commit(item1, item.rev, 'test_data')
-        assert item.rev['itemlinks'] == []
-        assert item.rev['itemtransclusions'] == []
-
 coverage_modules = ['MoinMoin.items']
 

File MoinMoin/script/maint/index.py

View file
  • Ignore whitespace
 from MoinMoin.storage.error import NoSuchItemError, NoSuchRevisionError
 from MoinMoin.util.mime import Type
 from MoinMoin.search.indexing import backend_to_index
-from MoinMoin.converter import convert_to_indexable
+from MoinMoin.storage.backends.indexing import convert_to_indexable
 
 from MoinMoin import log
 logging = log.getLogger(__name__)

File MoinMoin/storage/backends/indexing.py

View file
  • Ignore whitespace
 from uuid import uuid4
 make_uuid = lambda: unicode(uuid4().hex)
 
+from flask import current_app as app
+from flask import g as flaskg
+from flask import request
+
 from MoinMoin.storage.error import NoSuchItemError, NoSuchRevisionError, \
                                    AccessDeniedError
-from MoinMoin.config import ACL, CONTENTTYPE, UUID, NAME, NAME_OLD, MTIME, TAGS
+from MoinMoin.config import ACL, CONTENTTYPE, UUID, NAME, NAME_OLD, MTIME, TAGS, \
+                            ADDRESS, HOSTNAME, USERID, ITEMLINKS, ITEMTRANSCLUSIONS
 from MoinMoin.search.indexing import backend_to_index
-from MoinMoin.converter import convert_to_indexable
+from MoinMoin.converter import default_registry
+from MoinMoin.util.iri import Iri
+from MoinMoin.util.mime import Type, type_moin_document
+from MoinMoin.util.tree import moin_page
+from MoinMoin import wikiutil
 
 from MoinMoin import log
 logging = log.getLogger(__name__)
 
+
+def convert_to_indexable(rev, new_rev=False):
+    """
+    convert a revision to an indexable document
+
+    :param rev: item revision - please make sure that the content file is
+                ready to read all indexable content from it. if you have just
+                written that content or already read from it, you need to call
+                rev.seek(0) before calling convert_to_indexable(rev).
+    """
+    try:
+        # TODO use different converter mode?
+        # Maybe we want some special mode for the input converters so they emit
+        # different output than for normal rendering), esp. for the non-markup
+        # content types (images, etc.).
+        input_contenttype = rev[CONTENTTYPE]
+        output_contenttype = 'text/plain'
+        type_input_contenttype = Type(input_contenttype)
+        type_output_contenttype = Type(output_contenttype)
+        reg = default_registry
+        # first try a direct conversion (this could be useful for extraction
+        # of (meta)data from binary types, like from images or audio):
+        conv = reg.get(type_input_contenttype, type_output_contenttype)
+        if conv:
+            doc = conv(rev, input_contenttype)
+            return doc
+        # otherwise try via DOM as intermediate format (this is useful if
+        # input type is markup, to get rid of the markup):
+        input_conv = reg.get(type_input_contenttype, type_moin_document)
+        refs_conv = reg.get(type_moin_document, type_moin_document, items='refs')
+        output_conv = reg.get(type_moin_document, type_output_contenttype)
+        if input_conv and output_conv:
+            doc = input_conv(rev, input_contenttype)
+            # We do not convert smileys, includes, macros, links, because
+            # it does not improve search results or even makes results worse.
+            # We do run the referenced converter, though, to extract links and
+            # transclusions.
+            if new_rev:
+                # we only can modify new, uncommitted revisions, not stored revs
+                i = Iri(scheme='wiki', authority='', path='/' + rev[NAME])
+                doc.set(moin_page.page_href, unicode(i))
+                refs_conv(doc)
+                # side effect: we update some metadata:
+                rev[ITEMLINKS] = refs_conv.get_links()
+                rev[ITEMTRANSCLUSIONS] = refs_conv.get_transclusions()
+            doc = output_conv(doc)
+            return doc
+        # no way
+        raise TypeError("No converter for %s --> %s" % (input_contenttype, output_contenttype))
+    except Exception as e: # catch all exceptions, we don't want to break an indexing run
+        logging.exception("Exception happened in conversion of item %r rev %d contenttype %s:" % (rev[NAME], rev.revno, rev[CONTENTTYPE]))
+        doc = u'ERROR [%s]' % str(e)
+        return doc
+
+
 class IndexingBackendMixin(object):
     """
     Backend indexing support / functionality using the index.
         name = self.item.name
         uuid = self.item[UUID]
         revno = self.revno
+        logging.debug("Processing: name %s revno %s" % (name, revno))
         if MTIME not in self:
             self[MTIME] = int(time.time())
         if NAME not in self:
             self[UUID] = uuid # do we want the item's uuid in the rev's metadata?
         if CONTENTTYPE not in self:
             self[CONTENTTYPE] = u'application/octet-stream'
-        metas = self
+
+        if app.cfg.log_remote_addr:
+            remote_addr = request.remote_addr
+            if remote_addr:
+                self[ADDRESS] = unicode(remote_addr)
+                hostname = wikiutil.get_hostname(remote_addr)
+                if hostname:
+                    self[HOSTNAME] = hostname
+        try:
+            if flaskg.user.valid:
+                self[USERID] = unicode(flaskg.user.id)
+        except:
+            # when loading xml via script, we have no flaskg.user
+            pass
+
+        self.seek(0) # for a new revision, file pointer points to EOF, rewind first
+        rev_content = convert_to_indexable(self, new_rev=True)
+
         logging.debug("item %r revno %d update index:" % (name, revno))
-        for k, v in metas.items():
+        for k, v in self.items():
             logging.debug(" * rev meta %r: %r" % (k, v))
-        self._index.add_rev(uuid, revno, metas)
+        logging.debug("Indexable content: %r" % (rev_content[:250], ))
+        self._index.add_rev(uuid, revno, self, rev_content)
 
     def remove_index(self):
         """
                 for doc_number in doc_numbers:
                     async_writer.delete_document(doc_number)
 
-    def add_rev(self, uuid, revno, rev):
+    def add_rev(self, uuid, revno, rev, rev_content):
         """
         add a new revision <revno> for item <uuid> with metadata <metas>
         """
             latest_found_document = latest_revs_searcher.document(uuid=rev[UUID],
                                                                   wikiname=self.wikiname
                                                                  )
-        logging.debug("Processing: name %s revno %s" % (rev[NAME], revno))
-        rev.seek(0) # for a new revision, file pointer points to EOF, rewind first
-        rev_content = convert_to_indexable(rev)
-        logging.debug("Indexable content: %r" % (rev_content[:250], ))
         if not all_found_document:
             schema = self.index_object.all_revisions_index.schema
             with AsyncWriter(self.index_object.all_revisions_index) as async_writer: