changeset 893:3cbdb8f14e98 storage-ng

content indexing: do a real convert_to_indexable implementation fix content indexing test Note: class PseudoRev / PseudoItem is to work around the converters expecting a revision-like object and accessing attributes there, including outdated .revno!
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Mon, 26 Sep 2011 20:03:57 +0200
parents 79f80ce3cf78
children 93ae039cb363
files MoinMoin/storage/middleware/_tests/test_indexing.py MoinMoin/storage/middleware/indexing.py
diffstat 2 files changed, 65 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/storage/middleware/_tests/test_indexing.py	Mon Sep 26 18:49:49 2011 +0200
+++ b/MoinMoin/storage/middleware/_tests/test_indexing.py	Mon Sep 26 20:03:57 2011 +0200
@@ -324,13 +324,14 @@
         # TODO: this is a very simple check that assumes that data is put 1:1
         # into index' CONTENT field.
         item_name = u'foo'
-        meta = dict(name=item_name)
-        data = 'some test content'
+        meta = dict(name=item_name, contenttype=u'text/plain')
+        data = 'some test content\n'
         item = self.imw[item_name]
         data_file = StringIO(data)
         with item.store_revision(meta, data_file) as rev:
             expected_revid = rev.revid
         doc = self.imw._document(content=u'test')
+        assert doc is not None
         assert expected_revid == doc[REVID]
         assert unicode(data) == doc[CONTENT]
 
--- a/MoinMoin/storage/middleware/indexing.py	Mon Sep 26 18:49:49 2011 +0200
+++ b/MoinMoin/storage/middleware/indexing.py	Mon Sep 26 20:03:57 2011 +0200
@@ -103,7 +103,12 @@
     return doc
 
 
-def convert_to_indexable(meta, data):
+from MoinMoin.util.mime import Type, type_moin_document
+from MoinMoin.util.tree import moin_page
+from MoinMoin.converter import default_registry
+from MoinMoin.util.iri import Iri
+
+def convert_to_indexable(meta, data, is_new=False):
     """
     Convert revision data to a indexable content.
 
@@ -113,9 +118,64 @@
                  ready to read all indexable content from it. if you have just
                  written that content or already read from it, you need to call
                  rev.seek(0) before calling convert_to_indexable(rev).
+    :param is_new: if this is for a new revision and we shall modify
+                   metadata as a side effect
     :returns: indexable content, text/plain, unicode object
     """
-    return unicode(data.read()) # TODO integrate real thing after merge into moin2 code base.
+    class PseudoRev(object):
+        def __init__(self, meta, data):
+            self.meta = meta
+            self.data = data
+            self.revno = -1 # TODO: remove access to this in converters
+            class PseudoItem(object):
+                def __init__(self, name):
+                    self.name = name
+            self.item = PseudoItem(meta.get(NAME))
+
+    rev = PseudoRev(meta, data)
+    try:
+        # TODO use different converter mode?
+        # Maybe we want some special mode for the input converters so they emit
+        # different output than for normal rendering), esp. for the non-markup
+        # content types (images, etc.).
+        input_contenttype = meta[CONTENTTYPE]
+        output_contenttype = 'text/plain'
+        type_input_contenttype = Type(input_contenttype)
+        type_output_contenttype = Type(output_contenttype)
+        reg = default_registry
+        # first try a direct conversion (this could be useful for extraction
+        # of (meta)data from binary types, like from images or audio):
+        conv = reg.get(type_input_contenttype, type_output_contenttype)
+        if conv:
+            doc = conv(rev, input_contenttype)
+            return doc
+        # otherwise try via DOM as intermediate format (this is useful if
+        # input type is markup, to get rid of the markup):
+        input_conv = reg.get(type_input_contenttype, type_moin_document)
+        refs_conv = reg.get(type_moin_document, type_moin_document, items='refs')
+        output_conv = reg.get(type_moin_document, type_output_contenttype)
+        if input_conv and output_conv:
+            doc = input_conv(rev, input_contenttype)
+            # We do not convert smileys, includes, macros, links, because
+            # it does not improve search results or even makes results worse.
+            # We do run the referenced converter, though, to extract links and
+            # transclusions.
+            if is_new:
+                # we only can modify new, uncommitted revisions, not stored revs
+                i = Iri(scheme='wiki', authority='', path='/' + meta[NAME])
+                doc.set(moin_page.page_href, unicode(i))
+                refs_conv(doc)
+                # side effect: we update some metadata:
+                meta[ITEMLINKS] = refs_conv.get_links()
+                meta[ITEMTRANSCLUSIONS] = refs_conv.get_transclusions()
+            doc = output_conv(doc)
+            return doc
+        # no way
+        raise TypeError("No converter for %s --> %s" % (input_contenttype, output_contenttype))
+    except Exception as e: # catch all exceptions, we don't want to break an indexing run
+        logging.exception("Exception happened in conversion of item %r rev %s contenttype %s:" % (meta[NAME], meta[REVID], meta.get(CONTENTTYPE, '')))
+        doc = u'ERROR [%s]' % str(e)
+        return doc
 
 
 class IndexingMiddleware(object):