changeset 367:eb34de0b4489

Added patch for whoosh 1.8.4, remove content indexing due Item class not avaivable when flaskg not initialized
author Michael Mayorov <marchael@kb.csu.ru>
date Tue, 12 Jul 2011 02:30:14 +0000
parents 79a8c0c5a820
children 430223bb9701
files MoinMoin/items/__init__.py MoinMoin/script/maint/index.py whoosh184-fix.patch
diffstat 3 files changed, 63 insertions(+), 22 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/items/__init__.py	Mon Jul 11 22:07:46 2011 +0000
+++ b/MoinMoin/items/__init__.py	Tue Jul 12 02:30:14 2011 +0000
@@ -225,10 +225,6 @@
         """
         Return the internal representation of a document using a DOM Tree
         """
-        from MoinMoin.util.clock import Clock
-        with app.test_request_context():
-            flaskg.clock = Clock()
-            flaskg.clock.start('conv_in_dom')
         flaskg.clock.start('conv_in_dom')
         hash_name = HASH_ALGORITHM
         hash_hexdigest = self.rev.get(hash_name)
@@ -265,9 +261,6 @@
                     doc = smiley_conv(doc)
             if cid:
                 app.cache.set(cid, doc)
-        with app.test_request_context():
-            flaskg.clock = Clock()
-            flaskg.clock.stop('conv_in_dom')
         flaskg.clock.stop('conv_in_dom')
         return doc
 
--- a/MoinMoin/script/maint/index.py	Mon Jul 11 22:07:46 2011 +0000
+++ b/MoinMoin/script/maint/index.py	Tue Jul 12 02:30:14 2011 +0000
@@ -14,11 +14,11 @@
 from whoosh.index import open_dir, create_in, exists_in
 
 from MoinMoin.search.indexing import WhooshIndex
-from MoinMoin.config import MTIME, NAME
-from MoinMoin.items import Item
-from MoinMoin.converter.moinwiki_out import Converter
+from MoinMoin.config import MTIME, NAME, CONTENTTYPE
 from MoinMoin.error import FatalError
 from MoinMoin.storage.error import NoSuchItemError
+from MoinMoin.util.mime import Type
+
 from MoinMoin import log
 logging = log.getLogger(__name__)
 
@@ -63,12 +63,12 @@
                         for rev_no in item.list_revisions():
                             if "all_revisions_index" in indexnames:
                                 revision = item.get_revision(rev_no)
-                                metadata = backend_to_index(item, rev_no, all_rev_field_names)
+                                metadata = backend_to_index(revision, rev_no, all_rev_field_names)
                                 all_rev_writer.add_document(**metadata)
                         # revision is now the latest revision of this item
                         if "latest_revisions_index" in indexnames:
                             revision = item.get_revision(rev_no)
-                            metadata = backend_to_index(item, rev_no, latest_rev_field_names)
+                            metadata = backend_to_index(revision, rev_no, latest_rev_field_names)
                             latest_rev_writer.add_document(**metadata)
 
         def update_index(indexnames_schemas):
@@ -97,7 +97,8 @@
             if "latest_revisions_index" in indexnames and latest_documents:
                 with latest_rev_index.writer() as latest_rev_writer:
                     for item, rev_no in latest_documents:
-                        converted_rev = backend_to_index(item, rev_no, latest_rev_field_names)
+                        revision = item.get_revision(rev_no)
+                        converted_rev = backend_to_index(revision, rev_no, latest_rev_field_names)
                         found = latest_rev_searcher.document(name_exact=item.name)
                         if not found:
                             latest_rev_writer.add_document(**converted_rev)
@@ -121,7 +122,8 @@
                 with all_rev_index.writer() as all_rev_writer:
                     for item, rev_nos in create_documents:
                         for rev_no in rev_nos:
-                            converted_rev = backend_to_index(item, rev_no, all_rev_field_names)
+                            revision = item.get_revision(rev_no)
+                            converted_rev = backend_to_index(revision, rev_no, all_rev_field_names)
                             all_rev_writer.add_document(**converted_rev)
 
         def clean_index(indexnames_schemas):
@@ -178,25 +180,43 @@
             revs_found = searcher.documents(name_exact=name)
             return [rev["rev_no"] for rev in revs_found]
 
-        def backend_to_index(item, rev_no, schema_fields):
+        def backend_to_index(backend_rev, rev_no, schema_fields):
             """
             Convert fields from backend format to whoosh schema
             """
-            backend_rev = item.get_revision(rev_no)
             metadata = dict([(str(key), value)
                               for key, value in backend_rev.items()
                               if key in schema_fields])
             metadata[MTIME] = datetime.datetime.fromtimestamp(metadata[MTIME])
             metadata["name_exact"] = backend_rev[NAME]
             metadata["rev_no"] = rev_no
-            metadata["content"] = convert_data(item, rev_no)
+            metadata["content"] = convert_data(backend_rev, rev_no)
             return metadata
 
-        def convert_data(item, rev_no):
-            converter = Converter()
-            item = Item.create(item=item, rev_no=rev_no)
-            dom = item.internal_representation()
-            return converter(dom)
+        def convert_data(rev, rev_no):
+            # this is a q&d hack because MoinMoin.items.Item is not made to be
+            # called outside of a request (e.g. due to usage of flaskg.*). Later,
+            # we need a more general way to transform revision content to
+            # indexable content (e.g. using a contenttype-dependant Converter
+            # that removes markup for markup data or that extracts text from
+            # "binary" items).
+            ct = Type(rev[CONTENTTYPE])
+            if ct.type == 'text':
+                coding = ct.parameters.get('charset', 'ascii')
+                # XXX does not work yet as NewRevision does not have .seek and .read
+                #rev.seek(0)
+                #data = rev.read()
+                data = 'TODO'
+                try:
+                    data = data.decode(coding)
+                except UnicodeDecodeError as err:
+                    logging.warning("Item %r revision %d failed to decode (%s)" % (
+                                    rev[NAME], rev_no, str(err)))
+                    data = unicode(str(err))
+            else:
+                # TODO: support non-text items
+                data = u''
+            return data
 
         def do_action(action, indexnames_schemas):
             if action == "build":
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/whoosh184-fix.patch	Tue Jul 12 02:30:14 2011 +0000
@@ -0,0 +1,28 @@
+diff -r fb3ac9f4b717 env/lib/python2.6/site-packages/whoosh/filedb/multiproc.py
+--- a/whoosh/filedb/multiproc.py Thu Jun 16 16:17:59 2011 -0400
++++ b/whoosh/filedb/multiproc.py Mon Jun 20 18:06:37 2011 +0200
+@@ -157,6 +157,9 @@
+     
+     def commit(self, **kwargs):
+         try:
++            # index the remaining stuff in self.docbuffer
++            self._enqueue()
++
+             for task in self.tasks:
+                 self.jobqueue.put(None)
+             
+diff -r 273279b7f463 env/lib/python2.6/site-packages/whoosh/fields.py
+--- a/whoosh/fields.py Mon Jun 06 18:06:59 2011 -0400
++++ b/whoosh/fields.py Tue Jun 07 12:10:51 2011 +0200
+@@ -152,8 +152,8 @@
+
+         if not self.format:
+             raise Exception("%s field cannot index without a format" % self.__class__)
+-        if not isinstance(value, unicode):
+-            raise ValueError("%r is not unicode" % value)
++        if not isinstance(value, (text_type, list, tuple)):
++            raise ValueError("%r is not unicode or sequence" % value)
+         return self.format.word_values(value, mode="index", **kwargs)
+
+     def process_text(self, qstring, mode='', **kwargs):
+