changeset 5314:e005834bbf85

less disruptive xapian index rebuild (details below) Improved --mode=rebuild: Previously it first killed the old index files, then started to build a new index (takes quite long for big wikis) - until the new index was finished, search was either falling back to slow search or was having even bigger trouble. Now it builds the new index to a separate directory, then kills the old index, then moves the new index to the old place. This can still lead to trouble, but for a much shorter time. If one wants to avoid that, there is a new 2-stage rebuild now: --mode=buildnewindex - just build the new index (in a separate directory) This does not interfere with wiki search at all, thus it is no problem if it takes long. --mode=usenewindex - kill old index, move new index into its place This can be done with wiki shut down, so one does not have problems with open files or running searches. As this is a quick operation, downtime is short. Improved / adjusted docs accordingly.
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Sat, 28 Nov 2009 17:03:28 +0100
parents 37b22f678801
children 329241cd48b6 c3316c25365c
files MoinMoin/script/index/build.py MoinMoin/search/Xapian/indexing.py MoinMoin/search/builtin.py
diffstat 3 files changed, 72 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/script/index/build.py	Sat Nov 28 13:51:13 2009 +0100
+++ b/MoinMoin/script/index/build.py	Sat Nov 28 17:03:28 2009 +0100
@@ -2,10 +2,14 @@
 """
 MoinMoin - build xapian search engine's index
 
-@copyright: 2006 MoinMoin:ThomasWaldmann
+@copyright: 2006-2009 MoinMoin:ThomasWaldmann
 @license: GNU GPL, see COPYING for details.
 """
 
+import os
+import errno
+import shutil
+
 from MoinMoin.script import MoinScript
 
 class IndexScript(MoinScript):
@@ -22,17 +26,38 @@
     --config-dir=/path/to/my/cfg/ --wiki-url=wiki.example.org/
 
 [build-options] see below:
-    0. You must run this script as owner of the wiki files, usually this is the
-       web server user.
+    Please note:
+    * You must run this script as the owner of the wiki files,
+      usually this is the web server user.
+    * You may add the build-option --files=files.lst to let the indexer
+      also consider the filesystem filenames contained in that file (one
+      filename per line). Search results from these files will be "found"
+      under a special pseudo page called FS (like File System).
+      Without this option, the indexer will just consider pages and attachments.
 
-    1. To add the files from '/files.lst' to the index
-       moin ... index build --files /files.lst --mode add
+    1. Conditionally (considering modification time) update the index:
+       moin ... index build --mode=update
 
-    2. To update the index with the files from '/files.lst'
-       moin ... index build --files /files.lst --mode update
+    2. Unconditionally add to the index:
+       moin ... index build --mode=add
 
-    3. To rebuild the index with the files from '/files.lst'
-       moin ... index build --files /files.lst --mode rebuild
+    3. Completely rebuild the index (1-stage):
+       moin ... index build --mode=rebuild
+
+       Note: until it has completely built the new index, the wiki will still
+       use the old index. After rebuild has completed, it kills the old index
+       and moves the new index into its place.
+       If the wiki uses the index at that moment, that might have unwanted side
+       effects. If you want to avoid that and you can accept a short downtime,
+       consider using this safer method:
+
+       Completely rebuild the index (2-stage):
+       # takes long, does not interfere with wiki searches:
+       moin ... index build --mode=buildnewindex
+       stop this moin wiki process(es)
+       # quick, replaces the old index with the new one:
+       moin ... index build --mode=usenewindex
+       start this moin wiki process(es)
 """
 
     def __init__(self, argv, def_values):
@@ -43,7 +68,8 @@
         )
         self.parser.add_option(
             "--mode", metavar="MODE", dest="mode",
-            help="either add (unconditionally add to index), update (update an existing index) or rebuild (remove and add)"
+            help="either add (unconditionally add), update (conditional update), rebuild (complete 1-stage index rebuild)"
+                 " or buildnewindex and usenewindex (complete 2-stage index rebuild)"
         )
 
     def mainloop(self):
@@ -60,5 +86,36 @@
 
     def command(self):
         from MoinMoin.search.Xapian import XapianIndex
-        XapianIndex(self.request).indexPages(self.files, self.options.mode)
+        mode = self.options.mode
+        if mode in ('rebuild', 'buildnewindex'):
+            # rebuilding the DB into a new index directory, so the rebuild
+            # process does not interfere with the currently in-use DB
+            idx_mode, idx_name = 'add', 'index.new'
+        elif mode in ('add', 'update'):
+            # update/add in-place
+            idx_mode, idx_name = mode, 'index'
+        elif mode == 'usenewindex':
+            pass # nothing todo
+        else:
+            pass # XXX give error msg about invalid mode
 
+        if mode != 'usenewindex':
+            idx = XapianIndex(self.request, name=idx_name)
+            idx.indexPages(self.files, idx_mode)
+
+        if mode in ('rebuild', 'usenewindex'):
+            # 'rebuild' is still a bit dirty, because just killing old index will
+            # fail currently running searches. Thus, maybe do this in a time
+            # with litte wiki activity or better use 'buildnewindex' and
+            # 'usenewindex' (see above).
+            # XXX code here assumes that idx.db is a directory
+            # TODO improve this with xapian stub DBs
+            idx_old = XapianIndex(self.request, name='index').db
+            idx_new = XapianIndex(self.request, name='index.new').db
+            try:
+                shutil.rmtree(idx_old)
+            except OSError, err:
+                if err.errno != errno.ENOENT: # ignore it if we have no current index
+                    raise
+            os.rename(idx_new, idx_old)
+
--- a/MoinMoin/search/Xapian/indexing.py	Sat Nov 28 13:51:13 2009 +0100
+++ b/MoinMoin/search/Xapian/indexing.py	Sat Nov 28 17:03:28 2009 +0100
@@ -119,9 +119,9 @@
 
 class XapianIndex(BaseIndex):
 
-    def __init__(self, request):
+    def __init__(self, request, name='index'):
         super(XapianIndex, self).__init__(request)
-        self.db = os.path.join(self.main_dir, 'index')
+        self.db = os.path.join(self.main_dir, name)
 
     def _main_dir(self):
         """ Get the directory of the xapian index """
@@ -544,15 +544,6 @@
             # Index all pages
             pages = request.rootpage.getPageList(user='', exists=1)
 
-        # rebuilding the DB: delete it and add everything
-        # XXX assumes that self.db is a xapian index directory
-        # XXX killing the index this way leads to searches failing, if the
-        # XXX wiki tries to use the index while index is rebuilt.
-        if mode == 'rebuild':
-            for fname in os.listdir(self.db):
-                os.unlink(os.path.join(self.db, fname))
-            mode = 'add'
-
         try:
             connection = self.get_indexer_connection()
             self.touch()
--- a/MoinMoin/search/builtin.py	Sat Nov 28 13:51:13 2009 +0100
+++ b/MoinMoin/search/builtin.py	Sat Nov 28 17:03:28 2009 +0100
@@ -150,7 +150,7 @@
         """ Index pages (and files, if given)
 
         @param files: iterator or list of files to index additionally
-        @param mode: set the mode of indexing the pages, either 'update', 'add' or 'rebuild'
+        @param mode: set the mode of indexing the pages, either 'update' or 'add'
         @param pages: list of pages to index, if not given, all pages are indexed
         """
         start = time.time()
@@ -166,8 +166,7 @@
 
         @param request: current request
         @param files: iterator or list of files to index additionally
-        @param mode: set the mode of indexing the pages, either 'update',
-        'add' or 'rebuild'
+        @param mode: set the mode of indexing the pages, either 'update' or 'add'
         @param pages: list of pages to index, if not given, all pages are indexed
 
         """