changeset 860:bf18e19e618d

merged xapian branch
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Mon, 19 Jun 2006 17:51:32 +0200
parents a71bcc0f27c3 (current diff) c76dd5d97e0e (diff)
children 590a27a9c0e4
files
diffstat 4 files changed, 92 insertions(+), 38 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/Xapian.py	Mon Jun 19 17:40:57 2006 +0200
+++ b/MoinMoin/Xapian.py	Mon Jun 19 17:51:32 2006 +0200
@@ -12,6 +12,7 @@
 from pprint import pprint
 
 import xapian
+from xapian import Query
 from MoinMoin.support.xapwrap import document as xapdoc
 from MoinMoin.support.xapwrap import index as xapidx
 from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
@@ -280,30 +281,35 @@
                        #Y   year (four digits)
     }
 
-
-
     class LockedException(Exception):
         pass
     
     def __init__(self, request):
         self.request = request
         cache_dir = request.cfg.cache_dir
-        self.main_dir = os.path.join(cache_dir, 'xapian')
-        self.dir = os.path.join(self.main_dir, 'index')
+        main_dir = self._main_dir()
+        self.dir = os.path.join(main_dir, 'index')
         filesys.makeDirs(self.dir)
-        self.sig_file = os.path.join(self.main_dir, 'complete')
-        lock_dir = os.path.join(self.main_dir, 'index-lock')
+        self.sig_file = os.path.join(main_dir, 'complete')
+        lock_dir = os.path.join(main_dir, 'index-lock')
         self.lock = lock.WriteLock(lock_dir,
                                    timeout=3600.0, readlocktimeout=60.0)
         self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0)
-        self.queue = UpdateQueue(os.path.join(self.main_dir, "update-queue"),
-                                 os.path.join(self.main_dir, 'update-queue-lock'))
-        
+        self.queue = UpdateQueue(os.path.join(main_dir, 'update-queue'),
+                                 os.path.join(main_dir, 'update-queue-lock'))
+
         # Disabled until we have a sane way to build the index with a
         # queue in small steps.
         ## if not self.exists():
         ##    self.indexPagesInNewThread(request)
 
+    def _main_dir(self):
+        if self.request.cfg.xapian_index_dir:
+            return os.path.join(self.request.cfg.xapian_index_dir,
+                    self.request.cfg.siteid)
+        else:
+            return os.path.join(request.cfg.cache_dir, 'xapian')
+
     def exists(self):
         """ Check if index exists """        
         return os.path.exists(self.sig_file)
--- a/MoinMoin/multiconfig.py	Mon Jun 19 17:40:57 2006 +0200
+++ b/MoinMoin/multiconfig.py	Mon Jun 19 17:51:32 2006 +0200
@@ -276,6 +276,7 @@
                                     # instead of just IPs
 
     xapian_search = False # disabled until xapian is finished
+    xapian_index_dir = None
 
     mail_login = None # or "user pwd" if you need to use SMTP AUTH
     mail_sendmail = None # "/usr/sbin/sendmail -t -i" to not use SMTP, but sendmail
@@ -481,7 +482,7 @@
             name = dirname + '_dir'
             if not getattr(self, name, None):
                 setattr(self, name, os.path.join(data_dir, dirname))
-            
+
         # Try to decode certain names which allow unicode
         self._decode()
 
--- a/MoinMoin/search.py	Mon Jun 19 17:40:57 2006 +0200
+++ b/MoinMoin/search.py	Mon Jun 19 17:51:32 2006 +0200
@@ -15,9 +15,12 @@
 from MoinMoin import wikiutil, config
 from MoinMoin.Page import Page
 
-import Xapian
-from xapian import Query
-from Xapian import UnicodeQuery
+try:
+    import Xapian
+    from Xapian import Query, UnicodeQuery
+    use_stemming = Xapian.use_stemming
+except ImportError:
+    use_stemming = False
 
 #############################################################################
 ### query objects
@@ -90,15 +93,7 @@
                 self.pattern = pattern
         else:
             pattern = re.escape(pattern)
-            if stemmed:
-                # XXX: works, but pretty CPU-intensive (obviously...)
-                self.search_re = re.compile(r'(?=^|[\s]+|[^%s]+)%s[%s]*' %
-                        (config.chars_lower, case and pattern or
-                            ''.join(['[%s%s]' % (ch.upper(), ch.lower())
-                                for ch in pattern]),
-                         config.chars_lower), re.U)
-            else:
-                self.search_re = re.compile(pattern, flags)
+            self.search_re = re.compile(pattern, flags)
             self.pattern = pattern
 
 
@@ -280,7 +275,23 @@
         # Search in page body
         body = page.get_raw_body()
         for match in self.search_re.finditer(body):
-            matches.append(TextMatch(re_match=match))
+            if use_stemming:
+                # somewhere in regular word
+                if body[match.start()] not in config.chars_upper and \
+                        body[match.start()-1] in config.chars_lower:
+                    continue
+
+                post = 0
+                for c in body[match.end():]:
+                    if c in config.chars_lower:
+                        post += 1
+                    else:
+                        break
+
+                matches.append(TextMatch(start=match.start(),
+                        end=match.end()+post))
+            else:
+                matches.append(TextMatch(re_match=match))
 
         # Decide what to do with the results.
         if ((self.negated and matches) or
@@ -306,7 +317,7 @@
             queries = []
             stemmed = []
             for t in terms:
-                if Xapian.use_stemming:
+                if use_stemming:
                     # stemmed OR not stemmed
                     tmp = []
                     for i in analyzer.tokenize(t, flat_stemming=False):
@@ -368,7 +379,23 @@
         # Get matches in page name
         matches = []
         for match in self.search_re.finditer(page.page_name):
-            matches.append(TitleMatch(re_match=match))
+            if use_stemming:
+                # somewhere in regular word
+                if page.page_name[match.start()] not in config.chars_upper and \
+                        page.page_name[match.start()-1] in config.chars_lower:
+                    continue
+
+                post = 0
+                for c in page.page_name[match.end():]:
+                    if c in config.chars_lower:
+                        post += 1
+                    else:
+                        break
+
+                matches.append(TitleMatch(start=match.start(),
+                        end=match.end()+post))
+            else:
+                matches.append(TitleMatch(re_match=match))
         
         if ((self.negated and matches) or
             (not self.negated and not matches)):
@@ -394,7 +421,7 @@
             queries = []
             stemmed = []
             for t in terms:
-                if Xapian.use_stemming:
+                if use_stemming:
                     # stemmed OR not stemmed
                     tmp = []
                     for i in analyzer.tokenize(t, flat_stemming=False):
@@ -1341,8 +1368,11 @@
         return moin search in those pages.
         """
         pages = None
-        index = Xapian.Index(self.request)
-        if index.exists() and self.query.xapian_wanted():
+        try:
+            index = Xapian.Index(self.request)
+        except NameError:
+            index = None
+        if index and index.exists() and self.query.xapian_wanted():
             self.request.clock.start('_xapianSearch')
             try:
                 from MoinMoin.support import xapwrap
--- a/docs/CHANGES.fpletz	Mon Jun 19 17:40:57 2006 +0200
+++ b/docs/CHANGES.fpletz	Mon Jun 19 17:51:32 2006 +0200
@@ -2,12 +2,6 @@
 =============================
 
   Known main issues:
-    * _moinSearch matches all characters in words when stemming,
-      workaround uses too much CPU
-    * Matching of stemmed terms is generally unreliable because the
-      matches (and consequently the count) are not obtained by Xapian
-      as _moinSearch is called with the Xapian results. Use the Xapian
-      matches somehow?
     * Regex searching with Xapian?
 
   ToDo:
@@ -15,14 +9,16 @@
       metadata)
     * Mockup the new search UI
     * Write/update documentation for all the new search stuff
-    * Wikifarms support (multiple indexes)
-    * Indexing and searching of Categories (new term prefix)
-    * Finish the stemming/matching stuff
+    * Indexing and searching of categories (new term prefix)
+    * MoinMoin.Xapian.use_stemming -> request.cfg.xapian_use_stemming
 
   New Features:
     * Faster search thanks to Xapian
     * Searching for languages with new prefix lang/language, i.e. lang:de
       Note: Only available when Xapian is activated
+    * New config options:
+        xapian_search        0      enables xapian-powered search
+        xapian_index_dir     None   directory for xapian indices
   
   Bugfixes (only stuff that is buggy in moin/1.6 main branch):
     * ...
@@ -61,5 +57,26 @@
       advice on how to detect and match them reliably using the current
       framework
 
-2006-06-18
+2006-06-19
+    * Introducing xapian_index_dir as a global directory for multiple
+      xapian indices i.e. for wikifarms.
 
+      Layout:
+            xapian_index_dir/
+                siteid1/
+                    complete
+                    index/
+                    index-lock/
+                    update-queue-lock/
+                siteid2/
+                    complete
+                    index/
+                    index-lock/
+                    update-queue-lock/
+                ...
+
+        Possible extension: Xapian can handle multiple databases, maybe
+        allow searching across defined wikis on a wikifarm
+    * All stemming/matching issues resolved (hopefully)
+    * Works now without xapian installed (enhance error reporting)
+