changeset 947:41f6f7708466

merged xapian branch
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Mon, 17 Jul 2006 03:08:12 +0200
parents 7c8e8d370740 (current diff) 72aeb2ba133d (diff)
children 28ea5b3802b1
files MoinMoin/script/index/build.py
diffstat 6 files changed, 77 insertions(+), 28 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/script/index/build.py	Mon Jul 17 01:50:55 2006 +0200
+++ b/MoinMoin/script/index/build.py	Mon Jul 17 03:08:12 2006 +0200
@@ -22,7 +22,7 @@
         )
         self.parser.add_option(
             "--mode", metavar="MODE", dest="mode",
-            help="either add (unconditionally add to index) or update (update an existing index)"
+            help="either add (unconditionally add to index), update (update an existing index) or rebuild (remove and add)"
         )
 
     def mainloop(self):
@@ -40,5 +40,4 @@
     def command(self):
         from MoinMoin.search.Xapian import Index
         Index(self.request).indexPages(self.files, self.options.mode)
-        #Index(self.request).test(self.request)
 
--- a/MoinMoin/search/Xapian.py	Mon Jul 17 01:50:55 2006 +0200
+++ b/MoinMoin/search/Xapian.py	Mon Jul 17 03:08:12 2006 +0200
@@ -8,7 +8,7 @@
 """
 debug = True
 
-import sys, os, re, codecs, time
+import sys, os, re, codecs, time, os
 from pprint import pprint
 
 import xapian
@@ -237,6 +237,13 @@
             Assumes that the write lock is acquired
         """
         fs_rootpage = 'FS' # XXX FS hardcoded
+
+        # rebuilding the DB: delete it and add everything
+        if mode == 'rebuild':
+            for f in os.listdir(self.dir):
+                os.unlink(f)
+            mode = 'add'
+
         try:
             wikiname = request.cfg.interwikiname or 'Self'
             itemid = "%s:%s" % (wikiname, os.path.join(fs_rootpage, filename))
--- a/MoinMoin/search/builtin.py	Mon Jul 17 01:50:55 2006 +0200
+++ b/MoinMoin/search/builtin.py	Mon Jul 17 03:08:12 2006 +0200
@@ -149,7 +149,7 @@
         lock_dir = os.path.join(main_dir, 'index-lock')
         self.lock = lock.WriteLock(lock_dir,
                                    timeout=3600.0, readlocktimeout=60.0)
-        self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0)
+        #self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0)
         self.queue = UpdateQueue(os.path.join(main_dir, 'update-queue'),
                                  os.path.join(main_dir, 'update-queue-lock'))
 
@@ -172,12 +172,12 @@
         raise NotImplemented
 
     def search(self, query):
-        if not self.read_lock.acquire(1.0):
-            raise self.LockedException
-        try:
-            hits = self._search(query)
-        finally:
-            self.read_lock.release()
+        #if not self.read_lock.acquire(1.0):
+        #    raise self.LockedException
+        #try:
+        hits = self._search(query)
+        #finally:
+        #    self.read_lock.release()
         return hits
 
     def update_page(self, page):
@@ -415,17 +415,25 @@
         else:
             return self._moinSearch(pages)
 
+    def _xapianMatchDecider(self, term, pos):
+        if term[0] == 'S':      # TitleMatch
+            return TitleMatch(start=pos, end=pos+len(term)-1)
+        else:                   # TextMatch (incl. headers)
+            return TextMatch(start=pos, end=pos+len(term))
+        
     def _xapianMatch(self, page, uid):
-        matches = []
+        """ Get all relevant Xapian matches per document id """
+        positions = {}
         term = self._xapianEnquire.get_matching_terms_begin(uid)
-        #print hit['uid']
         while term != self._xapianEnquire.get_matching_terms_end(uid):
-            print term.get_term(), ':', list(self._xapianIndex.termpositions(uid, term.get_term()))
-            for pos in self._xapianIndex.termpositions(uid, term.get_term()):
-                matches.append(TextMatch(start=pos,
-                    end=pos+len(term.get_term())))
+            term_name = term.get_term()
+            for pos in self._xapianIndex.termpositions(uid,term.get_term()):
+                if pos not in positions or \
+                        len(positions[pos]) < len(term_name):
+                    positions[pos] = term_name
             term.next()
-        return matches
+        return [self._xapianMatchDecider(term, pos) for pos, term
+            in positions.iteritems()]
 
     def _moinSearch(self, pages=None):
         """ Search pages using moin's built-in full text search 
@@ -444,9 +452,11 @@
         return hits
     
     def _moinMatch(self, page, uid):
+        """ Just kick off regular moinSearch """
         return self.query.search(page)
 
     def _getHits(self, pages, matchSearchFunction):
+        """ Get the hit tuples in pages through matchSearchFunction """
         hits = []
         fs_rootpage = self.fs_rootpage
         for hit in pages:
@@ -455,6 +465,7 @@
                 uid = hit['uid']
             else:
                 valuedict = hit
+                uid = None
 
             wikiname = valuedict['wikiname']
             pagename = valuedict['pagename']
@@ -468,9 +479,9 @@
                     else:
                         hits.append((wikiname, page, attachment, None))
                 else:
-                    match = matchSearchFunction(page, uid)
-                    if match:
-                        hits.append((wikiname, page, attachment, match))
+                    matches = matchSearchFunction(page, uid)
+                    if matches:
+                        hits.append((wikiname, page, attachment, matches))
             else: # other wiki
                 hits.append((wikiname, pagename, attachment, None))
         return hits
--- a/MoinMoin/search/results.py	Mon Jul 17 01:50:55 2006 +0200
+++ b/MoinMoin/search/results.py	Mon Jul 17 03:08:12 2006 +0200
@@ -494,7 +494,8 @@
         start = len(header)
         # Find first match after start
         for i in xrange(len(matches)):
-            if matches[i].start >= start:
+            if matches[i].start >= start and \
+                    isinstance(matches[i], TextMatch):
                 return i, start
         return 0, 0
 
--- a/MoinMoin/support/xapwrap/document.py	Mon Jul 17 01:50:55 2006 +0200
+++ b/MoinMoin/support/xapwrap/document.py	Mon Jul 17 03:08:12 2006 +0200
@@ -145,6 +145,9 @@
 
         # add text fields
         for field in self.textFields:
+            # XXX: terms textFields won't get numbered
+            # after each other, needed for titles
+            position = 0
             for token in analyzer.tokenize(field.text):
                 if isinstance(token, tuple):
                     token, position = token
@@ -163,19 +166,20 @@
                 # the process, the string length could expand, so we
                 # need to check here as well.
                 d.add_posting(checkKeyLen(token), position)
-            position += INTER_FIELD_POSITION_GAP
+            #position += INTER_FIELD_POSITION_GAP
 
             if field.prefix:
                 prefix = field.name
                 for token in analyzer.tokenize(field.text):
                     if isinstance(token, tuple):
-                        token = token[0]
+                        token, position = token
+                    else:
+                        position += 1
                     # token is unicode, but gets converted to UTF-8
                     # by makePairForWrite:
                     term = makePairForWrite(prefix, token, prefixMap)
                     d.add_posting(term, position)
-                    position += 1
-                position += INTER_FIELD_POSITION_GAP
+                #position += INTER_FIELD_POSITION_GAP
 
         # add keyword fields
         for field in self.keywords:
--- a/docs/CHANGES.fpletz	Mon Jul 17 01:50:55 2006 +0200
+++ b/docs/CHANGES.fpletz	Mon Jul 17 03:08:12 2006 +0200
@@ -5,14 +5,18 @@
     * Only term-based regex searching possible, modifier or heuristic to
       enable usage of _moinSearch for full compatibility?
     * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata)
+    * Positions saved in Xapian aren't always correct, check. Code
+      generally needs some more love.
 
   ToDo:
     * Implement the new search UI
     * Write/update documentation for all the new search stuff
     * Indexing and searching of categories (new term prefix)
-    * Drop _moinSearch when using Xapian and use term positions provided
-      by Xapian itself, needs some reworking of WikiAnalyzer/xapwrap to
-      get the position of stemmed words right
+    * Reevaluate Xapwrap, possibly drop it and rip out usable stuff
+      (i.e. ExceptionTranslator)
+    * Add stemming support for highlighting stuff:
+        1. regexp for whole word (all lowercase), or
+        2. just the root of the word
 
   New Features:
     * Faster search thanks to Xapian
@@ -122,3 +126,26 @@
     * Basic (quick and dirty, limitations and bugs included, but
       commit-ready) implementation of getting matches out of the Xapian DB
 
+2006-07-08
+    * No work: daytrip to Munich
+
+2006-07-09
+    * Bugfix for _moinSearch (not using Xapian)
+
+2006-07-11
+    * Make matches which we get from Xapian more reliable
+    * Add TitleMatch support
+    * Xapwrap needed some tuning (aka hacking), think about dropping
+      and/or rewriting much of its code as it doesn't always fit (and
+      probably won't in the future)
+
+2006-07-12
+2006-07-13
+    * No work
+
+2006-07-14
+    * Minor bugfix for TitleMatch, now works correctly
+    * First interesting match must be a TextMatch
+    * Comment read_lock code from BaseIndex (should not be needed)
+    * Support complete rebuild of the database (delete and add)
+