changeset 945:248789a3f155

improving positions fetched from xapian, TitleMatch support, bugfixes for the current code
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Fri, 14 Jul 2006 13:17:15 +0200
parents 28ae528ca238
children 72aeb2ba133d
files MoinMoin/search/builtin.py MoinMoin/search/results.py MoinMoin/support/xapwrap/document.py docs/CHANGES.fpletz
diffstat 4 files changed, 56 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/search/builtin.py	Fri Jul 07 12:30:34 2006 +0200
+++ b/MoinMoin/search/builtin.py	Fri Jul 14 13:17:15 2006 +0200
@@ -415,17 +415,25 @@
         else:
             return self._moinSearch(pages)
 
+    def _xapianMatchDecider(self, term, pos):
+        if term[0] == 'S':      # TitleMatch
+            return TitleMatch(start=pos, end=pos+len(term)-1)
+        else:                   # TextMatch (incl. headers)
+            return TextMatch(start=pos, end=pos+len(term))
+        
     def _xapianMatch(self, page, uid):
-        matches = []
+        """ Get all relevant Xapian matches per document id """
+        positions = {}
         term = self._xapianEnquire.get_matching_terms_begin(uid)
-        #print hit['uid']
         while term != self._xapianEnquire.get_matching_terms_end(uid):
-            print term.get_term(), ':', list(self._xapianIndex.termpositions(uid, term.get_term()))
-            for pos in self._xapianIndex.termpositions(uid, term.get_term()):
-                matches.append(TextMatch(start=pos,
-                    end=pos+len(term.get_term())))
+            term_name = term.get_term()
+            for pos in self._xapianIndex.termpositions(uid,term.get_term()):
+                if pos not in positions or \
+                        len(positions[pos]) < len(term_name):
+                    positions[pos] = term_name
             term.next()
-        return matches
+        return [self._xapianMatchDecider(term, pos) for pos, term
+            in positions.iteritems()]
 
     def _moinSearch(self, pages=None):
         """ Search pages using moin's built-in full text search 
@@ -444,9 +452,11 @@
         return hits
     
     def _moinMatch(self, page, uid):
+        """ Just kick off regular moinSearch """
         return self.query.search(page)
 
     def _getHits(self, pages, matchSearchFunction):
+        """ Get the hit tuples in pages through matchSearchFunction """
         hits = []
         fs_rootpage = self.fs_rootpage
         for hit in pages:
@@ -455,6 +465,7 @@
                 uid = hit['uid']
             else:
                 valuedict = hit
+                uid = None
 
             wikiname = valuedict['wikiname']
             pagename = valuedict['pagename']
@@ -468,9 +479,9 @@
                     else:
                         hits.append((wikiname, page, attachment, None))
                 else:
-                    match = matchSearchFunction(page, uid)
-                    if match:
-                        hits.append((wikiname, page, attachment, match))
+                    matches = matchSearchFunction(page, uid)
+                    if matches:
+                        hits.append((wikiname, page, attachment, matches))
             else: # other wiki
                 hits.append((wikiname, pagename, attachment, None))
         return hits
--- a/MoinMoin/search/results.py	Fri Jul 07 12:30:34 2006 +0200
+++ b/MoinMoin/search/results.py	Fri Jul 14 13:17:15 2006 +0200
@@ -494,7 +494,8 @@
         start = len(header)
         # Find first match after start
         for i in xrange(len(matches)):
-            if matches[i].start >= start:
+            if matches[i].start >= start and \
+                    isinstance(matches[i], TextMatch):
                 return i, start
         return 0, 0
 
--- a/MoinMoin/support/xapwrap/document.py	Fri Jul 07 12:30:34 2006 +0200
+++ b/MoinMoin/support/xapwrap/document.py	Fri Jul 14 13:17:15 2006 +0200
@@ -145,6 +145,9 @@
 
         # add text fields
         for field in self.textFields:
+            # XXX: terms textFields won't get numbered
+            # after each other, needed for titles
+            position = 0
             for token in analyzer.tokenize(field.text):
                 if isinstance(token, tuple):
                     token, position = token
@@ -163,19 +166,20 @@
                 # the process, the string length could expand, so we
                 # need to check here as well.
                 d.add_posting(checkKeyLen(token), position)
-            position += INTER_FIELD_POSITION_GAP
+            #position += INTER_FIELD_POSITION_GAP
 
             if field.prefix:
                 prefix = field.name
                 for token in analyzer.tokenize(field.text):
                     if isinstance(token, tuple):
-                        token = token[0]
+                        token, position = token
+                    else:
+                        position += 1
                     # token is unicode, but gets converted to UTF-8
                     # by makePairForWrite:
                     term = makePairForWrite(prefix, token, prefixMap)
                     d.add_posting(term, position)
-                    position += 1
-                position += INTER_FIELD_POSITION_GAP
+                #position += INTER_FIELD_POSITION_GAP
 
         # add keyword fields
         for field in self.keywords:
--- a/docs/CHANGES.fpletz	Fri Jul 07 12:30:34 2006 +0200
+++ b/docs/CHANGES.fpletz	Fri Jul 14 13:17:15 2006 +0200
@@ -5,14 +5,15 @@
     * Only term-based regex searching possible, modifier or heuristic to
       enable usage of _moinSearch for full compatibility?
     * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata)
+    * Positions saved in Xapian aren't always correct, check. Code
+      generally needs some more love.
 
   ToDo:
     * Implement the new search UI
     * Write/update documentation for all the new search stuff
     * Indexing and searching of categories (new term prefix)
-    * Drop _moinSearch when using Xapian and use term positions provided
-      by Xapian itself, needs some reworking of WikiAnalyzer/xapwrap to
-      get the position of stemmed words right
+    * Reevaluate Xapwrap, possibly drop it and rip out usable stuff
+      (i.e. ExceptionTranslator)
 
   New Features:
     * Faster search thanks to Xapian
@@ -122,3 +123,24 @@
     * Basic (quick and dirty, limitations and bugs included, but
       commit-ready) implementation of getting matches out of the Xapian DB
 
+2006-07-08
+    * No work: daytrip to Munich
+
+2006-07-09
+    * Bugfix for _moinSearch (not using Xapian)
+
+2006-07-11
+    * Make matches which we get from Xapian more reliable
+    * Add TitleMatch support
+    * Xapwrap needed some tuning (aka hacking), think about dropping
+      and/or rewriting much of its code as it doesn't always fit (and
+      probably won't in the future)
+
+2006-07-12
+2006-07-13
+    * No work
+
+2006-07-14
+    * Minor bugfix for TitleMatch, now works correctly
+    * First interesting match must be a TextMatch
+