changeset 926:134b5ee99046

basic fetching of matches for terms with xapian
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Fri, 07 Jul 2006 12:28:54 +0200
parents 4508fc92fcb1
children 28ae528ca238
files MoinMoin/search/Xapian.py MoinMoin/search/builtin.py MoinMoin/search/queryparser.py MoinMoin/support/xapwrap/index.py docs/CHANGES.fpletz
diffstat 5 files changed, 64 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/search/Xapian.py	Wed Jul 05 12:19:22 2006 +0200
+++ b/MoinMoin/search/Xapian.py	Fri Jul 07 12:28:54 2006 +0200
@@ -87,7 +87,7 @@
             
         if isinstance(value, list): # used for page links
             for v in value:
-                yield enc(v)
+                yield (enc(v), 0)
         else:
             tokenstream = re.finditer(self.token_re, value)
             for m in tokenstream:
@@ -132,7 +132,7 @@
                 if self.stemmer:
                     yield (self.stemmer.stemWord(word), pos)
             else:
-                yield (i, self.stemmer.stemWord(i), pos)
+                yield (word, self.stemmer.stemWord(word), pos)
 
 
 #############################################################################
@@ -225,6 +225,13 @@
             yield i.get_term()
             i.next()
 
+    def termpositions(self, uid, term):
+        db = xapidx.ExceptionTranslater.openIndex(True, self.dir)
+        pos = db.positionlist_begin(uid, term)
+        while pos != db.positionlist_end(uid, term):
+            yield pos.get_termpos()
+            pos.next()
+
     def _index_file(self, request, writer, filename, mode='update'):
         """ index a file as it were a page named pagename
             Assumes that the write lock is acquired
--- a/MoinMoin/search/builtin.py	Wed Jul 05 12:19:22 2006 +0200
+++ b/MoinMoin/search/builtin.py	Fri Jul 07 12:28:54 2006 +0200
@@ -15,6 +15,7 @@
 from MoinMoin.Page import Page
 from MoinMoin.util import filesys, lock
 from MoinMoin.search.results import getSearchResults
+from MoinMoin.search.queryparser import TextMatch, TitleMatch
 
 ##############################################################################
 # Search Engine Abstraction
@@ -384,30 +385,47 @@
             index = Index(self.request)
         except ImportError:
             index = None
+        
         if index and index.exists(): #and self.query.xapian_wanted():
             self.request.clock.start('_xapianSearch')
             try:
                 from MoinMoin.support import xapwrap
-                query = self.query.xapian_term(self.request,
-                        index.allterms)
+                query = self.query.xapian_term(self.request, index.allterms)
                 self.request.log("xapianSearch: query = %r" %
                         query.get_description())
                 query = xapwrap.index.QObjQuery(query)
-                hits = index.search(query)
+                enq, hits = index.search(query)
                 self.request.log("xapianSearch: finds: %r" % hits)
                 def dict_decode(d):
                     """ decode dict values to unicode """
                     for k, v in d.items():
                         d[k] = d[k].decode(config.charset)
                     return d
-                pages = [dict_decode(hit['values']) for hit in hits]
+                pages = [{'uid': hit['uid'], 'values': dict_decode(hit['values'])}
+                        for hit in hits]
                 self.request.log("xapianSearch: finds pages: %r" % pages)
+                self._xapianEnquire = enq
+                self._xapianIndex = index
             except BaseIndex.LockedException:
                 pass
             #except AttributeError:
             #    pages = []
             self.request.clock.stop('_xapianSearch')
-        return self._moinSearch(pages)
+            return self._getHits(hits, self._xapianMatch)
+        else:
+            return self._moinSearch(pages)
+
+    def _xapianMatch(self, page, uid):
+        matches = []
+        term = self._xapianEnquire.get_matching_terms_begin(uid)
+        #print hit['uid']
+        while term != self._xapianEnquire.get_matching_terms_end(uid):
+            print term.get_term(), ':', list(self._xapianIndex.termpositions(uid, term.get_term()))
+            for pos in self._xapianIndex.termpositions(uid, term.get_term()):
+                matches.append(TextMatch(start=pos,
+                    end=pos+len(term.get_term())))
+            term.next()
+        return matches
 
     def _moinSearch(self, pages=None):
         """ Search pages using moin's built-in full text search 
@@ -421,9 +439,23 @@
             # if we are not called from _xapianSearch, we make a full pagelist,
             # but don't search attachments (thus attachment name = '')
             pages = [{'pagename': p, 'attachment': '', 'wikiname': 'Self', } for p in self._getPageList()]
+        hits = self._getHits(pages, self._moinMatch)
+        self.request.clock.stop('_moinSearch')
+        return hits
+    
+    def _moinMatch(self, page, uid):
+        return self.query.search(page)
+
+    def _getHits(self, pages, matchSearchFunction):
         hits = []
         fs_rootpage = self.fs_rootpage
-        for valuedict in pages:
+        for hit in pages:
+            if 'values' in hit:
+                valuedict = hit['values']
+                uid = hit['uid']
+            else:
+                valuedict = hit
+
             wikiname = valuedict['wikiname']
             pagename = valuedict['pagename']
             attachment = valuedict['attachment']
@@ -436,12 +468,11 @@
                     else:
                         hits.append((wikiname, page, attachment, None))
                 else:
-                    match = self.query.search(page)
+                    match = matchSearchFunction(page, uid)
                     if match:
                         hits.append((wikiname, page, attachment, match))
             else: # other wiki
                 hits.append((wikiname, pagename, attachment, None))
-        self.request.clock.stop('_moinSearch')
         return hits
 
     def _getPageList(self):
--- a/MoinMoin/search/queryparser.py	Wed Jul 05 12:19:22 2006 +0200
+++ b/MoinMoin/search/queryparser.py	Fri Jul 07 12:28:54 2006 +0200
@@ -341,7 +341,6 @@
                 (self.titlesearch.xapian_term(request, allterms),
                     Query(Query.OP_AND, queries)))
 
-
 class TitleSearch(BaseExpression):
     """ Term searches in pattern in page title only """
 
--- a/MoinMoin/support/xapwrap/index.py	Wed Jul 05 12:19:22 2006 +0200
+++ b/MoinMoin/support/xapwrap/index.py	Fri Jul 07 12:28:54 2006 +0200
@@ -635,7 +635,7 @@
                         valRes[valName] = xapDoc.get_value(valueIndex)
                     thisResult['values'] = valRes
                 results.append(thisResult)
-            return results
+            return enq, results
         except:
             del enq, mset
             raise
--- a/docs/CHANGES.fpletz	Wed Jul 05 12:19:22 2006 +0200
+++ b/docs/CHANGES.fpletz	Fri Jul 07 12:28:54 2006 +0200
@@ -107,3 +107,18 @@
 2006-07-01
     * Fully implemented term-based regexp searching
 
+2006-07-04
+    * Evaluating the current framework for the new UI (no new sane code to
+      commit)
+
+2006-07-05
+    * Indexing correct positions in xapwrap
+
+2006-07-06
+    * Played with Xapian to get correct positions and where to integrate
+      in MoinMoin
+
+2006-07-07
+    * Basic (quick and dirty, limitations and bugs included, but
+      commit-ready) implementation of getting matches out of the Xapian DB
+