changeset 1237:0a947454dec7

use xapian for sorting search results
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Thu, 10 Aug 2006 01:47:41 +0200
parents d2d160c344b7
children 820518f0118e
files MoinMoin/action/fullsearch.py MoinMoin/search/Xapian.py MoinMoin/search/__init__.py MoinMoin/search/builtin.py MoinMoin/search/queryparser.py MoinMoin/search/results.py docs/CHANGES.fpletz
diffstat 7 files changed, 61 insertions(+), 32 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/action/fullsearch.py	Tue Aug 08 22:39:15 2006 +0200
+++ b/MoinMoin/action/fullsearch.py	Thu Aug 10 01:47:41 2006 +0200
@@ -55,11 +55,19 @@
         Page(request, pagename).send_page(request, msg=err)
         return
 
+    # Setup for type of search
+    if titlesearch:
+        title = _('Title Search: "%s"')
+        sort = 'page_name'
+    else:
+        title = _('Full Text Search: "%s"')
+        sort = 'weight'
+
     # search the pages
     from MoinMoin.search import searchPages, QueryParser
     query = QueryParser(case=case, regex=regex,
             titlesearch=titlesearch).parse_query(needle)
-    results = searchPages(request, query)
+    results = searchPages(request, query, sort)
 
     # directly show a single hit
     # XXX won't work with attachment search
@@ -79,14 +87,6 @@
     # This action generate data using the user language
     request.setContentLanguage(request.lang)
 
-    # Setup for type of search
-    if titlesearch:
-        title = _('Title Search: "%s"')
-        results.sortByPagename()
-    else:
-        title = _('Full Text Search: "%s"')
-        results.sortByWeight()
-
     request.theme.send_title(title % needle, form=request.form, pagename=pagename)
 
     # Start content (important for RTL support)
--- a/MoinMoin/search/Xapian.py	Tue Aug 08 22:39:15 2006 +0200
+++ b/MoinMoin/search/Xapian.py	Thu Aug 10 01:47:41 2006 +0200
@@ -195,7 +195,7 @@
         """ Check if the Xapian index exists """
         return BaseIndex.exists(self) and os.listdir(self.dir)
 
-    def _search(self, query):
+    def _search(self, query, sort=None):
         """ read lock must be acquired """
         while True:
             try:
@@ -210,7 +210,16 @@
                 timestamp = self.mtime()
                 break
         
-        hits = searcher.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname'])
+        kw = {}
+        if sort == 'weight':
+            # XXX: we need real weight here, like _moinSearch
+            # (TradWeight in xapian)
+            kw['sortByRelevence'] = True
+        if sort == 'page_name':
+            kw['sortKey'] = 'pagename'
+
+        hits = searcher.search(query, valuesWanted=['pagename',
+            'attachment', 'mtime', 'wikiname'], **kw)
         self.request.cfg.xapian_searchers.append((searcher, timestamp))
         return hits
     
--- a/MoinMoin/search/__init__.py	Tue Aug 08 22:39:15 2006 +0200
+++ b/MoinMoin/search/__init__.py	Thu Aug 10 01:47:41 2006 +0200
@@ -13,7 +13,7 @@
 from MoinMoin.search.queryparser import QueryParser
 from MoinMoin.search.builtin import Search
 
-def searchPages(request, query, **kw):
+def searchPages(request, query, sort='weight', **kw):
     """ Search the text of all pages for query.
     
     @param request: current request
@@ -23,5 +23,5 @@
     """
     if isinstance(query, str) or isinstance(query, unicode):
         query = QueryParser(**kw).parse_query(query)
-    return Search(request, query).run()
+    return Search(request, query, sort).run()
 
--- a/MoinMoin/search/builtin.py	Tue Aug 08 22:39:15 2006 +0200
+++ b/MoinMoin/search/builtin.py	Thu Aug 10 01:47:41 2006 +0200
@@ -174,11 +174,11 @@
     def _search(self, query):
         raise NotImplemented('...')
 
-    def search(self, query):
+    def search(self, query, *args, **kw):
         #if not self.read_lock.acquire(1.0):
         #    raise self.LockedException
         #try:
-        hits = self._search(query)
+        hits = self._search(query, *args, **kw)
         #finally:
         #    self.read_lock.release()
         return hits
@@ -352,9 +352,10 @@
 class Search:
     """ A search run """
     
-    def __init__(self, request, query):
+    def __init__(self, request, query, sort='weight'):
         self.request = request
         self.query = query
+        self.sort = sort
         self.filtered = False
         self.fs_rootpage = "FS" # XXX FS hardcoded
 
@@ -370,7 +371,12 @@
         if not self.filtered:
             hits = self._filter(hits)
 
-        return getSearchResults(self.request, self.query, hits, start)
+        # when xapian was used, we won't need to sort manually
+        if self.request.cfg.xapian_search:
+            self.sort = None
+
+        return getSearchResults(self.request, self.query, hits, start,
+                self.sort)
         
 
     # ----------------------------------------------------------------
@@ -406,9 +412,9 @@
                 self.request.log("xapianSearch: query = %r" %
                         query.get_description())
                 query = xapwrap.index.QObjQuery(query)
-                enq, hits = index.search(query)
+                enq, hits = index.search(query, sort=self.sort)
                 clock.stop('_xapianQuery')
-                self.request.log("xapianSearch: finds: %r" % hits)
+                #self.request.log("xapianSearch: finds: %r" % hits)
                 def dict_decode(d):
                     """ decode dict values to unicode """
                     for k, v in d.items():
@@ -434,6 +440,9 @@
                         clock.stop('_xapianProcess')
             finally:
                 clock.stop('_xapianSearch')
+        else:
+            # we didn't use xapian in this request
+            self.request.cfg.xapian_search = 0
         
         return self._moinSearch(pages)
 
--- a/MoinMoin/search/queryparser.py	Tue Aug 08 22:39:15 2006 +0200
+++ b/MoinMoin/search/queryparser.py	Thu Aug 10 01:47:41 2006 +0200
@@ -436,7 +436,7 @@
                 if term[:4] == 'XFT:':
                     found = True
                     if self.search_re.findall(term[4:]):
-                        terms.append(term)
+                        terms.append(Query(term, 100))
                 elif found:
                     break
             if not terms:
@@ -456,15 +456,19 @@
                     # stemmed OR not stemmed
                     tmp = []
                     for w, s, pos in analyzer.tokenize(t, flat_stemming=False):
-                        tmp.append(UnicodeQuery(Query.OP_OR,
-                            ['%s%s' % (Xapian.Index.prefixMap['title'], j)
+                        tmp.append(Query(Query.OP_OR,
+                            [UnicodeQuery('%s%s' %
+                                    (Xapian.Index.prefixMap['title'], j),
+                                    100)
                                 for j in (w, s)]))
                         stemmed.append(s)
                     t = tmp
                 else:
                     # just not stemmed
-                    t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], w))
-                        for w, pos in analyzer.tokenize(t)]
+                    t = [UnicodeQuery(
+                                '%s%s' % (Xapian.Index.prefixMap['title'], w),
+                                100)
+                            for w, pos in analyzer.tokenize(t)]
 
                 queries.append(Query(Query.OP_AND, t))
 
--- a/MoinMoin/search/results.py	Tue Aug 08 22:39:15 2006 +0200
+++ b/MoinMoin/search/results.py	Thu Aug 10 01:47:41 2006 +0200
@@ -244,27 +244,30 @@
     """
     # Public functions --------------------------------------------------
     
-    def __init__(self, query, hits, pages, elapsed):
+    def __init__(self, query, hits, pages, elapsed, sort=None):
         self.query = query # the query
         self.hits = hits # hits list
-        self.sort = None # hits are unsorted initially
         self.pages = pages # number of pages in the wiki
         self.elapsed = elapsed # search time
 
-    def sortByWeight(self):
+        if sort == 'weight':
+            self._sortByWeight()
+        elif sort == 'page_name':
+            self.sortByPagename()
+        self.sort = sort
+
+    def _sortByWeight(self):
         """ Sorts found pages by the weight of the matches """
         tmp = [(hit.weight(), hit.page_name, hit) for hit in self.hits]
         tmp.sort()
         tmp.reverse()
         self.hits = [item[2] for item in tmp]
-        self.sort = 'weight'
         
-    def sortByPagename(self):
+    def _sortByPagename(self):
         """ Sorts a list of found pages alphabetical by page name """
         tmp = [(hit.page_name, hit) for hit in self.hits]
         tmp.sort()
         self.hits = [item[1] for item in tmp]
-        self.sort = 'page_name'
         
     def stats(self, request, formatter, hitsFrom):
         """ Return search statistics, formatted with formatter
@@ -802,7 +805,7 @@
         self.matchLabel = (_('match'), _('matches'))
 
 
-def getSearchResults(request, query, hits, start):
+def getSearchResults(request, query, hits, start, sort=None):
     result_hits = []
     for wikiname, page, attachment, match in hits:
         if wikiname in (request.cfg.interwikiname, 'Self'): # a local match
@@ -816,5 +819,5 @@
                 attachment, match, page))
     elapsed = time.time() - start
     count = request.rootpage.getPageCount()
-    return SearchResults(query, result_hits, count, elapsed)
+    return SearchResults(query, result_hits, count, elapsed, sort)
 
--- a/docs/CHANGES.fpletz	Tue Aug 08 22:39:15 2006 +0200
+++ b/docs/CHANGES.fpletz	Thu Aug 10 01:47:41 2006 +0200
@@ -223,3 +223,7 @@
     * improved highlighting code to work better with stemming and
       special searches, extended SystemInfo macro
 
+2006-08-09
+    * use xapian for sorting, first step not to fetch all results
+      -> still TODO: need real weight
+