changeset 4970:43e7b40912ac

Xapian2009: The MoinMoin.search.builtin.Search class was split to BaseSearch, MoinSearch and XapianSearch. Search using moin should work, xapian search is broken!
author Dmitrijs Milajevs <dimazest@gmail.com>
date Fri, 31 Jul 2009 16:09:20 +0200
parents b0afbf750a24
children 21bc8092a009
files MoinMoin/search/__init__.py MoinMoin/search/builtin.py
diffstat 2 files changed, 180 insertions(+), 218 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/search/__init__.py	Mon Jul 27 19:20:08 2009 +0200
+++ b/MoinMoin/search/__init__.py	Fri Jul 31 16:09:20 2009 +0200
@@ -14,7 +14,7 @@
 logging = log.getLogger(__name__)
 
 from MoinMoin.search.queryparser import QueryParser, QueryError
-from MoinMoin.search.builtin import Search
+from MoinMoin.search.builtin import MoinSearch, XapianSearch
 
 def searchPages(request, query, sort='weight', mtime=None, historysearch=None, **kw):
     """ Search the text of all pages for query.
@@ -32,6 +32,11 @@
     """
     if isinstance(query, str) or isinstance(query, unicode):
         query = QueryParser(**kw).parse_query(query)
-    return Search(request, query, sort, mtime=mtime,
-            historysearch=historysearch).run()
 
+    if request.cfg.xapian_search:
+        searcher = XapianSearch
+    else:
+        searcher = MoinSearch
+
+    return searcher(request, query, sort, mtime=mtime, historysearch=historysearch).run()
+
--- a/MoinMoin/search/builtin.py	Mon Jul 27 19:20:08 2009 +0200
+++ b/MoinMoin/search/builtin.py	Fri Jul 31 16:09:20 2009 +0200
@@ -433,11 +433,10 @@
 ### Searching
 ##############################################################################
 
-class Search:
+class BaseSearch(object):
     """ A search run """
 
-    def __init__(self, request, query, sort='weight', mtime=None,
-            historysearch=0):
+    def __init__(self, request, query, sort='weight', mtime=None, historysearch=0):
         """
         @param request: current request
         @param query: search query objects tree
@@ -455,37 +454,154 @@
 
     def run(self):
         """ Perform search and return results object """
+
         start = time.time()
-        if self.request.cfg.xapian_search:
-            hits = self._xapianSearch()
-            logging.debug("_xapianSearch found %d hits" % len(hits))
-        else:
-            hits = self._moinSearch()
-            logging.debug("_moinSearch found %d hits" % len(hits))
+        hits, estimated_hits = self._search()
 
         # important - filter deleted pages or pages the user may not read!
         if not self.filtered:
             hits = self._filter(hits)
             logging.debug("after filtering: %d hits" % len(hits))
 
-        # when xapian was used, we can estimate the numer of matches
-        # Note: hits can't be estimated by xapian with historysearch enabled
-        if not self.request.cfg.xapian_index_history and hasattr(self, '_xapianMset'):
-            _ = self.request.getText
-            mset = self._xapianMset
-            m_lower = mset.get_matches_lower_bound()
-            m_estimated = mset.get_matches_estimated()
-            m_upper = mset.get_matches_upper_bound()
-            estimated_hits = (m_estimated == m_upper and m_estimated == m_lower
-                              and '' or _('about'), m_estimated)
+        return self._get_search_results(hits, start, estimated_hits)
+
+    def _search(self):
+        """
+        Search pages.
+
+        Return list of tuples (wikiname, page object, attachment,
+        matches, revision) and estimated number of search results (If
+        there is no estimate, None should be returned).
+
+        The list may contain deleted pages or pages the user may not read.
+        """
+        raise NotImplementedError()
+
+    def _filter(self, hits):
+        """
+        Filter out deleted or acl protected pages
+
+        @param hits: list of hits
+        """
+        userMayRead = self.request.user.may.read
+        fs_rootpage = self.fs_rootpage + "/"
+        thiswiki = (self.request.cfg.interwikiname, 'Self')
+        filtered = [(wikiname, page, attachment, match, rev)
+                for wikiname, page, attachment, match, rev in hits
+                    if (not wikiname in thiswiki or
+                       page.exists() and userMayRead(page.page_name) or
+                       page.page_name.startswith(fs_rootpage)) and
+                       (not self.mtime or self.mtime <= page.mtime_usecs()/1000000)]
+        return filtered
+
+    def _get_search_results(self, hits, start, estimated_hits):
+        return getSearchResults(self.request, self.query, hits, start, self.sort, estimated_hits)
+
+    def _get_match(self, page=None, uid=None):
+        """
+        Get all matches
+
+        XXX xappy highlight functionality should be used for Xapian search!
+
+        @param page: the current page instance
+        """
+        if page:
+            return self.query.search(page)
+
+    def _getHits(self, pages):
+        """ Get the hit tuples in pages through _get_match """
+        logging.debug("_getHits searching in %d pages ..." % len(pages))
+        hits = []
+        revisionCache = {}
+        fs_rootpage = self.fs_rootpage
+        for hit in pages:
+
+            uid = hit.get('uid')
+            wikiname = hit['wikiname']
+            pagename = hit['pagename']
+            attachment = hit['attachment']
+            revision = int(hit.get('revision', 0))
+
+            logging.debug("_getHits processing %r %r %d %r" % (wikiname, pagename, revision, attachment))
+
+            if wikiname in (self.request.cfg.interwikiname, 'Self'): # THIS wiki
+                page = Page(self.request, pagename, rev=revision)
+                if not self.historysearch and revision:
+                    revlist = page.getRevList()
+                    # revlist can be empty if page was nuked/renamed since it was included in xapian index
+                    if not revlist or revlist[0] != revision:
+                        # nothing there at all or not the current revision
+                        logging.debug("no history search, skipping non-current revision...")
+                        continue
+                if attachment:
+                    # revision currently is 0 ever
+                    if pagename == fs_rootpage: # not really an attachment
+                        page = Page(self.request, "%s/%s" % (fs_rootpage, attachment))
+                        hits.append((wikiname, page, None, None, revision))
+                    else:
+                        matches = self._get_match(page=None, uid=uid)
+                        hits.append((wikiname, page, attachment, matches, revision))
+                else:
+                    matches = self._get_match(page=page, uid=uid)
+                    logging.debug("self._get_match %r" % matches)
+                    if matches:
+                        if not self.historysearch and \
+                                pagename in revisionCache and \
+                                revisionCache[pagename][0] < revision:
+                            hits.remove(revisionCache[pagename][1])
+                            del revisionCache[pagename]
+                        hits.append((wikiname, page, attachment, matches, revision))
+                        revisionCache[pagename] = (revision, hits[-1])
+            else: # other wiki
+                hits.append((wikiname, pagename, attachment, None, revision))
+        logging.debug("_getHits returning %r." % hits)
+        return hits
+
+class MoinSearch(BaseSearch):
+
+    def __init__(self, request, query, sort='weight', mtime=None, historysearch=0, pages=None):
+        super(MoinSearch, self).__init__(request, query, sort, mtime, historysearch)
+
+        self.pages = pages
+
+    def _search(self):
+        """
+        Search pages using moin's built-in full text search
+
+        The list may contain deleted pages or pages the user may not
+        read.
+
+        if self.pages is not None, searches in that pages.
+        """
+        self.request.clock.start('_moinSearch')
+
+        # if self.pages is none, we make a full pagelist, but don't
+        # search attachments (thus attachment name = '')
+        pages = self.pages or [{'pagename': p, 'attachment': '', 'wikiname': 'Self', } for p in self._getPageList()]
+
+        hits = self._getHits(pages)
+        self.request.clock.stop('_moinSearch')
+
+        return hits, None
+
+    def _getPageList(self):
+        """ Get list of pages to search in
+
+        If the query has a page filter, use it to filter pages before
+        searching. If not, get a unfiltered page list. The filtering
+        will happen later on the hits, which is faster with current
+        slow storage.
+        """
+        filter_ = self.query.pageFilter()
+        if filter_:
+            # There is no need to filter the results again.
+            self.filtered = True
+            return self.request.rootpage.getPageList(filter=filter_)
         else:
-            estimated_hits = None
+            return self.request.rootpage.getPageList(user='', exists=0)
 
-        return getSearchResults(self.request, self.query, hits, start,
-                self.sort, estimated_hits)
 
-    # ----------------------------------------------------------------
-    # Private!
+class XapianSearch(BaseSearch):
 
     def _xapianIndex(request):
         """ Get the xapian index if possible
@@ -503,7 +619,7 @@
 
     _xapianIndex = staticmethod(_xapianIndex)
 
-    def _xapianSearch(self):
+    def _search(self):
         """ Search using Xapian
 
         Get a list of pages using fast xapian search and
@@ -513,200 +629,41 @@
         pages = None
         index = self._xapianIndex(self.request)
 
-        if index and self.query.xapian_wanted():
-            clock.start('_xapianSearch')
-            try:
-                from MoinMoin.support import xapwrap
-
-                clock.start('_xapianQuery')
-                query = self.query.xapian_term(self.request, index.allterms)
-                description = str(query)
-                logging.debug("_xapianSearch: query = %r" % description)
-                query = xapwrap.index.QObjQuery(query)
-                enq, mset, hits = index.search(query, sort=self.sort,
-                        historysearch=self.historysearch)
-                clock.stop('_xapianQuery')
-
-                logging.debug("_xapianSearch: finds: %r" % hits)
-                def dict_decode(d):
-                    """ decode dict values to unicode """
-                    for key in d:
-                        d[key] = d[key].decode(config.charset)
-                    return d
-                pages = [dict_decode(hit['values']) for hit in hits]
-                logging.debug("_xapianSearch: finds pages: %r" % pages)
+        assert index, 'XXX Assume that index exist, actually we should have thrown an exception, so MoinSearch could be used instead'
 
-                self._xapianEnquire = enq
-                self._xapianMset = mset
-                self._xapianIndex = index
-            except BaseIndex.LockedException:
-                pass
-            #except AttributeError:
-            #    pages = []
+        clock.start('_xapianSearch')
+        try:
+            clock.start('_xapianQuery')
+            query = self.query.xapian_term(self.request, index.allterms)
+            search_results = index.search(query, sort=self.sort, historysearch=self.historysearch)
+            clock.stop('_xapianQuery')
+            logging.debug("_xapianSearch: finds: %r" % search_results)
+            self._xapianIndex = index
+        except BaseIndex.LockedException:
+            pass
 
-            try:
+        # XXX must search_results be decoded?
+
+        pages = [{'uid': r.id,
+                  'wikiname': r.data['wikiname'][0],
+                  'pagename': r.data['pagename'][0],
+                  'attachment': r.data['attachment'][0],
+                  'revision': r.data['revision'][0]}
+                 for r in search_results]
+
+        try:
+            if not self.query.xapian_need_postproc():
                 # xapian handled the full query
-                if not self.query.xapian_need_postproc():
-                    clock.start('_xapianProcess')
-                    try:
-                        return self._getHits(hits, self._xapianMatch)
-                    finally:
-                        clock.stop('_xapianProcess')
-            finally:
-                clock.stop('_xapianSearch')
-        elif not index:
-            # we didn't use xapian in this request because we have no index,
-            # so we can just disable it until admin builds an index and
-            # restarts moin processes
-            self.request.cfg.xapian_search = 0
+                clock.start('_xapianProcess')
+                try:
+                    _ = self.request.getText
+                    return self._getHits(pages), (search_results.estimate_is_exact and '' or _('about'), search_results.matches_estimated)
+                finally:
+                    clock.stop('_xapianProcess')
+        finally:
+            clock.stop('_xapianSearch')
 
         # some postprocessing by _moinSearch is required
-        return self._moinSearch(pages)
-
-    def _xapianMatchDecider(self, term, pos):
-        """ Returns correct Match object for a Xapian match
-
-        @param term: the term as string
-        @param pos: starting position of the match
-        """
-        if term[0] == 'S': # TitleMatch
-            return TitleMatch(start=pos, end=pos+len(term)-1)
-        else: # TextMatch (incl. headers)
-            return TextMatch(start=pos, end=pos+len(term))
-
-    def _xapianMatch(self, uid, page=None):
-        """ Get all relevant Xapian matches per document id
-
-        @param uid: the id of the document in the xapian index
-        """
-        positions = {}
-        term = self._xapianEnquire.get_matching_terms_begin(uid)
-        while term != self._xapianEnquire.get_matching_terms_end(uid):
-            term_name = term.get_term()
-            for pos in self._xapianIndex.termpositions(uid, term.get_term()):
-                if pos not in positions or \
-                        len(positions[pos]) < len(term_name):
-                    positions[pos] = term_name
-            term.next()
-        matches = [self._xapianMatchDecider(term, pos) for pos, term
-            in positions.iteritems()]
-
-        if not matches:
-            return [Match()] # dummy for metadata, we got a match!
-
-        return matches
-
-    def _moinSearch(self, pages=None):
-        """ Search pages using moin's built-in full text search
-
-        Return list of tuples (page, match). The list may contain
-        deleted pages or pages the user may not read.
-
-        @keyword pages: optional list of pages to search in
-        """
-        self.request.clock.start('_moinSearch')
-        if pages is None:
-            # if we are not called from _xapianSearch, we make a full pagelist,
-            # but don't search attachments (thus attachment name = '')
-            pages = [{'pagename': p, 'attachment': '', 'wikiname': 'Self', } for p in self._getPageList()]
-        hits = self._getHits(pages, self._moinMatch)
-        self.request.clock.stop('_moinSearch')
-        return hits
-
-    def _moinMatch(self, page, uid=None):
-        """ Get all matches from regular moinSearch
-
-        @param page: the current page instance
-        """
-        if page:
-            return self.query.search(page)
-
-    def _getHits(self, pages, matchSearchFunction):
-        """ Get the hit tuples in pages through matchSearchFunction """
-        logging.debug("_getHits searching in %d pages ..." % len(pages))
-        hits = []
-        revisionCache = {}
-        fs_rootpage = self.fs_rootpage
-        for hit in pages:
-            if 'values' in hit:
-                valuedict = hit['values']
-                uid = hit['uid']
-            else:
-                valuedict = hit
-                uid = None
+        return MoinSearch(self.request, self.query, self.sort, self.mtime, self.historysearch, pages=None)._search()
 
-            wikiname = valuedict['wikiname']
-            pagename = valuedict['pagename']
-            attachment = valuedict['attachment']
-
-            if 'revision' in valuedict and valuedict['revision']:
-                revision = int(valuedict['revision'])
-            else:
-                revision = 0
-            logging.debug("_getHits processing %r %r %d %r" % (wikiname, pagename, revision, attachment))
 
-            if wikiname in (self.request.cfg.interwikiname, 'Self'): # THIS wiki
-                page = Page(self.request, pagename, rev=revision)
-                if not self.historysearch and revision:
-                    revlist = page.getRevList()
-                    # revlist can be empty if page was nuked/renamed since it was included in xapian index
-                    if not revlist or revlist[0] != revision:
-                        # nothing there at all or not the current revision
-                        logging.debug("no history search, skipping non-current revision...")
-                        continue
-                if attachment:
-                    # revision currently is 0 ever
-                    if pagename == fs_rootpage: # not really an attachment
-                        page = Page(self.request, "%s/%s" % (fs_rootpage, attachment))
-                        hits.append((wikiname, page, None, None, revision))
-                    else:
-                        matches = matchSearchFunction(page=None, uid=uid)
-                        hits.append((wikiname, page, attachment, matches, revision))
-                else:
-                    matches = matchSearchFunction(page=page, uid=uid)
-                    logging.debug("matchSearchFunction %r returned %r" % (matchSearchFunction, matches))
-                    if matches:
-                        if not self.historysearch and \
-                                pagename in revisionCache and \
-                                revisionCache[pagename][0] < revision:
-                            hits.remove(revisionCache[pagename][1])
-                            del revisionCache[pagename]
-                        hits.append((wikiname, page, attachment, matches, revision))
-                        revisionCache[pagename] = (revision, hits[-1])
-            else: # other wiki
-                hits.append((wikiname, pagename, attachment, None, revision))
-        logging.debug("_getHits returning %r." % hits)
-        return hits
-
-    def _getPageList(self):
-        """ Get list of pages to search in
-
-        If the query has a page filter, use it to filter pages before
-        searching. If not, get a unfiltered page list. The filtering
-        will happen later on the hits, which is faster with current
-        slow storage.
-        """
-        filter_ = self.query.pageFilter()
-        if filter_:
-            # There is no need to filter the results again.
-            self.filtered = True
-            return self.request.rootpage.getPageList(filter=filter_)
-        else:
-            return self.request.rootpage.getPageList(user='', exists=0)
-
-    def _filter(self, hits):
-        """ Filter out deleted or acl protected pages
-
-        @param hits: list of hits
-        """
-        userMayRead = self.request.user.may.read
-        fs_rootpage = self.fs_rootpage + "/"
-        thiswiki = (self.request.cfg.interwikiname, 'Self')
-        filtered = [(wikiname, page, attachment, match, rev)
-                for wikiname, page, attachment, match, rev in hits
-                    if (not wikiname in thiswiki or
-                       page.exists() and userMayRead(page.page_name) or
-                       page.page_name.startswith(fs_rootpage)) and
-                       (not self.mtime or self.mtime <= page.mtime_usecs()/1000000)]
-        return filtered
-