changeset 4514:af09c1b3a153

fixed search (see details below) fixed bad exception handling fixed historysearch (wrong result tuple length, hidden by bad exception handling) index page attachments only once (it indexed them repeatedly for every page revision!) refactored indexing code to reduce code duplication the redirect for a single search result did not work correctly for historysearch, also it was sometimes confusing for people when they didn't get a search result list, but directly the (single result) wiki page. OTOH, titlesearch is used by many people for a "quick fuzzy jump" functionality. Thus, we only redirect to a single search result for titlesearch now, not for fulltext search results.
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Wed, 28 Jan 2009 16:42:21 +0100
parents 4832d31195bb
children 04f3cfbf446a
files MoinMoin/action/fullsearch.py MoinMoin/search/Xapian.py MoinMoin/search/__init__.py MoinMoin/search/_tests/test_search.py MoinMoin/search/builtin.py MoinMoin/search/queryparser.py MoinMoin/search/results.py
diffstat 7 files changed, 137 insertions(+), 118 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/action/fullsearch.py	Tue Jan 27 21:18:50 2009 +0100
+++ b/MoinMoin/action/fullsearch.py	Wed Jan 28 16:42:21 2009 +0100
@@ -179,21 +179,22 @@
         sort = 'weight'
 
     # search the pages
-    from MoinMoin.search import searchPages, QueryParser
+    from MoinMoin.search import searchPages, QueryParser, QueryError
     try:
         query = QueryParser(case=case, regex=regex,
                 titlesearch=titlesearch).parse_query(needle)
-        results = searchPages(request, query, sort, mtime, historysearch)
-    except ValueError: # catch errors in the search query
+    except QueryError: # catch errors in the search query
         request.theme.add_msg(_('Your search query {{{"%s"}}} is invalid. Please refer to '
                 'HelpOnSearching for more information.', wiki=True, percent=True) % wikiutil.escape(needle), "error")
         Page(request, pagename).send_page()
         return
 
-    # directly show a single hit
-    # Note: can't work with attachment search
-    # improve if we have one...
-    if len(results.hits) == 1:
+    results = searchPages(request, query, sort, mtime, historysearch)
+
+    # directly show a single hit for title searches
+    # this is the "quick jump" functionality if you don't remember
+    # the pagename exactly, but just some parts of it
+    if titlesearch and len(results.hits) == 1:
         page = results.hits[0]
         if not page.attachment: # we did not find an attachment
             page = Page(request, page.page_name)
@@ -205,7 +206,7 @@
             url = page.url(request, querystr=querydict)
             request.http_redirect(url)
             return
-    elif not results.hits: # no hits?
+    if not results.hits: # no hits?
         f = request.formatter
         querydict = wikiutil.parseQueryString(request.query_string)
         querydict.update({'titlesearch': 0})
--- a/MoinMoin/search/Xapian.py	Tue Jan 27 21:18:50 2009 +0100
+++ b/MoinMoin/search/Xapian.py	Wed Jan 28 16:42:21 2009 +0100
@@ -282,12 +282,7 @@
         # do all page updates
         pages = self.update_queue.pages()[:amount]
         for name in pages:
-            p = Page(request, name)
-            if request.cfg.xapian_index_history:
-                for rev in p.getRevList():
-                    self._index_page(writer, Page(request, name, rev=rev), mode='update')
-            else:
-                self._index_page(writer, p, mode='update')
+            self._index_page(request, writer, name, mode='update')
             self.update_queue.remove([name])
 
         # do page/attachment removals
@@ -295,7 +290,7 @@
         for item in items:
             _item = item.split('//')
             p = Page(request, _item[0])
-            self._remove_item(writer, p, _item[1])
+            self._remove_item(request, writer, p, _item[1])
             self.remove_queue.remove([item])
 
         writer.close()
@@ -352,7 +347,7 @@
                 xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or u"Self")
                 xpname = xapdoc.SortKey('pagename', fs_rootpage)
                 xattachment = xapdoc.SortKey('attachment', filename) # XXX we should treat files like real pages, not attachments
-                xmtime = xapdoc.SortKey('mtime', mtime)
+                xmtime = xapdoc.SortKey('mtime', str(mtime))
                 xrev = xapdoc.SortKey('revision', '0')
                 title = " ".join(os.path.join(fs_rootpage, filename).split("/"))
                 xtitle = xapdoc.Keyword('title', title)
@@ -432,15 +427,91 @@
         if wikiutil.isSystemPage(self.request, page.page_name):
             yield 'system'
 
-    def _index_page(self, writer, page, mode='update'):
+    def _index_page(self, request, writer, pagename, mode='update'):
         """ Index a page - assumes that the write lock is acquired
 
         @arg writer: the index writer object
+        @arg pagename: a page name
+        @arg mode: 'add' = just add, no checks
+                   'update' = check if already in index and update if needed (mtime)
+        """
+        p = Page(request, pagename)
+        if request.cfg.xapian_index_history:
+            for rev in p.getRevList():
+                self._index_page_rev(request, writer, Page(request, pagename, rev=rev), mode=mode)
+        else:
+            self._index_page_rev(request, writer, p, mode=mode)
+
+        from MoinMoin.action import AttachFile
+        wikiname = request.cfg.interwikiname or u"Self"
+        # XXX: Hack until we get proper metadata
+        language, stem_language = self._get_languages(p)
+        domains = tuple(self._get_domains(p))
+        updated = False
+
+        attachments = AttachFile._get_files(request, pagename)
+        for att in attachments:
+            filename = AttachFile.getFilename(request, pagename, att)
+            att_itemid = "%s:%s//%s" % (wikiname, pagename, att)
+            mtime = wikiutil.timestamp2version(os.path.getmtime(filename))
+            if mode == 'update':
+                query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', att_itemid))
+                enq, mset, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ])
+                logging.debug("##%r %r" % (filename, docs))
+                if docs:
+                    doc = docs[0] # there should be only one
+                    uid = doc['uid']
+                    docmtime = long(doc['values']['mtime'])
+                    updated = mtime > docmtime
+                    logging.debug("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated))
+                else:
+                    uid = None
+                    updated = True
+            elif mode == 'add':
+                updated = True
+            logging.debug("%s %s %r" % (pagename, att, updated))
+            if updated:
+                xatt_itemid = xapdoc.Keyword('itemid', att_itemid)
+                xpname = xapdoc.SortKey('pagename', pagename)
+                xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or u"Self")
+                xattachment = xapdoc.SortKey('attachment', att) # this is an attachment, store its filename
+                xmtime = xapdoc.SortKey('mtime', str(mtime))
+                xrev = xapdoc.SortKey('revision', '0')
+                xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att))
+                xlanguage = xapdoc.Keyword('lang', language)
+                xstem_language = xapdoc.Keyword('stem_lang', stem_language)
+                mimetype, att_content = self.contentfilter(filename)
+                xmimetypes = [xapdoc.Keyword('mimetype', mt) for mt in [mimetype, ] + mimetype.split('/')]
+                xcontent = xapdoc.TextField('content', att_content)
+                xtitle_txt = xapdoc.TextField('title', '%s/%s' % (pagename, att), True)
+                xfulltitle = xapdoc.Keyword('fulltitle', pagename)
+                xdomains = [xapdoc.Keyword('domain', domain) for domain in domains]
+                doc = xapdoc.Document(textFields=(xcontent, xtitle_txt),
+                                      keywords=xdomains + xmimetypes + [xatt_itemid,
+                                          xtitle, xlanguage, xstem_language,
+                                          xfulltitle, ],
+                                      sortFields=(xpname, xattachment, xmtime,
+                                          xwname, xrev, ),
+                                     )
+                doc.analyzerFactory = getWikiAnalyzerFactory(request, stem_language)
+                if mode == 'update':
+                    logging.debug("%s (replace %r)" % (pagename, uid))
+                    doc.uid = uid
+                    id = writer.index(doc)
+                elif mode == 'add':
+                    logging.debug("%s (add)" % (pagename, ))
+                    id = writer.index(doc)
+        #writer.flush()
+
+    def _index_page_rev(self, request, writer, page, mode='update'):
+        """ Index a page revision - assumes that the write lock is acquired
+
+        @arg writer: the index writer object
         @arg page: a page object
         @arg mode: 'add' = just add, no checks
                    'update' = check if already in index and update if needed (mtime)
         """
-        request = page.request
+        request.page = page
         wikiname = request.cfg.interwikiname or u"Self"
         pagename = page.page_name
         mtime = page.mtime_usecs()
@@ -511,67 +582,8 @@
                 logging.debug("%s (add)" % (pagename, ))
                 id = writer.index(doc)
 
-        from MoinMoin.action import AttachFile
 
-        attachments = AttachFile._get_files(request, pagename)
-        for att in attachments:
-            filename = AttachFile.getFilename(request, pagename, att)
-            att_itemid = "%s:%s//%s" % (wikiname, pagename, att)
-            mtime = wikiutil.timestamp2version(os.path.getmtime(filename))
-            if mode == 'update':
-                query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', att_itemid))
-                enq, mset, docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ])
-                logging.debug("##%r %r" % (filename, docs))
-                if docs:
-                    doc = docs[0] # there should be only one
-                    uid = doc['uid']
-                    docmtime = long(doc['values']['mtime'])
-                    updated = mtime > docmtime
-                    logging.debug("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated))
-                else:
-                    uid = None
-                    updated = True
-            elif mode == 'add':
-                updated = True
-            logging.debug("%s %s %r" % (pagename, att, updated))
-            if updated:
-                xatt_itemid = xapdoc.Keyword('itemid', att_itemid)
-                xpname = xapdoc.SortKey('pagename', pagename)
-                xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or u"Self")
-                xattachment = xapdoc.SortKey('attachment', att) # this is an attachment, store its filename
-                xmtime = xapdoc.SortKey('mtime', mtime)
-                xrev = xapdoc.SortKey('revision', '0')
-                xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att))
-                xlanguage = xapdoc.Keyword('lang', language)
-                xstem_language = xapdoc.Keyword('stem_lang', stem_language)
-                mimetype, att_content = self.contentfilter(filename)
-                xmimetypes = [xapdoc.Keyword('mimetype', mt) for mt in [mimetype, ] + mimetype.split('/')]
-                xcontent = xapdoc.TextField('content', att_content)
-                xtitle_txt = xapdoc.TextField('title',
-                        '%s/%s' % (pagename, att), True)
-                xfulltitle = xapdoc.Keyword('fulltitle', pagename)
-                xdomains = [xapdoc.Keyword('domain', domain)
-                        for domain in domains]
-                doc = xapdoc.Document(textFields=(xcontent, xtitle_txt),
-                                      keywords=xdomains + xmimetypes + [xatt_itemid,
-                                          xtitle, xlanguage, xstem_language,
-                                          xfulltitle, ],
-                                      sortFields=(xpname, xattachment, xmtime,
-                                          xwname, xrev, ),
-                                     )
-                doc.analyzerFactory = getWikiAnalyzerFactory(request,
-                        stem_language)
-                if mode == 'update':
-                    logging.debug("%s (replace %r)" % (pagename, uid))
-                    doc.uid = uid
-                    id = writer.index(doc)
-                elif mode == 'add':
-                    logging.debug("%s (add)" % (pagename, ))
-                    id = writer.index(doc)
-        #writer.flush()
-
-    def _remove_item(self, writer, page, attachment=None):
-        request = page.request
+    def _remove_item(self, request, writer, page, attachment=None):
         wikiname = request.cfg.interwikiname or u'Self'
         pagename = page.page_name
 
@@ -627,15 +639,7 @@
             pages = request.rootpage.getPageList(user='', exists=1)
             logging.debug("indexing all (%d) pages..." % len(pages))
             for pagename in pages:
-                p = Page(request, pagename)
-                request.page = p
-                if request.cfg.xapian_index_history:
-                    for rev in p.getRevList():
-                        self._index_page(writer,
-                                Page(request, pagename, rev=rev),
-                                mode)
-                else:
-                    self._index_page(writer, p, mode)
+                self._index_page(request, writer, pagename, mode=mode)
             if files:
                 logging.debug("indexing all files...")
                 for fname in files:
--- a/MoinMoin/search/__init__.py	Tue Jan 27 21:18:50 2009 +0100
+++ b/MoinMoin/search/__init__.py	Wed Jan 28 16:42:21 2009 +0100
@@ -13,7 +13,7 @@
 from MoinMoin import log
 logging = log.getLogger(__name__)
 
-from MoinMoin.search.queryparser import QueryParser
+from MoinMoin.search.queryparser import QueryParser, QueryError
 from MoinMoin.search.builtin import Search
 
 def searchPages(request, query, sort='weight', mtime=None, historysearch=None, **kw):
--- a/MoinMoin/search/_tests/test_search.py	Tue Jan 27 21:18:50 2009 +0100
+++ b/MoinMoin/search/_tests/test_search.py	Wed Jan 28 16:42:21 2009 +0100
@@ -9,6 +9,7 @@
 
 import py
 
+from MoinMoin.search import QueryError
 from MoinMoin.search.queryparser import QueryParser
 from MoinMoin import search
 
@@ -61,7 +62,7 @@
         """ search: test the query parser """
         parser = QueryParser()
         def _test(q):
-            py.test.raises(ValueError, parser.parse_query, q)
+            py.test.raises(QueryError, parser.parse_query, q)
         for query in ['""', '(', ')', '(a or b']:
             yield _test, query
 
--- a/MoinMoin/search/builtin.py	Tue Jan 27 21:18:50 2009 +0100
+++ b/MoinMoin/search/builtin.py	Wed Jan 28 16:42:21 2009 +0100
@@ -279,7 +279,7 @@
             return
 
         from threading import Thread
-        indexThread = Thread(target=self._index_pages, args=(files, mode))
+        indexThread = Thread(target=self._index_pages, args=(self.request, files, mode))
         indexThread.setDaemon(True)
 
         # Join the index thread after current request finish, prevent
@@ -403,12 +403,14 @@
         """
         from MoinMoin.request.request_cli import Request
         from MoinMoin.security import Permissions
-        request = Request(request.url)
+        from MoinMoin.logfile import editlog
+        r = Request(request.url)
         class SecurityPolicy(Permissions):
             def read(self, *args, **kw):
                 return True
-        request.user.may = SecurityPolicy(request.user)
-        return request
+        r.user.may = SecurityPolicy(r.user)
+        r.editlog = editlog.EditLog(r)
+        return r
 
     def _unsign(self):
         """ Remove sig file - assume write lock acquired """
@@ -636,12 +638,12 @@
             wikiname = valuedict['wikiname']
             pagename = valuedict['pagename']
             attachment = valuedict['attachment']
-            logging.debug("_getHits processing %r %r %r" % (wikiname, pagename, attachment))
 
             if 'revision' in valuedict and valuedict['revision']:
                 revision = int(valuedict['revision'])
             else:
                 revision = 0
+            logging.debug("_getHits processing %r %r %d %r" % (wikiname, pagename, revision, attachment))
 
             if wikiname in (self.request.cfg.interwikiname, 'Self'): # THIS wiki
                 page = Page(self.request, pagename, rev=revision)
@@ -650,14 +652,16 @@
                     # revlist can be empty if page was nuked/renamed since it was included in xapian index
                     if not revlist or revlist[0] != revision:
                         # nothing there at all or not the current revision
+                        logging.debug("no history search, skipping non-current revision...")
                         continue
                 if attachment:
+                    # revision currently is 0 ever
                     if pagename == fs_rootpage: # not really an attachment
                         page = Page(self.request, "%s/%s" % (fs_rootpage, attachment))
-                        hits.append((wikiname, page, None, None))
+                        hits.append((wikiname, page, None, None, revision))
                     else:
                         matches = matchSearchFunction(page=None, uid=uid)
-                        hits.append((wikiname, page, attachment, matches))
+                        hits.append((wikiname, page, attachment, matches, revision))
                 else:
                     matches = matchSearchFunction(page=page, uid=uid)
                     logging.debug("matchSearchFunction %r returned %r" % (matchSearchFunction, matches))
@@ -667,10 +671,11 @@
                                 revisionCache[pagename][0] < revision:
                             hits.remove(revisionCache[pagename][1])
                             del revisionCache[pagename]
-                        hits.append((wikiname, page, attachment, matches))
+                        hits.append((wikiname, page, attachment, matches, revision))
                         revisionCache[pagename] = (revision, hits[-1])
             else: # other wiki
                 hits.append((wikiname, pagename, attachment, None, revision))
+        logging.debug("_getHits returning %r." % hits)
         return hits
 
     def _getPageList(self):
@@ -697,8 +702,8 @@
         userMayRead = self.request.user.may.read
         fs_rootpage = self.fs_rootpage + "/"
         thiswiki = (self.request.cfg.interwikiname, 'Self')
-        filtered = [(wikiname, page, attachment, match)
-                for wikiname, page, attachment, match in hits
+        filtered = [(wikiname, page, attachment, match, rev)
+                for wikiname, page, attachment, match, rev in hits
                     if (not wikiname in thiswiki or
                        page.exists() and userMayRead(page.page_name) or
                        page.page_name.startswith(fs_rootpage)) and
--- a/MoinMoin/search/queryparser.py	Tue Jan 27 21:18:50 2009 +0100
+++ b/MoinMoin/search/queryparser.py	Wed Jan 28 16:42:21 2009 +0100
@@ -24,6 +24,10 @@
 except ImportError:
     pass
 
+class QueryError(ValueError):
+    """ error raised for problems when parsing the query """
+
+
 #############################################################################
 ### query objects
 #############################################################################
@@ -961,7 +965,7 @@
                             orexpr = OrExpression(terms)
                         terms = AndExpression(orexpr)
                     else:
-                        raise ValueError('Nothing to OR')
+                        raise QueryError('Nothing to OR')
                     remaining = self._analyse_items(items)
                     if remaining.__class__ == OrExpression:
                         for sub in remaining.subterms():
@@ -977,7 +981,7 @@
                     # being parsed rather than rejecting an empty string
                     # before parsing...
                     if not item:
-                        raise ValueError("Term too short")
+                        raise QueryError("Term too short")
                     regex = self.regex
                     case = self.case
                     if self.titlesearch:
@@ -997,7 +1001,7 @@
                 while len(item) > 1:
                     m = item[0]
                     if m is None:
-                        raise ValueError("Invalid search prefix")
+                        raise QueryError("Invalid search prefix")
                     elif m == M:
                         negate = True
                     elif "title".startswith(m):
@@ -1017,7 +1021,7 @@
                     elif "domain".startswith(m):
                         domain = True
                     else:
-                        raise ValueError("Invalid search prefix")
+                        raise QueryError("Invalid search prefix")
                     item = item[1:]
 
                 text = item[0]
@@ -1057,9 +1061,9 @@
                                                         multikey=True,
                                                         brackets=('()', ),
                                                         quotes='\'"')
-            logging.debug("parse_quoted_separated items: %r" % items)
         except wikiutil.BracketError, err:
-            raise ValueError(str(err))
+            raise QueryError(str(err))
+        logging.debug("parse_quoted_separated items: %r" % items)
         query = self._analyse_items(items)
         logging.debug("analyse_items query: %r" % query)
         return query
--- a/MoinMoin/search/results.py	Tue Jan 27 21:18:50 2009 +0100
+++ b/MoinMoin/search/results.py	Wed Jan 28 16:42:21 2009 +0100
@@ -95,10 +95,11 @@
 class FoundPage:
     """ Represents a page in a search result """
 
-    def __init__(self, page_name, matches=None, page=None):
+    def __init__(self, page_name, matches=None, page=None, rev=0):
         self.page_name = page_name
         self.attachment = '' # this is not an attachment
         self.page = page
+        self.rev = rev
         if matches is None:
             matches = []
         self._matches = matches
@@ -184,9 +185,10 @@
 class FoundAttachment(FoundPage):
     """ Represents an attachment in search results """
 
-    def __init__(self, page_name, attachment, matches=None, page=None):
+    def __init__(self, page_name, attachment, matches=None, page=None, rev=0):
         self.page_name = page_name
         self.attachment = attachment
+        self.rev = rev
         self.page = page
         if matches is None:
             matches = []
@@ -199,9 +201,10 @@
 class FoundRemote(FoundPage):
     """ Represents an attachment in search results """
 
-    def __init__(self, wikiname, page_name, attachment, matches=None, page=None):
+    def __init__(self, wikiname, page_name, attachment, matches=None, page=None, rev=0):
         self.wikiname = wikiname
         self.page_name = page_name
+        self.rev = rev
         self.attachment = attachment
         self.page = page
         if matches is None:
@@ -330,15 +333,16 @@
                 displayHits = self.hits
 
             for page in displayHits:
+                # TODO handle interwiki search hits
                 if page.attachment:
                     querydict = {
                         'action': 'AttachFile',
                         'do': 'view',
                         'target': page.attachment,
                     }
-                elif page.page.rev and page.page.rev != page.page.getRevList()[0]:
+                elif page.rev and page.rev != page.page.getRevList()[0]:
                     querydict = {
-                        'rev': page.page.rev,
+                        'rev': page.rev,
                     }
                 else:
                     querydict = None
@@ -409,6 +413,7 @@
                 displayHits = self.hits
 
             for page in displayHits:
+                # TODO handle interwiki search hits
                 matchInfo = ''
                 if info:
                     matchInfo = self.formatInfo(f, page)
@@ -424,9 +429,9 @@
                     querydict = None
                 else:
                     fmt_context = self.formatContext(page, context, maxlines)
-                    if page.page.rev and page.page.rev != page.page.getRevList()[0]:
+                    if page.rev and page.rev != page.page.getRevList()[0]:
                         querydict = {
-                            'rev': page.page.rev,
+                            'rev': page.rev,
                         }
                     else:
                         querydict = None
@@ -819,16 +824,15 @@
     @param estimated_hits: if true, use this estimated hit count
     """
     result_hits = []
-    for wikiname, page, attachment, match in hits:
+    for wikiname, page, attachment, match, rev in hits:
         if wikiname in (request.cfg.interwikiname, 'Self'): # a local match
             if attachment:
-                result_hits.append(FoundAttachment(page.page_name,
-                    attachment, page=page, matches=match))
+                result_hits.append(FoundAttachment(page.page_name, attachment, matches=match, page=page, rev=rev))
             else:
-                result_hits.append(FoundPage(page.page_name, match, page))
+                result_hits.append(FoundPage(page.page_name, matches=match, page=page, rev=rev))
         else:
-            result_hits.append(FoundRemote(wikiname, page.page_name,
-                attachment, match, page))
+            page_name = page # for remote wikis, we have the page_name, not the page obj
+            result_hits.append(FoundRemote(wikiname, page_name, attachment, matches=match, rev=rev))
     elapsed = time.time() - start
     count = request.rootpage.getPageCount()
     return SearchResults(query, result_hits, count, elapsed, sort,