changeset 919:5469c8b911a4

Splitting out MoinMoin/search.py to MoinMoin/search/*.py
author Franz Pletz <fpletz AT franz-pletz DOT org>
date Tue, 27 Jun 2006 15:09:46 +0200
parents d9bd5d6ae30d
children a2498260eca5
files MoinMoin/Xapian.py MoinMoin/action/fckdialog.py MoinMoin/action/fullsearch.py MoinMoin/macro/FullSearch.py MoinMoin/macro/__init__.py MoinMoin/search.py MoinMoin/search/Xapian.py MoinMoin/search/__init__.py MoinMoin/search/builtin.py MoinMoin/search/queryparser.py MoinMoin/search/results.py MoinMoin/xmlrpc/__init__.py docs/CHANGES.fpletz
diffstat 13 files changed, 2309 insertions(+), 2264 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/Xapian.py	Tue Jun 27 13:58:39 2006 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,771 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-"""
-    MoinMoin - xapian indexing search engine
-
-    @copyright: 2006 MoinMoin:ThomasWaldmann,
-                2006 MoinMoin:FranzPletz
-    @license: GNU GPL, see COPYING for details.
-"""
-debug = True
-
-import sys, os, re, codecs, errno, time
-from pprint import pprint
-
-import xapian
-from xapian import Query
-from MoinMoin.support.xapwrap import document as xapdoc
-from MoinMoin.support.xapwrap import index as xapidx
-from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
-
-from MoinMoin.Page import Page
-from MoinMoin import config, wikiutil
-from MoinMoin.util import filesys, lock
-
-try:
-    # PyStemmer, snowball python bindings from http://snowball.tartarus.org/
-    from Stemmer import Stemmer
-except ImportError:
-    Stemmer = None
-
-class UnicodeQuery(xapian.Query):
-    def __init__(self, *args, **kwargs):
-        self.encoding = kwargs.get('encoding', config.charset)
-
-        nargs = []
-        for term in args:
-            if isinstance(term, unicode):
-                term = term.encode(self.encoding)
-            elif isinstance(term, list) or isinstance(term, tuple):
-                term = [t.encode(self.encoding) for t in term]
-            nargs.append(term)
-
-        xapian.Query.__init__(self, *nargs, **kwargs)
-
-
-##############################################################################
-### Tokenizer
-##############################################################################
-
-def getWikiAnalyzerFactory(request=None, language='en'):
-    return (lambda: WikiAnalyzer(request, language))
-
-class WikiAnalyzer:
-    singleword = r"[%(u)s][%(l)s]+" % {
-                     'u': config.chars_upper,
-                     'l': config.chars_lower,
-                 }
-
-    singleword_re = re.compile(singleword, re.U)
-    wikiword_re = re.compile(WikiParser.word_rule, re.U)
-
-    token_re = re.compile(
-        r"(?P<company>\w+[&@]\w+)|" + # company names like AT&T and Excite@Home.
-        r"(?P<email>\w+([.-]\w+)*@\w+([.-]\w+)*)|" +    # email addresses
-        r"(?P<hostname>\w+(\.\w+)+)|" +                 # hostnames
-        r"(?P<num>(\w+[-/.,])*\w*\d\w*([-/.,]\w+)*)|" + # version numbers
-        r"(?P<acronym>(\w\.)+)|" +          # acronyms: U.S.A., I.B.M., etc.
-        r"(?P<word>\w+)",                   # words (including WikiWords)
-        re.U)
-
-    dot_re = re.compile(r"[-_/,.]")
-    mail_re = re.compile(r"[-_/,.]|(@)")
-    
-    # XXX limit stuff above to xapdoc.MAX_KEY_LEN
-    # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U)
-
-    def __init__(self, request=None, language=None):
-        if request and request.cfg.xapian_stemming and language:
-            self.stemmer = Stemmer(language)
-        else:
-            self.stemmer = None
-
-    def raw_tokenize(self, value):
-        def enc(uc):
-            """ 'encode' unicode results into whatever xapian / xapwrap wants """
-            lower = uc.lower()
-            return lower
-            
-        if isinstance(value, list): # used for page links
-            for v in value:
-                yield enc(v)
-        else:
-            tokenstream = re.finditer(self.token_re, value)
-            for m in tokenstream:
-                if m.group("acronym"):
-                    yield enc(m.group("acronym").replace('.', ''))
-                elif m.group("company"):
-                    yield enc(m.group("company"))
-                elif m.group("email"):
-                    for word in self.mail_re.split(m.group("email")):
-                        if word:
-                            yield enc(word)
-                elif m.group("hostname"):
-                    for word in self.dot_re.split(m.group("hostname")):
-                        yield enc(word)
-                elif m.group("num"):
-                    for word in self.dot_re.split(m.group("num")):
-                        yield enc(word)
-                elif m.group("word"):
-                    word = m.group("word")
-                    yield enc(word)
-                    # if it is a CamelCaseWord, we additionally yield Camel, Case and Word
-                    if self.wikiword_re.match(word):
-                        for sm in re.finditer(self.singleword_re, word):
-                            yield enc(sm.group())
-
-    def tokenize(self, value, flat_stemming=True):
-        """Yield a stream of lower cased raw and stemmed (optional) words from a string.
-           value must be an UNICODE object or a list of unicode objects
-        """
-        for i in self.raw_tokenize(value):
-            if flat_stemming:
-                yield i # XXX: should we really use a prefix for that? Index.prefixMap['raw'] + i
-                if self.stemmer:
-                    yield self.stemmer.stemWord(i)
-            else:
-                yield (i, self.stemmer.stemWord(i))
-
-
-#############################################################################
-### Indexing
-#############################################################################
-
-class UpdateQueue:
-    def __init__(self, file, lock_dir):
-        self.file = file
-        self.writeLock = lock.WriteLock(lock_dir, timeout=10.0)
-        self.readLock = lock.ReadLock(lock_dir, timeout=10.0)
-
-    def exists(self):
-        return os.path.exists(self.file)
-
-    def append(self, pagename):
-        """ Append a page to queue """
-        if not self.writeLock.acquire(60.0):
-            request.log("can't add %r to xapian update queue: can't lock queue" %
-                        pagename)
-            return
-        try:
-            f = codecs.open(self.file, 'a', config.charset)
-            try:
-                f.write(pagename + "\n")
-            finally:
-                f.close()
-        finally:
-            self.writeLock.release()
-
-    def pages(self):
-        """ Return list of pages in the queue """
-        if self.readLock.acquire(1.0):
-            try:
-                return self._decode(self._read())
-            finally:
-                self.readLock.release()
-        return []
-
-    def remove(self, pages):
-        """ Remove pages from the queue
-        
-        When the queue is empty, the queue file is removed, so exists()
-        can tell if there is something waiting in the queue.
-        """
-        if self.writeLock.acquire(30.0):
-            try:
-                queue = self._decode(self._read())
-                for page in pages:
-                    try:
-                        queue.remove(page)
-                    except ValueError:
-                        pass
-                if queue:
-                    self._write(queue)
-                else:
-                    self._removeFile()
-                return True
-            finally:
-                self.writeLock.release()
-        return False
-
-    # Private -------------------------------------------------------
-
-    def _decode(self, data):
-        """ Decode queue data """
-        pages = data.splitlines()
-        return self._filterDuplicates(pages)
-
-    def _filterDuplicates(self, pages):
-        """ Filter duplicates in page list, keeping the order """
-        unique = []
-        seen = {}
-        for name in pages:
-            if not name in seen:
-                unique.append(name)
-                seen[name] = 1
-        return unique
-
-    def _read(self):
-        """ Read and return queue data
-        
-        This does not do anything with the data so we can release the
-        lock as soon as possible, enabling others to update the queue.
-        """
-        try:
-            f = codecs.open(self.file, 'r', config.charset)
-            try:
-                return f.read()
-            finally:
-                f.close()
-        except (OSError, IOError), err:
-            if err.errno != errno.ENOENT:
-                raise
-            return ''
-
-    def _write(self, pages):
-        """ Write pages to queue file
-        
-        Requires queue write locking.
-        """
-        # XXX use tmpfile/move for atomic replace on real operating systems
-        data = '\n'.join(pages) + '\n'
-        f = codecs.open(self.file, 'w', config.charset)
-        try:
-            f.write(data)
-        finally:
-            f.close()
-
-    def _removeFile(self):
-        """ Remove queue file 
-        
-        Requires queue write locking.
-        """
-        try:
-            os.remove(self.file)
-        except OSError, err:
-            if err.errno != errno.ENOENT:
-                raise
-
-
-class Index:
-    indexValueMap = {
-        # mapping the value names we can easily fetch from the index to
-        # integers required by xapian. 0 and 1 are reserved by xapwrap!
-        'pagename': 2,
-        'attachment': 3,
-        'mtime': 4,
-        'wikiname': 5,
-    }
-    prefixMap = {
-        # http://svn.xapian.org/*checkout*/trunk/xapian-applications/omega/docs/termprefixes.txt
-        'author': 'A',
-        'date':   'D', # numeric format: YYYYMMDD or "latest" - e.g. D20050224 or Dlatest
-                       #G   newsGroup (or similar entity - e.g. a web forum name)
-        'hostname': 'H',
-        'keyword': 'K',
-        'lang': 'L',   # ISO Language code
-                       #M   Month (numeric format: YYYYMM)
-                       #N   ISO couNtry code (or domaiN name)
-                       #P   Pathname
-                       #Q   uniQue id
-        'raw':  'R',   # Raw (i.e. unstemmed) term
-        'title': 'S',  # Subject (or title)
-        'mimetype': 'T',
-        'url': 'U',    # full URL of indexed document - if the resulting term would be > 240
-                       # characters, a hashing scheme is used to prevent overflowing
-                       # the Xapian term length limit (see omindex for how to do this).
-                       #W   "weak" (approximately 10 day intervals, taken as YYYYMMD from
-                       #  the D term, and changing the last digit to a '2' if it's a '3')
-                       #X   longer prefix for user-defined use
-        'linkto': 'XLINKTO', # this document links to that document
-        'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in 
-                       #Y   year (four digits)
-    }
-
-    class LockedException(Exception):
-        pass
-    
-    def __init__(self, request):
-        self.request = request
-        cache_dir = request.cfg.cache_dir
-        main_dir = self._main_dir()
-        self.dir = os.path.join(main_dir, 'index')
-        filesys.makeDirs(self.dir)
-        self.sig_file = os.path.join(main_dir, 'complete')
-        lock_dir = os.path.join(main_dir, 'index-lock')
-        self.lock = lock.WriteLock(lock_dir,
-                                   timeout=3600.0, readlocktimeout=60.0)
-        self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0)
-        self.queue = UpdateQueue(os.path.join(main_dir, 'update-queue'),
-                                 os.path.join(main_dir, 'update-queue-lock'))
-
-        # Disabled until we have a sane way to build the index with a
-        # queue in small steps.
-        ## if not self.exists():
-        ##    self.indexPagesInNewThread(request)
-
-        # Check if we should and can stem words
-        if request.cfg.xapian_stemming and not Stemmer:
-            request.cfg.xapian_stemming = False
-
-    def _main_dir(self):
-        if self.request.cfg.xapian_index_dir:
-            return os.path.join(self.request.cfg.xapian_index_dir,
-                    self.request.cfg.siteid)
-        else:
-            return os.path.join(self.request.cfg.cache_dir, 'xapian')
-
-    def exists(self):
-        """ Check if index exists """        
-        return os.path.exists(self.sig_file)
-                
-    def mtime(self):
-        return os.path.getmtime(self.dir)
-
-    def _search(self, query):
-        """ read lock must be acquired """
-        while True:
-            try:
-                searcher, timestamp = self.request.cfg.xapian_searchers.pop()
-                if timestamp != self.mtime():
-                    searcher.close()
-                else:
-                    break
-            except IndexError:
-                searcher = xapidx.ReadOnlyIndex(self.dir)
-                searcher.configure(self.prefixMap, self.indexValueMap)
-                timestamp = self.mtime()
-                break
-        
-        hits = searcher.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname'])
-        self.request.cfg.xapian_searchers.append((searcher, timestamp))
-        return hits
-    
-    def search(self, query):
-        if not self.read_lock.acquire(1.0):
-            raise self.LockedException
-        try:
-            hits = self._search(query)
-        finally:
-            self.read_lock.release()
-        return hits
-
-    def update_page(self, page):
-        self.queue.append(page.page_name)
-        self._do_queued_updates_InNewThread()
-
-    def indexPages(self, files=None, mode='update'):
-        """ Index all pages (and files, if given)
-        
-        Can be called only from a script. To index pages during a user
-        request, use indexPagesInNewThread.
-        @arg files: iterator or list of files to index additionally
-        """
-        if not self.lock.acquire(1.0):
-            self.request.log("can't index: can't acquire lock")
-            return
-        try:
-            request = self._indexingRequest(self.request)
-            self._index_pages(request, None, files, mode)
-        finally:
-            self.lock.release()
-    
-    def indexPagesInNewThread(self, files=None, mode='update'):
-        """ Index all pages in a new thread
-        
-        Should be called from a user request. From a script, use indexPages.
-        """
-        if not self.lock.acquire(1.0):
-            self.request.log("can't index: can't acquire lock")
-            return
-        try:
-            # Prevent rebuilding the index just after it was finished
-            if self.exists():
-                self.lock.release()
-                return
-            from threading import Thread
-            indexThread = Thread(target=self._index_pages,
-                args=(self._indexingRequest(self.request), self.lock, files, mode))
-            indexThread.setDaemon(True)
-            
-            # Join the index thread after current request finish, prevent
-            # Apache CGI from killing the process.
-            def joinDecorator(finish):
-                def func():
-                    finish()
-                    indexThread.join()
-                return func
-
-            self.request.finish = joinDecorator(self.request.finish)
-            indexThread.start()
-        except:
-            self.lock.release()
-            raise
-
-    def optimize(self):
-        pass
-
-    # Private ----------------------------------------------------------------
-
-    def _do_queued_updates_InNewThread(self):
-        """ do queued index updates in a new thread
-        
-        Should be called from a user request. From a script, use indexPages.
-        """
-        if not self.lock.acquire(1.0):
-            self.request.log("can't index: can't acquire lock")
-            return
-        try:
-            from threading import Thread
-            indexThread = Thread(target=self._do_queued_updates,
-                args=(self._indexingRequest(self.request), self.lock))
-            indexThread.setDaemon(True)
-            
-            # Join the index thread after current request finish, prevent
-            # Apache CGI from killing the process.
-            def joinDecorator(finish):
-                def func():
-                    finish()
-                    indexThread.join()
-                return func
-                
-            self.request.finish = joinDecorator(self.request.finish)
-            indexThread.start()
-        except:
-            self.lock.release()
-            raise
-
-    def _do_queued_updates(self, request, lock=None, amount=5):
-        """ Assumes that the write lock is acquired """
-        try:
-            writer = xapidx.Index(self.dir, True)
-            writer.configure(self.prefixMap, self.indexValueMap)
-            pages = self.queue.pages()[:amount]
-            for name in pages:
-                p = Page(request, name)
-                self._index_page(writer, p, mode='update')
-                self.queue.remove([name])
-        finally:
-            writer.close()
-            if lock:
-                lock.release()
-
-    def contentfilter(self, filename):
-        """ Get a filter for content of filename and return unicode content. """
-        request = self.request
-        mt = wikiutil.MimeType(filename=filename)
-        for modulename in mt.module_name():
-            try:
-                execute = wikiutil.importPlugin(request.cfg, 'filter', modulename)
-                break
-            except wikiutil.PluginMissingError:
-                pass
-            else:
-                request.log("Cannot load filter for mimetype." + modulename)
-        try:
-            data = execute(self, filename)
-            if debug:
-                request.log("Filter %s returned %d characters for file %s" % (modulename, len(data), filename))
-        except (OSError, IOError), err:
-            data = ''
-            request.log("Filter %s threw error '%s' for file %s" % (modulename, str(err), filename))
-        return mt.mime_type(), data
-   
-    def test(self, request):
-        idx = xapidx.ReadOnlyIndex(self.dir)
-        idx.configure(self.prefixMap, self.indexValueMap)
-        print idx.search("is")
-        #for d in docs:
-        #    request.log("%r %r %r" % (d, d.get('attachment'), d.get('pagename')))
-
-    def _index_file(self, request, writer, filename, mode='update'):
-        """ index a file as it were a page named pagename
-            Assumes that the write lock is acquired
-        """
-        fs_rootpage = 'FS' # XXX FS hardcoded
-        try:
-            wikiname = request.cfg.interwikiname or 'Self'
-            itemid = "%s:%s" % (wikiname, os.path.join(fs_rootpage, filename))
-            mtime = os.path.getmtime(filename)
-            mtime = wikiutil.timestamp2version(mtime)
-            if mode == 'update':
-                query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid))
-                docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ])
-                if docs:
-                    doc = docs[0] # there should be only one
-                    uid = doc['uid']
-                    docmtime = long(doc['values']['mtime'])
-                    updated = mtime > docmtime
-                    if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated))
-                else:
-                    uid = None
-                    updated = True
-            elif mode == 'add':
-                updated = True
-            if debug: request.log("%s %r" % (filename, updated))
-            if updated:
-                xitemid = xapdoc.Keyword('itemid', itemid)
-                mimetype, file_content = self.contentfilter(filename)
-                xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or "Self")
-                xpname = xapdoc.SortKey('pagename', fs_rootpage)
-                xattachment = xapdoc.SortKey('attachment', filename) # XXX we should treat files like real pages, not attachments
-                xmtime = xapdoc.SortKey('mtime', mtime)
-                title = " ".join(os.path.join(fs_rootpage, filename).split("/"))
-                xtitle = xapdoc.Keyword('title', title)
-                xmimetype = xapdoc.TextField('mimetype', mimetype, True)
-                xcontent = xapdoc.TextField('content', file_content)
-                doc = xapdoc.Document(textFields=(xcontent, xmimetype, ),
-                                      keywords=(xtitle, xitemid, ),
-                                      sortFields=(xpname, xattachment, xmtime, xwname, ),
-                                     )
-                doc.analyzerFactory = getWikiAnalyzerFactory()
-                if mode == 'update':
-                    if debug: request.log("%s (replace %r)" % (filename, uid))
-                    doc.uid = uid
-                    id = writer.index(doc)
-                elif mode == 'add':
-                    if debug: request.log("%s (add)" % (filename,))
-                    id = writer.index(doc)
-        except (OSError, IOError), err:
-            pass
-
-    def _get_languages(self, page):
-        body = page.get_raw_body()
-        default_lang = page.request.cfg.language_default
-
-        lang = ''
-
-        if page.request.cfg.xapian_stemming:
-            for line in body.split('\n'):
-                if line.startswith('#language'):
-                    lang = line.split(' ')[1]
-                    try:
-                        Stemmer(lang)
-                    except KeyError:
-                        # lang is not stemmable
-                        break
-                    else:
-                        # lang is stemmable
-                        return (lang, lang)
-                elif not line.startswith('#'):
-                    break
-        
-        if not lang:
-            # no lang found at all.. fallback to default language
-            lang = default_lang
-
-        # return actual lang and lang to stem in
-        return (lang, default_lang)
-
-    def _index_page(self, writer, page, mode='update'):
-        """ Index a page - assumes that the write lock is acquired
-            @arg writer: the index writer object
-            @arg page: a page object
-            @arg mode: 'add' = just add, no checks
-                       'update' = check if already in index and update if needed (mtime)
-            
-        """
-        request = page.request
-        wikiname = request.cfg.interwikiname or "Self"
-        pagename = page.page_name
-        mtime = page.mtime_usecs()
-        itemid = "%s:%s" % (wikiname, pagename)
-        # XXX: Hack until we get proper metadata
-        language, stem_language = self._get_languages(page)
-        updated = False
-
-        if mode == 'update':
-            # from #xapian: if you generate a special "unique id" term,
-            # you can just call database.replace_document(uid_term, doc)
-            # -> done in xapwrap.index.Index.index()
-            query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid))
-            docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ])
-            if docs:
-                doc = docs[0] # there should be only one
-                uid = doc['uid']
-                docmtime = long(doc['values']['mtime'])
-                updated = mtime > docmtime
-                if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated))
-            else:
-                uid = None
-                updated = True
-        elif mode == 'add':
-            updated = True
-        if debug: request.log("%s %r" % (pagename, updated))
-        if updated:
-            xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or "Self")
-            xpname = xapdoc.SortKey('pagename', pagename)
-            xattachment = xapdoc.SortKey('attachment', '') # this is a real page, not an attachment
-            xmtime = xapdoc.SortKey('mtime', mtime)
-            xtitle = xapdoc.TextField('title', pagename, True) # prefixed
-            xkeywords = [xapdoc.Keyword('itemid', itemid),
-                    xapdoc.Keyword('lang', language),
-                    xapdoc.Keyword('stem_lang', stem_language)]
-            for pagelink in page.getPageLinks(request):
-                xkeywords.append(xapdoc.Keyword('linkto', pagelink))
-            xcontent = xapdoc.TextField('content', page.get_raw_body())
-            doc = xapdoc.Document(textFields=(xcontent, xtitle),
-                                  keywords=xkeywords,
-                                  sortFields=(xpname, xattachment, xmtime, xwname, ),
-                                 )
-            doc.analyzerFactory = getWikiAnalyzerFactory(request,
-                    stem_language)
-
-            if mode == 'update':
-                if debug: request.log("%s (replace %r)" % (pagename, uid))
-                doc.uid = uid
-                id = writer.index(doc)
-            elif mode == 'add':
-                if debug: request.log("%s (add)" % (pagename,))
-                id = writer.index(doc)
-
-        from MoinMoin.action import AttachFile
-
-        attachments = AttachFile._get_files(request, pagename)
-        for att in attachments:
-            filename = AttachFile.getFilename(request, pagename, att)
-            att_itemid = "%s//%s" % (itemid, att)
-            mtime = wikiutil.timestamp2version(os.path.getmtime(filename))
-            if mode == 'update':
-                query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', att_itemid))
-                docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ])
-                if debug: request.log("##%r %r" % (filename, docs))
-                if docs:
-                    doc = docs[0] # there should be only one
-                    uid = doc['uid']
-                    docmtime = long(doc['values']['mtime'])
-                    updated = mtime > docmtime
-                    if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated))
-                else:
-                    uid = None
-                    updated = True
-            elif mode == 'add':
-                updated = True
-            if debug: request.log("%s %s %r" % (pagename, att, updated))
-            if updated:
-                xatt_itemid = xapdoc.Keyword('itemid', att_itemid)
-                xpname = xapdoc.SortKey('pagename', pagename)
-                xattachment = xapdoc.SortKey('attachment', att) # this is an attachment, store its filename
-                xmtime = xapdoc.SortKey('mtime', mtime)
-                xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att))
-                xlanguage = xapdoc.Keyword('lang', language)
-                xstem_language = xapdoc.Keyword('stem_lang', stem_language)
-                mimetype, att_content = self.contentfilter(filename)
-                xmimetype = xapdoc.TextField('mimetype', mimetype, True)
-                xcontent = xapdoc.TextField('content', att_content)
-                doc = xapdoc.Document(textFields=(xcontent, xmimetype, ),
-                                      keywords=(xatt_itemid, xtitle, xlanguage, xstem_language, ),
-                                      sortFields=(xpname, xattachment, xmtime, xwname, ),
-                                     )
-                doc.analyzerFactory = getWikiAnalyzerFactory(request,
-                        stem_language)
-                if mode == 'update':
-                    if debug: request.log("%s (replace %r)" % (pagename, uid))
-                    doc.uid = uid
-                    id = writer.index(doc)
-                elif mode == 'add':
-                    if debug: request.log("%s (add)" % (pagename,))
-                    id = writer.index(doc)
-        #writer.flush()
-        
-
-    def _index_pages(self, request, lock=None, files=None, mode='update'):
-        """ Index all pages (and all given files)
-        
-        This should be called from indexPages or indexPagesInNewThread only!
-        
-        This may take some time, depending on the size of the wiki and speed
-        of the machine.
-
-        When called in a new thread, lock is acquired before the call,
-        and this method must release it when it finishes or fails.
-        """
-        try:
-            self._unsign()
-            start = time.time()
-            writer = xapidx.Index(self.dir, True)
-            writer.configure(self.prefixMap, self.indexValueMap)
-            pages = request.rootpage.getPageList(user='', exists=1)
-            request.log("indexing all (%d) pages..." % len(pages))
-            for pagename in pages:
-                p = Page(request, pagename)
-                self._index_page(writer, p, mode)
-            if files:
-                request.log("indexing all files...")
-                for fname in files:
-                    fname = fname.strip()
-                    self._index_file(request, writer, fname, mode)
-            writer.close()
-            request.log("indexing completed successfully in %0.2f seconds." %
-                        (time.time() - start))
-            self._sign()
-        finally:
-            writer.__del__()
-            if lock:
-                lock.release()
-
-    def _optimize(self, request):
-        """ Optimize the index """
-        pass
-
-    def _indexingRequest(self, request):
-        """ Return a new request that can be used for index building.
-        
-        This request uses a security policy that lets the current user
-        read any page. Without this policy some pages will not render,
-        which will create broken pagelinks index.        
-        """
-        from MoinMoin.request.CLI import Request
-        from MoinMoin.security import Permissions
-        request = Request(request.url)
-        class SecurityPolicy(Permissions):
-            def read(*args, **kw):
-                return True        
-        request.user.may = SecurityPolicy(request.user)
-        return request
-
-    def _unsign(self):
-        """ Remove sig file - assume write lock acquired """
-        try:
-            os.remove(self.sig_file)
-        except OSError, err:
-            if err.errno != errno.ENOENT:
-                raise
-
-    def _sign(self):
-        """ Add sig file - assume write lock acquired """
-        f = file(self.sig_file, 'w')
-        try:
-            f.write('')
-        finally:
-            f.close()
-
-
-def run_query(query, db):
-    enquire = xapian.Enquire(db)
-    parser = xapian.QueryParser()
-    query = parser.parse_query(query, xapian.QueryParser.FLAG_WILDCARD)
-    print query.get_description()
-    enquire.set_query(query)
-    return enquire.get_mset(0, 10)
-
-def run(request):
-    pass
-    #print "Begin"
-    #db = xapian.WritableDatabase(xapian.open('test.db',
-    #                                         xapian.DB_CREATE_OR_OPEN))
-    #
-    # index_data(db) ???
-    #del db
-    #mset = run_query(sys.argv[1], db)
-    #print mset.get_matches_estimated()
-    #iterator = mset.begin()
-    #while iterator != mset.end():
-    #    print iterator.get_document().get_data()
-    #    iterator.next()
-    #for i in xrange(1,170):
-    #    doc = db.get_document(i)
-    #    print doc.get_data()
-
-if __name__ == '__main__':
-    run()
-
-
--- a/MoinMoin/action/fckdialog.py	Tue Jun 27 13:58:39 2006 +0200
+++ b/MoinMoin/action/fckdialog.py	Tue Jun 27 15:09:46 2006 +0200
@@ -165,10 +165,7 @@
     from MoinMoin import search
     name = request.form.get("pagename",[""])[0]
     if name:
-        searchresult = search.searchPages(
-            request,
-            search.QueryParser().parse_query('t:"%s"' % name))
-        
+        searchresult = search.searchPages(request, 't:"%s"' % name)
         pages = [p.page_name for p in searchresult.hits]
     else:
         pages = [name]
@@ -209,9 +206,7 @@
     if name:
         from MoinMoin import search
         # XXX error handling!
-        searchresult = search.searchPages(
-            request,
-            search.QueryParser().parse_query('t:"%s"' % name))
+        searchresult = search.searchPages(request, 't:"%s"' % name)
         
         pages = [p.page_name for p in searchresult.hits]
         pages.sort()
@@ -378,9 +373,7 @@
     if name:
         from MoinMoin import search
         # XXX error handling!
-        searchresult = search.searchPages(
-            request,
-            search.QueryParser().parse_query('t:"%s"' % name))
+        searchresult = search.searchPages(request, 't:"%s"' % name)
         
         pages = [p.page_name for p in searchresult.hits]
         pages.sort()
--- a/MoinMoin/action/fullsearch.py	Tue Jun 27 13:58:39 2006 +0200
+++ b/MoinMoin/action/fullsearch.py	Tue Jun 27 15:09:46 2006 +0200
@@ -52,14 +52,13 @@
                 'of {{{"%s"}}}') % needle
         # send http headers
         request.http_headers()
-        Page(request, pagename).send_page(request, msg=err) 
+        Page(request, pagename).send_page(request, msg=err)
         return
 
     # search the pages
     from MoinMoin import search
-    query = search.QueryParser(case=case, regex=regex,
-                               titlesearch=titlesearch).parse_query(needle)
-    results = search.searchPages(request, query)
+    results = search.searchPages(request, needle, case=case,
+            regex=regex, titlesearch=titlesearch)
 
     # directly show a single hit
     # XXX won't work with attachment search
--- a/MoinMoin/macro/FullSearch.py	Tue Jun 27 13:58:39 2006 +0200
+++ b/MoinMoin/macro/FullSearch.py	Tue Jun 27 15:09:46 2006 +0200
@@ -54,8 +54,7 @@
     needle = needle.strip()
 
     # Search the pages and return the results
-    query = search.QueryParser().parse_query(needle)
-    results = search.searchPages(request, query)
+    results = search.searchPages(request, needle)
     results.sortByPagename()
 
     return results.pageList(request, macro.formatter)
--- a/MoinMoin/macro/__init__.py	Tue Jun 27 13:58:39 2006 +0200
+++ b/MoinMoin/macro/__init__.py	Tue Jun 27 15:09:46 2006 +0200
@@ -328,8 +328,9 @@
             return '<span class="error">%s</span>' % err
             
         # Return a title search for needle, sorted by name.
-        query = search.QueryParser(literal=literal, titlesearch=1, case=case).parse_query(needle)
-        results = search.searchPages(self.request, query)
+        # XXX: what's with literal?
+        results = search.searchPages(self.request, needle,
+                titlesearch=1, case=case)
         results.sortByPagename()
         return results.pageList(self.request, self.formatter)
         
--- a/MoinMoin/search.py	Tue Jun 27 13:58:39 2006 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1473 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-"""
-    MoinMoin - search engine
-    
-    @copyright: 2005 MoinMoin:FlorianFesti,
-                2005 MoinMoin:NirSoffer,
-                2005 MoinMoin:AlexanderSchremmer,
-                2006 MoinMoin:ThomasWaldmann,
-                2006 MoinMoin:FranzPletz
-    @license: GNU GPL, see COPYING for details
-"""
-
-import re, time, sys, StringIO, string, operator
-from sets import Set
-from MoinMoin import wikiutil, config
-from MoinMoin.Page import Page
-
-try:
-    import Xapian
-    from Xapian import Query, UnicodeQuery
-except ImportError:
-    pass
-
-#############################################################################
-### query objects
-#############################################################################
-
-class BaseExpression:
-    """ Base class for all search terms """
-    
-    def __init__(self):
-        self.negated = 0
-
-    def __str__(self):
-        return unicode(self).encode(config.charset, 'replace')
-
-    def negate(self):
-        """ Negate the result of this term """
-        self.negated = 1 
-
-    def pageFilter(self):
-        """ Return a page filtering function
-
-        This function is used to filter page list before we search
-        it. Return a function that get a page name, and return bool.
-
-        The default expression does not have any filter function and
-        return None. Sub class may define custom filter functions.
-        """
-        return None
-
-    def search(self, page):
-        """ Search a page
-
-        Returns a list of Match objects or None if term didn't find
-        anything (vice versa if negate() was called).  Terms containing
-        other terms must call this method to aggregate the results.
-        This Base class returns True (Match()) if not negated.
-        """
-        if self.negated:
-            # XXX why?
-            return [Match()]
-        else:
-            return None
-    
-    def costs(self):
-        """ Return estimated time to calculate this term
-        
-        Number is relative to other terms and has no real unit.
-        It allows to do the fast searches first.
-        """ 
-        return 0
-
-    def highlight_re(self):
-        """ Return a regular expression of what the term searches for
-
-        Used to display the needle in the page.
-        """
-        return ''
-
-    def _build_re(self, pattern, use_re=False, case=False, stemmed=False):
-        """ Make a regular expression out of a text pattern """
-        flags = case and re.U or (re.I | re.U)
-        if use_re:
-            try:
-                self.search_re = re.compile(pattern, flags)
-            except re.error:
-                pattern = re.escape(pattern)
-                self.pattern = pattern
-                self.search_re = re.compile(pattern, flags)
-            else:
-                self.pattern = pattern
-        else:
-            pattern = re.escape(pattern)
-            self.search_re = re.compile(pattern, flags)
-            self.pattern = pattern
-
-
-class AndExpression(BaseExpression):
-    """ A term connecting several sub terms with a logical AND """
-
-    operator = ' '
-
-    def __init__(self, *terms):
-        self._subterms = list(terms)
-        self._costs = 0
-        for t in self._subterms:
-            self._costs += t.costs()
-        self.negated = 0
-
-    def append(self, expression):
-        """ Append another term """
-        self._subterms.append(expression)
-        self._costs += expression.costs()
-
-    def subterms(self):
-        return self._subterms
-    
-    def costs(self):
-        return self._costs
-
-    def __unicode__(self):
-        result = ''
-        for t in self._subterms:
-            result += self.operator + t
-        return u'[' + result[len(self.operator):] + u']'
-
-    def pageFilter(self):
-        """ Return a page filtering function
-
-        This function is used to filter page list before we search it.
-
-        Return a function that gets a page name, and return bool, or None.
-        """
-        # Sort terms by cost, then get all title searches
-        self.sortByCost()
-        terms = [term for term in self._subterms if isinstance(term, TitleSearch)]
-        if terms:
-            # Create and return a filter function
-            def filter(name):
-                """ A function that return True if all terms filter name """
-                for term in terms:
-                    filter = term.pageFilter()
-                    if not filter(name):
-                        return False
-                return True
-            return filter
-        
-        return None
-
-    def sortByCost(self):
-        tmp = [(term.costs(), term) for term in self._subterms]
-        tmp.sort()
-        self._subterms = [item[1] for item in tmp]
-
-    def search(self, page):
-        """ Search for each term, cheap searches first """
-        self.sortByCost()
-        matches = []
-        for term in self._subterms:
-            result = term.search(page)
-            if not result:
-                return None
-            matches.extend(result)
-        return matches
-
-    def highlight_re(self):
-        result = []
-        for s in self._subterms:
-            highlight_re = s.highlight_re()
-            if highlight_re: result.append(highlight_re)
-            
-        return '|'.join(result)
-
-    def xapian_wanted(self):
-        wanted = True
-        for term in self._subterms:
-            wanted = wanted and term.xapian_wanted()
-        return wanted
-
-    def xapian_term(self, request):
-        # sort negated terms
-        terms = []
-        not_terms = []
-        for term in self._subterms:
-            if not term.negated:
-                terms.append(term.xapian_term(request))
-            else:
-                not_terms.append(term.xapian_term(request))
-
-        # prepare query for not negated terms
-        if len(terms) == 1:
-            t1 = Query(terms[0])
-        else:
-            t1 = Query(Query.OP_AND, terms)
-
-        # negated terms?
-        if not not_terms:
-            # no, just return query for not negated terms
-            return t1
-        
-        # yes, link not negated and negated terms' query with a AND_NOT query
-        if len(not_terms) == 1:
-            t2 = Query(not_terms[0])
-        else:
-            t2 = Query(Query.OP_OR, not_terms)
-
-        return Query(Query.OP_AND_NOT, t1, t2)
-
-
-class OrExpression(AndExpression):
-    """ A term connecting several sub terms with a logical OR """
-    
-    operator = ' or '
-
-    def search(self, page):
-        """ Search page with terms, cheap terms first
-
-        XXX Do we have any reason to sort here? we are not breaking out
-        of the search in any case.
-        """
-        self.sortByCost()
-        matches = []
-        for term in self._subterms:
-            result = term.search(page)
-            if result:
-                matches.extend(result)
-        return matches
-
-    def xapian_term(self, request):
-        # XXX: negated terms managed by _moinSearch?
-        return Query(Query.OP_OR, [term.xapian_term(request) for term in self._subterms])
-
-
-class TextSearch(BaseExpression):
-    """ A term that does a normal text search
-
-    Both page content and the page title are searched, using an
-    additional TitleSearch term.
-    """
-    
-    def __init__(self, pattern, use_re=False, case=False):
-        """ Init a text search
-
-        @param pattern: pattern to search for, ascii string or unicode
-        @param use_re: treat pattern as re of plain text, bool
-        @param case: do case sensitive search, bool 
-        """
-        self._pattern = unicode(pattern)
-        self.negated = 0
-        self.use_re = use_re
-        self.case = case
-        self._build_re(self._pattern, use_re=use_re, case=case)
-        self.titlesearch = TitleSearch(self._pattern, use_re=use_re, case=case)
-        
-    def costs(self):
-        return 10000
-    
-    def __unicode__(self):
-        neg = self.negated and '-' or ''
-        return u'%s"%s"' % (neg, unicode(self._pattern))
-
-    def highlight_re(self):
-        return u"(%s)" % self._pattern
-
-    def search(self, page):
-        matches = []
-
-        # Search in page name
-        results = self.titlesearch.search(page)
-        if results:
-            matches.extend(results)
-
-        # Search in page body
-        body = page.get_raw_body()
-        for match in self.search_re.finditer(body):
-            if page.request.cfg.xapian_stemming:
-                # somewhere in regular word
-                if body[match.start()] not in config.chars_upper and \
-                        body[match.start()-1] in config.chars_lower:
-                    continue
-
-                post = 0
-                for c in body[match.end():]:
-                    if c in config.chars_lower:
-                        post += 1
-                    else:
-                        break
-
-                matches.append(TextMatch(start=match.start(),
-                        end=match.end()+post))
-            else:
-                matches.append(TextMatch(re_match=match))
-
-        # Decide what to do with the results.
-        if ((self.negated and matches) or
-            (not self.negated and not matches)):
-            return None
-        elif matches:
-            return matches
-        else:
-            # XXX why not return None or empty list?
-            return [Match()]
-
-    def xapian_wanted(self):
-        return not self.use_re
-
-    def xapian_term(self, request):
-        if self.use_re:
-            return None # xapian can't do regex search
-        else:
-            analyzer = Xapian.WikiAnalyzer(request=request,
-                    language=request.cfg.language_default)
-            terms = self._pattern.split()
-
-            # all parsed wikiwords, AND'ed
-            queries = []
-            stemmed = []
-            for t in terms:
-                if request.cfg.xapian_stemming:
-                    # stemmed OR not stemmed
-                    tmp = []
-                    for i in analyzer.tokenize(t, flat_stemming=False):
-                        tmp.append(UnicodeQuery(Query.OP_OR, i))
-                        stemmed.append(i[1])
-                    t = tmp
-                else:
-                    # just not stemmed
-                    t = [UnicodeQuery(i) for i in analyzer.tokenize(t)]
-                queries.append(Query(Query.OP_AND, t))
-
-            if stemmed:
-                self._build_re(' '.join(stemmed), use_re=False,
-                        case=self.case, stemmed=True)
-
-            # titlesearch OR parsed wikiwords
-            return Query(Query.OP_OR,
-                    (self.titlesearch.xapian_term(request),
-                        Query(Query.OP_AND, queries)))
-
-
-class TitleSearch(BaseExpression):
-    """ Term searches in pattern in page title only """
-
-    def __init__(self, pattern, use_re=False, case=False):
-        """ Init a title search
-
-        @param pattern: pattern to search for, ascii string or unicode
-        @param use_re: treat pattern as re of plain text, bool
-        @param case: do case sensitive search, bool 
-        """
-        self._pattern = unicode(pattern)
-        self.negated = 0
-        self.use_re = use_re
-        self.case = case
-        self._build_re(self._pattern, use_re=use_re, case=case)
-        
-    def costs(self):
-        return 100
-
-    def __unicode__(self):
-        neg = self.negated and '-' or ''
-        return u'%s!"%s"' % (neg, unicode(self._pattern))
-
-    def highlight_re(self):
-        return u"(%s)" % self._pattern
-
-    def pageFilter(self):
-        """ Page filter function for single title search """
-        def filter(name):
-            match = self.search_re.search(name)
-            if ((self.negated and match) or
-                (not self.negated and not match)):
-                return False
-            return True
-        return filter
-            
-    def search(self, page):
-        # Get matches in page name
-        matches = []
-        for match in self.search_re.finditer(page.page_name):
-            if page.request.cfg.xapian_stemming:
-                # somewhere in regular word
-                if page.page_name[match.start()] not in config.chars_upper and \
-                        page.page_name[match.start()-1] in config.chars_lower:
-                    continue
-
-                post = 0
-                for c in page.page_name[match.end():]:
-                    if c in config.chars_lower:
-                        post += 1
-                    else:
-                        break
-
-                matches.append(TitleMatch(start=match.start(),
-                        end=match.end()+post))
-            else:
-                matches.append(TitleMatch(re_match=match))
-        
-        if ((self.negated and matches) or
-            (not self.negated and not matches)):
-            return None
-        elif matches:
-            return matches
-        else:
-            # XXX why not return None or empty list?
-            return [Match()]
-
-    def xapian_wanted(self):
-        return not self.use_re
-
-    def xapian_term(self, request):
-        if self.use_re:
-            return None # xapian doesn't support regex search
-        else:
-            analyzer = Xapian.WikiAnalyzer(request=request,
-                    language=request.cfg.language_default)
-            terms = self._pattern.split()
-            terms = [list(analyzer.raw_tokenize(t)) for t in terms]
-
-            # all parsed wikiwords, AND'ed
-            queries = []
-            stemmed = []
-            for t in terms:
-                if request.cfg.xapian_stemming:
-                    # stemmed OR not stemmed
-                    tmp = []
-                    for i in analyzer.tokenize(t, flat_stemming=False):
-                        tmp.append(UnicodeQuery(Query.OP_OR, ['%s%s' %
-                            (Xapian.Index.prefixMap['title'], j) for j in i]))
-                        stemmed.append(i[1])
-                    t = tmp
-                else:
-                    # just not stemmed
-                    t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], i))
-                        for i in analyzer.tokenize(t)]
-
-                queries.append(Query(Query.OP_AND, t))
-
-            if stemmed:
-                self._build_re(' '.join(stemmed), use_re=False,
-                        case=self.case, stemmed=True)
-
-            return Query(Query.OP_AND, queries)
-
-
-class LinkSearch(BaseExpression):
-    """ Search the term in the pagelinks """
-
-    def __init__(self, pattern, use_re=False, case=True):
-        """ Init a link search
-
-        @param pattern: pattern to search for, ascii string or unicode
-        @param use_re: treat pattern as re of plain text, bool
-        @param case: do case sensitive search, bool 
-        """
-        # used for search in links
-        self._pattern = pattern
-        # used for search in text
-        self._textpattern = '(' + self._pattern.replace('/', '|') + ')'
-        self.negated = 0
-        self.use_re = use_re
-        self.case = case
-        self.textsearch = TextSearch(self._textpattern, use_re=1, case=case)
-        self._build_re(unicode(pattern), use_re=use_re, case=case)
-
-    def _build_re(self, pattern, use_re=False, case=False):
-        """ Make a regular expression out of a text pattern """
-        flags = case and re.U or (re.I | re.U)
-        try:
-            if not use_re:
-                raise re.error
-            self.search_re = re.compile(pattern, flags)
-            self.static = False
-        except re.error:
-            self.pattern = pattern
-            self.static = True
-        
-    def costs(self):
-        return 5000 # cheaper than a TextSearch
-
-    def __unicode__(self):
-        neg = self.negated and '-' or ''
-        return u'%s!"%s"' % (neg, unicode(self._pattern))
-
-    def highlight_re(self):
-        return u"(%s)" % self._textpattern
-
-    def search(self, page):
-        # Get matches in page name
-        matches = []
-
-        Found = True
-        
-        for link in page.getPageLinks(page.request):
-            if ((self.static and self.pattern == link) or
-                (not self.static and self.search_re.match(link))):
-                break
-        else:
-            Found = False
-
-        if Found:
-            # Search in page text
-            results = self.textsearch.search(page)
-            if results:
-                matches.extend(results)
-            else: #This happens e.g. for pages that use navigation macros
-                matches.append(TextMatch(0, 0))
-
-        # Decide what to do with the results.
-        if ((self.negated and matches) or
-            (not self.negated and not matches)):
-            return None
-        elif matches:
-            return matches
-        else:
-            # XXX why not return None or empty list?
-            return [Match()]
-
-    def xapian_wanted(self):
-        return not self.use_re
-
-    def xapian_term(self, request):
-        pattern = self.pattern
-        if self.use_re:
-            return None # xapian doesnt support regex search
-        else:
-            return UnicodeQuery('%s:%s' %
-                    (Xapian.Index.prefixMap['linkto'], pattern))
-
-
-class LanguageSearch(BaseExpression):
-    """ Search the pages written in a language """
-
-    def __init__(self, pattern, use_re=False, case=True):
-        """ Init a language search
-
-        @param pattern: pattern to search for, ascii string or unicode
-        @param use_re: treat pattern as re of plain text, bool
-        @param case: do case sensitive search, bool 
-        """
-        # iso language code, always lowercase
-        self._pattern = pattern.lower()
-        self.negated = 0
-        self.use_re = use_re
-        self.case = case
-        self.xapian_called = False
-        self._build_re(self._pattern, use_re=use_re, case=case)
-
-    def costs(self):
-        return 5000 # cheaper than a TextSearch
-
-    def __unicode__(self):
-        neg = self.negated and '-' or ''
-        return u'%s!"%s"' % (neg, unicode(self._pattern))
-
-    def highlight_re(self):
-        return ""
-
-    def search(self, page):
-        # We just use (and trust ;)) xapian for this.. deactivated for _moinSearch
-        if not self.xapian_called:
-            return None
-        else:
-            # XXX why not return None or empty list?
-            return [Match()]
-
-    def xapian_wanted(self):
-        return not self.use_re
-
-    def xapian_term(self, request):
-        pattern = self.pattern
-        if self.use_re:
-            return None # xapian doesnt support regex search
-        else:
-            self.xapian_called = True
-            return UnicodeQuery('%s%s' %
-                    (Xapian.Index.prefixMap['lang'], pattern))
-
-
-############################################################################
-### Results
-############################################################################
-
-class Match(object):
-    """ Base class for all Matches (found pieces of pages).
-    
-    This class represents a empty True value as returned from negated searches.
-    """
-    # Default match weight
-    _weight = 1.0
-    
-    def __init__(self, start=0, end=0, re_match=None):
-        self.re_match = re_match
-        if not re_match:
-            self._start = start
-            self._end = end
-        else:
-            self._start = self._end = 0
-
-    def __len__(self):
-        return self.end - self.start
-
-    def __eq__(self, other):
-        equal = (self.__class__ == other.__class__ and
-                 self.start == other.start and
-                 self.end == other.end)
-        return equal
-        
-    def __ne__(self, other):
-        return not self.__eq__(other)
-
-    def view(self):
-        return ''
-
-    def weight(self):
-        return self._weight
-
-    def _get_start(self):
-        if self.re_match:
-            return self.re_match.start()
-        return self._start
-
-    def _get_end(self):
-        if self.re_match:
-            return self.re_match.end()
-        return self._end
-
-    # object properties
-    start = property(_get_start)
-    end   = property(_get_end)
-
-
-class TextMatch(Match):
-    """ Represents a match in the page content """
-    pass
-
-
-class TitleMatch(Match):
-    """ Represents a match in the page title
-    
-    Has more weight as a match in the page content.
-    """
-    # Matches in titles are much more important in wikis. This setting
-    # seems to make all pages that have matches in the title to appear
-    # before pages that their title does not match.
-    _weight = 100.0
-
-
-class AttachmentMatch(Match):
-    """ Represents a match in a attachment content
-
-    Not used yet.
-    """
-    pass
-
-
-class FoundPage:
-    """ Represents a page in a search result """
-
-    def __init__(self, page_name, matches=None, page=None):
-        self.page_name = page_name
-        self.attachment = '' # this is not an attachment
-        self.page = page
-        if matches is None:
-            matches = []
-        self._matches = matches
-
-    def weight(self, unique=1):
-        """ returns how important this page is for the terms searched for
-
-        Summarize the weight of all page matches
-
-        @param unique: ignore identical matches
-        @rtype: int
-        @return: page weight
-        """
-        weight = 0
-        for match in self.get_matches(unique=unique):
-            weight += match.weight()
-            # More sophisticated things to be added, like increase
-            # weight of near matches.
-        return weight
-
-    def add_matches(self, matches):
-        """ Add found matches """
-        self._matches.extend(matches)
-
-    def get_matches(self, unique=1, sort='start', type=Match):
-        """ Return all matches of type sorted by sort
-
-        @param unique: return only unique matches (bool)
-        @param sort: match attribute to sort by (string)
-        @param type: type of match to return (Match or sub class) 
-        @rtype: list
-        @return: list of matches
-        """
-        if unique:
-            matches = self._unique_matches(type=type)
-            if sort == 'start':
-                # matches already sorted by match.start, finished.
-                return matches
-        else:
-            matches = self._matches
-
-        # Filter by type and sort by sort using fast schwartzian
-        # transform.
-        if sort == 'start':
-            tmp = [(match.start, match) for match in matches
-                   if instance(match, type)]
-        else:
-            tmp = [(match.weight(), match) for match in matches
-                   if instance(match, type)]
-        tmp.sort()
-        if sort == 'weight':
-            tmp.reverse()
-        matches = [item[1] for item in tmp]
-        
-        return matches
-
-    def _unique_matches(self, type=Match):
-        """ Get a list of unique matches of type
-
-        The result is sorted by match.start, because its easy to remove
-        duplicates like this.
-
-        @param type: type of match to return
-        @rtype: list
-        @return: list of matches of type, sorted by match.start
-        """
-        # Filter by type and sort by match.start using fast schwartzian
-        # transform.
-        tmp = [(match.start, match) for match in self._matches
-               if isinstance(match, type)]
-        tmp.sort()
-
-        if not len(tmp):
-            return []
-
-        # Get first match into matches list
-        matches = [tmp[0][1]]
-
-        # Add the remaining ones of matches ignoring identical matches
-        for item in tmp[1:]:
-            if item[1] == matches[-1]:
-                continue
-            matches.append(item[1])
-
-        return matches
-    
-
-class FoundAttachment(FoundPage):
-    """ Represent an attachment in search results """
-    
-    def __init__(self, page_name, attachment, matches=None, page=None):
-        self.page_name = page_name
-        self.attachment = attachment
-        self.page = page
-        if matches is None:
-            matches = []
-        self._matches = matches
-
-    def weight(self, unique=1):
-        return 1
-
-    def get_matches(self, unique=1, sort='start', type=Match):
-        return []
-
-    def _unique_matches(self, type=Match):
-        return []
-
-
-class FoundRemote(FoundPage):
-    """ Represent an attachment in search results """
-    
-    def __init__(self, wikiname, page_name, attachment, matches=None, page=None):
-        self.wikiname = wikiname
-        self.page_name = page_name
-        self.attachment = attachment
-        self.page = page
-        if matches is None:
-            matches = []
-        self._matches = matches
-
-    def weight(self, unique=1):
-        return 1
-
-    def get_matches(self, unique=1, sort='start', type=Match):
-        return []
-
-    def _unique_matches(self, type=Match):
-        return []
-
-
-##############################################################################
-### Parse Query
-##############################################################################
-
-
-class QueryParser:
-    """
-    Converts a String into a tree of Query objects
-    using recursive top/down parsing
-    """
-
-    def __init__(self, **kw):
-        """
-        @keyword titlesearch: treat all terms as title searches
-        @keyword case: do case sensitive search
-        @keyword regex: treat all terms as regular expressions
-        """
-        self.titlesearch = kw.get('titlesearch', 0)
-        self.case = kw.get('case', 0)
-        self.regex = kw.get('regex', 0)
-
-    def parse_query(self, query):
-        """ transform an string into a tree of Query objects """
-        if isinstance(query, str):
-            query = query.decode(config.charset)
-        self._query = query
-        result = self._or_expression()
-        if result is None:
-            result = BaseExpression()
-        return result
-
-    def _or_expression(self):
-        result = self._and_expression()
-        if self._query:
-            result = OrExpression(result)
-        while self._query:
-            q = self._and_expression()
-            if q:
-                result.append(q)
-        return result
-            
-    def _and_expression(self):
-        result = None
-        while not result and self._query:
-            result = self._single_term()
-        term = self._single_term()
-        if term:
-            result = AndExpression(result, term)
-        else:
-            return result
-        term = self._single_term()
-        while term:
-            result.append(term)
-            term = self._single_term()
-        return result
-                                
-    def _single_term(self):
-        regex = (r'(?P<NEG>-?)\s*(' +              # leading '-'
-                 r'(?P<OPS>\(|\)|(or\b(?!$)))|' +  # or, (, )
-                 r'(?P<MOD>(\w+:)*)' +
-                 r'(?P<TERM>("[^"]+")|' +
-                 r"('[^']+')|(\S+)))")             # search word itself
-        self._query = self._query.strip()
-        match = re.match(regex, self._query, re.U)
-        if not match:
-            return None
-        self._query = self._query[match.end():]
-        ops = match.group("OPS")
-        if ops == '(':
-            result = self._or_expression()
-            if match.group("NEG"): result.negate()
-            return result
-        elif ops == ')':
-            return None
-        elif ops == 'or':
-            return None
-        modifiers = match.group('MOD').split(":")[:-1]
-        text = match.group('TERM')
-        if self.isQuoted(text):
-            text = text[1:-1]
-
-        title_search = self.titlesearch
-        regex = self.regex
-        case = self.case
-        linkto = False
-        lang = False
-
-        for m in modifiers:
-            if "title".startswith(m):
-                title_search = True
-            elif "regex".startswith(m):
-                regex = True
-            elif "case".startswith(m):
-                case = True
-            elif "linkto".startswith(m):
-                linkto = True
-            elif "language".startswith(m):
-                lang = True
-
-        if lang:
-            obj = LanguageSearch(text, use_re=regex, case=False)
-        elif linkto:
-            obj = LinkSearch(text, use_re=regex, case=case)
-        elif title_search:
-            obj = TitleSearch(text, use_re=regex, case=case)
-        else:
-            obj = TextSearch(text, use_re=regex, case=case)
-
-        if match.group("NEG"):
-            obj.negate()
-        return obj
-
-    def isQuoted(self, text):
-        # Empty string '' is not considered quoted
-        if len(text) < 3:
-            return False
-        return (text.startswith('"') and text.endswith('"') or
-                text.startswith("'") and text.endswith("'"))
-
-
-############################################################################
-### Search results formatting
-############################################################################
-
-class SearchResults:
-    """ Manage search results, supply different views
-
-    Search results can hold valid search results and format them for
-    many requests, until the wiki content changes.
-
-    For example, one might ask for full page list sorted from A to Z,
-    and then ask for the same list sorted from Z to A. Or sort results
-    by name and then by rank.
-    """
-    # Public functions --------------------------------------------------
-    
-    def __init__(self, query, hits, pages, elapsed):
-        self.query = query # the query
-        self.hits = hits # hits list
-        self.sort = None # hits are unsorted initially
-        self.pages = pages # number of pages in the wiki
-        self.elapsed = elapsed # search time
-
-    def sortByWeight(self):
-        """ Sorts found pages by the weight of the matches """
-        tmp = [(hit.weight(), hit.page_name, hit) for hit in self.hits]
-        tmp.sort()
-        tmp.reverse()
-        self.hits = [item[2] for item in tmp]
-        self.sort = 'weight'
-        
-    def sortByPagename(self):
-        """ Sorts a list of found pages alphabetical by page name """
-        tmp = [(hit.page_name, hit) for hit in self.hits]
-        tmp.sort()
-        self.hits = [item[1] for item in tmp]
-        self.sort = 'page_name'
-        
-    def stats(self, request, formatter):
-        """ Return search statistics, formatted with formatter
-
-        @param request: current request
-        @param formatter: formatter to use
-        @rtype: unicode
-        @return formatted statistics
-        """
-        _ = request.getText
-        output = [
-            formatter.paragraph(1),
-            formatter.text(_("%(hits)d results out of about %(pages)d pages.") %
-                   {'hits': len(self.hits), 'pages': self.pages}),
-            u' (%s)' % formatter.text(_("%.2f seconds") % self.elapsed),
-            formatter.paragraph(0),
-            ]
-        return ''.join(output)
-
-    def pageList(self, request, formatter, info=0, numbered=1):
-        """ Format a list of found pages
-
-        @param request: current request
-        @param formatter: formatter to use
-        @param info: show match info in title
-        @param numbered: use numbered list for display
-        @rtype: unicode
-        @return formatted page list
-        """
-        self._reset(request, formatter)
-        f = formatter
-        write = self.buffer.write
-        if numbered:
-            list = f.number_list
-        else:
-            list = f.bullet_list
-
-        # Add pages formatted as list
-        if self.hits:
-            write(list(1))
-
-            for page in self.hits:
-                if page.attachment:
-                    querydict = {
-                        'action': 'AttachFile',
-                        'do': 'get',
-                        'target': page.attachment,
-                    }
-                else:
-                    querydict = None
-                querystr = self.querystring(querydict)
-            
-                matchInfo = ''
-                if info:
-                    matchInfo = self.formatInfo(f, page)
-                item = [
-                    f.listitem(1),
-                    f.pagelink(1, page.page_name, querystr=querystr),
-                    self.formatTitle(page),
-                    f.pagelink(0, page.page_name),
-                    matchInfo,
-                    f.listitem(0),
-                    ]
-                write(''.join(item))
-            write(list(0))
-
-        return self.getvalue()
-
-    def pageListWithContext(self, request, formatter, info=1, context=180,
-                            maxlines=1):
-        """ Format a list of found pages with context
-
-        The default parameter values will create Google-like search
-        results, as this is the most known search interface. Good
-        interface is familiar interface, so unless we have much better
-        solution (we don't), being like Google is the way.
-
-        @param request: current request
-        @param formatter: formatter to use
-        @param info: show match info near the page link
-        @param context: how many characters to show around each match. 
-        @param maxlines: how many contexts lines to show. 
-        @rtype: unicode
-        @return formatted page list with context
-        """
-        self._reset(request, formatter)
-        f = formatter
-        write = self.buffer.write
-        
-        # Add pages formatted as definition list
-        if self.hits:
-            write(f.definition_list(1))       
-
-            for page in self.hits:
-                matchInfo = ''
-                if info:
-                    matchInfo = self.formatInfo(f, page)
-                if page.attachment:
-                    fmt_context = ""
-                    querydict = {
-                        'action': 'AttachFile',
-                        'do': 'get',
-                        'target': page.attachment,
-                    }
-                elif page.page_name.startswith('FS/'): # XXX FS hardcoded
-                    fmt_context = ""
-                    querydict = None
-                else:
-                    fmt_context = self.formatContext(page, context, maxlines)
-                    querydict = None
-                querystr = self.querystring(querydict)
-                item = [
-                    f.definition_term(1),
-                    f.pagelink(1, page.page_name, querystr=querystr),
-                    self.formatTitle(page),
-                    f.pagelink(0, page.page_name),
-                    matchInfo,
-                    f.definition_term(0),
-                    f.definition_desc(1),
-                    fmt_context,
-                    f.definition_desc(0),
-                    ]
-                write(''.join(item))
-            write(f.definition_list(0))
-        
-        return self.getvalue()
-
-    # Private -----------------------------------------------------------
-
-    # This methods are not meant to be used by clients and may change
-    # without notice.
-    
-    def formatContext(self, page, context, maxlines):
-        """ Format search context for each matched page
-
-        Try to show first maxlines interesting matches context.
-        """
-        f = self.formatter
-        if not page.page:
-            page.page = Page(self.request, page.page_name)
-        body = page.page.get_raw_body()
-        last = len(body) - 1
-        lineCount = 0
-        output = []
-        
-        # Get unique text matches sorted by match.start, try to ignore
-        # matches in page header, and show the first maxlines matches.
-        # TODO: when we implement weight algorithm for text matches, we
-        # should get the list of text matches sorted by weight and show
-        # the first maxlines matches.
-        matches = page.get_matches(unique=1, sort='start', type=TextMatch)
-        i, start = self.firstInterestingMatch(page, matches)            
-
-        # Format context
-        while i < len(matches) and lineCount < maxlines:
-            match = matches[i]
-            
-            # Get context range for this match
-            start, end = self.contextRange(context, match, start, last)
-
-            # Format context lines for matches. Each complete match in
-            # the context will be highlighted, and if the full match is
-            # in the context, we increase the index, and will not show
-            # same match again on a separate line.
-
-            output.append(f.text(u'...'))
-            
-            # Get the index of the first match completely within the
-            # context.
-            for j in xrange(0, len(matches)):
-                if matches[j].start >= start:
-                    break
-
-            # Add all matches in context and the text between them 
-            while True:
-                match = matches[j]
-                # Ignore matches behind the current position
-                if start < match.end:
-                    # Append the text before match
-                    if start < match.start:
-                        output.append(f.text(body[start:match.start]))
-                    # And the match
-                    output.append(self.formatMatch(body, match, start))
-                    start = match.end
-                # Get next match, but only if its completely within the context
-                if j < len(matches) - 1 and matches[j + 1].end <= end:
-                    j += 1
-                else:
-                    break
-
-            # Add text after last match and finish the line
-            if match.end < end:
-               output.append(f.text(body[match.end:end]))
-            output.append(f.text(u'...'))
-            output.append(f.linebreak(preformatted=0))
-
-            # Increase line and point to the next match
-            lineCount += 1
-            i = j + 1
-
-        output = ''.join(output)
-
-        if not output:
-            # Return the first context characters from the page text
-            output = f.text(page.page.getPageText(length=context))
-            output = output.strip()
-            if not output:
-                # This is a page with no text, only header, for example,
-                # a redirect page.
-                output = f.text(page.page.getPageHeader(length=context))
-        
-        return output
-        
-    def firstInterestingMatch(self, page, matches):
-        """ Return the first interesting match
-
-        This function is needed only because we don't have yet a weight
-        algorithm for page text matches.
-        
-        Try to find the first match in the page text. If we can't find
-        one, we return the first match and start=0.
-
-        @rtype: tuple
-        @return: index of first match, start of text
-        """
-        header = page.page.getPageHeader()
-        start = len(header)
-        # Find first match after start
-        for i in xrange(len(matches)):
-            if matches[i].start >= start:
-                return i, start
-        return 0, 0
-
-    def contextRange(self, context, match, start, last):
-        """ Compute context range
-
-        Add context around each match. If there is no room for context
-        before or after the match, show more context on the other side.
-
-        @param context: context length
-        @param match: current match
-        @param start: context should not start before that index, unless
-                      end is past the last character.
-        @param last: last character index
-        @rtype: tuple
-        @return: start, end of context
-        """
-        # Start by giving equal context on both sides of match
-        contextlen = max(context - len(match), 0)
-        cstart = match.start - contextlen / 2
-        cend = match.end + contextlen / 2
-
-        # If context start before start, give more context on end
-        if cstart < start:
-            cend += start - cstart
-            cstart = start
-            
-        # But if end if after last, give back context to start
-        if cend > last:
-            cstart -= cend - last
-            cend = last
-
-        # Keep context start positive for very short texts
-        cstart = max(cstart, 0)
-
-        return cstart, cend
-
-    def formatTitle(self, page):
-        """ Format page title
-
-        Invoke format match on all unique matches in page title.
-
-        @param page: found page
-        @rtype: unicode
-        @return: formatted title
-        """
-        # Get unique title matches sorted by match.start
-        matches = page.get_matches(unique=1, sort='start', type=TitleMatch)
-        
-        # Format
-        pagename = page.page_name
-        f = self.formatter
-        output = []
-        start = 0
-        for match in matches:
-            # Ignore matches behind the current position
-            if start < match.end:
-                # Append the text before the match
-                if start < match.start:
-                    output.append(f.text(pagename[start:match.start]))
-                # And the match
-                output.append(self.formatMatch(pagename, match, start))
-                start = match.end
-        # Add text after match
-        if start < len(pagename):
-            output.append(f.text(pagename[start:]))
-        
-        if page.attachment: # show the attachment that matched
-            output.extend([
-                    " ",
-                    f.strong(1),
-                    f.text("(%s)" % page.attachment),
-                    f.strong(0)])
-
-        return ''.join(output)
-
-    def formatMatch(self, body, match, location):
-        """ Format single match in text
-
-        Format the part of the match after the current location in the
-        text. Matches behind location are ignored and an empty string is
-        returned.
-
-        @param body: text containing match
-        @param match: search match in text
-        @param location: current location in text
-        @rtype: unicode
-        @return: formatted match or empty string
-        """        
-        start = max(location, match.start)
-        if start < match.end:
-            f = self.formatter
-            output = [
-                f.strong(1),
-                f.text(body[start:match.end]),
-                f.strong(0),
-                ]
-            return ''.join(output)
-        return ''
-
-    def querystring(self, querydict=None):
-        """ Return query string, used in the page link """
-        if querydict is None:
-            querydict = {'highlight': self.query.highlight_re()}
-        querystr = wikiutil.makeQueryString(querydict)
-        #querystr = wikiutil.escape(querystr)
-        return querystr
-
-    def formatInfo(self, formatter, page):
-        """ Return formatted match info """
-        template = u' . . . %s %s'
-        template = u"%s%s%s" % (formatter.span(1, css_class="info"),
-                                template,
-                                formatter.span(0))
-        # Count number of unique matches in text of all types
-        count = len(page.get_matches(unique=1))
-        info = template % (count, self.matchLabel[count != 1])
-        return info
-
-    def getvalue(self):
-        """ Return output in div with CSS class """
-        write = self.request.write
-        value = [
-            self.formatter.div(1, css_class='searchresults'),
-            self.buffer.getvalue(),
-            self.formatter.div(0),
-            ]
-        return '\n'.join(value)
-
-    def _reset(self, request, formatter):
-        """ Update internal state before new output
-
-        Do not call this, it should be called only by the instance code.
-
-        Each request might need different translations or other user
-        preferences.
-        """
-        self.buffer = StringIO.StringIO()
-        self.formatter = formatter
-        self.request = request
-        # Use 1 match, 2 matches...
-        _ = request.getText    
-        self.matchLabel = (_('match'), _('matches'))
-
-
-##############################################################################
-### Searching
-##############################################################################
-
-class Search:
-    """ A search run """
-    
-    def __init__(self, request, query):
-        self.request = request
-        self.query = query
-        self.filtered = False
-        self.fs_rootpage = "FS" # XXX FS hardcoded
-
-    def run(self):
-        """ Perform search and return results object """
-        start = time.time()
-        if self.request.cfg.xapian_search:
-            hits = self._xapianSearch()
-        else:
-            hits = self._moinSearch()
-            
-        # important - filter deleted pages or pages the user may not read!
-        if not self.filtered:
-            hits = self._filter(hits)
-        
-        result_hits = []
-        for wikiname, page, attachment, match in hits:
-            if wikiname in (self.request.cfg.interwikiname, 'Self'): # a local match
-                if attachment:
-                    result_hits.append(FoundAttachment(page.page_name, attachment))
-                else:
-                    result_hits.append(FoundPage(page.page_name, match))
-            else:
-                result_hits.append(FoundRemote(wikiname, page, attachment, match))
-        elapsed = time.time() - start
-        count = self.request.rootpage.getPageCount()
-        return SearchResults(self.query, result_hits, count, elapsed)
-
-    # ----------------------------------------------------------------
-    # Private!
-
-    def _xapianSearch(self):
-        """ Search using Xapian
-        
-        Get a list of pages using fast xapian search and
-        return moin search in those pages.
-        """
-        pages = None
-        try:
-            index = Xapian.Index(self.request)
-        except NameError:
-            index = None
-        if index and index.exists() and self.query.xapian_wanted():
-            self.request.clock.start('_xapianSearch')
-            try:
-                from MoinMoin.support import xapwrap
-                query = self.query.xapian_term(self.request)
-                self.request.log("xapianSearch: query = %r" %
-                        query.get_description())
-                query = xapwrap.index.QObjQuery(query)
-                hits = index.search(query)
-                self.request.log("xapianSearch: finds: %r" % hits)
-                def dict_decode(d):
-                    """ decode dict values to unicode """
-                    for k, v in d.items():
-                        d[k] = d[k].decode(config.charset)
-                    return d
-                pages = [dict_decode(hit['values']) for hit in hits]
-                self.request.log("xapianSearch: finds pages: %r" % pages)
-            except index.LockedException:
-                pass
-            self.request.clock.stop('_xapianSearch')
-        return self._moinSearch(pages)
-
-    def _moinSearch(self, pages=None):
-        """ Search pages using moin's built-in full text search 
-        
-        Return list of tuples (page, match). The list may contain
-        deleted pages or pages the user may not read.
-        """
-        self.request.clock.start('_moinSearch')
-        from MoinMoin.Page import Page
-        if pages is None:
-            # if we are not called from _xapianSearch, we make a full pagelist,
-            # but don't search attachments (thus attachment name = '')
-            pages = [{'pagename': p, 'attachment': '', 'wikiname': 'Self', } for p in self._getPageList()]
-        hits = []
-        fs_rootpage = self.fs_rootpage
-        for valuedict in pages:
-            wikiname = valuedict['wikiname']
-            pagename = valuedict['pagename']
-            attachment = valuedict['attachment']
-            if wikiname in (self.request.cfg.interwikiname, 'Self'): # THIS wiki
-                page = Page(self.request, pagename)
-                if attachment:
-                    if pagename == fs_rootpage: # not really an attachment
-                        page = Page(self.request, "%s/%s" % (fs_rootpage, attachment))
-                        hits.append((wikiname, page, None, None))
-                    else:
-                        hits.append((wikiname, page, attachment, None))
-                else:
-                    match = self.query.search(page)
-                    if match:
-                        hits.append((wikiname, page, attachment, match))
-            else: # other wiki
-                hits.append((wikiname, pagename, attachment, None))
-        self.request.clock.stop('_moinSearch')
-        return hits
-
-    def _getPageList(self):
-        """ Get list of pages to search in 
-        
-        If the query has a page filter, use it to filter pages before
-        searching. If not, get a unfiltered page list. The filtering
-        will happen later on the hits, which is faster with current
-        slow storage.
-        """
-        filter = self.query.pageFilter()
-        if filter:
-            # There is no need to filter the results again.
-            self.filtered = True
-            return self.request.rootpage.getPageList(filter=filter)
-        else:
-            return self.request.rootpage.getPageList(user='', exists=0)
-        
-    def _filter(self, hits):
-        """ Filter out deleted or acl protected pages """
-        userMayRead = self.request.user.may.read
-        fs_rootpage = self.fs_rootpage + "/"
-        thiswiki = (self.request.cfg.interwikiname, 'Self')
-        filtered = [(wikiname, page, attachment, match) for wikiname, page, attachment, match in hits
-                    if not wikiname in thiswiki or
-                       page.exists() and userMayRead(page.page_name) or
-                       page.page_name.startswith(fs_rootpage)]    
-        return filtered
-        
-        
-def searchPages(request, query, **kw):
-    """ Search the text of all pages for query.
-    
-    @param request: current request
-    @param query: the expression we want to search for
-    @rtype: SearchResults instance
-    @return: search results
-    """
-    return Search(request, query).run()
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/search/Xapian.py	Tue Jun 27 15:09:46 2006 +0200
@@ -0,0 +1,771 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - xapian indexing search engine
+
+    @copyright: 2006 MoinMoin:ThomasWaldmann,
+                2006 MoinMoin:FranzPletz
+    @license: GNU GPL, see COPYING for details.
+"""
+debug = True
+
+import sys, os, re, codecs, errno, time
+from pprint import pprint
+
+import xapian
+from xapian import Query
+from MoinMoin.support.xapwrap import document as xapdoc
+from MoinMoin.support.xapwrap import index as xapidx
+from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
+
+from MoinMoin.Page import Page
+from MoinMoin import config, wikiutil
+from MoinMoin.util import filesys, lock
+
+try:
+    # PyStemmer, snowball python bindings from http://snowball.tartarus.org/
+    from Stemmer import Stemmer
+except ImportError:
+    Stemmer = None
+
+class UnicodeQuery(xapian.Query):
+    def __init__(self, *args, **kwargs):
+        self.encoding = kwargs.get('encoding', config.charset)
+
+        nargs = []
+        for term in args:
+            if isinstance(term, unicode):
+                term = term.encode(self.encoding)
+            elif isinstance(term, list) or isinstance(term, tuple):
+                term = [t.encode(self.encoding) for t in term]
+            nargs.append(term)
+
+        xapian.Query.__init__(self, *nargs, **kwargs)
+
+
+##############################################################################
+### Tokenizer
+##############################################################################
+
+def getWikiAnalyzerFactory(request=None, language='en'):
+    return (lambda: WikiAnalyzer(request, language))
+
+class WikiAnalyzer:
+    singleword = r"[%(u)s][%(l)s]+" % {
+                     'u': config.chars_upper,
+                     'l': config.chars_lower,
+                 }
+
+    singleword_re = re.compile(singleword, re.U)
+    wikiword_re = re.compile(WikiParser.word_rule, re.U)
+
+    token_re = re.compile(
+        r"(?P<company>\w+[&@]\w+)|" + # company names like AT&T and Excite@Home.
+        r"(?P<email>\w+([.-]\w+)*@\w+([.-]\w+)*)|" +    # email addresses
+        r"(?P<hostname>\w+(\.\w+)+)|" +                 # hostnames
+        r"(?P<num>(\w+[-/.,])*\w*\d\w*([-/.,]\w+)*)|" + # version numbers
+        r"(?P<acronym>(\w\.)+)|" +          # acronyms: U.S.A., I.B.M., etc.
+        r"(?P<word>\w+)",                   # words (including WikiWords)
+        re.U)
+
+    dot_re = re.compile(r"[-_/,.]")
+    mail_re = re.compile(r"[-_/,.]|(@)")
+    
+    # XXX limit stuff above to xapdoc.MAX_KEY_LEN
+    # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U)
+
+    def __init__(self, request=None, language=None):
+        if request and request.cfg.xapian_stemming and language:
+            self.stemmer = Stemmer(language)
+        else:
+            self.stemmer = None
+
+    def raw_tokenize(self, value):
+        def enc(uc):
+            """ 'encode' unicode results into whatever xapian / xapwrap wants """
+            lower = uc.lower()
+            return lower
+            
+        if isinstance(value, list): # used for page links
+            for v in value:
+                yield enc(v)
+        else:
+            tokenstream = re.finditer(self.token_re, value)
+            for m in tokenstream:
+                if m.group("acronym"):
+                    yield enc(m.group("acronym").replace('.', ''))
+                elif m.group("company"):
+                    yield enc(m.group("company"))
+                elif m.group("email"):
+                    for word in self.mail_re.split(m.group("email")):
+                        if word:
+                            yield enc(word)
+                elif m.group("hostname"):
+                    for word in self.dot_re.split(m.group("hostname")):
+                        yield enc(word)
+                elif m.group("num"):
+                    for word in self.dot_re.split(m.group("num")):
+                        yield enc(word)
+                elif m.group("word"):
+                    word = m.group("word")
+                    yield enc(word)
+                    # if it is a CamelCaseWord, we additionally yield Camel, Case and Word
+                    if self.wikiword_re.match(word):
+                        for sm in re.finditer(self.singleword_re, word):
+                            yield enc(sm.group())
+
+    def tokenize(self, value, flat_stemming=True):
+        """Yield a stream of lower cased raw and stemmed (optional) words from a string.
+           value must be an UNICODE object or a list of unicode objects
+        """
+        for i in self.raw_tokenize(value):
+            if flat_stemming:
+                yield i # XXX: should we really use a prefix for that? Index.prefixMap['raw'] + i
+                if self.stemmer:
+                    yield self.stemmer.stemWord(i)
+            else:
+                yield (i, self.stemmer.stemWord(i))
+
+
+#############################################################################
+### Indexing
+#############################################################################
+
+class UpdateQueue:
+    def __init__(self, file, lock_dir):
+        self.file = file
+        self.writeLock = lock.WriteLock(lock_dir, timeout=10.0)
+        self.readLock = lock.ReadLock(lock_dir, timeout=10.0)
+
+    def exists(self):
+        return os.path.exists(self.file)
+
+    def append(self, pagename):
+        """ Append a page to queue """
+        if not self.writeLock.acquire(60.0):
+            request.log("can't add %r to xapian update queue: can't lock queue" %
+                        pagename)
+            return
+        try:
+            f = codecs.open(self.file, 'a', config.charset)
+            try:
+                f.write(pagename + "\n")
+            finally:
+                f.close()
+        finally:
+            self.writeLock.release()
+
+    def pages(self):
+        """ Return list of pages in the queue """
+        if self.readLock.acquire(1.0):
+            try:
+                return self._decode(self._read())
+            finally:
+                self.readLock.release()
+        return []
+
+    def remove(self, pages):
+        """ Remove pages from the queue
+        
+        When the queue is empty, the queue file is removed, so exists()
+        can tell if there is something waiting in the queue.
+        """
+        if self.writeLock.acquire(30.0):
+            try:
+                queue = self._decode(self._read())
+                for page in pages:
+                    try:
+                        queue.remove(page)
+                    except ValueError:
+                        pass
+                if queue:
+                    self._write(queue)
+                else:
+                    self._removeFile()
+                return True
+            finally:
+                self.writeLock.release()
+        return False
+
+    # Private -------------------------------------------------------
+
+    def _decode(self, data):
+        """ Decode queue data """
+        pages = data.splitlines()
+        return self._filterDuplicates(pages)
+
+    def _filterDuplicates(self, pages):
+        """ Filter duplicates in page list, keeping the order """
+        unique = []
+        seen = {}
+        for name in pages:
+            if not name in seen:
+                unique.append(name)
+                seen[name] = 1
+        return unique
+
+    def _read(self):
+        """ Read and return queue data
+        
+        This does not do anything with the data so we can release the
+        lock as soon as possible, enabling others to update the queue.
+        """
+        try:
+            f = codecs.open(self.file, 'r', config.charset)
+            try:
+                return f.read()
+            finally:
+                f.close()
+        except (OSError, IOError), err:
+            if err.errno != errno.ENOENT:
+                raise
+            return ''
+
+    def _write(self, pages):
+        """ Write pages to queue file
+        
+        Requires queue write locking.
+        """
+        # XXX use tmpfile/move for atomic replace on real operating systems
+        data = '\n'.join(pages) + '\n'
+        f = codecs.open(self.file, 'w', config.charset)
+        try:
+            f.write(data)
+        finally:
+            f.close()
+
+    def _removeFile(self):
+        """ Remove queue file 
+        
+        Requires queue write locking.
+        """
+        try:
+            os.remove(self.file)
+        except OSError, err:
+            if err.errno != errno.ENOENT:
+                raise
+
+
+class Index:
+    indexValueMap = {
+        # mapping the value names we can easily fetch from the index to
+        # integers required by xapian. 0 and 1 are reserved by xapwrap!
+        'pagename': 2,
+        'attachment': 3,
+        'mtime': 4,
+        'wikiname': 5,
+    }
+    prefixMap = {
+        # http://svn.xapian.org/*checkout*/trunk/xapian-applications/omega/docs/termprefixes.txt
+        'author': 'A',
+        'date':   'D', # numeric format: YYYYMMDD or "latest" - e.g. D20050224 or Dlatest
+                       #G   newsGroup (or similar entity - e.g. a web forum name)
+        'hostname': 'H',
+        'keyword': 'K',
+        'lang': 'L',   # ISO Language code
+                       #M   Month (numeric format: YYYYMM)
+                       #N   ISO couNtry code (or domaiN name)
+                       #P   Pathname
+                       #Q   uniQue id
+        'raw':  'R',   # Raw (i.e. unstemmed) term
+        'title': 'S',  # Subject (or title)
+        'mimetype': 'T',
+        'url': 'U',    # full URL of indexed document - if the resulting term would be > 240
+                       # characters, a hashing scheme is used to prevent overflowing
+                       # the Xapian term length limit (see omindex for how to do this).
+                       #W   "weak" (approximately 10 day intervals, taken as YYYYMMD from
+                       #  the D term, and changing the last digit to a '2' if it's a '3')
+                       #X   longer prefix for user-defined use
+        'linkto': 'XLINKTO', # this document links to that document
+        'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in 
+                       #Y   year (four digits)
+    }
+
+    class LockedException(Exception):
+        pass
+    
+    def __init__(self, request):
+        self.request = request
+        cache_dir = request.cfg.cache_dir
+        main_dir = self._main_dir()
+        self.dir = os.path.join(main_dir, 'index')
+        filesys.makeDirs(self.dir)
+        self.sig_file = os.path.join(main_dir, 'complete')
+        lock_dir = os.path.join(main_dir, 'index-lock')
+        self.lock = lock.WriteLock(lock_dir,
+                                   timeout=3600.0, readlocktimeout=60.0)
+        self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0)
+        self.queue = UpdateQueue(os.path.join(main_dir, 'update-queue'),
+                                 os.path.join(main_dir, 'update-queue-lock'))
+
+        # Disabled until we have a sane way to build the index with a
+        # queue in small steps.
+        ## if not self.exists():
+        ##    self.indexPagesInNewThread(request)
+
+        # Check if we should and can stem words
+        if request.cfg.xapian_stemming and not Stemmer:
+            request.cfg.xapian_stemming = False
+
+    def _main_dir(self):
+        if self.request.cfg.xapian_index_dir:
+            return os.path.join(self.request.cfg.xapian_index_dir,
+                    self.request.cfg.siteid)
+        else:
+            return os.path.join(self.request.cfg.cache_dir, 'xapian')
+
+    def exists(self):
+        """ Check if index exists """        
+        return os.path.exists(self.sig_file)
+                
+    def mtime(self):
+        return os.path.getmtime(self.dir)
+
+    def _search(self, query):
+        """ read lock must be acquired """
+        while True:
+            try:
+                searcher, timestamp = self.request.cfg.xapian_searchers.pop()
+                if timestamp != self.mtime():
+                    searcher.close()
+                else:
+                    break
+            except IndexError:
+                searcher = xapidx.ReadOnlyIndex(self.dir)
+                searcher.configure(self.prefixMap, self.indexValueMap)
+                timestamp = self.mtime()
+                break
+        
+        hits = searcher.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname'])
+        self.request.cfg.xapian_searchers.append((searcher, timestamp))
+        return hits
+    
+    def search(self, query):
+        if not self.read_lock.acquire(1.0):
+            raise self.LockedException
+        try:
+            hits = self._search(query)
+        finally:
+            self.read_lock.release()
+        return hits
+
+    def update_page(self, page):
+        self.queue.append(page.page_name)
+        self._do_queued_updates_InNewThread()
+
+    def indexPages(self, files=None, mode='update'):
+        """ Index all pages (and files, if given)
+        
+        Can be called only from a script. To index pages during a user
+        request, use indexPagesInNewThread.
+        @arg files: iterator or list of files to index additionally
+        """
+        if not self.lock.acquire(1.0):
+            self.request.log("can't index: can't acquire lock")
+            return
+        try:
+            request = self._indexingRequest(self.request)
+            self._index_pages(request, None, files, mode)
+        finally:
+            self.lock.release()
+    
+    def indexPagesInNewThread(self, files=None, mode='update'):
+        """ Index all pages in a new thread
+        
+        Should be called from a user request. From a script, use indexPages.
+        """
+        if not self.lock.acquire(1.0):
+            self.request.log("can't index: can't acquire lock")
+            return
+        try:
+            # Prevent rebuilding the index just after it was finished
+            if self.exists():
+                self.lock.release()
+                return
+            from threading import Thread
+            indexThread = Thread(target=self._index_pages,
+                args=(self._indexingRequest(self.request), self.lock, files, mode))
+            indexThread.setDaemon(True)
+            
+            # Join the index thread after current request finish, prevent
+            # Apache CGI from killing the process.
+            def joinDecorator(finish):
+                def func():
+                    finish()
+                    indexThread.join()
+                return func
+
+            self.request.finish = joinDecorator(self.request.finish)
+            indexThread.start()
+        except:
+            self.lock.release()
+            raise
+
+    def optimize(self):
+        pass
+
+    # Private ----------------------------------------------------------------
+
+    def _do_queued_updates_InNewThread(self):
+        """ do queued index updates in a new thread
+        
+        Should be called from a user request. From a script, use indexPages.
+        """
+        if not self.lock.acquire(1.0):
+            self.request.log("can't index: can't acquire lock")
+            return
+        try:
+            from threading import Thread
+            indexThread = Thread(target=self._do_queued_updates,
+                args=(self._indexingRequest(self.request), self.lock))
+            indexThread.setDaemon(True)
+            
+            # Join the index thread after current request finish, prevent
+            # Apache CGI from killing the process.
+            def joinDecorator(finish):
+                def func():
+                    finish()
+                    indexThread.join()
+                return func
+                
+            self.request.finish = joinDecorator(self.request.finish)
+            indexThread.start()
+        except:
+            self.lock.release()
+            raise
+
+    def _do_queued_updates(self, request, lock=None, amount=5):
+        """ Assumes that the write lock is acquired """
+        try:
+            writer = xapidx.Index(self.dir, True)
+            writer.configure(self.prefixMap, self.indexValueMap)
+            pages = self.queue.pages()[:amount]
+            for name in pages:
+                p = Page(request, name)
+                self._index_page(writer, p, mode='update')
+                self.queue.remove([name])
+        finally:
+            writer.close()
+            if lock:
+                lock.release()
+
+    def contentfilter(self, filename):
+        """ Get a filter for content of filename and return unicode content. """
+        request = self.request
+        mt = wikiutil.MimeType(filename=filename)
+        for modulename in mt.module_name():
+            try:
+                execute = wikiutil.importPlugin(request.cfg, 'filter', modulename)
+                break
+            except wikiutil.PluginMissingError:
+                pass
+            else:
+                request.log("Cannot load filter for mimetype." + modulename)
+        try:
+            data = execute(self, filename)
+            if debug:
+                request.log("Filter %s returned %d characters for file %s" % (modulename, len(data), filename))
+        except (OSError, IOError), err:
+            data = ''
+            request.log("Filter %s threw error '%s' for file %s" % (modulename, str(err), filename))
+        return mt.mime_type(), data
+   
+    def test(self, request):
+        idx = xapidx.ReadOnlyIndex(self.dir)
+        idx.configure(self.prefixMap, self.indexValueMap)
+        print idx.search("is")
+        #for d in docs:
+        #    request.log("%r %r %r" % (d, d.get('attachment'), d.get('pagename')))
+
+    def _index_file(self, request, writer, filename, mode='update'):
+        """ index a file as it were a page named pagename
+            Assumes that the write lock is acquired
+        """
+        fs_rootpage = 'FS' # XXX FS hardcoded
+        try:
+            wikiname = request.cfg.interwikiname or 'Self'
+            itemid = "%s:%s" % (wikiname, os.path.join(fs_rootpage, filename))
+            mtime = os.path.getmtime(filename)
+            mtime = wikiutil.timestamp2version(mtime)
+            if mode == 'update':
+                query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid))
+                docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ])
+                if docs:
+                    doc = docs[0] # there should be only one
+                    uid = doc['uid']
+                    docmtime = long(doc['values']['mtime'])
+                    updated = mtime > docmtime
+                    if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated))
+                else:
+                    uid = None
+                    updated = True
+            elif mode == 'add':
+                updated = True
+            if debug: request.log("%s %r" % (filename, updated))
+            if updated:
+                xitemid = xapdoc.Keyword('itemid', itemid)
+                mimetype, file_content = self.contentfilter(filename)
+                xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or "Self")
+                xpname = xapdoc.SortKey('pagename', fs_rootpage)
+                xattachment = xapdoc.SortKey('attachment', filename) # XXX we should treat files like real pages, not attachments
+                xmtime = xapdoc.SortKey('mtime', mtime)
+                title = " ".join(os.path.join(fs_rootpage, filename).split("/"))
+                xtitle = xapdoc.Keyword('title', title)
+                xmimetype = xapdoc.TextField('mimetype', mimetype, True)
+                xcontent = xapdoc.TextField('content', file_content)
+                doc = xapdoc.Document(textFields=(xcontent, xmimetype, ),
+                                      keywords=(xtitle, xitemid, ),
+                                      sortFields=(xpname, xattachment, xmtime, xwname, ),
+                                     )
+                doc.analyzerFactory = getWikiAnalyzerFactory()
+                if mode == 'update':
+                    if debug: request.log("%s (replace %r)" % (filename, uid))
+                    doc.uid = uid
+                    id = writer.index(doc)
+                elif mode == 'add':
+                    if debug: request.log("%s (add)" % (filename,))
+                    id = writer.index(doc)
+        except (OSError, IOError), err:
+            pass
+
+    def _get_languages(self, page):
+        body = page.get_raw_body()
+        default_lang = page.request.cfg.language_default
+
+        lang = ''
+
+        if page.request.cfg.xapian_stemming:
+            for line in body.split('\n'):
+                if line.startswith('#language'):
+                    lang = line.split(' ')[1]
+                    try:
+                        Stemmer(lang)
+                    except KeyError:
+                        # lang is not stemmable
+                        break
+                    else:
+                        # lang is stemmable
+                        return (lang, lang)
+                elif not line.startswith('#'):
+                    break
+        
+        if not lang:
+            # no lang found at all.. fallback to default language
+            lang = default_lang
+
+        # return actual lang and lang to stem in
+        return (lang, default_lang)
+
+    def _index_page(self, writer, page, mode='update'):
+        """ Index a page - assumes that the write lock is acquired
+            @arg writer: the index writer object
+            @arg page: a page object
+            @arg mode: 'add' = just add, no checks
+                       'update' = check if already in index and update if needed (mtime)
+            
+        """
+        request = page.request
+        wikiname = request.cfg.interwikiname or "Self"
+        pagename = page.page_name
+        mtime = page.mtime_usecs()
+        itemid = "%s:%s" % (wikiname, pagename)
+        # XXX: Hack until we get proper metadata
+        language, stem_language = self._get_languages(page)
+        updated = False
+
+        if mode == 'update':
+            # from #xapian: if you generate a special "unique id" term,
+            # you can just call database.replace_document(uid_term, doc)
+            # -> done in xapwrap.index.Index.index()
+            query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid))
+            docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ])
+            if docs:
+                doc = docs[0] # there should be only one
+                uid = doc['uid']
+                docmtime = long(doc['values']['mtime'])
+                updated = mtime > docmtime
+                if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated))
+            else:
+                uid = None
+                updated = True
+        elif mode == 'add':
+            updated = True
+        if debug: request.log("%s %r" % (pagename, updated))
+        if updated:
+            xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or "Self")
+            xpname = xapdoc.SortKey('pagename', pagename)
+            xattachment = xapdoc.SortKey('attachment', '') # this is a real page, not an attachment
+            xmtime = xapdoc.SortKey('mtime', mtime)
+            xtitle = xapdoc.TextField('title', pagename, True) # prefixed
+            xkeywords = [xapdoc.Keyword('itemid', itemid),
+                    xapdoc.Keyword('lang', language),
+                    xapdoc.Keyword('stem_lang', stem_language)]
+            for pagelink in page.getPageLinks(request):
+                xkeywords.append(xapdoc.Keyword('linkto', pagelink))
+            xcontent = xapdoc.TextField('content', page.get_raw_body())
+            doc = xapdoc.Document(textFields=(xcontent, xtitle),
+                                  keywords=xkeywords,
+                                  sortFields=(xpname, xattachment, xmtime, xwname, ),
+                                 )
+            doc.analyzerFactory = getWikiAnalyzerFactory(request,
+                    stem_language)
+
+            if mode == 'update':
+                if debug: request.log("%s (replace %r)" % (pagename, uid))
+                doc.uid = uid
+                id = writer.index(doc)
+            elif mode == 'add':
+                if debug: request.log("%s (add)" % (pagename,))
+                id = writer.index(doc)
+
+        from MoinMoin.action import AttachFile
+
+        attachments = AttachFile._get_files(request, pagename)
+        for att in attachments:
+            filename = AttachFile.getFilename(request, pagename, att)
+            att_itemid = "%s//%s" % (itemid, att)
+            mtime = wikiutil.timestamp2version(os.path.getmtime(filename))
+            if mode == 'update':
+                query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', att_itemid))
+                docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ])
+                if debug: request.log("##%r %r" % (filename, docs))
+                if docs:
+                    doc = docs[0] # there should be only one
+                    uid = doc['uid']
+                    docmtime = long(doc['values']['mtime'])
+                    updated = mtime > docmtime
+                    if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated))
+                else:
+                    uid = None
+                    updated = True
+            elif mode == 'add':
+                updated = True
+            if debug: request.log("%s %s %r" % (pagename, att, updated))
+            if updated:
+                xatt_itemid = xapdoc.Keyword('itemid', att_itemid)
+                xpname = xapdoc.SortKey('pagename', pagename)
+                xattachment = xapdoc.SortKey('attachment', att) # this is an attachment, store its filename
+                xmtime = xapdoc.SortKey('mtime', mtime)
+                xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att))
+                xlanguage = xapdoc.Keyword('lang', language)
+                xstem_language = xapdoc.Keyword('stem_lang', stem_language)
+                mimetype, att_content = self.contentfilter(filename)
+                xmimetype = xapdoc.TextField('mimetype', mimetype, True)
+                xcontent = xapdoc.TextField('content', att_content)
+                doc = xapdoc.Document(textFields=(xcontent, xmimetype, ),
+                                      keywords=(xatt_itemid, xtitle, xlanguage, xstem_language, ),
+                                      sortFields=(xpname, xattachment, xmtime, xwname, ),
+                                     )
+                doc.analyzerFactory = getWikiAnalyzerFactory(request,
+                        stem_language)
+                if mode == 'update':
+                    if debug: request.log("%s (replace %r)" % (pagename, uid))
+                    doc.uid = uid
+                    id = writer.index(doc)
+                elif mode == 'add':
+                    if debug: request.log("%s (add)" % (pagename,))
+                    id = writer.index(doc)
+        #writer.flush()
+        
+
+    def _index_pages(self, request, lock=None, files=None, mode='update'):
+        """ Index all pages (and all given files)
+        
+        This should be called from indexPages or indexPagesInNewThread only!
+        
+        This may take some time, depending on the size of the wiki and speed
+        of the machine.
+
+        When called in a new thread, lock is acquired before the call,
+        and this method must release it when it finishes or fails.
+        """
+        try:
+            self._unsign()
+            start = time.time()
+            writer = xapidx.Index(self.dir, True)
+            writer.configure(self.prefixMap, self.indexValueMap)
+            pages = request.rootpage.getPageList(user='', exists=1)
+            request.log("indexing all (%d) pages..." % len(pages))
+            for pagename in pages:
+                p = Page(request, pagename)
+                self._index_page(writer, p, mode)
+            if files:
+                request.log("indexing all files...")
+                for fname in files:
+                    fname = fname.strip()
+                    self._index_file(request, writer, fname, mode)
+            writer.close()
+            request.log("indexing completed successfully in %0.2f seconds." %
+                        (time.time() - start))
+            self._sign()
+        finally:
+            writer.__del__()
+            if lock:
+                lock.release()
+
+    def _optimize(self, request):
+        """ Optimize the index """
+        pass
+
+    def _indexingRequest(self, request):
+        """ Return a new request that can be used for index building.
+        
+        This request uses a security policy that lets the current user
+        read any page. Without this policy some pages will not render,
+        which will create broken pagelinks index.        
+        """
+        from MoinMoin.request.CLI import Request
+        from MoinMoin.security import Permissions
+        request = Request(request.url)
+        class SecurityPolicy(Permissions):
+            def read(*args, **kw):
+                return True        
+        request.user.may = SecurityPolicy(request.user)
+        return request
+
+    def _unsign(self):
+        """ Remove sig file - assume write lock acquired """
+        try:
+            os.remove(self.sig_file)
+        except OSError, err:
+            if err.errno != errno.ENOENT:
+                raise
+
+    def _sign(self):
+        """ Add sig file - assume write lock acquired """
+        f = file(self.sig_file, 'w')
+        try:
+            f.write('')
+        finally:
+            f.close()
+
+
+def run_query(query, db):
+    enquire = xapian.Enquire(db)
+    parser = xapian.QueryParser()
+    query = parser.parse_query(query, xapian.QueryParser.FLAG_WILDCARD)
+    print query.get_description()
+    enquire.set_query(query)
+    return enquire.get_mset(0, 10)
+
+def run(request):
+    pass
+    #print "Begin"
+    #db = xapian.WritableDatabase(xapian.open('test.db',
+    #                                         xapian.DB_CREATE_OR_OPEN))
+    #
+    # index_data(db) ???
+    #del db
+    #mset = run_query(sys.argv[1], db)
+    #print mset.get_matches_estimated()
+    #iterator = mset.begin()
+    #while iterator != mset.end():
+    #    print iterator.get_document().get_data()
+    #    iterator.next()
+    #for i in xrange(1,170):
+    #    doc = db.get_document(i)
+    #    print doc.get_data()
+
+if __name__ == '__main__':
+    run()
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/search/__init__.py	Tue Jun 27 15:09:46 2006 +0200
@@ -0,0 +1,27 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - search engine
+    
+    @copyright: 2005 MoinMoin:FlorianFesti,
+                2005 MoinMoin:NirSoffer,
+                2005 MoinMoin:AlexanderSchremmer,
+                2006 MoinMoin:ThomasWaldmann,
+                2006 MoinMoin:FranzPletz
+    @license: GNU GPL, see COPYING for details
+"""
+
+from MoinMoin.search.queryparser import QueryParser
+from MoinMoin.search.builtin import Search
+
+def searchPages(request, query, **kw):
+    """ Search the text of all pages for query.
+    
+    @param request: current request
+    @param query: the expression (string or query objects) we want to search for
+    @rtype: SearchResults instance
+    @return: search results
+    """
+    if isinstance(query, str) or isinstance(query, unicode):
+        query = QueryParser(**kw).parse_query(query)
+    return Search(request, query).run()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/search/builtin.py	Tue Jun 27 15:09:46 2006 +0200
@@ -0,0 +1,159 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - search engine
+    
+    @copyright: 2005 MoinMoin:FlorianFesti,
+                2005 MoinMoin:NirSoffer,
+                2005 MoinMoin:AlexanderSchremmer,
+                2006 MoinMoin:ThomasWaldmann,
+                2006 MoinMoin:FranzPletz
+    @license: GNU GPL, see COPYING for details
+"""
+
+import time, sys
+from MoinMoin import wikiutil, config
+from MoinMoin.Page import Page
+from MoinMoin.search.results import FoundRemote, FoundPage, FoundAttachment, SearchResults
+
+try:
+    from MoinMoin.search import Xapian
+except ImportError:
+    pass
+
+
+##############################################################################
+### Searching
+##############################################################################
+
+class Search:
+    """ A search run """
+    
+    def __init__(self, request, query):
+        self.request = request
+        self.query = query
+        self.filtered = False
+        self.fs_rootpage = "FS" # XXX FS hardcoded
+
+    def run(self):
+        """ Perform search and return results object """
+        start = time.time()
+        if self.request.cfg.xapian_search:
+            hits = self._xapianSearch()
+        else:
+            hits = self._moinSearch()
+            
+        # important - filter deleted pages or pages the user may not read!
+        if not self.filtered:
+            hits = self._filter(hits)
+        
+        result_hits = []
+        for wikiname, page, attachment, match in hits:
+            if wikiname in (self.request.cfg.interwikiname, 'Self'): # a local match
+                if attachment:
+                    result_hits.append(FoundAttachment(page.page_name, attachment))
+                else:
+                    result_hits.append(FoundPage(page.page_name, match))
+            else:
+                result_hits.append(FoundRemote(wikiname, page, attachment, match))
+        elapsed = time.time() - start
+        count = self.request.rootpage.getPageCount()
+        return SearchResults(self.query, result_hits, count, elapsed)
+
+    # ----------------------------------------------------------------
+    # Private!
+
+    def _xapianSearch(self):
+        """ Search using Xapian
+        
+        Get a list of pages using fast xapian search and
+        return moin search in those pages.
+        """
+        pages = None
+        try:
+            index = Xapian.Index(self.request)
+        except NameError:
+            index = None
+        if index and index.exists() and self.query.xapian_wanted():
+            self.request.clock.start('_xapianSearch')
+            try:
+                from MoinMoin.support import xapwrap
+                query = self.query.xapian_term(self.request)
+                self.request.log("xapianSearch: query = %r" %
+                        query.get_description())
+                query = xapwrap.index.QObjQuery(query)
+                hits = index.search(query)
+                self.request.log("xapianSearch: finds: %r" % hits)
+                def dict_decode(d):
+                    """ decode dict values to unicode """
+                    for k, v in d.items():
+                        d[k] = d[k].decode(config.charset)
+                    return d
+                pages = [dict_decode(hit['values']) for hit in hits]
+                self.request.log("xapianSearch: finds pages: %r" % pages)
+            except index.LockedException:
+                pass
+            self.request.clock.stop('_xapianSearch')
+        return self._moinSearch(pages)
+
+    def _moinSearch(self, pages=None):
+        """ Search pages using moin's built-in full text search 
+        
+        Return list of tuples (page, match). The list may contain
+        deleted pages or pages the user may not read.
+        """
+        self.request.clock.start('_moinSearch')
+        from MoinMoin.Page import Page
+        if pages is None:
+            # if we are not called from _xapianSearch, we make a full pagelist,
+            # but don't search attachments (thus attachment name = '')
+            pages = [{'pagename': p, 'attachment': '', 'wikiname': 'Self', } for p in self._getPageList()]
+        hits = []
+        fs_rootpage = self.fs_rootpage
+        for valuedict in pages:
+            wikiname = valuedict['wikiname']
+            pagename = valuedict['pagename']
+            attachment = valuedict['attachment']
+            if wikiname in (self.request.cfg.interwikiname, 'Self'): # THIS wiki
+                page = Page(self.request, pagename)
+                if attachment:
+                    if pagename == fs_rootpage: # not really an attachment
+                        page = Page(self.request, "%s/%s" % (fs_rootpage, attachment))
+                        hits.append((wikiname, page, None, None))
+                    else:
+                        hits.append((wikiname, page, attachment, None))
+                else:
+                    match = self.query.search(page)
+                    if match:
+                        hits.append((wikiname, page, attachment, match))
+            else: # other wiki
+                hits.append((wikiname, pagename, attachment, None))
+        self.request.clock.stop('_moinSearch')
+        return hits
+
+    def _getPageList(self):
+        """ Get list of pages to search in 
+        
+        If the query has a page filter, use it to filter pages before
+        searching. If not, get a unfiltered page list. The filtering
+        will happen later on the hits, which is faster with current
+        slow storage.
+        """
+        filter = self.query.pageFilter()
+        if filter:
+            # There is no need to filter the results again.
+            self.filtered = True
+            return self.request.rootpage.getPageList(filter=filter)
+        else:
+            return self.request.rootpage.getPageList(user='', exists=0)
+        
+    def _filter(self, hits):
+        """ Filter out deleted or acl protected pages """
+        userMayRead = self.request.user.may.read
+        fs_rootpage = self.fs_rootpage + "/"
+        thiswiki = (self.request.cfg.interwikiname, 'Self')
+        filtered = [(wikiname, page, attachment, match) for wikiname, page, attachment, match in hits
+                    if not wikiname in thiswiki or
+                       page.exists() and userMayRead(page.page_name) or
+                       page.page_name.startswith(fs_rootpage)]
+        return filtered
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/search/queryparser.py	Tue Jun 27 15:09:46 2006 +0200
@@ -0,0 +1,695 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - search engine query parser
+    
+    @copyright: 2005 MoinMoin:FlorianFesti,
+                2005 MoinMoin:NirSoffer,
+                2005 MoinMoin:AlexanderSchremmer,
+                2006 MoinMoin:ThomasWaldmann,
+                2006 MoinMoin:FranzPletz
+    @license: GNU GPL, see COPYING for details
+"""
+
+import re, string
+from MoinMoin import config
+from MoinMoin.search.results import Match, TitleMatch, TextMatch
+
+try:
+    from MoinMoin.search import Xapian
+    from MoinMoin.search.Xapian import Query, UnicodeQuery
+except ImportError:
+    pass
+
+#############################################################################
+### query objects
+#############################################################################
+
+class BaseExpression:
+    """ Base class for all search terms """
+    
+    def __init__(self):
+        self.negated = 0
+
+    def __str__(self):
+        return unicode(self).encode(config.charset, 'replace')
+
+    def negate(self):
+        """ Negate the result of this term """
+        self.negated = 1 
+
+    def pageFilter(self):
+        """ Return a page filtering function
+
+        This function is used to filter page list before we search
+        it. Return a function that get a page name, and return bool.
+
+        The default expression does not have any filter function and
+        return None. Sub class may define custom filter functions.
+        """
+        return None
+
+    def search(self, page):
+        """ Search a page
+
+        Returns a list of Match objects or None if term didn't find
+        anything (vice versa if negate() was called).  Terms containing
+        other terms must call this method to aggregate the results.
+        This Base class returns True (Match()) if not negated.
+        """
+        if self.negated:
+            # XXX why?
+            return [Match()]
+        else:
+            return None
+    
+    def costs(self):
+        """ Return estimated time to calculate this term
+        
+        Number is relative to other terms and has no real unit.
+        It allows to do the fast searches first.
+        """ 
+        return 0
+
+    def highlight_re(self):
+        """ Return a regular expression of what the term searches for
+
+        Used to display the needle in the page.
+        """
+        return ''
+
+    def _build_re(self, pattern, use_re=False, case=False, stemmed=False):
+        """ Make a regular expression out of a text pattern """
+        flags = case and re.U or (re.I | re.U)
+        if use_re:
+            try:
+                self.search_re = re.compile(pattern, flags)
+            except re.error:
+                pattern = re.escape(pattern)
+                self.pattern = pattern
+                self.search_re = re.compile(pattern, flags)
+            else:
+                self.pattern = pattern
+        else:
+            pattern = re.escape(pattern)
+            self.search_re = re.compile(pattern, flags)
+            self.pattern = pattern
+
+
+class AndExpression(BaseExpression):
+    """ A term connecting several sub terms with a logical AND """
+
+    operator = ' '
+
+    def __init__(self, *terms):
+        self._subterms = list(terms)
+        self._costs = 0
+        for t in self._subterms:
+            self._costs += t.costs()
+        self.negated = 0
+
+    def append(self, expression):
+        """ Append another term """
+        self._subterms.append(expression)
+        self._costs += expression.costs()
+
+    def subterms(self):
+        return self._subterms
+    
+    def costs(self):
+        return self._costs
+
+    def __unicode__(self):
+        result = ''
+        for t in self._subterms:
+            result += self.operator + t
+        return u'[' + result[len(self.operator):] + u']'
+
+    def pageFilter(self):
+        """ Return a page filtering function
+
+        This function is used to filter page list before we search it.
+
+        Return a function that gets a page name, and return bool, or None.
+        """
+        # Sort terms by cost, then get all title searches
+        self.sortByCost()
+        terms = [term for term in self._subterms if isinstance(term, TitleSearch)]
+        if terms:
+            # Create and return a filter function
+            def filter(name):
+                """ A function that return True if all terms filter name """
+                for term in terms:
+                    filter = term.pageFilter()
+                    if not filter(name):
+                        return False
+                return True
+            return filter
+        
+        return None
+
+    def sortByCost(self):
+        tmp = [(term.costs(), term) for term in self._subterms]
+        tmp.sort()
+        self._subterms = [item[1] for item in tmp]
+
+    def search(self, page):
+        """ Search for each term, cheap searches first """
+        self.sortByCost()
+        matches = []
+        for term in self._subterms:
+            result = term.search(page)
+            if not result:
+                return None
+            matches.extend(result)
+        return matches
+
+    def highlight_re(self):
+        result = []
+        for s in self._subterms:
+            highlight_re = s.highlight_re()
+            if highlight_re: result.append(highlight_re)
+            
+        return '|'.join(result)
+
+    def xapian_wanted(self):
+        wanted = True
+        for term in self._subterms:
+            wanted = wanted and term.xapian_wanted()
+        return wanted
+
+    def xapian_term(self, request):
+        # sort negated terms
+        terms = []
+        not_terms = []
+        for term in self._subterms:
+            if not term.negated:
+                terms.append(term.xapian_term(request))
+            else:
+                not_terms.append(term.xapian_term(request))
+
+        # prepare query for not negated terms
+        if len(terms) == 1:
+            t1 = Query(terms[0])
+        else:
+            t1 = Query(Query.OP_AND, terms)
+
+        # negated terms?
+        if not not_terms:
+            # no, just return query for not negated terms
+            return t1
+        
+        # yes, link not negated and negated terms' query with a AND_NOT query
+        if len(not_terms) == 1:
+            t2 = Query(not_terms[0])
+        else:
+            t2 = Query(Query.OP_OR, not_terms)
+
+        return Query(Query.OP_AND_NOT, t1, t2)
+
+
+class OrExpression(AndExpression):
+    """ A term connecting several sub terms with a logical OR """
+    
+    operator = ' or '
+
+    def search(self, page):
+        """ Search page with terms, cheap terms first
+
+        XXX Do we have any reason to sort here? we are not breaking out
+        of the search in any case.
+        """
+        self.sortByCost()
+        matches = []
+        for term in self._subterms:
+            result = term.search(page)
+            if result:
+                matches.extend(result)
+        return matches
+
+    def xapian_term(self, request):
+        # XXX: negated terms managed by _moinSearch?
+        return Query(Query.OP_OR, [term.xapian_term(request) for term in self._subterms])
+
+
+class TextSearch(BaseExpression):
+    """ A term that does a normal text search
+
+    Both page content and the page title are searched, using an
+    additional TitleSearch term.
+    """
+    
+    def __init__(self, pattern, use_re=False, case=False):
+        """ Init a text search
+
+        @param pattern: pattern to search for, ascii string or unicode
+        @param use_re: treat pattern as re of plain text, bool
+        @param case: do case sensitive search, bool 
+        """
+        self._pattern = unicode(pattern)
+        self.negated = 0
+        self.use_re = use_re
+        self.case = case
+        self._build_re(self._pattern, use_re=use_re, case=case)
+        self.titlesearch = TitleSearch(self._pattern, use_re=use_re, case=case)
+        
+    def costs(self):
+        return 10000
+    
+    def __unicode__(self):
+        neg = self.negated and '-' or ''
+        return u'%s"%s"' % (neg, unicode(self._pattern))
+
+    def highlight_re(self):
+        return u"(%s)" % self._pattern
+
+    def search(self, page):
+        matches = []
+
+        # Search in page name
+        results = self.titlesearch.search(page)
+        if results:
+            matches.extend(results)
+
+        # Search in page body
+        body = page.get_raw_body()
+        for match in self.search_re.finditer(body):
+            if page.request.cfg.xapian_stemming:
+                # somewhere in regular word
+                if body[match.start()] not in config.chars_upper and \
+                        body[match.start()-1] in config.chars_lower:
+                    continue
+
+                post = 0
+                for c in body[match.end():]:
+                    if c in config.chars_lower:
+                        post += 1
+                    else:
+                        break
+
+                matches.append(TextMatch(start=match.start(),
+                        end=match.end()+post))
+            else:
+                matches.append(TextMatch(re_match=match))
+
+        # Decide what to do with the results.
+        if ((self.negated and matches) or
+            (not self.negated and not matches)):
+            return None
+        elif matches:
+            return matches
+        else:
+            return []
+
+    def xapian_wanted(self):
+        return not self.use_re
+
+    def xapian_term(self, request):
+        if self.use_re:
+            return None # xapian can't do regex search
+        else:
+            analyzer = Xapian.WikiAnalyzer(request=request,
+                    language=request.cfg.language_default)
+            terms = self._pattern.split()
+
+            # all parsed wikiwords, AND'ed
+            queries = []
+            stemmed = []
+            for t in terms:
+                if request.cfg.xapian_stemming:
+                    # stemmed OR not stemmed
+                    tmp = []
+                    for i in analyzer.tokenize(t, flat_stemming=False):
+                        tmp.append(UnicodeQuery(Query.OP_OR, i))
+                        stemmed.append(i[1])
+                    t = tmp
+                else:
+                    # just not stemmed
+                    t = [UnicodeQuery(i) for i in analyzer.tokenize(t)]
+                queries.append(Query(Query.OP_AND, t))
+
+            if stemmed:
+                self._build_re(' '.join(stemmed), use_re=False,
+                        case=self.case, stemmed=True)
+
+            # titlesearch OR parsed wikiwords
+            return Query(Query.OP_OR,
+                    (self.titlesearch.xapian_term(request),
+                        Query(Query.OP_AND, queries)))
+
+
+class TitleSearch(BaseExpression):
+    """ Term searches in pattern in page title only """
+
+    def __init__(self, pattern, use_re=False, case=False):
+        """ Init a title search
+
+        @param pattern: pattern to search for, ascii string or unicode
+        @param use_re: treat pattern as re of plain text, bool
+        @param case: do case sensitive search, bool 
+        """
+        self._pattern = unicode(pattern)
+        self.negated = 0
+        self.use_re = use_re
+        self.case = case
+        self._build_re(self._pattern, use_re=use_re, case=case)
+        
+    def costs(self):
+        return 100
+
+    def __unicode__(self):
+        neg = self.negated and '-' or ''
+        return u'%s!"%s"' % (neg, unicode(self._pattern))
+
+    def highlight_re(self):
+        return u"(%s)" % self._pattern
+
+    def pageFilter(self):
+        """ Page filter function for single title search """
+        def filter(name):
+            match = self.search_re.search(name)
+            if ((self.negated and match) or
+                (not self.negated and not match)):
+                return False
+            return True
+        return filter
+            
+    def search(self, page):
+        # Get matches in page name
+        matches = []
+        for match in self.search_re.finditer(page.page_name):
+            if page.request.cfg.xapian_stemming:
+                # somewhere in regular word
+                if page.page_name[match.start()] not in config.chars_upper and \
+                        page.page_name[match.start()-1] in config.chars_lower:
+                    continue
+
+                post = 0
+                for c in page.page_name[match.end():]:
+                    if c in config.chars_lower:
+                        post += 1
+                    else:
+                        break
+
+                matches.append(TitleMatch(start=match.start(),
+                        end=match.end()+post))
+            else:
+                matches.append(TitleMatch(re_match=match))
+        
+        if ((self.negated and matches) or
+            (not self.negated and not matches)):
+            return None
+        elif matches:
+            return matches
+        else:
+            return []
+
+    def xapian_wanted(self):
+        return not self.use_re
+
+    def xapian_term(self, request):
+        if self.use_re:
+            return None # xapian doesn't support regex search
+        else:
+            analyzer = Xapian.WikiAnalyzer(request=request,
+                    language=request.cfg.language_default)
+            terms = self._pattern.split()
+            terms = [list(analyzer.raw_tokenize(t)) for t in terms]
+
+            # all parsed wikiwords, AND'ed
+            queries = []
+            stemmed = []
+            for t in terms:
+                if request.cfg.xapian_stemming:
+                    # stemmed OR not stemmed
+                    tmp = []
+                    for i in analyzer.tokenize(t, flat_stemming=False):
+                        tmp.append(UnicodeQuery(Query.OP_OR, ['%s%s' %
+                            (Xapian.Index.prefixMap['title'], j) for j in i]))
+                        stemmed.append(i[1])
+                    t = tmp
+                else:
+                    # just not stemmed
+                    t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], i))
+                        for i in analyzer.tokenize(t)]
+
+                queries.append(Query(Query.OP_AND, t))
+
+            if stemmed:
+                self._build_re(' '.join(stemmed), use_re=False,
+                        case=self.case, stemmed=True)
+
+            return Query(Query.OP_AND, queries)
+
+
+class LinkSearch(BaseExpression):
+    """ Search the term in the pagelinks """
+
+    def __init__(self, pattern, use_re=False, case=True):
+        """ Init a link search
+
+        @param pattern: pattern to search for, ascii string or unicode
+        @param use_re: treat pattern as re of plain text, bool
+        @param case: do case sensitive search, bool 
+        """
+        # used for search in links
+        self._pattern = pattern
+        # used for search in text
+        self._textpattern = '(' + self._pattern.replace('/', '|') + ')'
+        self.negated = 0
+        self.use_re = use_re
+        self.case = case
+        self.textsearch = TextSearch(self._textpattern, use_re=1, case=case)
+        self._build_re(unicode(pattern), use_re=use_re, case=case)
+
+    def _build_re(self, pattern, use_re=False, case=False):
+        """ Make a regular expression out of a text pattern """
+        flags = case and re.U or (re.I | re.U)
+        try:
+            if not use_re:
+                raise re.error
+            self.search_re = re.compile(pattern, flags)
+            self.static = False
+        except re.error:
+            self.pattern = pattern
+            self.static = True
+        
+    def costs(self):
+        return 5000 # cheaper than a TextSearch
+
+    def __unicode__(self):
+        neg = self.negated and '-' or ''
+        return u'%s!"%s"' % (neg, unicode(self._pattern))
+
+    def highlight_re(self):
+        return u"(%s)" % self._textpattern
+
+    def search(self, page):
+        # Get matches in page name
+        matches = []
+
+        Found = True
+        
+        for link in page.getPageLinks(page.request):
+            if ((self.static and self.pattern == link) or
+                (not self.static and self.search_re.match(link))):
+                break
+        else:
+            Found = False
+
+        if Found:
+            # Search in page text
+            results = self.textsearch.search(page)
+            if results:
+                matches.extend(results)
+            else: #This happens e.g. for pages that use navigation macros
+                matches.append(TextMatch(0, 0))
+
+        # Decide what to do with the results.
+        if ((self.negated and matches) or
+            (not self.negated and not matches)):
+            return None
+        elif matches:
+            return matches
+        else:
+            return []
+
+    def xapian_wanted(self):
+        return not self.use_re
+
+    def xapian_term(self, request):
+        pattern = self.pattern
+        if self.use_re:
+            return None # xapian doesnt support regex search
+        else:
+            return UnicodeQuery('%s:%s' %
+                    (Xapian.Index.prefixMap['linkto'], pattern))
+
+
+class LanguageSearch(BaseExpression):
+    """ Search the pages written in a language """
+
+    def __init__(self, pattern, use_re=False, case=True):
+        """ Init a language search
+
+        @param pattern: pattern to search for, ascii string or unicode
+        @param use_re: treat pattern as re of plain text, bool
+        @param case: do case sensitive search, bool 
+        """
+        # iso language code, always lowercase
+        self._pattern = pattern.lower()
+        self.negated = 0
+        self.use_re = use_re
+        self.case = case
+        self.xapian_called = False
+        self._build_re(self._pattern, use_re=use_re, case=case)
+
+    def costs(self):
+        return 5000 # cheaper than a TextSearch
+
+    def __unicode__(self):
+        neg = self.negated and '-' or ''
+        return u'%s!"%s"' % (neg, unicode(self._pattern))
+
+    def highlight_re(self):
+        return ""
+
+    def search(self, page):
+        # We just use (and trust ;)) xapian for this.. deactivated for _moinSearch
+        if not self.xapian_called:
+            return []
+        else:
+            return [Match()]
+
+    def xapian_wanted(self):
+        return not self.use_re
+
+    def xapian_term(self, request):
+        pattern = self.pattern
+        if self.use_re:
+            return None # xapian doesnt support regex search
+        else:
+            self.xapian_called = True
+            return UnicodeQuery('%s%s' %
+                    (Xapian.Index.prefixMap['lang'], pattern))
+
+
+##############################################################################
+### Parse Query
+##############################################################################
+
+class QueryParser:
+    """
+    Converts a String into a tree of Query objects
+    using recursive top/down parsing
+    """
+
+    def __init__(self, **kw):
+        """
+        @keyword titlesearch: treat all terms as title searches
+        @keyword case: do case sensitive search
+        @keyword regex: treat all terms as regular expressions
+        """
+        self.titlesearch = kw.get('titlesearch', 0)
+        self.case = kw.get('case', 0)
+        self.regex = kw.get('regex', 0)
+
+    def parse_query(self, query):
+        """ transform an string into a tree of Query objects """
+        if isinstance(query, str):
+            query = query.decode(config.charset)
+        self._query = query
+        result = self._or_expression()
+        if result is None:
+            result = BaseExpression()
+        return result
+
+    def _or_expression(self):
+        result = self._and_expression()
+        if self._query:
+            result = OrExpression(result)
+        while self._query:
+            q = self._and_expression()
+            if q:
+                result.append(q)
+        return result
+            
+    def _and_expression(self):
+        result = None
+        while not result and self._query:
+            result = self._single_term()
+        term = self._single_term()
+        if term:
+            result = AndExpression(result, term)
+        else:
+            return result
+        term = self._single_term()
+        while term:
+            result.append(term)
+            term = self._single_term()
+        return result
+                                
+    def _single_term(self):
+        regex = (r'(?P<NEG>-?)\s*(' +              # leading '-'
+                 r'(?P<OPS>\(|\)|(or\b(?!$)))|' +  # or, (, )
+                 r'(?P<MOD>(\w+:)*)' +
+                 r'(?P<TERM>("[^"]+")|' +
+                 r"('[^']+')|(\S+)))")             # search word itself
+        self._query = self._query.strip()
+        match = re.match(regex, self._query, re.U)
+        if not match:
+            return None
+        self._query = self._query[match.end():]
+        ops = match.group("OPS")
+        if ops == '(':
+            result = self._or_expression()
+            if match.group("NEG"): result.negate()
+            return result
+        elif ops == ')':
+            return None
+        elif ops == 'or':
+            return None
+        modifiers = match.group('MOD').split(":")[:-1]
+        text = match.group('TERM')
+        if self.isQuoted(text):
+            text = text[1:-1]
+
+        title_search = self.titlesearch
+        regex = self.regex
+        case = self.case
+        linkto = False
+        lang = False
+
+        for m in modifiers:
+            if "title".startswith(m):
+                title_search = True
+            elif "regex".startswith(m):
+                regex = True
+            elif "case".startswith(m):
+                case = True
+            elif "linkto".startswith(m):
+                linkto = True
+            elif "language".startswith(m):
+                lang = True
+
+        if lang:
+            obj = LanguageSearch(text, use_re=regex, case=False)
+        elif linkto:
+            obj = LinkSearch(text, use_re=regex, case=case)
+        elif title_search:
+            obj = TitleSearch(text, use_re=regex, case=case)
+        else:
+            obj = TextSearch(text, use_re=regex, case=case)
+
+        if match.group("NEG"):
+            obj.negate()
+        return obj
+
+    def isQuoted(self, text):
+        # Empty string '' is not considered quoted
+        if len(text) < 3:
+            return False
+        return (text.startswith('"') and text.endswith('"') or
+                text.startswith("'") and text.endswith("'"))
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/search/results.py	Tue Jun 27 15:09:46 2006 +0200
@@ -0,0 +1,642 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - search engine
+    
+    @copyright: 2005 MoinMoin:FlorianFesti,
+                2005 MoinMoin:NirSoffer,
+                2005 MoinMoin:AlexanderSchremmer,
+                2006 MoinMoin:ThomasWaldmann,
+                2006 MoinMoin:FranzPletz
+    @license: GNU GPL, see COPYING for details
+"""
+
+import StringIO
+from MoinMoin import config, wikiutil
+from MoinMoin.Page import Page
+
+############################################################################
+### Results
+############################################################################
+
+class Match(object):
+    """ Base class for all Matches (found pieces of pages).
+    
+    This class represents a empty True value as returned from negated searches.
+    """
+    # Default match weight
+    _weight = 1.0
+    
+    def __init__(self, start=0, end=0, re_match=None):
+        self.re_match = re_match
+        if not re_match:
+            self._start = start
+            self._end = end
+        else:
+            self._start = self._end = 0
+
+    def __len__(self):
+        return self.end - self.start
+
+    def __eq__(self, other):
+        equal = (self.__class__ == other.__class__ and
+                 self.start == other.start and
+                 self.end == other.end)
+        return equal
+        
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def view(self):
+        return ''
+
+    def weight(self):
+        return self._weight
+
+    def _get_start(self):
+        if self.re_match:
+            return self.re_match.start()
+        return self._start
+
+    def _get_end(self):
+        if self.re_match:
+            return self.re_match.end()
+        return self._end
+
+    # object properties
+    start = property(_get_start)
+    end   = property(_get_end)
+
+
+class TextMatch(Match):
+    """ Represents a match in the page content """
+    pass
+
+
+class TitleMatch(Match):
+    """ Represents a match in the page title
+    
+    Has more weight as a match in the page content.
+    """
+    # Matches in titles are much more important in wikis. This setting
+    # seems to make all pages that have matches in the title to appear
+    # before pages that their title does not match.
+    _weight = 100.0
+
+
+class AttachmentMatch(Match):
+    """ Represents a match in a attachment content
+
+    Not used yet.
+    """
+    pass
+
+
+class FoundPage:
+    """ Represents a page in a search result """
+
+    def __init__(self, page_name, matches=None, page=None):
+        self.page_name = page_name
+        self.attachment = '' # this is not an attachment
+        self.page = page
+        if matches is None:
+            matches = []
+        self._matches = matches
+
+    def weight(self, unique=1):
+        """ returns how important this page is for the terms searched for
+
+        Summarize the weight of all page matches
+
+        @param unique: ignore identical matches
+        @rtype: int
+        @return: page weight
+        """
+        weight = 0
+        for match in self.get_matches(unique=unique):
+            weight += match.weight()
+            # More sophisticated things to be added, like increase
+            # weight of near matches.
+        return weight
+
+    def add_matches(self, matches):
+        """ Add found matches """
+        self._matches.extend(matches)
+
+    def get_matches(self, unique=1, sort='start', type=Match):
+        """ Return all matches of type sorted by sort
+
+        @param unique: return only unique matches (bool)
+        @param sort: match attribute to sort by (string)
+        @param type: type of match to return (Match or sub class) 
+        @rtype: list
+        @return: list of matches
+        """
+        if unique:
+            matches = self._unique_matches(type=type)
+            if sort == 'start':
+                # matches already sorted by match.start, finished.
+                return matches
+        else:
+            matches = self._matches
+
+        # Filter by type and sort by sort using fast schwartzian
+        # transform.
+        if sort == 'start':
+            tmp = [(match.start, match) for match in matches
+                   if instance(match, type)]
+        else:
+            tmp = [(match.weight(), match) for match in matches
+                   if instance(match, type)]
+        tmp.sort()
+        if sort == 'weight':
+            tmp.reverse()
+        matches = [item[1] for item in tmp]
+        
+        return matches
+
+    def _unique_matches(self, type=Match):
+        """ Get a list of unique matches of type
+
+        The result is sorted by match.start, because its easy to remove
+        duplicates like this.
+
+        @param type: type of match to return
+        @rtype: list
+        @return: list of matches of type, sorted by match.start
+        """
+        # Filter by type and sort by match.start using fast schwartzian
+        # transform.
+        tmp = [(match.start, match) for match in self._matches
+               if isinstance(match, type)]
+        tmp.sort()
+
+        if not len(tmp):
+            return []
+
+        # Get first match into matches list
+        matches = [tmp[0][1]]
+
+        # Add the remaining ones of matches ignoring identical matches
+        for item in tmp[1:]:
+            if item[1] == matches[-1]:
+                continue
+            matches.append(item[1])
+
+        return matches
+    
+
+class FoundAttachment(FoundPage):
+    """ Represent an attachment in search results """
+    
+    def __init__(self, page_name, attachment, matches=None, page=None):
+        self.page_name = page_name
+        self.attachment = attachment
+        self.page = page
+        if matches is None:
+            matches = []
+        self._matches = matches
+
+    def weight(self, unique=1):
+        return 1
+
+    def get_matches(self, unique=1, sort='start', type=Match):
+        return []
+
+    def _unique_matches(self, type=Match):
+        return []
+
+
+class FoundRemote(FoundPage):
+    """ Represent an attachment in search results """
+    
+    def __init__(self, wikiname, page_name, attachment, matches=None, page=None):
+        self.wikiname = wikiname
+        self.page_name = page_name
+        self.attachment = attachment
+        self.page = page
+        if matches is None:
+            matches = []
+        self._matches = matches
+
+    def weight(self, unique=1):
+        return 1
+
+    def get_matches(self, unique=1, sort='start', type=Match):
+        return []
+
+    def _unique_matches(self, type=Match):
+        return []
+
+
+############################################################################
+### Search results formatting
+############################################################################
+
+class SearchResults:
+    """ Manage search results, supply different views
+
+    Search results can hold valid search results and format them for
+    many requests, until the wiki content changes.
+
+    For example, one might ask for full page list sorted from A to Z,
+    and then ask for the same list sorted from Z to A. Or sort results
+    by name and then by rank.
+    """
+    # Public functions --------------------------------------------------
+    
+    def __init__(self, query, hits, pages, elapsed):
+        self.query = query # the query
+        self.hits = hits # hits list
+        self.sort = None # hits are unsorted initially
+        self.pages = pages # number of pages in the wiki
+        self.elapsed = elapsed # search time
+
+    def sortByWeight(self):
+        """ Sorts found pages by the weight of the matches """
+        tmp = [(hit.weight(), hit.page_name, hit) for hit in self.hits]
+        tmp.sort()
+        tmp.reverse()
+        self.hits = [item[2] for item in tmp]
+        self.sort = 'weight'
+        
+    def sortByPagename(self):
+        """ Sorts a list of found pages alphabetical by page name """
+        tmp = [(hit.page_name, hit) for hit in self.hits]
+        tmp.sort()
+        self.hits = [item[1] for item in tmp]
+        self.sort = 'page_name'
+        
+    def stats(self, request, formatter):
+        """ Return search statistics, formatted with formatter
+
+        @param request: current request
+        @param formatter: formatter to use
+        @rtype: unicode
+        @return formatted statistics
+        """
+        _ = request.getText
+        output = [
+            formatter.paragraph(1),
+            formatter.text(_("%(hits)d results out of about %(pages)d pages.") %
+                   {'hits': len(self.hits), 'pages': self.pages}),
+            u' (%s)' % formatter.text(_("%.2f seconds") % self.elapsed),
+            formatter.paragraph(0),
+            ]
+        return ''.join(output)
+
+    def pageList(self, request, formatter, info=0, numbered=1):
+        """ Format a list of found pages
+
+        @param request: current request
+        @param formatter: formatter to use
+        @param info: show match info in title
+        @param numbered: use numbered list for display
+        @rtype: unicode
+        @return formatted page list
+        """
+        self._reset(request, formatter)
+        f = formatter
+        write = self.buffer.write
+        if numbered:
+            list = f.number_list
+        else:
+            list = f.bullet_list
+
+        # Add pages formatted as list
+        if self.hits:
+            write(list(1))
+
+            for page in self.hits:
+                if page.attachment:
+                    querydict = {
+                        'action': 'AttachFile',
+                        'do': 'get',
+                        'target': page.attachment,
+                    }
+                else:
+                    querydict = None
+                querystr = self.querystring(querydict)
+            
+                matchInfo = ''
+                if info:
+                    matchInfo = self.formatInfo(f, page)
+                item = [
+                    f.listitem(1),
+                    f.pagelink(1, page.page_name, querystr=querystr),
+                    self.formatTitle(page),
+                    f.pagelink(0, page.page_name),
+                    matchInfo,
+                    f.listitem(0),
+                    ]
+                write(''.join(item))
+            write(list(0))
+
+        return self.getvalue()
+
+    def pageListWithContext(self, request, formatter, info=1, context=180,
+                            maxlines=1):
+        """ Format a list of found pages with context
+
+        The default parameter values will create Google-like search
+        results, as this is the most known search interface. Good
+        interface is familiar interface, so unless we have much better
+        solution (we don't), being like Google is the way.
+
+        @param request: current request
+        @param formatter: formatter to use
+        @param info: show match info near the page link
+        @param context: how many characters to show around each match. 
+        @param maxlines: how many contexts lines to show. 
+        @rtype: unicode
+        @return formatted page list with context
+        """
+        self._reset(request, formatter)
+        f = formatter
+        write = self.buffer.write
+        
+        # Add pages formatted as definition list
+        if self.hits:
+            write(f.definition_list(1))
+
+            for page in self.hits:
+                matchInfo = ''
+                if info:
+                    matchInfo = self.formatInfo(f, page)
+                if page.attachment:
+                    fmt_context = ""
+                    querydict = {
+                        'action': 'AttachFile',
+                        'do': 'get',
+                        'target': page.attachment,
+                    }
+                elif page.page_name.startswith('FS/'): # XXX FS hardcoded
+                    fmt_context = ""
+                    querydict = None
+                else:
+                    fmt_context = self.formatContext(page, context, maxlines)
+                    querydict = None
+                querystr = self.querystring(querydict)
+                item = [
+                    f.definition_term(1),
+                    f.pagelink(1, page.page_name, querystr=querystr),
+                    self.formatTitle(page),
+                    f.pagelink(0, page.page_name),
+                    matchInfo,
+                    f.definition_term(0),
+                    f.definition_desc(1),
+                    fmt_context,
+                    f.definition_desc(0),
+                    ]
+                write(''.join(item))
+            write(f.definition_list(0))
+        
+        return self.getvalue()
+
+    # Private -----------------------------------------------------------
+
+    # This methods are not meant to be used by clients and may change
+    # without notice.
+    
+    def formatContext(self, page, context, maxlines):
+        """ Format search context for each matched page
+
+        Try to show first maxlines interesting matches context.
+        """
+        f = self.formatter
+        if not page.page:
+            page.page = Page(self.request, page.page_name)
+        body = page.page.get_raw_body()
+        last = len(body) - 1
+        lineCount = 0
+        output = []
+        
+        # Get unique text matches sorted by match.start, try to ignore
+        # matches in page header, and show the first maxlines matches.
+        # TODO: when we implement weight algorithm for text matches, we
+        # should get the list of text matches sorted by weight and show
+        # the first maxlines matches.
+        matches = page.get_matches(unique=1, sort='start', type=TextMatch)
+        i, start = self.firstInterestingMatch(page, matches)
+
+        # Format context
+        while i < len(matches) and lineCount < maxlines:
+            match = matches[i]
+            
+            # Get context range for this match
+            start, end = self.contextRange(context, match, start, last)
+
+            # Format context lines for matches. Each complete match in
+            # the context will be highlighted, and if the full match is
+            # in the context, we increase the index, and will not show
+            # same match again on a separate line.
+
+            output.append(f.text(u'...'))
+            
+            # Get the index of the first match completely within the
+            # context.
+            for j in xrange(0, len(matches)):
+                if matches[j].start >= start:
+                    break
+
+            # Add all matches in context and the text between them 
+            while True:
+                match = matches[j]
+                # Ignore matches behind the current position
+                if start < match.end:
+                    # Append the text before match
+                    if start < match.start:
+                        output.append(f.text(body[start:match.start]))
+                    # And the match
+                    output.append(self.formatMatch(body, match, start))
+                    start = match.end
+                # Get next match, but only if its completely within the context
+                if j < len(matches) - 1 and matches[j + 1].end <= end:
+                    j += 1
+                else:
+                    break
+
+            # Add text after last match and finish the line
+            if match.end < end:
+               output.append(f.text(body[match.end:end]))
+            output.append(f.text(u'...'))
+            output.append(f.linebreak(preformatted=0))
+
+            # Increase line and point to the next match
+            lineCount += 1
+            i = j + 1
+
+        output = ''.join(output)
+
+        if not output:
+            # Return the first context characters from the page text
+            output = f.text(page.page.getPageText(length=context))
+            output = output.strip()
+            if not output:
+                # This is a page with no text, only header, for example,
+                # a redirect page.
+                output = f.text(page.page.getPageHeader(length=context))
+        
+        return output
+        
+    def firstInterestingMatch(self, page, matches):
+        """ Return the first interesting match
+
+        This function is needed only because we don't have yet a weight
+        algorithm for page text matches.
+        
+        Try to find the first match in the page text. If we can't find
+        one, we return the first match and start=0.
+
+        @rtype: tuple
+        @return: index of first match, start of text
+        """
+        header = page.page.getPageHeader()
+        start = len(header)
+        # Find first match after start
+        for i in xrange(len(matches)):
+            if matches[i].start >= start:
+                return i, start
+        return 0, 0
+
+    def contextRange(self, context, match, start, last):
+        """ Compute context range
+
+        Add context around each match. If there is no room for context
+        before or after the match, show more context on the other side.
+
+        @param context: context length
+        @param match: current match
+        @param start: context should not start before that index, unless
+                      end is past the last character.
+        @param last: last character index
+        @rtype: tuple
+        @return: start, end of context
+        """
+        # Start by giving equal context on both sides of match
+        contextlen = max(context - len(match), 0)
+        cstart = match.start - contextlen / 2
+        cend = match.end + contextlen / 2
+
+        # If context start before start, give more context on end
+        if cstart < start:
+            cend += start - cstart
+            cstart = start
+            
+        # But if end if after last, give back context to start
+        if cend > last:
+            cstart -= cend - last
+            cend = last
+
+        # Keep context start positive for very short texts
+        cstart = max(cstart, 0)
+
+        return cstart, cend
+
+    def formatTitle(self, page):
+        """ Format page title
+
+        Invoke format match on all unique matches in page title.
+
+        @param page: found page
+        @rtype: unicode
+        @return: formatted title
+        """
+        # Get unique title matches sorted by match.start
+        matches = page.get_matches(unique=1, sort='start', type=TitleMatch)
+        
+        # Format
+        pagename = page.page_name
+        f = self.formatter
+        output = []
+        start = 0
+        for match in matches:
+            # Ignore matches behind the current position
+            if start < match.end:
+                # Append the text before the match
+                if start < match.start:
+                    output.append(f.text(pagename[start:match.start]))
+                # And the match
+                output.append(self.formatMatch(pagename, match, start))
+                start = match.end
+        # Add text after match
+        if start < len(pagename):
+            output.append(f.text(pagename[start:]))
+        
+        if page.attachment: # show the attachment that matched
+            output.extend([
+                    " ",
+                    f.strong(1),
+                    f.text("(%s)" % page.attachment),
+                    f.strong(0)])
+
+        return ''.join(output)
+
+    def formatMatch(self, body, match, location):
+        """ Format single match in text
+
+        Format the part of the match after the current location in the
+        text. Matches behind location are ignored and an empty string is
+        returned.
+
+        @param body: text containing match
+        @param match: search match in text
+        @param location: current location in text
+        @rtype: unicode
+        @return: formatted match or empty string
+        """        
+        start = max(location, match.start)
+        if start < match.end:
+            f = self.formatter
+            output = [
+                f.strong(1),
+                f.text(body[start:match.end]),
+                f.strong(0),
+                ]
+            return ''.join(output)
+        return ''
+
+    def querystring(self, querydict=None):
+        """ Return query string, used in the page link """
+        if querydict is None:
+            querydict = {'highlight': self.query.highlight_re()}
+        querystr = wikiutil.makeQueryString(querydict)
+        #querystr = wikiutil.escape(querystr)
+        return querystr
+
+    def formatInfo(self, formatter, page):
+        """ Return formatted match info """
+        template = u' . . . %s %s'
+        template = u"%s%s%s" % (formatter.span(1, css_class="info"),
+                                template,
+                                formatter.span(0))
+        # Count number of unique matches in text of all types
+        count = len(page.get_matches(unique=1))
+        info = template % (count, self.matchLabel[count != 1])
+        return info
+
+    def getvalue(self):
+        """ Return output in div with CSS class """
+        write = self.request.write
+        value = [
+            self.formatter.div(1, css_class='searchresults'),
+            self.buffer.getvalue(),
+            self.formatter.div(0),
+            ]
+        return '\n'.join(value)
+
+    def _reset(self, request, formatter):
+        """ Update internal state before new output
+
+        Do not call this, it should be called only by the instance code.
+
+        Each request might need different translations or other user
+        preferences.
+        """
+        self.buffer = StringIO.StringIO()
+        self.formatter = formatter
+        self.request = request
+        # Use 1 match, 2 matches...
+        _ = request.getText
+        self.matchLabel = (_('match'), _('matches'))
+
+
--- a/MoinMoin/xmlrpc/__init__.py	Tue Jun 27 13:58:39 2006 +0200
+++ b/MoinMoin/xmlrpc/__init__.py	Tue Jun 27 15:09:46 2006 +0200
@@ -484,8 +484,7 @@
 
     def xmlrpc_searchPages(self, query_string):
         from MoinMoin import search
-        query = search.QueryParser().parse_query(query_string)
-        results = search.searchPages(self.request, query)
+        results = search.searchPages(self.request, query_string)
         results.formatter = self.request.html_formatter
         results.request = self.request
         return [(self._outstr(hit.page_name),
--- a/docs/CHANGES.fpletz	Tue Jun 27 13:58:39 2006 +0200
+++ b/docs/CHANGES.fpletz	Tue Jun 27 15:09:46 2006 +0200
@@ -89,3 +89,7 @@
     * Xapian.use_stemming -> request.cfg.xapian_stemming
     * Fixed bug in the selection of the stemming language
 
+2006-06-27
+    * Splitting out MoinMoin/search.py to MoinMoin/search/*.py, no more
+      need to invoke QueryParser manually when using searchPages
+