Mercurial > moin > 1.9
changeset 947:41f6f7708466
merged xapian branch
author | Thomas Waldmann <tw AT waldmann-edv DOT de> |
---|---|
date | Mon, 17 Jul 2006 03:08:12 +0200 |
parents | 7c8e8d370740 (current diff) 72aeb2ba133d (diff) |
children | 28ea5b3802b1 |
files | MoinMoin/script/index/build.py |
diffstat | 6 files changed, 77 insertions(+), 28 deletions(-) [+] |
line wrap: on
line diff
--- a/MoinMoin/script/index/build.py Mon Jul 17 01:50:55 2006 +0200 +++ b/MoinMoin/script/index/build.py Mon Jul 17 03:08:12 2006 +0200 @@ -22,7 +22,7 @@ ) self.parser.add_option( "--mode", metavar="MODE", dest="mode", - help="either add (unconditionally add to index) or update (update an existing index)" + help="either add (unconditionally add to index), update (update an existing index) or rebuild (remove and add)" ) def mainloop(self): @@ -40,5 +40,4 @@ def command(self): from MoinMoin.search.Xapian import Index Index(self.request).indexPages(self.files, self.options.mode) - #Index(self.request).test(self.request)
--- a/MoinMoin/search/Xapian.py Mon Jul 17 01:50:55 2006 +0200 +++ b/MoinMoin/search/Xapian.py Mon Jul 17 03:08:12 2006 +0200 @@ -8,7 +8,7 @@ """ debug = True -import sys, os, re, codecs, time +import sys, os, re, codecs, time, os from pprint import pprint import xapian @@ -237,6 +237,13 @@ Assumes that the write lock is acquired """ fs_rootpage = 'FS' # XXX FS hardcoded + + # rebuilding the DB: delete it and add everything + if mode == 'rebuild': + for f in os.listdir(self.dir): + os.unlink(f) + mode = 'add' + try: wikiname = request.cfg.interwikiname or 'Self' itemid = "%s:%s" % (wikiname, os.path.join(fs_rootpage, filename))
--- a/MoinMoin/search/builtin.py Mon Jul 17 01:50:55 2006 +0200 +++ b/MoinMoin/search/builtin.py Mon Jul 17 03:08:12 2006 +0200 @@ -149,7 +149,7 @@ lock_dir = os.path.join(main_dir, 'index-lock') self.lock = lock.WriteLock(lock_dir, timeout=3600.0, readlocktimeout=60.0) - self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0) + #self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0) self.queue = UpdateQueue(os.path.join(main_dir, 'update-queue'), os.path.join(main_dir, 'update-queue-lock')) @@ -172,12 +172,12 @@ raise NotImplemented def search(self, query): - if not self.read_lock.acquire(1.0): - raise self.LockedException - try: - hits = self._search(query) - finally: - self.read_lock.release() + #if not self.read_lock.acquire(1.0): + # raise self.LockedException + #try: + hits = self._search(query) + #finally: + # self.read_lock.release() return hits def update_page(self, page): @@ -415,17 +415,25 @@ else: return self._moinSearch(pages) + def _xapianMatchDecider(self, term, pos): + if term[0] == 'S': # TitleMatch + return TitleMatch(start=pos, end=pos+len(term)-1) + else: # TextMatch (incl. headers) + return TextMatch(start=pos, end=pos+len(term)) + def _xapianMatch(self, page, uid): - matches = [] + """ Get all relevant Xapian matches per document id """ + positions = {} term = self._xapianEnquire.get_matching_terms_begin(uid) - #print hit['uid'] while term != self._xapianEnquire.get_matching_terms_end(uid): - print term.get_term(), ':', list(self._xapianIndex.termpositions(uid, term.get_term())) - for pos in self._xapianIndex.termpositions(uid, term.get_term()): - matches.append(TextMatch(start=pos, - end=pos+len(term.get_term()))) + term_name = term.get_term() + for pos in self._xapianIndex.termpositions(uid,term.get_term()): + if pos not in positions or \ + len(positions[pos]) < len(term_name): + positions[pos] = term_name term.next() - return matches + return [self._xapianMatchDecider(term, pos) for pos, term + in positions.iteritems()] def _moinSearch(self, pages=None): """ Search pages using moin's built-in full text search @@ -444,9 +452,11 @@ return hits def _moinMatch(self, page, uid): + """ Just kick off regular moinSearch """ return self.query.search(page) def _getHits(self, pages, matchSearchFunction): + """ Get the hit tuples in pages through matchSearchFunction """ hits = [] fs_rootpage = self.fs_rootpage for hit in pages: @@ -455,6 +465,7 @@ uid = hit['uid'] else: valuedict = hit + uid = None wikiname = valuedict['wikiname'] pagename = valuedict['pagename'] @@ -468,9 +479,9 @@ else: hits.append((wikiname, page, attachment, None)) else: - match = matchSearchFunction(page, uid) - if match: - hits.append((wikiname, page, attachment, match)) + matches = matchSearchFunction(page, uid) + if matches: + hits.append((wikiname, page, attachment, matches)) else: # other wiki hits.append((wikiname, pagename, attachment, None)) return hits
--- a/MoinMoin/search/results.py Mon Jul 17 01:50:55 2006 +0200 +++ b/MoinMoin/search/results.py Mon Jul 17 03:08:12 2006 +0200 @@ -494,7 +494,8 @@ start = len(header) # Find first match after start for i in xrange(len(matches)): - if matches[i].start >= start: + if matches[i].start >= start and \ + isinstance(matches[i], TextMatch): return i, start return 0, 0
--- a/MoinMoin/support/xapwrap/document.py Mon Jul 17 01:50:55 2006 +0200 +++ b/MoinMoin/support/xapwrap/document.py Mon Jul 17 03:08:12 2006 +0200 @@ -145,6 +145,9 @@ # add text fields for field in self.textFields: + # XXX: terms textFields won't get numbered + # after each other, needed for titles + position = 0 for token in analyzer.tokenize(field.text): if isinstance(token, tuple): token, position = token @@ -163,19 +166,20 @@ # the process, the string length could expand, so we # need to check here as well. d.add_posting(checkKeyLen(token), position) - position += INTER_FIELD_POSITION_GAP + #position += INTER_FIELD_POSITION_GAP if field.prefix: prefix = field.name for token in analyzer.tokenize(field.text): if isinstance(token, tuple): - token = token[0] + token, position = token + else: + position += 1 # token is unicode, but gets converted to UTF-8 # by makePairForWrite: term = makePairForWrite(prefix, token, prefixMap) d.add_posting(term, position) - position += 1 - position += INTER_FIELD_POSITION_GAP + #position += INTER_FIELD_POSITION_GAP # add keyword fields for field in self.keywords:
--- a/docs/CHANGES.fpletz Mon Jul 17 01:50:55 2006 +0200 +++ b/docs/CHANGES.fpletz Mon Jul 17 03:08:12 2006 +0200 @@ -5,14 +5,18 @@ * Only term-based regex searching possible, modifier or heuristic to enable usage of _moinSearch for full compatibility? * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata) + * Positions saved in Xapian aren't always correct, check. Code + generally needs some more love. ToDo: * Implement the new search UI * Write/update documentation for all the new search stuff * Indexing and searching of categories (new term prefix) - * Drop _moinSearch when using Xapian and use term positions provided - by Xapian itself, needs some reworking of WikiAnalyzer/xapwrap to - get the position of stemmed words right + * Reevaluate Xapwrap, possibly drop it and rip out usable stuff + (i.e. ExceptionTranslator) + * Add stemming support for highlighting stuff: + 1. regexp for whole word (all lowercase), or + 2. just the root of the word New Features: * Faster search thanks to Xapian @@ -122,3 +126,26 @@ * Basic (quick and dirty, limitations and bugs included, but commit-ready) implementation of getting matches out of the Xapian DB +2006-07-08 + * No work: daytrip to Munich + +2006-07-09 + * Bugfix for _moinSearch (not using Xapian) + +2006-07-11 + * Make matches which we get from Xapian more reliable + * Add TitleMatch support + * Xapwrap needed some tuning (aka hacking), think about dropping + and/or rewriting much of its code as it doesn't always fit (and + probably won't in the future) + +2006-07-12 +2006-07-13 + * No work + +2006-07-14 + * Minor bugfix for TitleMatch, now works correctly + * First interesting match must be a TextMatch + * Comment read_lock code from BaseIndex (should not be needed) + * Support complete rebuild of the database (delete and add) +