Mercurial > moin > 1.9
changeset 945:248789a3f155
improving positions fetched from xapian, TitleMatch support, bugfixes for the current code
author | Franz Pletz <fpletz AT franz-pletz DOT org> |
---|---|
date | Fri, 14 Jul 2006 13:17:15 +0200 |
parents | 28ae528ca238 |
children | 72aeb2ba133d |
files | MoinMoin/search/builtin.py MoinMoin/search/results.py MoinMoin/support/xapwrap/document.py docs/CHANGES.fpletz |
diffstat | 4 files changed, 56 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/MoinMoin/search/builtin.py Fri Jul 07 12:30:34 2006 +0200 +++ b/MoinMoin/search/builtin.py Fri Jul 14 13:17:15 2006 +0200 @@ -415,17 +415,25 @@ else: return self._moinSearch(pages) + def _xapianMatchDecider(self, term, pos): + if term[0] == 'S': # TitleMatch + return TitleMatch(start=pos, end=pos+len(term)-1) + else: # TextMatch (incl. headers) + return TextMatch(start=pos, end=pos+len(term)) + def _xapianMatch(self, page, uid): - matches = [] + """ Get all relevant Xapian matches per document id """ + positions = {} term = self._xapianEnquire.get_matching_terms_begin(uid) - #print hit['uid'] while term != self._xapianEnquire.get_matching_terms_end(uid): - print term.get_term(), ':', list(self._xapianIndex.termpositions(uid, term.get_term())) - for pos in self._xapianIndex.termpositions(uid, term.get_term()): - matches.append(TextMatch(start=pos, - end=pos+len(term.get_term()))) + term_name = term.get_term() + for pos in self._xapianIndex.termpositions(uid,term.get_term()): + if pos not in positions or \ + len(positions[pos]) < len(term_name): + positions[pos] = term_name term.next() - return matches + return [self._xapianMatchDecider(term, pos) for pos, term + in positions.iteritems()] def _moinSearch(self, pages=None): """ Search pages using moin's built-in full text search @@ -444,9 +452,11 @@ return hits def _moinMatch(self, page, uid): + """ Just kick off regular moinSearch """ return self.query.search(page) def _getHits(self, pages, matchSearchFunction): + """ Get the hit tuples in pages through matchSearchFunction """ hits = [] fs_rootpage = self.fs_rootpage for hit in pages: @@ -455,6 +465,7 @@ uid = hit['uid'] else: valuedict = hit + uid = None wikiname = valuedict['wikiname'] pagename = valuedict['pagename'] @@ -468,9 +479,9 @@ else: hits.append((wikiname, page, attachment, None)) else: - match = matchSearchFunction(page, uid) - if match: - hits.append((wikiname, page, attachment, match)) + matches = matchSearchFunction(page, uid) + if matches: + hits.append((wikiname, page, attachment, matches)) else: # other wiki hits.append((wikiname, pagename, attachment, None)) return hits
--- a/MoinMoin/search/results.py Fri Jul 07 12:30:34 2006 +0200 +++ b/MoinMoin/search/results.py Fri Jul 14 13:17:15 2006 +0200 @@ -494,7 +494,8 @@ start = len(header) # Find first match after start for i in xrange(len(matches)): - if matches[i].start >= start: + if matches[i].start >= start and \ + isinstance(matches[i], TextMatch): return i, start return 0, 0
--- a/MoinMoin/support/xapwrap/document.py Fri Jul 07 12:30:34 2006 +0200 +++ b/MoinMoin/support/xapwrap/document.py Fri Jul 14 13:17:15 2006 +0200 @@ -145,6 +145,9 @@ # add text fields for field in self.textFields: + # XXX: terms textFields won't get numbered + # after each other, needed for titles + position = 0 for token in analyzer.tokenize(field.text): if isinstance(token, tuple): token, position = token @@ -163,19 +166,20 @@ # the process, the string length could expand, so we # need to check here as well. d.add_posting(checkKeyLen(token), position) - position += INTER_FIELD_POSITION_GAP + #position += INTER_FIELD_POSITION_GAP if field.prefix: prefix = field.name for token in analyzer.tokenize(field.text): if isinstance(token, tuple): - token = token[0] + token, position = token + else: + position += 1 # token is unicode, but gets converted to UTF-8 # by makePairForWrite: term = makePairForWrite(prefix, token, prefixMap) d.add_posting(term, position) - position += 1 - position += INTER_FIELD_POSITION_GAP + #position += INTER_FIELD_POSITION_GAP # add keyword fields for field in self.keywords:
--- a/docs/CHANGES.fpletz Fri Jul 07 12:30:34 2006 +0200 +++ b/docs/CHANGES.fpletz Fri Jul 14 13:17:15 2006 +0200 @@ -5,14 +5,15 @@ * Only term-based regex searching possible, modifier or heuristic to enable usage of _moinSearch for full compatibility? * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper metadata) + * Positions saved in Xapian aren't always correct, check. Code + generally needs some more love. ToDo: * Implement the new search UI * Write/update documentation for all the new search stuff * Indexing and searching of categories (new term prefix) - * Drop _moinSearch when using Xapian and use term positions provided - by Xapian itself, needs some reworking of WikiAnalyzer/xapwrap to - get the position of stemmed words right + * Reevaluate Xapwrap, possibly drop it and rip out usable stuff + (i.e. ExceptionTranslator) New Features: * Faster search thanks to Xapian @@ -122,3 +123,24 @@ * Basic (quick and dirty, limitations and bugs included, but commit-ready) implementation of getting matches out of the Xapian DB +2006-07-08 + * No work: daytrip to Munich + +2006-07-09 + * Bugfix for _moinSearch (not using Xapian) + +2006-07-11 + * Make matches which we get from Xapian more reliable + * Add TitleMatch support + * Xapwrap needed some tuning (aka hacking), think about dropping + and/or rewriting much of its code as it doesn't always fit (and + probably won't in the future) + +2006-07-12 +2006-07-13 + * No work + +2006-07-14 + * Minor bugfix for TitleMatch, now works correctly + * First interesting match must be a TextMatch +