Mercurial > moin > 1.9
changeset 5027:f531ccc68313
Xapian2009: xapian_term() was refactored for the TextSearch and TitleSearch. Title and content fields are tokenized (and stemmed) in the search index.
author | Dmitrijs Milajevs <dimazest@gmail.com> |
---|---|
date | Thu, 20 Aug 2009 19:28:34 +0200 |
parents | deb2e2d5326e |
children | aafcd2b5597a |
files | MoinMoin/search/Xapian/indexing.py MoinMoin/search/_tests/test_search.py MoinMoin/search/queryparser/expressions.py |
diffstat | 3 files changed, 82 insertions(+), 102 deletions(-) [+] |
line wrap: on
line diff
--- a/MoinMoin/search/Xapian/indexing.py Thu Aug 20 16:54:49 2009 +0200 +++ b/MoinMoin/search/Xapian/indexing.py Thu Aug 20 19:28:34 2009 +0200 @@ -17,6 +17,7 @@ from MoinMoin.support import xappy from MoinMoin.parser.text_moin_wiki import Parser as WikiParser from MoinMoin.search.builtin import BaseIndex +from MoinMoin.search.Xapian.tokenizer import WikiAnalyzer from MoinMoin.Page import Page from MoinMoin import config, wikiutil @@ -91,7 +92,8 @@ self.add_field_action('revision', INDEX_EXACT) self.add_field_action('mimetype', INDEX_EXACT) self.add_field_action('mimetype', STORE_CONTENT) - self.add_field_action('title', INDEX_FREETEXT, weight=5) + self.add_field_action('title', INDEX_FREETEXT, weight=100) + self.add_field_action('title', STORE_CONTENT) self.add_field_action('content', INDEX_FREETEXT, spell=True) self.add_field_action('fulltitle', INDEX_EXACT) self.add_field_action('fulltitle', STORE_CONTENT) @@ -106,6 +108,15 @@ self.add_field_action('category', INDEX_EXACT) self.add_field_action('category', STORE_CONTENT) +class StemmedField(xappy.Field): + + def __init__(self, name, value, request): + + analyzer = WikiAnalyzer(request=request, language=request.cfg.language_default) + tokens = analyzer.tokenize(value) + value = ''.join(('%s %s' % (word, stemmed) for word, stemmed in analyzer.tokenize(value))) + super(StemmedField, self).__init__(name, value) + class Index(BaseIndex): @@ -241,14 +252,11 @@ doc.fields.append(xappy.Field('mtime', str(mtime))) doc.fields.append(xappy.Field('revision', '0')) title = " ".join(os.path.join(fs_rootpage, filename).split("/")) - doc.fields.append(xappy.Field('title', title)) + doc.fields.append(StemmedField('title', title, request)) mimetype, file_content = self.contentfilter(filename) doc.fields.extend([xappy.Field('mimetype', mt) for mt in [mimetype, ] + mimetype.split('/')]) - doc.fields.append(xappy.Field('content', file_content)) - - # Stemming - # doc.analyzerFactory = getWikiAnalyzerFactory() + doc.fields.append(StemmedField('content', file_content, request)) connection.replace(doc) @@ -372,7 +380,7 @@ doc.fields.append(xappy.Field('mtime', str(mtime))) doc.fields.append(xappy.Field('revision', '0')) - doc.fields.append(xappy.Field('title', '%s/%s' % (pagename, att))) + doc.fields.append(StemmedField('title', '%s/%s' % (pagename, att), request)) doc.fields.append(xappy.Field('lang', language)) doc.fields.append(xappy.Field('stem_lang', stem_language)) @@ -380,11 +388,9 @@ mimetype, att_content = self.contentfilter(filename) doc.fields.extend([xappy.Field('mimetype', mt) for mt in [mimetype, ] + mimetype.split('/')]) - doc.fields.append(xappy.Field('content', att_content)) + doc.fields.append(StemmedField('content', att_content, request)) doc.fields.extend([xappy.Field('domain', domain) for domain in domains]) - # XXX Stemming - # doc.analyzerFactory = getWikiAnalyzerFactory(request, stem_language) connection.replace(doc) def _index_page_rev(self, request, connection, page, mode='update'): @@ -433,7 +439,7 @@ doc.fields.append(xappy.Field('attachment', '')) # this is a real page, not an attachment doc.fields.append(xappy.Field('mtime', str(mtime))) doc.fields.append(xappy.Field('revision', revision)) - doc.fields.append(xappy.Field('title', pagename)) + doc.fields.append(StemmedField('title', pagename, request)) doc.fields.append(xappy.Field('lang', language)) doc.fields.append(xappy.Field('stem_lang', stem_language)) @@ -446,11 +452,7 @@ doc.fields.extend([xappy.Field('linkto', pagelink) for pagelink in page.getPageLinks(request)]) doc.fields.extend([xappy.Field('category', category) for category in categories]) doc.fields.extend([xappy.Field('domain', domain) for domain in domains]) - - doc.fields.append(xappy.Field('content', page.get_raw_body())) - - # XXX Stemming - # doc.analyzerFactory = getWikiAnalyzerFactory(request, stem_language) + doc.fields.append(StemmedField('content', page.get_raw_body(), request)) logging.debug("%s (replace %r)" % (pagename, itemid)) connection.replace(doc)
--- a/MoinMoin/search/_tests/test_search.py Thu Aug 20 16:54:49 2009 +0200 +++ b/MoinMoin/search/_tests/test_search.py Thu Aug 20 19:28:34 2009 +0200 @@ -89,6 +89,7 @@ u'FrontPage': None, u'RecentChanges': None, u'HelpOnCreoleSyntax': None, + u'HelpOnEditing': None, u'HelpIndex': None} def setup_class(self): @@ -116,7 +117,8 @@ searches = {u'title:SearchTestPage': 1, u'title:LanguageSetup': 1, u'title:HelpIndex': 1, - u'title:Help': 2, + u'title:Help': 3, + u'title:HelpOn': 2, u'title:SearchTestNotExisting': 0} def test(query, res_count): @@ -205,18 +207,18 @@ def test_mimetype_search_simple(self): result = self.search(u'mimetype:text/wiki') - assert len(result.hits) == 10 + assert len(result.hits) == 11 def test_mimetype_search_re(self): result = self.search(ur'mimetype:re:\btext/wiki\b') - assert len(result.hits) == 10 + assert len(result.hits) == 11 result = self.search(ur'category:re:\bCategoryHomepa\b') assert not result.hits def test_language_search_simple(self): result = self.search(u'language:en') - assert len(result.hits) == 10 + assert len(result.hits) == 11 def test_domain_search_simple(self): result = self.search(u'domain:system') @@ -248,6 +250,9 @@ result = self.search(u"-title:FrontPage") assert len(result.hits) == len(self.pages) - 1 + result = self.search(u"-title:HelpOn") + assert len(result.hits) == len(self.pages) - 2 + def testFullSearchNegatedFindAll(self): """ search: negated full search for some string that does not exist results in all pages """ result = self.search(u"-%s" % self.doesnotexist)
--- a/MoinMoin/search/queryparser/expressions.py Thu Aug 20 16:54:49 2009 +0200 +++ b/MoinMoin/search/queryparser/expressions.py Thu Aug 20 19:28:34 2009 +0200 @@ -22,6 +22,11 @@ try: from MoinMoin.search import Xapian from MoinMoin.search.Xapian import Query, UnicodeQuery + + OP_AND = Query.OP_AND + OP_OR = Query.OP_OR + OP_AND_NOT = Query.OP_AND_NOT + except ImportError: pass @@ -143,7 +148,7 @@ if self.search_re.match(term): queries.append(connection.query_field(field_to_check, term)) - return Query(Query.OP_OR, queries) + return Query(OP_OR, queries) def xapian_need_postproc(self): return self.case @@ -248,17 +253,17 @@ # prepare query for not negated terms if terms: - query = Query(Query.OP_AND, terms) + query = Query(OP_AND, terms) else: query = Query('') # MatchAll # prepare query for negated terms if not_terms: - query_negated = Query(Query.OP_OR, not_terms) + query_negated = Query(OP_OR, not_terms) else: query_negated = Query() - return Query(Query.OP_AND_NOT, query, query_negated) + return Query(OP_AND_NOT, query, query_negated) class OrExpression(AndExpression): @@ -298,10 +303,48 @@ def xapian_term(self, request, connection): # XXX: negated terms managed by _moinSearch? - return Query(Query.OP_OR, [term.xapian_term(request, connection) for term in self._subterms]) + return Query(OP_OR, [term.xapian_term(request, connection) for term in self._subterms]) + +class BaseTextFieldSearch(BaseExpression): + + _field_to_search = None + + def xapian_term(self, request, connection): + if self.use_re: + queries = [self._get_query_for_search_re(connection, self._field_to_search)] + else: + queries = [] + stemmed = [] + analyzer = Xapian.WikiAnalyzer(request=request, language=request.cfg.language_default) + + for term in self._pattern.split(): + query_term = connection.query_field(self._field_to_search, term) + + tokens = analyzer.tokenize(term) + + if request.cfg.xapian_stemming: + query_tokens = Query(OP_AND, + [Query(OP_OR, + [connection.query_field(self._field_to_search, token), + connection.query_field(self._field_to_search, stemmed)]) for token, stemmed in tokens if token != term]) + + stemmed.extend(stemmed for term, stemmed in analyzer.tokenize(term)) + + else: + tokens = analyzer.tokenize(term) + query_tokens = Query(OP_AND, [connection.query_field(self._field_to_search, token) for token, stemmed in tokens if token != term]) + + queries.append(Query(OP_OR, [query_term, query_tokens])) + + if not self.case and stemmed: + new_pat = ' '.join(stemmed) + self._pattern = new_pat + self.pattern, self.search_re = self._build_re(new_pat, use_re=False, case=self.case, stemmed=True) + + return Query(OP_AND, queries) -class TextSearch(BaseExpression): +class TextSearch(BaseTextFieldSearch): """ A term that does a normal text search Both page content and the page title are searched, using an @@ -309,6 +352,7 @@ """ costs = 10000 + _field_to_search = 'content' def highlight_re(self): return u"(%s)" % self.pattern @@ -329,49 +373,19 @@ return matches def xapian_term(self, request, connection): - # XXX next version of xappy (>0.5) will provide Query class - # it should be used. - if self.use_re: - queries = [self._get_query_for_search_re(connection)] - else: - analyzer = Xapian.WikiAnalyzer(request=request, language=request.cfg.language_default) - terms = self._pattern.split() - - # all parsed wikiwords, AND'ed - queries = [] - stemmed = [] - for term in terms: - if request.cfg.xapian_stemming: - # stemmed OR not stemmed - t = [] - for w, s, pos in analyzer.tokenize(term, flat_stemming=False): - query_word = connection.query_field('content', w) - query_stemmed = connection.query_field('content', s) - # XXX UnicodeQuery was used here! - t.append(Query(Query.OP_OR, [query_word, query_stemmed])) - stemmed.append(s) - else: - # just not stemmed - t = [connection.query_field('content', w) for w, pos in analyzer.tokenize(term)] + content_query = super(TextSearch, self).xapian_term(request, connection) + title_query = TitleSearch(self._pattern, use_re=self.use_re, case=self.case).xapian_term(request, connection) - queries.append(Query(connection.OP_AND, t)) - - # XXX Is it required to change pattern and search_re here? - if not self.case and stemmed: - new_pat = ' '.join(stemmed) - self._pattern = new_pat - self.pattern, self.search_re = self._build_re(new_pat, use_re=False, case=self.case, stemmed=True) - - title_query = TitleSearch(self._pattern, use_re=self.use_re, case=self.case).xapian_term(request, connection) - return Query(Query.OP_OR, [title_query, Query(Query.OP_AND, queries)]) + return Query(OP_OR, [title_query, content_query]) -class TitleSearch(BaseExpression): +class TitleSearch(BaseTextFieldSearch): """ Term searches in pattern in page title only """ _tag = 'title:' costs = 100 + _field_to_search = 'title' def pageFilter(self): """ Page filter function for single title search """ @@ -391,47 +405,6 @@ return matches - def xapian_term(self, request, connection): - if self.use_re: - # XXX weight for a query! - queries = [self._get_query_for_search_re(connection, 'fulltitle')] - else: - analyzer = Xapian.WikiAnalyzer(request=request, - language=request.cfg.language_default) - terms = self._pattern.split() - terms = [[w for w, pos in analyzer.raw_tokenize(t)] for t in terms] - - # all parsed wikiwords, ANDed - queries = [] - stemmed = [] - for term in terms: - if request.cfg.xapian_stemming: - # stemmed OR not stemmed - t = [] - for w, s, pos in analyzer.tokenize(term, flat_stemming=False): - # XXX weight for a query 100! - query_word = connection.query_field('title', w) - query_stemmed = connection.query_field('title', s) - - # XXX UnicodeQuery was used here! - t.append(Query(Query.OP_OR, [query_word, query_stemmed])) - stemmed.append(s) - else: - # just not stemmed - # XXX weight for a query 100! - # XXX UnicodeQuery was used here! - t = [connection.query_field('title', w) for w, pos in analyzer.tokenize(term)] - - # XXX what should be there OR or AND?! - queries.append(Query(Query.OP_OR, t)) - - if not self.case and stemmed: - new_pat = ' '.join(stemmed) - self._pattern = new_pat - self._build_re(new_pat, use_re=False, case=self.case, stemmed=True) - - return Query(Query.OP_AND, queries) - class BaseFieldSearch(BaseExpression):