changeset 5027:f531ccc68313

Xapian2009: xapian_term() was refactored for the TextSearch and TitleSearch. Title and content fields are tokenized (and stemmed) in the search index.
author Dmitrijs Milajevs <dimazest@gmail.com>
date Thu, 20 Aug 2009 19:28:34 +0200
parents deb2e2d5326e
children aafcd2b5597a
files MoinMoin/search/Xapian/indexing.py MoinMoin/search/_tests/test_search.py MoinMoin/search/queryparser/expressions.py
diffstat 3 files changed, 82 insertions(+), 102 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/search/Xapian/indexing.py	Thu Aug 20 16:54:49 2009 +0200
+++ b/MoinMoin/search/Xapian/indexing.py	Thu Aug 20 19:28:34 2009 +0200
@@ -17,6 +17,7 @@
 from MoinMoin.support import xappy
 from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
 from MoinMoin.search.builtin import BaseIndex
+from MoinMoin.search.Xapian.tokenizer import WikiAnalyzer
 
 from MoinMoin.Page import Page
 from MoinMoin import config, wikiutil
@@ -91,7 +92,8 @@
         self.add_field_action('revision', INDEX_EXACT)
         self.add_field_action('mimetype', INDEX_EXACT)
         self.add_field_action('mimetype', STORE_CONTENT)
-        self.add_field_action('title', INDEX_FREETEXT, weight=5)
+        self.add_field_action('title', INDEX_FREETEXT, weight=100)
+        self.add_field_action('title', STORE_CONTENT)
         self.add_field_action('content', INDEX_FREETEXT, spell=True)
         self.add_field_action('fulltitle', INDEX_EXACT)
         self.add_field_action('fulltitle', STORE_CONTENT)
@@ -106,6 +108,15 @@
         self.add_field_action('category', INDEX_EXACT)
         self.add_field_action('category', STORE_CONTENT)
 
+class StemmedField(xappy.Field):
+
+    def __init__(self, name, value, request):
+
+        analyzer = WikiAnalyzer(request=request, language=request.cfg.language_default)
+        tokens = analyzer.tokenize(value)
+        value = ''.join(('%s %s' % (word, stemmed) for word, stemmed in analyzer.tokenize(value)))
+        super(StemmedField, self).__init__(name, value)
+
 
 class Index(BaseIndex):
 
@@ -241,14 +252,11 @@
                 doc.fields.append(xappy.Field('mtime', str(mtime)))
                 doc.fields.append(xappy.Field('revision', '0'))
                 title = " ".join(os.path.join(fs_rootpage, filename).split("/"))
-                doc.fields.append(xappy.Field('title', title))
+                doc.fields.append(StemmedField('title', title, request))
 
                 mimetype, file_content = self.contentfilter(filename)
                 doc.fields.extend([xappy.Field('mimetype', mt) for mt in [mimetype, ] + mimetype.split('/')])
-                doc.fields.append(xappy.Field('content', file_content))
-
-                # Stemming
-                # doc.analyzerFactory = getWikiAnalyzerFactory()
+                doc.fields.append(StemmedField('content', file_content, request))
 
                 connection.replace(doc)
 
@@ -372,7 +380,7 @@
 
                 doc.fields.append(xappy.Field('mtime', str(mtime)))
                 doc.fields.append(xappy.Field('revision', '0'))
-                doc.fields.append(xappy.Field('title', '%s/%s' % (pagename, att)))
+                doc.fields.append(StemmedField('title', '%s/%s' % (pagename, att), request))
 
                 doc.fields.append(xappy.Field('lang', language))
                 doc.fields.append(xappy.Field('stem_lang', stem_language))
@@ -380,11 +388,9 @@
 
                 mimetype, att_content = self.contentfilter(filename)
                 doc.fields.extend([xappy.Field('mimetype', mt) for mt in [mimetype, ] + mimetype.split('/')])
-                doc.fields.append(xappy.Field('content', att_content))
+                doc.fields.append(StemmedField('content', att_content, request))
                 doc.fields.extend([xappy.Field('domain', domain) for domain in domains])
 
-                # XXX Stemming
-                # doc.analyzerFactory = getWikiAnalyzerFactory(request, stem_language)
                 connection.replace(doc)
 
     def _index_page_rev(self, request, connection, page, mode='update'):
@@ -433,7 +439,7 @@
             doc.fields.append(xappy.Field('attachment', '')) # this is a real page, not an attachment
             doc.fields.append(xappy.Field('mtime', str(mtime)))
             doc.fields.append(xappy.Field('revision', revision))
-            doc.fields.append(xappy.Field('title', pagename))
+            doc.fields.append(StemmedField('title', pagename, request))
 
             doc.fields.append(xappy.Field('lang', language))
             doc.fields.append(xappy.Field('stem_lang', stem_language))
@@ -446,11 +452,7 @@
             doc.fields.extend([xappy.Field('linkto', pagelink) for pagelink in page.getPageLinks(request)])
             doc.fields.extend([xappy.Field('category', category) for category in categories])
             doc.fields.extend([xappy.Field('domain', domain) for domain in domains])
-
-            doc.fields.append(xappy.Field('content', page.get_raw_body()))
-
-            # XXX Stemming
-            # doc.analyzerFactory = getWikiAnalyzerFactory(request, stem_language)
+            doc.fields.append(StemmedField('content', page.get_raw_body(), request))
 
             logging.debug("%s (replace %r)" % (pagename, itemid))
             connection.replace(doc)
--- a/MoinMoin/search/_tests/test_search.py	Thu Aug 20 16:54:49 2009 +0200
+++ b/MoinMoin/search/_tests/test_search.py	Thu Aug 20 19:28:34 2009 +0200
@@ -89,6 +89,7 @@
              u'FrontPage': None,
              u'RecentChanges': None,
              u'HelpOnCreoleSyntax': None,
+             u'HelpOnEditing': None,
              u'HelpIndex': None}
 
     def setup_class(self):
@@ -116,7 +117,8 @@
         searches = {u'title:SearchTestPage': 1,
                     u'title:LanguageSetup': 1,
                     u'title:HelpIndex': 1,
-                    u'title:Help': 2,
+                    u'title:Help': 3,
+                    u'title:HelpOn': 2,
                     u'title:SearchTestNotExisting': 0}
 
         def test(query, res_count):
@@ -205,18 +207,18 @@
 
     def test_mimetype_search_simple(self):
         result = self.search(u'mimetype:text/wiki')
-        assert len(result.hits) == 10
+        assert len(result.hits) == 11
 
     def test_mimetype_search_re(self):
         result = self.search(ur'mimetype:re:\btext/wiki\b')
-        assert len(result.hits) == 10
+        assert len(result.hits) == 11
 
         result = self.search(ur'category:re:\bCategoryHomepa\b')
         assert not result.hits
 
     def test_language_search_simple(self):
         result = self.search(u'language:en')
-        assert len(result.hits) == 10
+        assert len(result.hits) == 11
 
     def test_domain_search_simple(self):
         result = self.search(u'domain:system')
@@ -248,6 +250,9 @@
         result = self.search(u"-title:FrontPage")
         assert len(result.hits) == len(self.pages) - 1
 
+        result = self.search(u"-title:HelpOn")
+        assert len(result.hits) == len(self.pages) - 2
+
     def testFullSearchNegatedFindAll(self):
         """ search: negated full search for some string that does not exist results in all pages """
         result = self.search(u"-%s" % self.doesnotexist)
--- a/MoinMoin/search/queryparser/expressions.py	Thu Aug 20 16:54:49 2009 +0200
+++ b/MoinMoin/search/queryparser/expressions.py	Thu Aug 20 19:28:34 2009 +0200
@@ -22,6 +22,11 @@
 try:
     from MoinMoin.search import Xapian
     from MoinMoin.search.Xapian import Query, UnicodeQuery
+
+    OP_AND = Query.OP_AND
+    OP_OR = Query.OP_OR
+    OP_AND_NOT = Query.OP_AND_NOT
+
 except ImportError:
     pass
 
@@ -143,7 +148,7 @@
                         if self.search_re.match(term):
                             queries.append(connection.query_field(field_to_check, term))
 
-        return Query(Query.OP_OR, queries)
+        return Query(OP_OR, queries)
 
     def xapian_need_postproc(self):
         return self.case
@@ -248,17 +253,17 @@
 
         # prepare query for not negated terms
         if terms:
-            query = Query(Query.OP_AND, terms)
+            query = Query(OP_AND, terms)
         else:
             query = Query('') # MatchAll
 
         # prepare query for negated terms
         if not_terms:
-            query_negated = Query(Query.OP_OR, not_terms)
+            query_negated = Query(OP_OR, not_terms)
         else:
             query_negated = Query()
 
-        return Query(Query.OP_AND_NOT, query, query_negated)
+        return Query(OP_AND_NOT, query, query_negated)
 
 
 class OrExpression(AndExpression):
@@ -298,10 +303,48 @@
 
     def xapian_term(self, request, connection):
         # XXX: negated terms managed by _moinSearch?
-        return Query(Query.OP_OR, [term.xapian_term(request, connection) for term in self._subterms])
+        return Query(OP_OR, [term.xapian_term(request, connection) for term in self._subterms])
+
+class BaseTextFieldSearch(BaseExpression):
+
+    _field_to_search = None
+
+    def xapian_term(self, request, connection):
+        if self.use_re:
+            queries = [self._get_query_for_search_re(connection, self._field_to_search)]
+        else:
+            queries = []
+            stemmed = []
+            analyzer = Xapian.WikiAnalyzer(request=request, language=request.cfg.language_default)
+
+            for term in self._pattern.split():
+                query_term = connection.query_field(self._field_to_search, term)
+
+                tokens = analyzer.tokenize(term)
+
+                if request.cfg.xapian_stemming:
+                    query_tokens = Query(OP_AND,
+                                         [Query(OP_OR,
+                                                [connection.query_field(self._field_to_search, token),
+                                                 connection.query_field(self._field_to_search, stemmed)]) for token, stemmed in tokens if token != term])
+
+                    stemmed.extend(stemmed for term, stemmed in analyzer.tokenize(term))
+
+                else:
+                    tokens = analyzer.tokenize(term)
+                    query_tokens = Query(OP_AND, [connection.query_field(self._field_to_search, token) for token, stemmed in tokens if token != term])
+
+                queries.append(Query(OP_OR, [query_term, query_tokens]))
+
+            if not self.case and stemmed:
+                new_pat = ' '.join(stemmed)
+                self._pattern = new_pat
+                self.pattern, self.search_re = self._build_re(new_pat, use_re=False, case=self.case, stemmed=True)
+
+        return Query(OP_AND, queries)
 
 
-class TextSearch(BaseExpression):
+class TextSearch(BaseTextFieldSearch):
     """ A term that does a normal text search
 
     Both page content and the page title are searched, using an
@@ -309,6 +352,7 @@
     """
 
     costs = 10000
+    _field_to_search = 'content'
 
     def highlight_re(self):
         return u"(%s)" % self.pattern
@@ -329,49 +373,19 @@
         return matches
 
     def xapian_term(self, request, connection):
-        # XXX next version of xappy (>0.5) will provide Query class
-        # it should be used.
-        if self.use_re:
-            queries = [self._get_query_for_search_re(connection)]
-        else:
-            analyzer = Xapian.WikiAnalyzer(request=request, language=request.cfg.language_default)
-            terms = self._pattern.split()
-
-            # all parsed wikiwords, AND'ed
-            queries = []
-            stemmed = []
 
-            for term in terms:
-                if request.cfg.xapian_stemming:
-                    # stemmed OR not stemmed
-                    t = []
-                    for w, s, pos in analyzer.tokenize(term, flat_stemming=False):
-                        query_word = connection.query_field('content', w)
-                        query_stemmed = connection.query_field('content', s)
-                        # XXX UnicodeQuery was used here!
-                        t.append(Query(Query.OP_OR, [query_word, query_stemmed]))
-                        stemmed.append(s)
-                else:
-                    # just not stemmed
-                    t = [connection.query_field('content', w) for w, pos in analyzer.tokenize(term)]
+        content_query = super(TextSearch, self).xapian_term(request, connection)
+        title_query = TitleSearch(self._pattern, use_re=self.use_re, case=self.case).xapian_term(request, connection)
 
-                queries.append(Query(connection.OP_AND, t))
-
-            # XXX Is it required to change pattern and search_re here?
-            if not self.case and stemmed:
-                new_pat = ' '.join(stemmed)
-                self._pattern = new_pat
-                self.pattern, self.search_re = self._build_re(new_pat, use_re=False, case=self.case, stemmed=True)
-
-        title_query = TitleSearch(self._pattern, use_re=self.use_re, case=self.case).xapian_term(request, connection)
-        return Query(Query.OP_OR, [title_query, Query(Query.OP_AND, queries)])
+        return Query(OP_OR, [title_query, content_query])
 
 
-class TitleSearch(BaseExpression):
+class TitleSearch(BaseTextFieldSearch):
     """ Term searches in pattern in page title only """
 
     _tag = 'title:'
     costs = 100
+    _field_to_search = 'title'
 
     def pageFilter(self):
         """ Page filter function for single title search """
@@ -391,47 +405,6 @@
 
         return matches
 
-    def xapian_term(self, request, connection):
-        if self.use_re:
-            # XXX weight for a query!
-            queries = [self._get_query_for_search_re(connection, 'fulltitle')]
-        else:
-            analyzer = Xapian.WikiAnalyzer(request=request,
-                    language=request.cfg.language_default)
-            terms = self._pattern.split()
-            terms = [[w for w, pos in analyzer.raw_tokenize(t)] for t in terms]
-
-            # all parsed wikiwords, ANDed
-            queries = []
-            stemmed = []
-            for term in terms:
-                if request.cfg.xapian_stemming:
-                    # stemmed OR not stemmed
-                    t = []
-                    for w, s, pos in analyzer.tokenize(term, flat_stemming=False):
-                        # XXX weight for a query 100!
-                        query_word = connection.query_field('title', w)
-                        query_stemmed = connection.query_field('title', s)
-
-                        # XXX UnicodeQuery was used here!
-                        t.append(Query(Query.OP_OR, [query_word, query_stemmed]))
-                        stemmed.append(s)
-                else:
-                    # just not stemmed
-                    # XXX weight for a query 100!
-                    # XXX UnicodeQuery was used here!
-                    t = [connection.query_field('title', w) for w, pos in analyzer.tokenize(term)]
-
-                # XXX what should be there OR or AND?!
-                queries.append(Query(Query.OP_OR, t))
-
-            if not self.case and stemmed:
-                new_pat = ' '.join(stemmed)
-                self._pattern = new_pat
-                self._build_re(new_pat, use_re=False, case=self.case, stemmed=True)
-
-        return Query(Query.OP_AND, queries)
-
 
 class BaseFieldSearch(BaseExpression):