changeset 5035:93becb451375

Xapian2009: BaseTextFieldSearch.xapian_term() refactoring. Tests for a search with stemming.
author Dmitrijs Milajevs <dimazest@gmail.com>
date Sat, 22 Aug 2009 20:54:24 +0200
parents 384ad8cec085
children 4b2ef153ad4f
files MoinMoin/search/Xapian/_tests/test_wiki_analyzer.py MoinMoin/search/Xapian/indexing.py MoinMoin/search/_tests/test_search.py MoinMoin/search/builtin.py MoinMoin/search/queryparser/expressions.py
diffstat 5 files changed, 83 insertions(+), 23 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/search/Xapian/_tests/test_wiki_analyzer.py	Sat Aug 22 20:54:21 2009 +0200
+++ b/MoinMoin/search/Xapian/_tests/test_wiki_analyzer.py	Sat Aug 22 20:54:24 2009 +0200
@@ -6,6 +6,7 @@
     @license: GNU GPL, see COPYING for details.
 """
 
+
 from MoinMoin._tests import wikiconfig
 
 
@@ -81,3 +82,26 @@
 
         xapian_stemming = True
 
+class TestWikiAnalyzerStemmedHelpOnEditing(TestWikiAnalyzer):
+
+    word = u'HelpOnEditing'
+    words = {u'helponediting': u'helponedit',
+             u'help': u'',
+             u'on': u'',
+             u'editing': u'edit'}
+
+    class Config(wikiconfig.Config):
+
+        xapian_stemming = True
+
+
+class TestWikiAnalyzerStemmedCategoryHomepage(TestWikiAnalyzer):
+
+    word = u'CategoryHomepage'
+    words = {u'categoryhomepage': u'categoryhomepag',
+             u'category': u'categori',
+             u'homepage': u'homepag'}
+
+    class Config(wikiconfig.Config):
+
+        xapian_stemming = True
--- a/MoinMoin/search/Xapian/indexing.py	Sat Aug 22 20:54:21 2009 +0200
+++ b/MoinMoin/search/Xapian/indexing.py	Sat Aug 22 20:54:24 2009 +0200
@@ -113,8 +113,10 @@
     def __init__(self, name, value, request):
 
         analyzer = WikiAnalyzer(request=request, language=request.cfg.language_default)
+
         tokens = analyzer.tokenize(value)
-        value = ''.join(('%s %s' % (word, stemmed) for word, stemmed in analyzer.tokenize(value)))
+        value = ' '.join(unicode('%s %s' % (word, stemmed)).strip() for word, stemmed in analyzer.tokenize(value))
+
         super(StemmedField, self).__init__(name, value)
 
 
--- a/MoinMoin/search/_tests/test_search.py	Sat Aug 22 20:54:21 2009 +0200
+++ b/MoinMoin/search/_tests/test_search.py	Sat Aug 22 20:54:24 2009 +0200
@@ -85,6 +85,7 @@
              u'SearchTestLinks': u'SearchTestPage',
              u'SearchTestLinksLowerCase': u'searchtestpage',
              u'SearchTestOtherLinks': u'SearchTestLinks',
+             u'TestEdit': u'TestEdit',
              u'LanguageSetup': None,
              u'CategoryHomepage': None,
              u'HomePageWiki': None,
@@ -121,7 +122,9 @@
                     u'title:HelpIndex': 1,
                     u'title:Help': 3,
                     u'title:HelpOn': 2,
-                    u'title:SearchTestNotExisting': 0}
+                    u'title:SearchTestNotExisting': 0,
+                    u'title:FrontPage': 1,
+                    u'title:HelpOnEditing': 1}
 
         def test(query, res_count):
             result = self.search(query)
@@ -209,18 +212,18 @@
 
     def test_mimetype_search_simple(self):
         result = self.search(u'mimetype:text/wiki')
-        assert len(result.hits) == 11
+        assert len(result.hits) == 12
 
     def test_mimetype_search_re(self):
         result = self.search(ur'mimetype:re:\btext/wiki\b')
-        assert len(result.hits) == 11
+        assert len(result.hits) == 12
 
         result = self.search(ur'category:re:\bCategoryHomepa\b')
         assert not result.hits
 
     def test_language_search_simple(self):
         result = self.search(u'language:en')
-        assert len(result.hits) == 11
+        assert len(result.hits) == 12
 
     def test_domain_search_simple(self):
         result = self.search(u'domain:system')
@@ -279,13 +282,19 @@
         del self.pages['TestCreatePage']
         assert len(result.hits) == 1
 
-
 class TestMoinSearch(BaseSearchTest):
 
     def get_searcher(self, query):
         pages = [{'pagename': page, 'attachment': '', 'wikiname': 'Self', } for page in self.pages]
         return MoinSearch(self.request, query, pages=pages)
 
+    def test_stemming(self):
+        result = self.search(u"title:edit")
+        assert len(result.hits) == 2
+
+        result = self.search(u"title:editing")
+        assert len(result.hits) == 1
+
 
 class TestXapianSearch(BaseSearchTest):
     """ search: test Xapian indexing """
@@ -336,7 +345,9 @@
                     u'domain:': ([u''], u'system')}
 
         def test_query(query):
-            assert not parser.parse_query(query).xapian_term(self.request, connection).empty()
+            query_ = parser.parse_query(query).xapian_term(self.request, connection)
+            print str(query_)
+            assert not query_.empty()
 
         for prefix, data in prefixes.iteritems():
             modifiers, term = data
@@ -344,6 +355,27 @@
                 query = ''.join([prefix, modifier, term])
                 yield query, test_query, query
 
+    def test_stemming(self):
+        result = self.search(u"title:edit")
+        assert len(result.hits) == 1
+
+        result = self.search(u"title:editing")
+        assert len(result.hits) == 1
+
+
+class TestXapianSearchStemmed(TestXapianSearch):
+
+    class Config(wikiconfig.Config):
+
+        xapian_search = True
+        xapian_stemming = True
+
+    def test_stemming(self):
+        result = self.search(u"title:edit")
+        assert len(result.hits) == 2
+
+        result = self.search(u"title:editing")
+        assert len(result.hits) == 2
 
 class TestXapianIndexingInNewThread(object):
     """ search: test Xapian indexing """
--- a/MoinMoin/search/builtin.py	Sat Aug 22 20:54:21 2009 +0200
+++ b/MoinMoin/search/builtin.py	Sat Aug 22 20:54:24 2009 +0200
@@ -509,8 +509,6 @@
         """
         Get all matches
 
-        XXX xappy highlight functionality should be used for Xapian search!
-
         @param page: the current page instance
         """
         if page:
@@ -534,6 +532,7 @@
 
             if wikiname in (self.request.cfg.interwikiname, 'Self'): # THIS wiki
                 page = Page(self.request, pagename, rev=revision)
+
                 if not self.historysearch and revision:
                     revlist = page.getRevList()
                     # revlist can be empty if page was nuked/renamed since it was included in xapian index
@@ -541,6 +540,7 @@
                         # nothing there at all or not the current revision
                         logging.debug("no history search, skipping non-current revision...")
                         continue
+
                 if attachment:
                     # revision currently is 0 ever
                     if pagename == fs_rootpage: # not really an attachment
@@ -553,13 +553,12 @@
                     matches = self._get_match(page=page, uid=uid)
                     logging.debug("self._get_match %r" % matches)
                     if matches:
-                        if not self.historysearch and \
-                                pagename in revisionCache and \
-                                revisionCache[pagename][0] < revision:
+                        if not self.historysearch and  pagename in revisionCache and revisionCache[pagename][0] < revision:
                             hits.remove(revisionCache[pagename][1])
                             del revisionCache[pagename]
                         hits.append((wikiname, page, attachment, matches, revision))
                         revisionCache[pagename] = (revision, hits[-1])
+
             else: # other wiki
                 hits.append((wikiname, pagename, attachment, None, revision))
         logging.debug("_getHits returning %r." % hits)
@@ -658,7 +657,6 @@
                   'attachment': r.data['attachment'][0],
                   'revision': r.data.get('revision', [0])[0]}
                  for r in search_results]
-
         try:
             if not self.query.xapian_need_postproc():
                 # xapian handled the full query
--- a/MoinMoin/search/queryparser/expressions.py	Sat Aug 22 20:54:21 2009 +0200
+++ b/MoinMoin/search/queryparser/expressions.py	Sat Aug 22 20:54:24 2009 +0200
@@ -319,23 +319,27 @@
 
             for term in self._pattern.split():
                 query_term = connection.query_field(self._field_to_search, term)
-
                 tokens = analyzer.tokenize(term)
 
                 if request.cfg.xapian_stemming:
-                    query_tokens = Query(OP_AND,
-                                         [Query(OP_OR,
-                                                [connection.query_field(self._field_to_search, token),
-                                                 connection.query_field(self._field_to_search, stemmed)]) for token, stemmed in tokens if token != term])
-
-                    stemmed.extend(stemmed for term, stemmed in analyzer.tokenize(term))
-
+                    query_token = []
+                    for token, stemmed_ in tokens:
+                        if token != term.lower():
+                            if stemmed_:
+                                query_token.append(Query(OP_OR,
+                                                         [connection.query_field(self._field_to_search, token),
+                                                          connection.query_field(self._field_to_search, stemmed_)]))
+#                                 stemmed.append('(%s|%s)' % (token, stemmed_))
+                            else:
+                                query_token.append(connection.query_field(self._field_to_search, token))
+#                                 stemmed.append(token)
+                    query_tokens = Query(OP_AND, query_token)
                 else:
-                    tokens = analyzer.tokenize(term)
-                    query_tokens = Query(OP_AND, [connection.query_field(self._field_to_search, token) for token, stemmed in tokens if token != term])
+                    query_tokens = Query(OP_AND, [connection.query_field(self._field_to_search, token) for token, stemmed_ in tokens if token != term.lower()])
 
                 queries.append(Query(OP_OR, [query_term, query_tokens]))
 
+            # XXX broken wrong regexp is built!
             if not self.case and stemmed:
                 new_pat = ' '.join(stemmed)
                 self._pattern = new_pat