changeset 5026:deb2e2d5326e

Xapian2009: WikiAnalyzer.tokenize() was refactored, flat_stemming parameter was removed. Tests for the WikiAnalyzer.
author Dmitrijs Milajevs <dimazest@gmail.com>
date Thu, 20 Aug 2009 16:54:49 +0200
parents aca1825d1890
children f531ccc68313
files MoinMoin/search/Xapian/_tests/__init__.py MoinMoin/search/Xapian/_tests/test_wiki_analyzer.py MoinMoin/search/Xapian/tokenizer.py
diffstat 2 files changed, 105 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/search/Xapian/_tests/test_wiki_analyzer.py	Thu Aug 20 16:54:49 2009 +0200
@@ -0,0 +1,91 @@
+# -*- coding: iso-8859-1 -*-
+"""
+    MoinMoin - MoinMoin.search.Xapian.tokenizer Tests
+
+    @copyright: 2009 MoinMoin:DmitrijsMilajevs
+    @license: GNU GPL, see COPYING for details.
+"""
+
+from MoinMoin._tests import wikiconfig
+
+
+class TestWikiAnalyzer(object):
+
+    word = u'HelpOnMoinTesting'
+    words = {word: u'',
+             u'Help': u'',
+             u'On': u'',
+             u'Moin': u'',
+             u'Testing': u''}
+
+    def setup_class(self):
+        try:
+            from MoinMoin.search import Xapian
+            self.analyzer = Xapian.WikiAnalyzer(request=self.request, language=self.request.cfg.language_default)
+        except ImportError:
+            py.test.skip('xapian is not installed')
+
+    def test_tokenize(self):
+        words = self.words
+        tokens = list(self.analyzer.tokenize(self.word))
+
+        assert len(tokens) == len(words)
+
+        for token, stemmed in tokens:
+            assert token in words
+            assert words[token] == stemmed
+
+    def test_first_token(self):
+        tokens = list(self.analyzer.tokenize(self.word))
+        assert tokens[0][0] == self.word, 'The first token must be the word itself'
+
+
+class TestWikiAnalyzerStemmed(TestWikiAnalyzer):
+
+    word = u'HelpOnMoinTesting'
+    words = {word: u'HelpOnMoinTest',
+             u'Help': u'',
+             u'On': u'',
+             u'Moin': u'',
+             u'Testing': u'Test'}
+
+    class Config(wikiconfig.Config):
+
+        xapian_stemming = True
+
+
+class TestWikiAnalyzerSeveralWords(TestWikiAnalyzer):
+
+    word = u'HelpOnMoinTesting OtherWikiWord'
+    words = {u'HelpOnMoinTesting': u'',
+             u'Help': u'',
+             u'On': u'',
+             u'Moin': u'',
+             u'Testing': u'',
+             u'OtherWikiWord': u'',
+             u'Other': u'',
+             u'Wiki': u'',
+             u'Word': u''}
+
+    def test_first_token(self):
+        pass
+
+class TestWikiAnalyzerStemmedSeveralWords(TestWikiAnalyzer):
+
+    word = u'HelpOnMoinTesting OtherWikiWord'
+    words = {u'HelpOnMoinTesting': u'HelpOnMoinTest',
+             u'Help': u'',
+             u'On': u'',
+             u'Moin': u'',
+             u'Testing': u'Test',
+             u'OtherWikiWord': u'',
+             u'Other': u'',
+             u'Wiki': u'',
+             u'Word': u''}
+
+    class Config(wikiconfig.Config):
+
+        xapian_stemming = True
+
+    def test_first_token(self):
+        pass
--- a/MoinMoin/search/Xapian/tokenizer.py	Thu Aug 20 16:54:49 2009 +0200
+++ b/MoinMoin/search/Xapian/tokenizer.py	Thu Aug 20 16:54:49 2009 +0200
@@ -98,20 +98,23 @@
                     for word, pos in self.raw_tokenize_word(m.group("word"), m.start()):
                         yield word, pos
 
-    def tokenize(self, value, flat_stemming=True):
-        """ Yield a stream of lower cased raw and stemmed words from a string.
+    def tokenize(self, value):
+        """
+        Yield a stream of raw and stemmed words from a string.
 
         @param value: string to split, must be an unicode object or a list of
                       unicode objects
-        @keyword flat_stemming: whether to yield stemmed terms automatically
-                                with the natural forms (True) or
-                                yield both at once as a tuple (False)
         """
+        if self.stemmer:
+            def stemmer(value):
+                stemmed = self.stemmer(value)
+                if stemmed != value:
+                    return stemmed
+                else:
+                    return ''
+        else:
+            stemmer = lambda v: ''
+
         for word, pos in self.raw_tokenize(value):
-            if flat_stemming:
-                yield (word, pos)
-                if self.stemmer:
-                    yield (self.stemmer(word), pos)
-            else:
-                yield (word, self.stemmer(word), pos)
+                yield word, stemmer(word)