changeset 5032:7e9b7149c95e

Xapian2009: WikiAnalyzer.tokenize() returns lowercased words.
author Dmitrijs Milajevs <dimazest@gmail.com>
date Sat, 22 Aug 2009 12:42:33 +0200
parents d0af0cf05fc2
children 2649f3f0fabd
files MoinMoin/search/Xapian/_tests/test_wiki_analyzer.py MoinMoin/search/Xapian/tokenizer.py
diffstat 2 files changed, 32 insertions(+), 38 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/search/Xapian/_tests/test_wiki_analyzer.py	Sat Aug 22 12:22:20 2009 +0200
+++ b/MoinMoin/search/Xapian/_tests/test_wiki_analyzer.py	Sat Aug 22 12:42:33 2009 +0200
@@ -12,11 +12,11 @@
 class TestWikiAnalyzer(object):
 
     word = u'HelpOnMoinTesting'
-    words = {word: u'',
-             u'Help': u'',
-             u'On': u'',
-             u'Moin': u'',
-             u'Testing': u''}
+    words = {word.lower(): u'',
+             u'help': u'',
+             u'on': u'',
+             u'moin': u'',
+             u'testing': u''}
 
     def setup_class(self):
         try:
@@ -35,19 +35,15 @@
             assert token in words
             assert words[token] == stemmed
 
-    def test_first_token(self):
-        tokens = list(self.analyzer.tokenize(self.word))
-        assert tokens[0][0] == self.word, 'The first token must be the word itself'
-
 
 class TestWikiAnalyzerStemmed(TestWikiAnalyzer):
 
     word = u'HelpOnMoinTesting'
-    words = {word: u'HelpOnMoinTest',
-             u'Help': u'',
-             u'On': u'',
-             u'Moin': u'',
-             u'Testing': u'Test'}
+    words = {word.lower(): u'helponmointest',
+             u'help': u'',
+             u'on': u'',
+             u'moin': u'',
+             u'testing': u'test'}
 
     class Config(wikiconfig.Config):
 
@@ -57,35 +53,31 @@
 class TestWikiAnalyzerSeveralWords(TestWikiAnalyzer):
 
     word = u'HelpOnMoinTesting OtherWikiWord'
-    words = {u'HelpOnMoinTesting': u'',
-             u'Help': u'',
-             u'On': u'',
-             u'Moin': u'',
-             u'Testing': u'',
-             u'OtherWikiWord': u'',
-             u'Other': u'',
-             u'Wiki': u'',
-             u'Word': u''}
+    words = {u'helponmointesting': u'',
+             u'help': u'',
+             u'on': u'',
+             u'moin': u'',
+             u'testing': u'',
+             u'otherwikiword': u'',
+             u'other': u'',
+             u'wiki': u'',
+             u'word': u''}
 
-    def test_first_token(self):
-        pass
 
 class TestWikiAnalyzerStemmedSeveralWords(TestWikiAnalyzer):
 
     word = u'HelpOnMoinTesting OtherWikiWord'
-    words = {u'HelpOnMoinTesting': u'HelpOnMoinTest',
-             u'Help': u'',
-             u'On': u'',
-             u'Moin': u'',
-             u'Testing': u'Test',
-             u'OtherWikiWord': u'',
-             u'Other': u'',
-             u'Wiki': u'',
-             u'Word': u''}
+    words = {u'helponmointesting': u'helponmointest',
+             u'help': u'',
+             u'on': u'',
+             u'moin': u'',
+             u'testing': u'test',
+             u'otherwikiword': u'',
+             u'other': u'',
+             u'wiki': u'',
+             u'word': u''}
 
     class Config(wikiconfig.Config):
 
         xapian_stemming = True
 
-    def test_first_token(self):
-        pass
--- a/MoinMoin/search/Xapian/tokenizer.py	Sat Aug 22 12:22:20 2009 +0200
+++ b/MoinMoin/search/Xapian/tokenizer.py	Sat Aug 22 12:42:33 2009 +0200
@@ -100,7 +100,7 @@
 
     def tokenize(self, value):
         """
-        Yield a stream of raw and stemmed words from a string.
+        Yield a stream of raw lower cased and stemmed words from a string.
 
         @param value: string to split, must be an unicode object or a list of
                       unicode objects
@@ -116,5 +116,7 @@
             stemmer = lambda v: ''
 
         for word, pos in self.raw_tokenize(value):
-                yield word, stemmer(word)
+            # Xapian stemmer expects lowercase input
+            word = word.lower()
+            yield word, stemmer(word)