Mercurial > moin > 1.9
changeset 5026:deb2e2d5326e
Xapian2009: WikiAnalyzer.tokenize() was refactored, flat_stemming parameter was removed. Tests for the WikiAnalyzer.
author | Dmitrijs Milajevs <dimazest@gmail.com> |
---|---|
date | Thu, 20 Aug 2009 16:54:49 +0200 |
parents | aca1825d1890 |
children | f531ccc68313 |
files | MoinMoin/search/Xapian/_tests/__init__.py MoinMoin/search/Xapian/_tests/test_wiki_analyzer.py MoinMoin/search/Xapian/tokenizer.py |
diffstat | 2 files changed, 105 insertions(+), 11 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/search/Xapian/_tests/test_wiki_analyzer.py Thu Aug 20 16:54:49 2009 +0200 @@ -0,0 +1,91 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - MoinMoin.search.Xapian.tokenizer Tests + + @copyright: 2009 MoinMoin:DmitrijsMilajevs + @license: GNU GPL, see COPYING for details. +""" + +from MoinMoin._tests import wikiconfig + + +class TestWikiAnalyzer(object): + + word = u'HelpOnMoinTesting' + words = {word: u'', + u'Help': u'', + u'On': u'', + u'Moin': u'', + u'Testing': u''} + + def setup_class(self): + try: + from MoinMoin.search import Xapian + self.analyzer = Xapian.WikiAnalyzer(request=self.request, language=self.request.cfg.language_default) + except ImportError: + py.test.skip('xapian is not installed') + + def test_tokenize(self): + words = self.words + tokens = list(self.analyzer.tokenize(self.word)) + + assert len(tokens) == len(words) + + for token, stemmed in tokens: + assert token in words + assert words[token] == stemmed + + def test_first_token(self): + tokens = list(self.analyzer.tokenize(self.word)) + assert tokens[0][0] == self.word, 'The first token must be the word itself' + + +class TestWikiAnalyzerStemmed(TestWikiAnalyzer): + + word = u'HelpOnMoinTesting' + words = {word: u'HelpOnMoinTest', + u'Help': u'', + u'On': u'', + u'Moin': u'', + u'Testing': u'Test'} + + class Config(wikiconfig.Config): + + xapian_stemming = True + + +class TestWikiAnalyzerSeveralWords(TestWikiAnalyzer): + + word = u'HelpOnMoinTesting OtherWikiWord' + words = {u'HelpOnMoinTesting': u'', + u'Help': u'', + u'On': u'', + u'Moin': u'', + u'Testing': u'', + u'OtherWikiWord': u'', + u'Other': u'', + u'Wiki': u'', + u'Word': u''} + + def test_first_token(self): + pass + +class TestWikiAnalyzerStemmedSeveralWords(TestWikiAnalyzer): + + word = u'HelpOnMoinTesting OtherWikiWord' + words = {u'HelpOnMoinTesting': u'HelpOnMoinTest', + u'Help': u'', + u'On': u'', + u'Moin': u'', + u'Testing': u'Test', + u'OtherWikiWord': u'', + u'Other': u'', + u'Wiki': u'', + u'Word': u''} + + class Config(wikiconfig.Config): + + xapian_stemming = True + + def test_first_token(self): + pass
--- a/MoinMoin/search/Xapian/tokenizer.py Thu Aug 20 16:54:49 2009 +0200 +++ b/MoinMoin/search/Xapian/tokenizer.py Thu Aug 20 16:54:49 2009 +0200 @@ -98,20 +98,23 @@ for word, pos in self.raw_tokenize_word(m.group("word"), m.start()): yield word, pos - def tokenize(self, value, flat_stemming=True): - """ Yield a stream of lower cased raw and stemmed words from a string. + def tokenize(self, value): + """ + Yield a stream of raw and stemmed words from a string. @param value: string to split, must be an unicode object or a list of unicode objects - @keyword flat_stemming: whether to yield stemmed terms automatically - with the natural forms (True) or - yield both at once as a tuple (False) """ + if self.stemmer: + def stemmer(value): + stemmed = self.stemmer(value) + if stemmed != value: + return stemmed + else: + return '' + else: + stemmer = lambda v: '' + for word, pos in self.raw_tokenize(value): - if flat_stemming: - yield (word, pos) - if self.stemmer: - yield (self.stemmer(word), pos) - else: - yield (word, self.stemmer(word), pos) + yield word, stemmer(word)