changeset 3856:6aeb3f0af92c

Xapian indexer/tokenizer: tokenize CamelCase parts of non-wikiwords
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Mon, 14 Jul 2008 22:44:33 +0200
parents f40bd4c68aa2
children ee74cf49c1ca
files MoinMoin/search/Xapian.py
diffstat 1 files changed, 8 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/search/Xapian.py	Mon Jul 14 13:03:42 2008 +0200
+++ b/MoinMoin/search/Xapian.py	Mon Jul 14 22:44:33 2008 +0200
@@ -103,16 +103,21 @@
 
     def raw_tokenize_word(self, word, pos):
         """ try to further tokenize some word starting at pos """
+        yield (word, pos)
         if self.wikiword_re.match(word):
-            yield (word, pos)
             # if it is a CamelCaseWord, we additionally try to tokenize Camel, Case and Word
             for m in re.finditer(self.singleword_re, word):
-                for w, p in self.raw_tokenize_word(m.group(), pos + m.start()):
+                mw, mp = m.group(), pos + m.start()
+                for w, p in self.raw_tokenize_word(mw, mp):
                     yield (w, p)
         else:
             # if we have Foo42, yield Foo and 42
             for m in re.finditer(self.alpha_num_re, word):
-                yield (m.group(), pos + m.start())
+                mw, mp = m.group(), pos + m.start()
+                if mw != word:
+                    for w, p in self.raw_tokenize_word(mw, mp):
+                        yield (w, p)
+
 
     def raw_tokenize(self, value):
         """ Yield a stream of words from a string.