Mercurial > moin > 1.9
changeset 3647:b3747c0e81ae
Xapian search: improve analyzer to tokenize Foo42Bar23 into Foo, 42, Bar, 23
author | Thomas Waldmann <tw AT waldmann-edv DOT de> |
---|---|
date | Thu, 29 May 2008 23:42:28 +0200 |
parents | 68da15c7eeec |
children | 8352dcd5a282 |
files | MoinMoin/search/Xapian.py |
diffstat | 1 files changed, 23 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- a/MoinMoin/search/Xapian.py Mon May 26 17:41:41 2008 +0200 +++ b/MoinMoin/search/Xapian.py Thu May 29 23:42:28 2008 +0200 @@ -87,6 +87,7 @@ dot_re = re.compile(r"[-_/,.]") mail_re = re.compile(r"[-_/,.]|(@)") + alpha_num_re = re.compile(r"\d+|\D+") # XXX limit stuff above to xapdoc.MAX_KEY_LEN # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U) @@ -104,46 +105,49 @@ # lang is not stemmable or not available pass + def raw_tokenize_word(self, word, pos): + """ try to further tokenize some word starting at pos """ + if self.wikiword_re.match(word): + yield (word, pos) + # if it is a CamelCaseWord, we additionally try to tokenize Camel, Case and Word + for m in re.finditer(self.singleword_re, word): + for w, p in self.raw_tokenize_word(m.group(), pos + m.start()): + yield (w, p) + else: + # if we have Foo42, yield Foo and 42 + for m in re.finditer(self.alpha_num_re, word): + yield (m.group(), pos + m.start()) + def raw_tokenize(self, value): - """ Yield a stream of lower cased raw and stemmed words from a string. + """ Yield a stream of words from a string. @param value: string to split, must be an unicode object or a list of unicode objects """ - def enc(uc): - """ 'encode' unicode results into whatever xapian wants """ - lower = uc.lower() - return lower - if isinstance(value, list): # used for page links for v in value: - yield (enc(v), 0) + yield (v, 0) else: tokenstream = re.finditer(self.token_re, value) for m in tokenstream: if m.group("acronym"): - yield (enc(m.group("acronym").replace('.', '')), - m.start()) + yield (m.group("acronym").replace('.', ''), m.start()) elif m.group("company"): - yield (enc(m.group("company")), m.start()) + yield (m.group("company"), m.start()) elif m.group("email"): displ = 0 for word in self.mail_re.split(m.group("email")): if word: - yield (enc(word), m.start() + displ) + yield (word, m.start() + displ) displ += len(word) + 1 elif m.group("hostname"): displ = 0 for word in self.dot_re.split(m.group("hostname")): - yield (enc(word), m.start() + displ) + yield (word, m.start() + displ) displ += len(word) + 1 elif m.group("word"): - word = m.group("word") - yield (enc(word), m.start()) - # if it is a CamelCaseWord, we additionally yield Camel, Case and Word - if self.wikiword_re.match(word): - for sm in re.finditer(self.singleword_re, word): - yield (enc(sm.group()), m.start() + sm.start()) + for word, pos in self.raw_tokenize_word(m.group("word"), m.start()): + yield word, pos def tokenize(self, value, flat_stemming=True): """ Yield a stream of lower cased raw and stemmed words from a string. @@ -155,6 +159,7 @@ yield both at once as a tuple (False) """ for word, pos in self.raw_tokenize(value): + word = word.lower() # transform it into what xapian wants if flat_stemming: yield (word, pos) if self.stemmer: