view MoinMoin/search/Xapian/ @ 5032:7e9b7149c95e

Xapian2009: WikiAnalyzer.tokenize() returns lowercased words.
author Dmitrijs Milajevs <>
date Sat, 22 Aug 2009 12:42:33 +0200
parents deb2e2d5326e
children 4b2ef153ad4f
line wrap: on
line source
# -*- coding: iso-8859-1 -*-
    MoinMoin - A text analyzer for wiki syntax

    @copyright: 2006-2008 MoinMoin:ThomasWaldmann,
                2006 MoinMoin:FranzPletz
    @license: GNU GPL, see COPYING for details.

import re
import xapian

from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
from MoinMoin import config

class WikiAnalyzer(object):
    """ A text analyzer for wiki syntax

    The purpose of this class is to anaylze texts/pages in wiki syntax
    and yield single terms to feed into the xapian database.

    singleword = r"[%(u)s][%(l)s]+" % {
                     'u': config.chars_upper,
                     'l': config.chars_lower,

    singleword_re = re.compile(singleword, re.U)
    wikiword_re = re.compile(WikiParser.word_rule, re.UNICODE|re.VERBOSE)

    token_re = re.compile(
        r"(?P<company>\w+[&@]\w+)|" + # company names like AT&T and Excite@Home.
        r"(?P<email>\w+([.-]\w+)*@\w+([.-]\w+)*)|" +    # email addresses
        r"(?P<acronym>(\w\.)+)|" +          # acronyms: U.S.A., I.B.M., etc.
        r"(?P<word>\w+)",                   # words (including WikiWords)

    dot_re = re.compile(r"[-_/,.]")
    mail_re = re.compile(r"[-_/,.]|(@)")
    alpha_num_re = re.compile(r"\d+|\D+")

    def __init__(self, request=None, language=None):
        @param request: current request
        @param language: if given, the language in which to stem words
        self.stemmer = None
        if request and request.cfg.xapian_stemming and language:
                stemmer = xapian.Stem(language)
                # we need this wrapper because the stemmer returns a utf-8
                # encoded string even when it gets fed with unicode objects:
                self.stemmer = lambda word: stemmer(word).decode('utf-8')
            except xapian.InvalidArgumentError:
                # lang is not stemmable or not available

    def raw_tokenize_word(self, word, pos):
        """ try to further tokenize some word starting at pos """
        yield (word, pos)
        if self.wikiword_re.match(word):
            # if it is a CamelCaseWord, we additionally try to tokenize Camel, Case and Word
            for m in re.finditer(self.singleword_re, word):
                mw, mp =, pos + m.start()
                for w, p in self.raw_tokenize_word(mw, mp):
                    yield (w, p)
            # if we have Foo42, yield Foo and 42
            for m in re.finditer(self.alpha_num_re, word):
                mw, mp =, pos + m.start()
                if mw != word:
                    for w, p in self.raw_tokenize_word(mw, mp):
                        yield (w, p)

    def raw_tokenize(self, value):
        """ Yield a stream of words from a string.

        @param value: string to split, must be an unicode object or a list of
                      unicode objects
        if isinstance(value, list): # used for page links
            for v in value:
                yield (v, 0)
            tokenstream = re.finditer(self.token_re, value)
            for m in tokenstream:
                    yield ("acronym").replace('.', ''), m.start())
                    yield ("company"), m.start())
                    displ = 0
                    for word in self.mail_re.split("email")):
                        if word:
                            yield (word, m.start() + displ)
                            displ += len(word) + 1
                    for word, pos in self.raw_tokenize_word("word"), m.start()):
                        yield word, pos

    def tokenize(self, value):
        Yield a stream of raw lower cased and stemmed words from a string.

        @param value: string to split, must be an unicode object or a list of
                      unicode objects
        if self.stemmer:
            def stemmer(value):
                stemmed = self.stemmer(value)
                if stemmed != value:
                    return stemmed
                    return ''
            stemmer = lambda v: ''

        for word, pos in self.raw_tokenize(value):
            # Xapian stemmer expects lowercase input
            word = word.lower()
            yield word, stemmer(word)