view MoinMoin/support/lupy/search/phrase.py @ 0:77665d8e2254

tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0 (automatically generated log message) imported from: moin--main--1.5--base-0
author Thomas Waldmann <tw-public@gmx.de>
date Thu, 22 Sep 2005 15:09:50 +0000
parents
children
line wrap: on
line source
# This module is part of the Lupy project and is Copyright 2003 Amir
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
# General Public License as published by the Free Software Foundation.


from bisect import insort
from MoinMoin.support.lupy.search import term, similarity
import sys

class PhraseQuery:
    """A query that matches documents containing a particular
    sequence of terms. This may be combined with other terms
    with a L{lupy.search.boolean.BooleanQuery}.
    """

    def __init__(self):
        """Constructs an empty phrase query."""
        
        self.idf = 0.0
        self.slop = 0
        self.terms = []
        self.weight = 0.0
        self.boost = 1.0

    def add(self, term):
        """Adds a term to the end of the query phrase."""
        if len(self.terms) == 0:
            self.field = term.field()

        elif term.field() != self.field:
            raise Exception, 'All phrase terms must be in the same field: ' + str(term)

        self.terms.append(term)


    def getSlop(self):
        """Returns the slop.  See setSlop()."""
        return self.slop


    def normalize(self, norm):
        # normalize for query
        self.weight *= norm
        # factor from document
        self.weight *= self.idf


    def scorer(self, reader):
        # optimize zero-term case
        if len(self.terms) == 0:
            return None

        # optimize one-term case
        if len(self.terms) == 1:
            t = self.terms[0]
            docs = reader.termDocsTerm(t)
            if docs is None:
                return None
            return term.TermScorer(docs, reader.normsField(t.field()), self.weight)

        tps = [] 
        
        for t in self.terms:
            p = reader.termPositionsTerm(t)
            if p is None:
                # I am not sure how this is ever reached?
                return None
            tps.append(p)

        if self.slop == 0:
            return ExactPhraseScorer(tps, reader.normsField(self.field),
                                     self.weight)
        else:
            return SloppyPhraseScorer(tps, reader.norms(self.field),
                                      self.weight)


    def sumOfSquaredWeights(self, searcher):
        # sum term IDFs
        for term in self.terms:
            self.idf += similarity.idfTerm(term, searcher)
            
        self.weight = self.idf * self.boost
        # square term weights
        return self.weight * self.weight


    def toString(self, f):
        """Prints a user-readable version of this query"""

        buffer = ''
        if not self.field == f :
            buffer += f + ':'
        buffer += '\\'

        for term in self.terms[:-1]:
            buffer += term.text() + ' '
            
        buffer += self.terms[-1].text() + '\\'

        if self.slop != 0:
            buffer += '~' + str(self.slop)

        if self.boost != 1.0:
            buffer += '^' + str(self.boost)

        return buffer


class PhraseScorer:
    
    def __init__(self, tps, n, w):
        self.norms = n
        self.weight = w
        
        self.pps = [PhrasePositions(tp, i) for i, tp in enumerate(tps)]
        self.pps.sort()
                        
    def phraseQuery(self):
        """Subclass responsibility"""

    def score(self, end):
        # find doc w/ all the terms
        while self.pps[-1].doc < end:
            while self.pps[0].doc < self.pps[-1].doc:
                self.pps[0].advance()
                while self.pps[0].doc < self.pps[-1].doc:
                    self.pps[0].advance()
                self.pps.append(self.pps.pop(0))
                if self.pps[-1].doc >= end:
                    return
                
            # found doc with all terms
            # check for phrase
            freq = self.phraseFreq()
            
            if freq > 0.0:
                # compute score
                score = similarity.tf(freq) * self.weight
                # normalize
                score *= similarity.normByte(self.norms[self.pps[0].doc])
                # add to results
                yield (self.pps[0].doc, score)
            # resume scanning
            self.pps[-1].advance()
                
                
        

class ExactPhraseScorer(PhraseScorer):
    
    def phraseFreq(self):
        for pp in self.pps:
            pp.firstPosition()
        self.pps.sort()
        freq = 0.0
        
        init = 0
        # the 'init' bits are to simulate a do-while loop :-/
        while init == 0 or self.pps[-1].nextPosition():
            while self.pps[0].position < self.pps[-1].position:
                # scan forward in first
                init2 = 0
                while init2 == 0 or self.pps[0].position < self.pps[-1].position:
                    if not self.pps[0].nextPosition():
                        return freq
                    init2 = 1
                    
                self.pps.append(self.pps.pop(0))
            # all equal: a match
            freq += 1
            init = 1
            
        return freq
        

class PhrasePositions(object):

    def __init__(self, t, o):
        self.tp = t
        self.offset = o
        
        self.position = 0
        self.count = 0
        self.doc = 0
        self.tpiter = iter(t)
        self.advance()
        
        
    def firstPosition(self):
        self.count = self.tp.frq
        self.nextPosition()
        
        
    def advance(self):
        """Increments to next doc"""
        
        for doc, frq, nextPos in self.tpiter:
            self.doc = doc
            self.frq = frq
            self._nextPos = nextPos
            self.position = 0
            return
        else:
            # close stream
            self.tp.close()
            # sentinel value
            self.doc = sys.maxint
            return
        
        
    def nextPosition(self):
        if self.count > 0:
            self.count -= 1
            # read subsequent positions
            self.position = self._nextPos.next() - self.offset
            return True
        else:
            self.count -= 1
            return False
        
                
    def __repr__(self):
        res = '<pp>d:' + str(self.doc) + ' p:' + str(self.position) + ' o:' + str(self.offset)
        return res

    def __lt__(this, that):
        if this.doc == that.doc:
            return this.position < that.position
        else:
            return this.doc < that.doc