diff MoinMoin/support/lupy/search/phrase.py @ 0:77665d8e2254

tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0 (automatically generated log message) imported from: moin--main--1.5--base-0
author Thomas Waldmann <tw-public@gmx.de>
date Thu, 22 Sep 2005 15:09:50 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/support/lupy/search/phrase.py	Thu Sep 22 15:09:50 2005 +0000
@@ -0,0 +1,232 @@
+# This module is part of the Lupy project and is Copyright 2003 Amir
+# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
+# it and/or modify it under the terms of version 2.1 of the GNU Lesser
+# General Public License as published by the Free Software Foundation.
+
+
+from bisect import insort
+from MoinMoin.support.lupy.search import term, similarity
+import sys
+
+class PhraseQuery:
+    """A query that matches documents containing a particular
+    sequence of terms. This may be combined with other terms
+    with a L{lupy.search.boolean.BooleanQuery}.
+    """
+
+    def __init__(self):
+        """Constructs an empty phrase query."""
+        
+        self.idf = 0.0
+        self.slop = 0
+        self.terms = []
+        self.weight = 0.0
+        self.boost = 1.0
+
+    def add(self, term):
+        """Adds a term to the end of the query phrase."""
+        if len(self.terms) == 0:
+            self.field = term.field()
+
+        elif term.field() != self.field:
+            raise Exception, 'All phrase terms must be in the same field: ' + str(term)
+
+        self.terms.append(term)
+
+
+    def getSlop(self):
+        """Returns the slop.  See setSlop()."""
+        return self.slop
+
+
+    def normalize(self, norm):
+        # normalize for query
+        self.weight *= norm
+        # factor from document
+        self.weight *= self.idf
+
+
+    def scorer(self, reader):
+        # optimize zero-term case
+        if len(self.terms) == 0:
+            return None
+
+        # optimize one-term case
+        if len(self.terms) == 1:
+            t = self.terms[0]
+            docs = reader.termDocsTerm(t)
+            if docs is None:
+                return None
+            return term.TermScorer(docs, reader.normsField(t.field()), self.weight)
+
+        tps = [] 
+        
+        for t in self.terms:
+            p = reader.termPositionsTerm(t)
+            if p is None:
+                # I am not sure how this is ever reached?
+                return None
+            tps.append(p)
+
+        if self.slop == 0:
+            return ExactPhraseScorer(tps, reader.normsField(self.field),
+                                     self.weight)
+        else:
+            return SloppyPhraseScorer(tps, reader.norms(self.field),
+                                      self.weight)
+
+
+    def sumOfSquaredWeights(self, searcher):
+        # sum term IDFs
+        for term in self.terms:
+            self.idf += similarity.idfTerm(term, searcher)
+            
+        self.weight = self.idf * self.boost
+        # square term weights
+        return self.weight * self.weight
+
+
+    def toString(self, f):
+        """Prints a user-readable version of this query"""
+
+        buffer = ''
+        if not self.field == f :
+            buffer += f + ':'
+        buffer += '\\'
+
+        for term in self.terms[:-1]:
+            buffer += term.text() + ' '
+            
+        buffer += self.terms[-1].text() + '\\'
+
+        if self.slop != 0:
+            buffer += '~' + str(self.slop)
+
+        if self.boost != 1.0:
+            buffer += '^' + str(self.boost)
+
+        return buffer
+
+
+class PhraseScorer:
+    
+    def __init__(self, tps, n, w):
+        self.norms = n
+        self.weight = w
+        
+        self.pps = [PhrasePositions(tp, i) for i, tp in enumerate(tps)]
+        self.pps.sort()
+                        
+    def phraseQuery(self):
+        """Subclass responsibility"""
+
+    def score(self, end):
+        # find doc w/ all the terms
+        while self.pps[-1].doc < end:
+            while self.pps[0].doc < self.pps[-1].doc:
+                self.pps[0].advance()
+                while self.pps[0].doc < self.pps[-1].doc:
+                    self.pps[0].advance()
+                self.pps.append(self.pps.pop(0))
+                if self.pps[-1].doc >= end:
+                    return
+                
+            # found doc with all terms
+            # check for phrase
+            freq = self.phraseFreq()
+            
+            if freq > 0.0:
+                # compute score
+                score = similarity.tf(freq) * self.weight
+                # normalize
+                score *= similarity.normByte(self.norms[self.pps[0].doc])
+                # add to results
+                yield (self.pps[0].doc, score)
+            # resume scanning
+            self.pps[-1].advance()
+                
+                
+        
+
+class ExactPhraseScorer(PhraseScorer):
+    
+    def phraseFreq(self):
+        for pp in self.pps:
+            pp.firstPosition()
+        self.pps.sort()
+        freq = 0.0
+        
+        init = 0
+        # the 'init' bits are to simulate a do-while loop :-/
+        while init == 0 or self.pps[-1].nextPosition():
+            while self.pps[0].position < self.pps[-1].position:
+                # scan forward in first
+                init2 = 0
+                while init2 == 0 or self.pps[0].position < self.pps[-1].position:
+                    if not self.pps[0].nextPosition():
+                        return freq
+                    init2 = 1
+                    
+                self.pps.append(self.pps.pop(0))
+            # all equal: a match
+            freq += 1
+            init = 1
+            
+        return freq
+        
+
+class PhrasePositions(object):
+
+    def __init__(self, t, o):
+        self.tp = t
+        self.offset = o
+        
+        self.position = 0
+        self.count = 0
+        self.doc = 0
+        self.tpiter = iter(t)
+        self.advance()
+        
+        
+    def firstPosition(self):
+        self.count = self.tp.frq
+        self.nextPosition()
+        
+        
+    def advance(self):
+        """Increments to next doc"""
+        
+        for doc, frq, nextPos in self.tpiter:
+            self.doc = doc
+            self.frq = frq
+            self._nextPos = nextPos
+            self.position = 0
+            return
+        else:
+            # close stream
+            self.tp.close()
+            # sentinel value
+            self.doc = sys.maxint
+            return
+        
+        
+    def nextPosition(self):
+        if self.count > 0:
+            self.count -= 1
+            # read subsequent positions
+            self.position = self._nextPos.next() - self.offset
+            return True
+        else:
+            self.count -= 1
+            return False
+        
+                
+    def __repr__(self):
+        res = '<pp>d:' + str(self.doc) + ' p:' + str(self.position) + ' o:' + str(self.offset)
+        return res
+
+    def __lt__(this, that):
+        if this.doc == that.doc:
+            return this.position < that.position
+        else:
+            return this.doc < that.doc