comparison MoinMoin/support/lupy/search/phrase.py @ 0:77665d8e2254

tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0 (automatically generated log message) imported from: moin--main--1.5--base-0
author Thomas Waldmann <tw-public@gmx.de>
date Thu, 22 Sep 2005 15:09:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:77665d8e2254
1 # This module is part of the Lupy project and is Copyright 2003 Amir
2 # Bakhtiar (amir@divmod.org). This is free software; you can redistribute
3 # it and/or modify it under the terms of version 2.1 of the GNU Lesser
4 # General Public License as published by the Free Software Foundation.
5
6
7 from bisect import insort
8 from MoinMoin.support.lupy.search import term, similarity
9 import sys
10
11 class PhraseQuery:
12 """A query that matches documents containing a particular
13 sequence of terms. This may be combined with other terms
14 with a L{lupy.search.boolean.BooleanQuery}.
15 """
16
17 def __init__(self):
18 """Constructs an empty phrase query."""
19
20 self.idf = 0.0
21 self.slop = 0
22 self.terms = []
23 self.weight = 0.0
24 self.boost = 1.0
25
26 def add(self, term):
27 """Adds a term to the end of the query phrase."""
28 if len(self.terms) == 0:
29 self.field = term.field()
30
31 elif term.field() != self.field:
32 raise Exception, 'All phrase terms must be in the same field: ' + str(term)
33
34 self.terms.append(term)
35
36
37 def getSlop(self):
38 """Returns the slop. See setSlop()."""
39 return self.slop
40
41
42 def normalize(self, norm):
43 # normalize for query
44 self.weight *= norm
45 # factor from document
46 self.weight *= self.idf
47
48
49 def scorer(self, reader):
50 # optimize zero-term case
51 if len(self.terms) == 0:
52 return None
53
54 # optimize one-term case
55 if len(self.terms) == 1:
56 t = self.terms[0]
57 docs = reader.termDocsTerm(t)
58 if docs is None:
59 return None
60 return term.TermScorer(docs, reader.normsField(t.field()), self.weight)
61
62 tps = []
63
64 for t in self.terms:
65 p = reader.termPositionsTerm(t)
66 if p is None:
67 # I am not sure how this is ever reached?
68 return None
69 tps.append(p)
70
71 if self.slop == 0:
72 return ExactPhraseScorer(tps, reader.normsField(self.field),
73 self.weight)
74 else:
75 return SloppyPhraseScorer(tps, reader.norms(self.field),
76 self.weight)
77
78
79 def sumOfSquaredWeights(self, searcher):
80 # sum term IDFs
81 for term in self.terms:
82 self.idf += similarity.idfTerm(term, searcher)
83
84 self.weight = self.idf * self.boost
85 # square term weights
86 return self.weight * self.weight
87
88
89 def toString(self, f):
90 """Prints a user-readable version of this query"""
91
92 buffer = ''
93 if not self.field == f :
94 buffer += f + ':'
95 buffer += '\\'
96
97 for term in self.terms[:-1]:
98 buffer += term.text() + ' '
99
100 buffer += self.terms[-1].text() + '\\'
101
102 if self.slop != 0:
103 buffer += '~' + str(self.slop)
104
105 if self.boost != 1.0:
106 buffer += '^' + str(self.boost)
107
108 return buffer
109
110
111 class PhraseScorer:
112
113 def __init__(self, tps, n, w):
114 self.norms = n
115 self.weight = w
116
117 self.pps = [PhrasePositions(tp, i) for i, tp in enumerate(tps)]
118 self.pps.sort()
119
120 def phraseQuery(self):
121 """Subclass responsibility"""
122
123 def score(self, end):
124 # find doc w/ all the terms
125 while self.pps[-1].doc < end:
126 while self.pps[0].doc < self.pps[-1].doc:
127 self.pps[0].advance()
128 while self.pps[0].doc < self.pps[-1].doc:
129 self.pps[0].advance()
130 self.pps.append(self.pps.pop(0))
131 if self.pps[-1].doc >= end:
132 return
133
134 # found doc with all terms
135 # check for phrase
136 freq = self.phraseFreq()
137
138 if freq > 0.0:
139 # compute score
140 score = similarity.tf(freq) * self.weight
141 # normalize
142 score *= similarity.normByte(self.norms[self.pps[0].doc])
143 # add to results
144 yield (self.pps[0].doc, score)
145 # resume scanning
146 self.pps[-1].advance()
147
148
149
150
151 class ExactPhraseScorer(PhraseScorer):
152
153 def phraseFreq(self):
154 for pp in self.pps:
155 pp.firstPosition()
156 self.pps.sort()
157 freq = 0.0
158
159 init = 0
160 # the 'init' bits are to simulate a do-while loop :-/
161 while init == 0 or self.pps[-1].nextPosition():
162 while self.pps[0].position < self.pps[-1].position:
163 # scan forward in first
164 init2 = 0
165 while init2 == 0 or self.pps[0].position < self.pps[-1].position:
166 if not self.pps[0].nextPosition():
167 return freq
168 init2 = 1
169
170 self.pps.append(self.pps.pop(0))
171 # all equal: a match
172 freq += 1
173 init = 1
174
175 return freq
176
177
178 class PhrasePositions(object):
179
180 def __init__(self, t, o):
181 self.tp = t
182 self.offset = o
183
184 self.position = 0
185 self.count = 0
186 self.doc = 0
187 self.tpiter = iter(t)
188 self.advance()
189
190
191 def firstPosition(self):
192 self.count = self.tp.frq
193 self.nextPosition()
194
195
196 def advance(self):
197 """Increments to next doc"""
198
199 for doc, frq, nextPos in self.tpiter:
200 self.doc = doc
201 self.frq = frq
202 self._nextPos = nextPos
203 self.position = 0
204 return
205 else:
206 # close stream
207 self.tp.close()
208 # sentinel value
209 self.doc = sys.maxint
210 return
211
212
213 def nextPosition(self):
214 if self.count > 0:
215 self.count -= 1
216 # read subsequent positions
217 self.position = self._nextPos.next() - self.offset
218 return True
219 else:
220 self.count -= 1
221 return False
222
223
224 def __repr__(self):
225 res = '<pp>d:' + str(self.doc) + ' p:' + str(self.position) + ' o:' + str(self.offset)
226 return res
227
228 def __lt__(this, that):
229 if this.doc == that.doc:
230 return this.position < that.position
231 else:
232 return this.doc < that.doc