comparison MoinMoin/support/lupy/index/documentwriter.py @ 0:77665d8e2254

tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0 (automatically generated log message) imported from: moin--main--1.5--base-0
author Thomas Waldmann <tw-public@gmx.de>
date Thu, 22 Sep 2005 15:09:50 +0000
parents
children dbb7c93f21b7
comparison
equal deleted inserted replaced
-1:000000000000 0:77665d8e2254
1 # This module is part of the Lupy project and is Copyright 2003 Amir
2 # Bakhtiar (amir@divmod.org). This is free software; you can redistribute
3 # it and/or modify it under the terms of version 2.1 of the GNU Lesser
4 # General Public License as published by the Free Software Foundation.
5
6 from StringIO import StringIO
7 from array import array
8 import re, unicodedata
9 from MoinMoin.support.lupy.search import similarity
10 from MoinMoin.support.lupy.index import field, term
11
12 def standardTokenizer(string):
13 """Yield a stream of downcased words from a string."""
14 r = re.compile("\\w+", re.U)
15 tokenstream = re.finditer(r, string)
16 for m in tokenstream:
17 yield m.group().lower()
18
19 class DocumentWriter(object):
20
21 def __init__(self, directory, analyzer=None, mfl=None):
22 self.directory = directory
23 self.maxFieldLength = mfl
24 self.postingTable = {}
25 self.termBuffer = term.Term('','')
26 self.analyzer=analyzer or standardTokenizer
27
28 def addDocument(self, segment, doc):
29 # Write field names
30 fi = self.fieldInfos = field.FieldInfos()
31 fi.add(doc)
32 fi.writeDir(self.directory, segment + '.fnm')
33
34 # Write field values
35 fieldsWriter = field.FieldsWriter(self.directory,
36 segment,
37 self.fieldInfos)
38 try:
39 fieldsWriter.addDocument(doc)
40 finally:
41 fieldsWriter.close()
42
43 # Invert doc into postingTable
44 self.postingTable = {}
45 self.fieldLengths = [0] * (len(self.fieldInfos))
46 self.invertDocument(doc)
47
48 # Sort postingTable into an array
49 postings = self.sortPostingTable()
50
51
52 # Write postings
53 self.writePostings(postings, segment)
54
55 # Write noms of indexed files
56 self.writeNorms(doc, segment)
57
58
59 def invertDocument(self, doc):
60 fields = doc.fields()
61 for field in doc.fields():
62 fieldName = field.name()
63 fieldNumber = self.fieldInfos.fieldNumber(fieldName)
64
65 position = self.fieldLengths[fieldNumber] # Position in field
66
67 if field.isIndexed:
68 if not field.isTokenized:
69 # Untokenized
70 self.addPosition(fieldName, field.stringValue(), position)
71 position += 1
72 else:
73 # Find or make a reader
74 if field.readerValue() is not None:
75 val = field.readerValue().read()
76 elif field.stringValue() is not None:
77 val = field.stringValue()
78 else:
79 raise Exception, 'Field must have either a String or Reader value'
80
81 for tok in self.analyzer(val):
82 self.addPosition(fieldName, tok, position)
83 position += 1
84
85 if self.maxFieldLength and (position > self.maxFieldLength):
86 break
87
88 self.fieldLengths[fieldNumber] = position
89
90
91 def addPosition(self, field, text, position):
92 self.termBuffer.set(field, text)
93
94 ti = self.postingTable.get(self.termBuffer, None)
95
96 if ti is not None:
97 freq = ti.freq
98 ti.positions.append(position)
99 ti.freq = freq + 1
100 else:
101 trm = term.Term(field, text, False)
102 self.postingTable[trm] = Posting(trm, position)
103
104
105 def sortPostingTable(self):
106 arr = self.postingTable.values()
107 arr.sort()
108 return arr
109
110
111 def writePostings(self, postings, segment):
112 freq = None
113 prox = None
114 tis = None
115
116 try:
117 freq = self.directory.createFile(segment + '.frq')
118 prox = self.directory.createFile(segment + '.prx')
119
120 tis = term.TermInfosWriter(self.directory,
121 segment,
122 self.fieldInfos)
123 ti = term.TermInfo()
124
125 for posting in postings:
126 # print 'writing', posting, posting.term
127 # Add entry to the dictionary with pointers to prox and freq files
128 ti.set(1, freq.getFilePointer(), prox.getFilePointer())
129 tis.add(posting.term, ti)
130
131 # Add an entry to the freq file
132 f = posting.freq
133 if f == 1: # optimize freq == 1
134 freq.writeVInt(1) # set low bit of doc num
135 else:
136 freq.writeVInt(0) # the document number
137 freq.writeVInt(f) # frequency in doc
138
139 lastPosition = 0
140 positions = posting.positions
141
142 for position in positions:
143 prox.writeVInt(position - lastPosition)
144 lastPosition = position
145
146 finally:
147 if freq is not None:
148 freq.close()
149 if prox is not None:
150 prox.close()
151 if tis is not None:
152 tis.close()
153
154
155 def writeNorms(self, doc, segment):
156 for field in doc.fields():
157 if field.isIndexed:
158 fieldNumber = self.fieldInfos.fieldNumber(field.name())
159 norm = self.directory.createFile(segment +
160 '.f' + str(fieldNumber))
161 try:
162 norm.writeByte(similarity.normInt(self.fieldLengths[fieldNumber]))
163 finally:
164 norm.close()
165
166
167 class Posting(object):
168
169 def __init__(self, t, position):
170 self.term = t
171 self.freq = 1
172 self.positions = array('i',[1])
173 self.positions[0] = position
174
175 def __repr__(self):
176 s = '<Posting:'
177 s += str(self.term) + '>'
178 return s
179
180 def __cmp__(self, other):
181 return cmp(self.term, other.term)