annotate MoinMoin/support/lupy/index/documentwriter.py @ 0:77665d8e2254

tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0 (automatically generated log message) imported from: moin--main--1.5--base-0
author Thomas Waldmann <tw-public@gmx.de>
date Thu, 22 Sep 2005 15:09:50 +0000
parents
children dbb7c93f21b7
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
1 # This module is part of the Lupy project and is Copyright 2003 Amir
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
2 # Bakhtiar (amir@divmod.org). This is free software; you can redistribute
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
3 # it and/or modify it under the terms of version 2.1 of the GNU Lesser
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
4 # General Public License as published by the Free Software Foundation.
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
5
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
6 from StringIO import StringIO
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
7 from array import array
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
8 import re, unicodedata
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
9 from MoinMoin.support.lupy.search import similarity
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
10 from MoinMoin.support.lupy.index import field, term
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
11
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
12 def standardTokenizer(string):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
13 """Yield a stream of downcased words from a string."""
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
14 r = re.compile("\\w+", re.U)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
15 tokenstream = re.finditer(r, string)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
16 for m in tokenstream:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
17 yield m.group().lower()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
18
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
19 class DocumentWriter(object):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
20
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
21 def __init__(self, directory, analyzer=None, mfl=None):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
22 self.directory = directory
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
23 self.maxFieldLength = mfl
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
24 self.postingTable = {}
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
25 self.termBuffer = term.Term('','')
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
26 self.analyzer=analyzer or standardTokenizer
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
27
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
28 def addDocument(self, segment, doc):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
29 # Write field names
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
30 fi = self.fieldInfos = field.FieldInfos()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
31 fi.add(doc)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
32 fi.writeDir(self.directory, segment + '.fnm')
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
33
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
34 # Write field values
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
35 fieldsWriter = field.FieldsWriter(self.directory,
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
36 segment,
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
37 self.fieldInfos)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
38 try:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
39 fieldsWriter.addDocument(doc)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
40 finally:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
41 fieldsWriter.close()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
42
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
43 # Invert doc into postingTable
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
44 self.postingTable = {}
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
45 self.fieldLengths = [0] * (len(self.fieldInfos))
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
46 self.invertDocument(doc)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
47
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
48 # Sort postingTable into an array
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
49 postings = self.sortPostingTable()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
50
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
51
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
52 # Write postings
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
53 self.writePostings(postings, segment)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
54
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
55 # Write noms of indexed files
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
56 self.writeNorms(doc, segment)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
57
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
58
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
59 def invertDocument(self, doc):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
60 fields = doc.fields()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
61 for field in doc.fields():
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
62 fieldName = field.name()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
63 fieldNumber = self.fieldInfos.fieldNumber(fieldName)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
64
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
65 position = self.fieldLengths[fieldNumber] # Position in field
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
66
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
67 if field.isIndexed:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
68 if not field.isTokenized:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
69 # Untokenized
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
70 self.addPosition(fieldName, field.stringValue(), position)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
71 position += 1
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
72 else:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
73 # Find or make a reader
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
74 if field.readerValue() is not None:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
75 val = field.readerValue().read()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
76 elif field.stringValue() is not None:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
77 val = field.stringValue()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
78 else:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
79 raise Exception, 'Field must have either a String or Reader value'
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
80
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
81 for tok in self.analyzer(val):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
82 self.addPosition(fieldName, tok, position)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
83 position += 1
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
84
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
85 if self.maxFieldLength and (position > self.maxFieldLength):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
86 break
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
87
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
88 self.fieldLengths[fieldNumber] = position
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
89
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
90
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
91 def addPosition(self, field, text, position):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
92 self.termBuffer.set(field, text)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
93
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
94 ti = self.postingTable.get(self.termBuffer, None)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
95
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
96 if ti is not None:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
97 freq = ti.freq
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
98 ti.positions.append(position)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
99 ti.freq = freq + 1
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
100 else:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
101 trm = term.Term(field, text, False)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
102 self.postingTable[trm] = Posting(trm, position)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
103
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
104
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
105 def sortPostingTable(self):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
106 arr = self.postingTable.values()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
107 arr.sort()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
108 return arr
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
109
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
110
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
111 def writePostings(self, postings, segment):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
112 freq = None
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
113 prox = None
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
114 tis = None
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
115
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
116 try:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
117 freq = self.directory.createFile(segment + '.frq')
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
118 prox = self.directory.createFile(segment + '.prx')
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
119
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
120 tis = term.TermInfosWriter(self.directory,
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
121 segment,
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
122 self.fieldInfos)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
123 ti = term.TermInfo()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
124
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
125 for posting in postings:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
126 # print 'writing', posting, posting.term
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
127 # Add entry to the dictionary with pointers to prox and freq files
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
128 ti.set(1, freq.getFilePointer(), prox.getFilePointer())
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
129 tis.add(posting.term, ti)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
130
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
131 # Add an entry to the freq file
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
132 f = posting.freq
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
133 if f == 1: # optimize freq == 1
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
134 freq.writeVInt(1) # set low bit of doc num
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
135 else:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
136 freq.writeVInt(0) # the document number
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
137 freq.writeVInt(f) # frequency in doc
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
138
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
139 lastPosition = 0
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
140 positions = posting.positions
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
141
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
142 for position in positions:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
143 prox.writeVInt(position - lastPosition)
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
144 lastPosition = position
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
145
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
146 finally:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
147 if freq is not None:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
148 freq.close()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
149 if prox is not None:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
150 prox.close()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
151 if tis is not None:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
152 tis.close()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
153
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
154
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
155 def writeNorms(self, doc, segment):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
156 for field in doc.fields():
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
157 if field.isIndexed:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
158 fieldNumber = self.fieldInfos.fieldNumber(field.name())
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
159 norm = self.directory.createFile(segment +
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
160 '.f' + str(fieldNumber))
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
161 try:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
162 norm.writeByte(similarity.normInt(self.fieldLengths[fieldNumber]))
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
163 finally:
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
164 norm.close()
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
165
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
166
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
167 class Posting(object):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
168
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
169 def __init__(self, t, position):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
170 self.term = t
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
171 self.freq = 1
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
172 self.positions = array('i',[1])
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
173 self.positions[0] = position
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
174
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
175 def __repr__(self):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
176 s = '<Posting:'
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
177 s += str(self.term) + '>'
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
178 return s
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
179
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
180 def __cmp__(self, other):
77665d8e2254 tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
Thomas Waldmann <tw-public@gmx.de>
parents:
diff changeset
181 return cmp(self.term, other.term)