view MoinMoin/support/lupy/index/documentwriter.py @ 0:77665d8e2254

tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0 (automatically generated log message) imported from: moin--main--1.5--base-0
author Thomas Waldmann <tw-public@gmx.de>
date Thu, 22 Sep 2005 15:09:50 +0000
parents
children dbb7c93f21b7
line wrap: on
line source
# This module is part of the Lupy project and is Copyright 2003 Amir
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
# General Public License as published by the Free Software Foundation.

from StringIO import StringIO
from array import array
import re, unicodedata
from MoinMoin.support.lupy.search import similarity
from MoinMoin.support.lupy.index import field, term

def standardTokenizer(string):
    """Yield a stream of downcased words from a string."""
    r = re.compile("\\w+", re.U)
    tokenstream = re.finditer(r, string)
    for m in tokenstream:
        yield m.group().lower()
        
class DocumentWriter(object):

    def __init__(self, directory, analyzer=None, mfl=None):
        self.directory = directory
        self.maxFieldLength = mfl
        self.postingTable = {}
        self.termBuffer = term.Term('','')
        self.analyzer=analyzer or standardTokenizer
        
    def addDocument(self, segment, doc):
        # Write field names
        fi = self.fieldInfos = field.FieldInfos()
        fi.add(doc)
        fi.writeDir(self.directory, segment + '.fnm')

        # Write field values
        fieldsWriter = field.FieldsWriter(self.directory,
                                                 segment,
                                                 self.fieldInfos)
        try:
            fieldsWriter.addDocument(doc)
        finally:
            fieldsWriter.close()

        # Invert doc into postingTable
        self.postingTable = {}
        self.fieldLengths = [0] * (len(self.fieldInfos))
        self.invertDocument(doc)

        # Sort postingTable into an array
        postings = self.sortPostingTable()


        # Write postings
        self.writePostings(postings, segment)
        
        # Write noms of indexed files
        self.writeNorms(doc, segment)


    def invertDocument(self, doc):
        fields = doc.fields()
        for field in doc.fields():
            fieldName = field.name()
            fieldNumber = self.fieldInfos.fieldNumber(fieldName)
            
            position = self.fieldLengths[fieldNumber]    # Position in field

            if field.isIndexed:
                if not field.isTokenized:
                    # Untokenized
                    self.addPosition(fieldName, field.stringValue(), position)
                    position += 1
                else:
                    # Find or make a reader
                    if field.readerValue() is not None:
                        val = field.readerValue().read()
                    elif field.stringValue() is not None:
                        val = field.stringValue()
                    else:
                        raise Exception, 'Field must have either a String or Reader value'
                    
                    for tok in self.analyzer(val):
                        self.addPosition(fieldName, tok, position)
                        position += 1

                        if self.maxFieldLength and (position > self.maxFieldLength):
                            break
                        
            self.fieldLengths[fieldNumber] = position 
                    

    def addPosition(self, field, text, position):
        self.termBuffer.set(field, text)

        ti = self.postingTable.get(self.termBuffer, None)
        
        if ti is not None:
            freq = ti.freq
            ti.positions.append(position)
            ti.freq = freq + 1
        else:
            trm = term.Term(field, text, False)
            self.postingTable[trm] = Posting(trm, position)


    def sortPostingTable(self):
        arr = self.postingTable.values()
        arr.sort()
        return arr


    def writePostings(self, postings, segment):
        freq = None
        prox = None
        tis = None

        try:
            freq = self.directory.createFile(segment + '.frq')
            prox = self.directory.createFile(segment + '.prx')

            tis = term.TermInfosWriter(self.directory,
                                                  segment,
                                                  self.fieldInfos)
            ti = term.TermInfo()

            for posting in postings:
                # print 'writing', posting, posting.term
                # Add entry to the dictionary with pointers to prox and freq files
                ti.set(1, freq.getFilePointer(), prox.getFilePointer())
                tis.add(posting.term, ti)

                # Add an entry to the freq file
                f = posting.freq
                if f == 1:                  # optimize freq == 1
                    freq.writeVInt(1)       # set low bit of doc num
                else:
                    freq.writeVInt(0)       # the document number
                    freq.writeVInt(f)       # frequency in doc

                lastPosition = 0
                positions = posting.positions

                for position in positions:
                    prox.writeVInt(position - lastPosition)
                    lastPosition = position
                    
        finally:
            if freq is not None:
                freq.close()
            if prox is not None:
                prox.close()
            if tis is not None:
                tis.close()


    def writeNorms(self, doc, segment):
        for field in doc.fields():
            if field.isIndexed:
                fieldNumber = self.fieldInfos.fieldNumber(field.name())
                norm = self.directory.createFile(segment +
                                                 '.f' + str(fieldNumber))
                try:
                    norm.writeByte(similarity.normInt(self.fieldLengths[fieldNumber]))
                finally:
                    norm.close()


class Posting(object):

    def __init__(self, t, position):
        self.term = t
        self.freq = 1
        self.positions = array('i',[1])
        self.positions[0] = position

    def __repr__(self):
        s = '<Posting:'
        s += str(self.term) + '>'
        return s

    def __cmp__(self, other):
        return cmp(self.term, other.term)