view MoinMoin/support/lupy/index/term.py @ 0:77665d8e2254

tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0 (automatically generated log message) imported from: moin--main--1.5--base-0
author Thomas Waldmann <tw-public@gmx.de>
date Thu, 22 Sep 2005 15:09:50 +0000
parents
children
line wrap: on
line source
# This module is part of the Lupy project and is Copyright 2003 Amir
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
# General Public License as published by the Free Software Foundation.

class Term(object):
    
    def __init__(self, fld, txt, intern=False):
        self.set(fld, txt)

    def __cmp__(self, other):
        """Compares two terms, returning an integer which is less than zero iff this
        term belongs after the argument, equal zero iff this term is equal to the
        argument, and greater than zero iff this term belongs after the argument.

        The ordering of terms is first by field, then by text."""

        if self.fld == other.fld:
            # fields are interned
            return cmp(self.txt, other.txt)
        else:
            return cmp(self.fld, other.fld)

    def __hash__(self):
        return self._hash
    
    def field(self):
        return self.fld
    
    def readObject(self, inp):
        inp.defaultReadObject()

    def set(self, fld, txt):
        self.fld = fld
        self.txt = txt
        self._hash = hash(fld + txt)

    def text(self):
        return self.txt

    def __repr__(self):
        return 'Term<'+self.fld.encode('utf8')+':'+self.txt.encode('utf8')+'>'

class TermInfo(object):

    def __init__(self):
        self.docFreq = 0
        self.freqPointer = 0
        self.proxPointer = 0

    def set(self, df, fp, pp):
        self.docFreq = df
        self.freqPointer = fp
        self.proxPointer = pp

    def setTo(self, ti):
        self.docFreq = ti.docFreq
        self.freqPointer = ti.freqPointer
        self.proxPointer = ti.proxPointer

    def __repr__(self):
        return '<TermInfo:d:' + str(self.docFreq)+ ' f:' + str(self.freqPointer) +\
               ' p:' + str(self.proxPointer) + '>'


class TermInfosWriter(object):
    INDEX_INTERVAL = 128


    def __init__(self, d, seg, fis, isIndex = False):
        
        self.initialize(d, seg, fis, isIndex)
        
        self.size = 0
        self.lastIndexPointer = 0
        self.lastTerm = Term('','')
        self.lastTi = TermInfo()
        
        if isIndex is False:
            self.other = TermInfosWriter(d, seg, fis, True)
            self.other.other = self

            
    def initialize(self, d, seg, fis, isi):
        self.fieldInfos = fis
        self.isIndex = isi
        if isi is True:
            ext = '.tii'
        else:
            ext = '.tis'
            
        self.output=d.createFile(seg + ext)
        # leave space for size
        self.output.writeInt(0)


    def stringDifference(self, s1, s2):
        prefixLength = min(len(s1), len(s2))
        for i in range(prefixLength):
            if s1[i] != s2[i]:
                return i
        
        return prefixLength


    def add(self, term, ti):
        if not self.isIndex and term <= self.lastTerm:
            raise Exception, "term out of order: " + str(term) + str(self.lastTerm)
        if ti.freqPointer < self.lastTi.freqPointer:
            raise Exception, "freqPointer out of order"
        if ti.proxPointer < self.lastTi.proxPointer:
            raise Exception, "proxPointer out of order"

        if (not self.isIndex and self.size % self.INDEX_INTERVAL == 0):
            # add an index term
            self.other.add(self.lastTerm, self.lastTi)

        # write term
        self.writeTerm(term)
        # write doc freq
        self.output.writeVInt(ti.docFreq)
        # write pointers
        self.output.writeVLong(ti.freqPointer - self.lastTi.freqPointer)
        self.output.writeVLong(ti.proxPointer - self.lastTi.proxPointer)

        if self.isIndex:
            self.output.writeVLong(self.other.output.getFilePointer() - self.lastIndexPointer)
            self.lastIndexPointer = self.other.output.getFilePointer()

        self.lastTi.setTo(ti)
        self.size += 1


    def close(self):
        self.output.seek(0)
        self.output.writeInt(self.size)
        self.output.close()

        if self.isIndex is not True:
            self.other.close()


    def writeTerm(self, term):
        a, b = self.lastTerm.text(), term.text()
        start = self.stringDifference(a, b)
        delta = term.text()[start:]
        # write shared prefix length
        self.output.writeVInt(start)
        # write delta chars
        self.output.writeString(delta)
        # write field num
        i = self.fieldInfos.fieldNumber(term.field())
        self.output.writeVInt(i)
        self.lastTerm = term