Mercurial > moin > 1.9
view MoinMoin/support/lupy/search/similarity.py @ 0:77665d8e2254
tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0
(automatically generated log message)
imported from: moin--main--1.5--base-0
author | Thomas Waldmann <tw-public@gmx.de> |
---|---|
date | Thu, 22 Sep 2005 15:09:50 +0000 |
parents | |
children |
line wrap: on
line source
# This module is part of the Lupy project and is Copyright 2003 Amir # Bakhtiar (amir@divmod.org). This is free software; you can redistribute # it and/or modify it under the terms of version 2.1 of the GNU Lesser # General Public License as published by the Free Software Foundation. import math NORM_TABLE = map(lambda x: x/255.0, range(0,256)) def coord(overlap, maxOverlap): return overlap/float(maxOverlap) def idf(docFreq, numDocs): return math.log((numDocs/(docFreq + 1.0)) or 1.0) + 1.0 def idfTerm(term, searcher): """Use maxDoc() instead of numDocs() because its proportional to docFreq(), i.e., when one is inaccurate, so is the other, and in the same way.""" return idf(searcher.docFreq(term), searcher.maxDoc()) def normByte(normByte): """Un-scales from the byte encoding of a norm into a float, i.e., approximately 1/sqrt(numTerms).""" return NORM_TABLE[normByte & 0xFF] def normInt(numTerms): """Computes the normalization byte for a document given the total number of terms contained in the document. These values are stored in an index and used by the search code Scales 1/sqrt(numTerms) into a byte, i.e. 256/sqrt(numTerms). Math.ceil is used to ensure that even very long documents don't get a zero norm byte, as that is reserved for zero-lengthed documents and deleted documents.""" if numTerms == 0: return 0 return int((math.ceil(255.0 / math.sqrt(numTerms)))) & 0xFF def tf(freq): return float(math.sqrt(freq))