Mercurial > moin > 1.9
changeset 953:3abaabbe631e
Merge with main.
author | Alexander Schremmer <alex AT alexanderweb DOT de> |
---|---|
date | Thu, 29 Jun 2006 01:05:53 +0200 |
parents | cd3019c751e6 (current diff) 715171e93d79 (diff) |
children | a7e98fd10e97 |
files | MoinMoin/action/tm.py MoinMoin/lupy.py MoinMoin/script/lupy/__init__.py MoinMoin/script/lupy/build.py MoinMoin/script/lupy/optimize.py MoinMoin/support/lupy/__init__.py MoinMoin/support/lupy/document.py MoinMoin/support/lupy/index/__init__.py MoinMoin/support/lupy/index/documentwriter.py MoinMoin/support/lupy/index/field.py MoinMoin/support/lupy/index/indexwriter.py MoinMoin/support/lupy/index/segment.py MoinMoin/support/lupy/index/segmentmerger.py MoinMoin/support/lupy/index/term.py MoinMoin/support/lupy/index/terminfo.py MoinMoin/support/lupy/indexer.py MoinMoin/support/lupy/search/__init__.py MoinMoin/support/lupy/search/boolean.py MoinMoin/support/lupy/search/camelcase.py MoinMoin/support/lupy/search/fuzzy.py MoinMoin/support/lupy/search/hits.py MoinMoin/support/lupy/search/indexsearcher.py MoinMoin/support/lupy/search/phrase.py MoinMoin/support/lupy/search/prefix.py MoinMoin/support/lupy/search/regularexpression.py MoinMoin/support/lupy/search/similarity.py MoinMoin/support/lupy/search/term.py MoinMoin/support/lupy/store.py MoinMoin/support/lupy/util.py docs/Lupy-0.2.1/LICENSE docs/Lupy-0.2.1/README.txt docs/Lupy-0.2.1/releasenotes.txt |
diffstat | 72 files changed, 2783 insertions(+), 5622 deletions(-) [+] |
line wrap: on
line diff
--- a/MoinMoin/Page.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/Page.py Thu Jun 29 01:05:53 2006 +0200 @@ -1540,7 +1540,8 @@ links = self.parsePageLinks(request) cache.update('\n'.join(links) + '\n', True) return links - return cache.content(True).split('\n') + links = cache.content(True).split('\n') + return [link for link in links if link] def parsePageLinks(self, request): """ Parse page links by formatting with a pagelinks formatter
--- a/MoinMoin/PageEditor.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/PageEditor.py Thu Jun 29 01:05:53 2006 +0200 @@ -967,9 +967,9 @@ if self.request.cfg.mail_enabled: msg = msg + self._notifySubscribers(comment, trivial) - if self.request.cfg.lupy_search: - from MoinMoin import lupy - index = lupy.Index(self.request) + if self.request.cfg.xapian_search: + from MoinMoin import Xapian + index = Xapian.Index(self.request) # When we have automatic index building, we can add to # the queue even if the index is missing. if index.exists():
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/Xapian.py Thu Jun 29 01:05:53 2006 +0200 @@ -0,0 +1,708 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - xapian indexing search engine + + @copyright: 2006 MoinMoin:ThomasWaldmann, + 2006 MoinMoin:FranzPletz + @license: GNU GPL, see COPYING for details. +""" +debug = True + +import sys, os, re, codecs, errno, time +from pprint import pprint + +import xapian +from MoinMoin.support.xapwrap import document as xapdoc +from MoinMoin.support.xapwrap import index as xapidx +from MoinMoin.parser.text_moin_wiki import Parser as WikiParser + +from MoinMoin.Page import Page +from MoinMoin import config, wikiutil +from MoinMoin.util import filesys, lock + + +class UnicodeQuery(xapian.Query): + def __init__(self, *args, **kwargs): + self.encoding = kwargs.get('encoding', config.charset) + + nargs = [] + for i in args: + if isinstance(i, unicode): + i = i.encode(self.encoding) + nargs.append(i) + + xapian.Query.__init__(self, *nargs, **kwargs) + + +############################################################################## +### Tokenizer +############################################################################## + +class WikiAnalyzer: + singleword = r"[%(u)s][%(l)s]+" % { + 'u': config.chars_upper, + 'l': config.chars_lower, + } + + singleword_re = re.compile(singleword, re.U) + wikiword_re = re.compile(WikiParser.word_rule, re.U) + + token_re = re.compile( + r"(?P<company>\w+[&@]\w+)|" + # company names like AT&T and Excite@Home. + r"(?P<email>\w+([.-]\w+)*@\w+([.-]\w+)*)|" + # email addresses + r"(?P<hostname>\w+(\.\w+)+)|" + # hostnames + r"(?P<num>(\w+[-/.,])*\w*\d\w*([-/.,]\w+)*)|" + # version numbers + r"(?P<acronym>(\w\.)+)|" + # acronyms: U.S.A., I.B.M., etc. + r"(?P<word>\w+)", # words (including WikiWords) + re.U) + + dot_re = re.compile(r"[-_/,.]") + mail_re = re.compile(r"[-_/,.]|(@)") + + # XXX limit stuff above to xapdoc.MAX_KEY_LEN + # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U) + + def tokenize(self, value): + """Yield a stream of lower cased words from a string. + value must be an UNICODE object or a list of unicode objects + """ + def enc(uc): + """ 'encode' unicode results into whatever xapian / xapwrap wants """ + lower = uc.lower() + return lower + + if isinstance(value, list): # used for page links + for v in value: + yield enc(v) + else: + tokenstream = re.finditer(self.token_re, value) + for m in tokenstream: + if m.group("acronym"): + yield enc(m.group("acronym").replace('.', '')) + elif m.group("company"): + yield enc(m.group("company")) + elif m.group("email"): + for word in self.mail_re.split(m.group("email")): + if word: + yield enc(word) + elif m.group("hostname"): + for word in self.dot_re.split(m.group("hostname")): + yield enc(word) + elif m.group("num"): + for word in self.dot_re.split(m.group("num")): + yield enc(word) + elif m.group("word"): + word = m.group("word") + yield enc(word) + # if it is a CamelCaseWord, we additionally yield Camel, Case and Word + if self.wikiword_re.match(word): + for sm in re.finditer(self.singleword_re, word): + yield enc(sm.group()) + + +############################################################################# +### Indexing +############################################################################# + +class UpdateQueue: + def __init__(self, file, lock_dir): + self.file = file + self.writeLock = lock.WriteLock(lock_dir, timeout=10.0) + self.readLock = lock.ReadLock(lock_dir, timeout=10.0) + + def exists(self): + return os.path.exists(self.file) + + def append(self, pagename): + """ Append a page to queue """ + if not self.writeLock.acquire(60.0): + request.log("can't add %r to xapian update queue: can't lock queue" % + pagename) + return + try: + f = codecs.open(self.file, 'a', config.charset) + try: + f.write(pagename + "\n") + finally: + f.close() + finally: + self.writeLock.release() + + def pages(self): + """ Return list of pages in the queue """ + if self.readLock.acquire(1.0): + try: + return self._decode(self._read()) + finally: + self.readLock.release() + return [] + + def remove(self, pages): + """ Remove pages from the queue + + When the queue is empty, the queue file is removed, so exists() + can tell if there is something waiting in the queue. + """ + if self.writeLock.acquire(30.0): + try: + queue = self._decode(self._read()) + for page in pages: + try: + queue.remove(page) + except ValueError: + pass + if queue: + self._write(queue) + else: + self._removeFile() + return True + finally: + self.writeLock.release() + return False + + # Private ------------------------------------------------------- + + def _decode(self, data): + """ Decode queue data """ + pages = data.splitlines() + return self._filterDuplicates(pages) + + def _filterDuplicates(self, pages): + """ Filter duplicates in page list, keeping the order """ + unique = [] + seen = {} + for name in pages: + if not name in seen: + unique.append(name) + seen[name] = 1 + return unique + + def _read(self): + """ Read and return queue data + + This does not do anything with the data so we can release the + lock as soon as possible, enabling others to update the queue. + """ + try: + f = codecs.open(self.file, 'r', config.charset) + try: + return f.read() + finally: + f.close() + except (OSError, IOError), err: + if err.errno != errno.ENOENT: + raise + return '' + + def _write(self, pages): + """ Write pages to queue file + + Requires queue write locking. + """ + # XXX use tmpfile/move for atomic replace on real operating systems + data = '\n'.join(pages) + '\n' + f = codecs.open(self.file, 'w', config.charset) + try: + f.write(data) + finally: + f.close() + + def _removeFile(self): + """ Remove queue file + + Requires queue write locking. + """ + try: + os.remove(self.file) + except OSError, err: + if err.errno != errno.ENOENT: + raise + + +class Index: + indexValueMap = { + # mapping the value names we can easily fetch from the index to + # integers required by xapian. 0 and 1 are reserved by xapwrap! + 'pagename': 2, + 'attachment': 3, + 'mtime': 4, + 'wikiname': 5, + } + prefixMap = { + # http://svn.xapian.org/*checkout*/trunk/xapian-applications/omega/docs/termprefixes.txt + 'author': 'A', + 'date': 'D', # numeric format: YYYYMMDD or "latest" - e.g. D20050224 or Dlatest + #G newsGroup (or similar entity - e.g. a web forum name) + 'hostname': 'H', + 'keyword': 'K', + 'lang': 'L', # ISO Language code + #M Month (numeric format: YYYYMM) + #N ISO couNtry code (or domaiN name) + #P Pathname + #Q uniQue id + #R Raw (i.e. unstemmed) term + 'title': 'S', # Subject (or title) + 'mimetype': 'T', + 'url': 'U', # full URL of indexed document - if the resulting term would be > 240 + # characters, a hashing scheme is used to prevent overflowing + # the Xapian term length limit (see omindex for how to do this). + #W "weak" (approximately 10 day intervals, taken as YYYYMMD from + # the D term, and changing the last digit to a '2' if it's a '3') + #X longer prefix for user-defined use + 'linkto': 'XLINKTO', # this document links to that document + #Y year (four digits) + } + + + + class LockedException(Exception): + pass + + def __init__(self, request): + self.request = request + cache_dir = request.cfg.cache_dir + self.main_dir = os.path.join(cache_dir, 'xapian') + self.dir = os.path.join(self.main_dir, 'index') + filesys.makeDirs(self.dir) + self.sig_file = os.path.join(self.main_dir, 'complete') + lock_dir = os.path.join(self.main_dir, 'index-lock') + self.lock = lock.WriteLock(lock_dir, + timeout=3600.0, readlocktimeout=60.0) + self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0) + self.queue = UpdateQueue(os.path.join(self.main_dir, "update-queue"), + os.path.join(self.main_dir, 'update-queue-lock')) + + # Disabled until we have a sane way to build the index with a + # queue in small steps. + ## if not self.exists(): + ## self.indexPagesInNewThread(request) + + def exists(self): + """ Check if index exists """ + return os.path.exists(self.sig_file) + + def mtime(self): + return os.path.getmtime(self.dir) + + def _search(self, query): + """ read lock must be acquired """ + while True: + try: + searcher, timestamp = self.request.cfg.xapian_searchers.pop() + if timestamp != self.mtime(): + searcher.close() + else: + break + except IndexError: + searcher = xapidx.ReadOnlyIndex(self.dir) + searcher.configure(self.prefixMap, self.indexValueMap) + timestamp = self.mtime() + break + + hits = searcher.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname']) + self.request.cfg.xapian_searchers.append((searcher, timestamp)) + return hits + + def search(self, query): + if not self.read_lock.acquire(1.0): + raise self.LockedException + try: + hits = self._search(query) + finally: + self.read_lock.release() + return hits + + def update_page(self, page): + self.queue.append(page.page_name) + self._do_queued_updates_InNewThread() + + def indexPages(self, files=None, mode='update'): + """ Index all pages (and files, if given) + + Can be called only from a script. To index pages during a user + request, use indexPagesInNewThread. + @arg files: iterator or list of files to index additionally + """ + if not self.lock.acquire(1.0): + self.request.log("can't index: can't acquire lock") + return + try: + request = self._indexingRequest(self.request) + self._index_pages(request, None, files, mode) + finally: + self.lock.release() + + def indexPagesInNewThread(self, files=None, mode='update'): + """ Index all pages in a new thread + + Should be called from a user request. From a script, use indexPages. + """ + if not self.lock.acquire(1.0): + self.request.log("can't index: can't acquire lock") + return + try: + # Prevent rebuilding the index just after it was finished + if self.exists(): + self.lock.release() + return + from threading import Thread + indexThread = Thread(target=self._index_pages, + args=(self._indexingRequest(self.request), self.lock, files, mode)) + indexThread.setDaemon(True) + + # Join the index thread after current request finish, prevent + # Apache CGI from killing the process. + def joinDecorator(finish): + def func(): + finish() + indexThread.join() + return func + + self.request.finish = joinDecorator(self.request.finish) + indexThread.start() + except: + self.lock.release() + raise + + def optimize(self): + pass + + # Private ---------------------------------------------------------------- + + def _do_queued_updates_InNewThread(self): + """ do queued index updates in a new thread + + Should be called from a user request. From a script, use indexPages. + """ + if not self.lock.acquire(1.0): + self.request.log("can't index: can't acquire lock") + return + try: + from threading import Thread + indexThread = Thread(target=self._do_queued_updates, + args=(self._indexingRequest(self.request), self.lock)) + indexThread.setDaemon(True) + + # Join the index thread after current request finish, prevent + # Apache CGI from killing the process. + def joinDecorator(finish): + def func(): + finish() + indexThread.join() + return func + + self.request.finish = joinDecorator(self.request.finish) + indexThread.start() + except: + self.lock.release() + raise + + def _do_queued_updates(self, request, lock=None, amount=5): + """ Assumes that the write lock is acquired """ + try: + writer = xapidx.Index(self.dir, True) + writer.configure(self.prefixMap, self.indexValueMap) + pages = self.queue.pages()[:amount] + for name in pages: + p = Page(request, name) + self._index_page(writer, p, mode='update') + self.queue.remove([name]) + finally: + writer.close() + if lock: + lock.release() + + def contentfilter(self, filename): + """ Get a filter for content of filename and return unicode content. """ + request = self.request + mt = wikiutil.MimeType(filename=filename) + for modulename in mt.module_name(): + try: + execute = wikiutil.importPlugin(request.cfg, 'filter', modulename) + break + except wikiutil.PluginMissingError: + pass + #else: + # raise "Cannot load filter for mimetype." + modulename # XXX + try: + data = execute(self, filename) + if debug: + request.log("Filter %s returned %d characters for file %s" % (modulename, len(data), filename)) + except (OSError, IOError), err: + data = '' + request.log("Filter %s threw error '%s' for file %s" % (modulename, str(err), filename)) + return mt.mime_type(), data + + def test(self, request): + idx = xapidx.ReadOnlyIndex(self.dir) + idx.configure(self.prefixMap, self.indexValueMap) + print idx.search("is") + #for d in docs: + # request.log("%r %r %r" % (d, d.get('attachment'), d.get('pagename'))) + + def _index_file(self, request, writer, filename, mode='update'): + """ index a file as it were a page named pagename + Assumes that the write lock is acquired + """ + fs_rootpage = 'FS' # XXX FS hardcoded + try: + wikiname = request.cfg.interwikiname or 'Self' + itemid = "%s:%s" % (wikiname, os.path.join(fs_rootpage, filename)) + mtime = os.path.getmtime(filename) + mtime = wikiutil.timestamp2version(mtime) + if mode == 'update': + query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid)) + docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) + if docs: + doc = docs[0] # there should be only one + uid = doc['uid'] + docmtime = long(doc['values']['mtime']) + updated = mtime > docmtime + if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated)) + else: + uid = None + updated = True + elif mode == 'add': + updated = True + if debug: request.log("%s %r" % (filename, updated)) + if updated: + xitemid = xapdoc.Keyword('itemid', itemid) + mimetype, file_content = self.contentfilter(filename) + xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or "Self") + xpname = xapdoc.SortKey('pagename', fs_rootpage) + xattachment = xapdoc.SortKey('attachment', filename) # XXX we should treat files like real pages, not attachments + xmtime = xapdoc.SortKey('mtime', mtime) + title = " ".join(os.path.join(fs_rootpage, filename).split("/")) + xtitle = xapdoc.Keyword('title', title) + xmimetype = xapdoc.TextField('mimetype', mimetype, True) + xcontent = xapdoc.TextField('content', file_content) + doc = xapdoc.Document(textFields=(xcontent, xmimetype, ), + keywords=(xtitle, xitemid, ), + sortFields=(xpname, xattachment, xmtime, xwname, ), + ) + doc.analyzerFactory = WikiAnalyzer + if mode == 'update': + if debug: request.log("%s (replace %r)" % (filename, uid)) + doc.uid = uid + id = writer.index(doc) + elif mode == 'add': + if debug: request.log("%s (add)" % (filename,)) + id = writer.index(doc) + except (OSError, IOError), err: + pass + + def _index_page(self, writer, page, mode='update'): + """ Index a page - assumes that the write lock is acquired + @arg writer: the index writer object + @arg page: a page object + @arg mode: 'add' = just add, no checks + 'update' = check if already in index and update if needed (mtime) + + """ + request = page.request + wikiname = request.cfg.interwikiname or "Self" + pagename = page.page_name + mtime = page.mtime_usecs() + itemid = "%s:%s" % (wikiname, pagename) + updated = False + + if mode == 'update': + # from #xapian: if you generate a special "unique id" term, + # you can just call database.replace_document(uid_term, doc) + # -> done in xapwrap.index.Index.index() + query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', itemid)) + docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', 'wikiname', ]) + if docs: + doc = docs[0] # there should be only one + uid = doc['uid'] + docmtime = long(doc['values']['mtime']) + updated = mtime > docmtime + if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated)) + else: + uid = None + updated = True + elif mode == 'add': + updated = True + if debug: request.log("%s %r" % (pagename, updated)) + if updated: + xwname = xapdoc.SortKey('wikiname', request.cfg.interwikiname or "Self") + xpname = xapdoc.SortKey('pagename', pagename) + xattachment = xapdoc.SortKey('attachment', '') # this is a real page, not an attachment + xmtime = xapdoc.SortKey('mtime', mtime) + xtitle = xapdoc.TextField('title', pagename, True) # prefixed + xkeywords = [xapdoc.Keyword('itemid', itemid)] + for pagelink in page.getPageLinks(request): + xkeywords.append(xapdoc.Keyword('linkto', pagelink)) + xcontent = xapdoc.TextField('content', page.get_raw_body()) + doc = xapdoc.Document(textFields=(xcontent, xtitle), + keywords=xkeywords, + sortFields=(xpname, xattachment, xmtime, xwname, ), + ) + doc.analyzerFactory = WikiAnalyzer + #search_db_language = "english" + #stemmer = xapian.Stem(search_db_language) + #pagetext = page.get_raw_body().lower() + #words = re.finditer(r"\w+", pagetext) + #count = 0 + #for wordmatch in words: + # count += 1 + # word = wordmatch.group().encode(config.charset) + # document.add_posting('R' + stemmer.stem_word(word), count) # count should be term position in document (starting at 1) + + if mode == 'update': + if debug: request.log("%s (replace %r)" % (pagename, uid)) + doc.uid = uid + id = writer.index(doc) + elif mode == 'add': + if debug: request.log("%s (add)" % (pagename,)) + id = writer.index(doc) + + from MoinMoin.action import AttachFile + + attachments = AttachFile._get_files(request, pagename) + for att in attachments: + filename = AttachFile.getFilename(request, pagename, att) + att_itemid = "%s//%s" % (itemid, att) + mtime = wikiutil.timestamp2version(os.path.getmtime(filename)) + if mode == 'update': + query = xapidx.RawQuery(xapdoc.makePairForWrite('itemid', att_itemid)) + docs = writer.search(query, valuesWanted=['pagename', 'attachment', 'mtime', ]) + if debug: request.log("##%r %r" % (filename, docs)) + if docs: + doc = docs[0] # there should be only one + uid = doc['uid'] + docmtime = long(doc['values']['mtime']) + updated = mtime > docmtime + if debug: request.log("uid %r: mtime %r > docmtime %r == updated %r" % (uid, mtime, docmtime, updated)) + else: + uid = None + updated = True + elif mode == 'add': + updated = True + if debug: request.log("%s %s %r" % (pagename, att, updated)) + if updated: + xatt_itemid = xapdoc.Keyword('itemid', att_itemid) + xpname = xapdoc.SortKey('pagename', pagename) + xattachment = xapdoc.SortKey('attachment', att) # this is an attachment, store its filename + xmtime = xapdoc.SortKey('mtime', mtime) + xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att)) + mimetype, att_content = self.contentfilter(filename) + xmimetype = xapdoc.TextField('mimetype', mimetype, True) + xcontent = xapdoc.TextField('content', att_content) + doc = xapdoc.Document(textFields=(xcontent, xmimetype, ), + keywords=(xatt_itemid, xtitle, ), + sortFields=(xpname, xattachment, xmtime, xwname, ), + ) + doc.analyzerFactory = WikiAnalyzer + if mode == 'update': + if debug: request.log("%s (replace %r)" % (pagename, uid)) + doc.uid = uid + id = writer.index(doc) + elif mode == 'add': + if debug: request.log("%s (add)" % (pagename,)) + id = writer.index(doc) + #writer.flush() + + + def _index_pages(self, request, lock=None, files=None, mode='update'): + """ Index all pages (and all given files) + + This should be called from indexPages or indexPagesInNewThread only! + + This may take some time, depending on the size of the wiki and speed + of the machine. + + When called in a new thread, lock is acquired before the call, + and this method must release it when it finishes or fails. + """ + try: + self._unsign() + start = time.time() + writer = xapidx.Index(self.dir, True) + writer.configure(self.prefixMap, self.indexValueMap) + pages = request.rootpage.getPageList(user='', exists=1) + request.log("indexing all (%d) pages..." % len(pages)) + for pagename in pages: + p = Page(request, pagename) + self._index_page(writer, p, mode) + if files: + request.log("indexing all files...") + for fname in files: + fname = fname.strip() + self._index_file(request, writer, fname, mode) + writer.close() + request.log("indexing completed successfully in %0.2f seconds." % + (time.time() - start)) + self._sign() + finally: + writer.__del__() + if lock: + lock.release() + + def _optimize(self, request): + """ Optimize the index """ + pass + + def _indexingRequest(self, request): + """ Return a new request that can be used for index building. + + This request uses a security policy that lets the current user + read any page. Without this policy some pages will not render, + which will create broken pagelinks index. + """ + from MoinMoin.request.CLI import Request + from MoinMoin.security import Permissions + request = Request(request.url) + class SecurityPolicy(Permissions): + def read(*args, **kw): + return True + request.user.may = SecurityPolicy(request.user) + return request + + def _unsign(self): + """ Remove sig file - assume write lock acquired """ + try: + os.remove(self.sig_file) + except OSError, err: + if err.errno != errno.ENOENT: + raise + + def _sign(self): + """ Add sig file - assume write lock acquired """ + f = file(self.sig_file, 'w') + try: + f.write('') + finally: + f.close() + + +def run_query(query, db): + enquire = xapian.Enquire(db) + parser = xapian.QueryParser() + query = parser.parse_query(query, xapian.QueryParser.FLAG_WILDCARD) + print query.get_description() + enquire.set_query(query) + return enquire.get_mset(0, 10) + +def run(request): + pass + #print "Begin" + #db = xapian.WritableDatabase(xapian.open('test.db', + # xapian.DB_CREATE_OR_OPEN)) + # + # index_data(db) ??? + #del db + #mset = run_query(sys.argv[1], db) + #print mset.get_matches_estimated() + #iterator = mset.begin() + #while iterator != mset.end(): + # print iterator.get_document().get_data() + # iterator.next() + #for i in xrange(1,170): + # doc = db.get_document(i) + # print doc.get_data() + +if __name__ == '__main__': + run() + +
--- a/MoinMoin/action/AttachFile.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/action/AttachFile.py Thu Jun 29 01:05:53 2006 +0200 @@ -26,7 +26,7 @@ @license: GNU GPL, see COPYING for details. """ -import os, mimetypes, time, zipfile +import os, time, zipfile from MoinMoin import config, user, util, wikiutil, packages from MoinMoin.Page import Page from MoinMoin.util import filesys @@ -163,15 +163,15 @@ target = wikiutil.taintfilename(target) # set mimetype from extension, or from given mimetype - #type, encoding = mimetypes.guess_type(target) + #type, encoding = wikiutil.guess_type(target) #if not type: # ext = None # if request.form.has_key('mime'): - # ext = mimetypes.guess_extension(request.form['mime'][0]) + # ext = wikiutil.guess_extension(request.form['mime'][0]) # if not ext: - # type, encoding = mimetypes.guess_type(filename) + # type, encoding = wikiutil.guess_type(filename) # if type: - # ext = mimetypes.guess_extension(type) + # ext = wikiutil.guess_extension(type) # else: # ext = '' # target = target + ext @@ -193,7 +193,8 @@ _addLogEntry(request, 'ATTNEW', pagename, target) return target - + + ############################################################################# ### Internal helpers ############################################################################# @@ -645,16 +646,14 @@ import shutil filename, fpath = _access_file(pagename, request) - if not filename: return # error msg already sent in _access_file + if not filename: + return # error msg already sent in _access_file - # get mimetype - type, enc = mimetypes.guess_type(filename) - if not type: - type = "application/octet-stream" + mt = wikiutil.MimeType(filename=filename) # send header request.http_headers([ - "Content-Type: %s" % type, + "Content-Type: %s" % mt.content_type(), "Content-Length: %d" % os.path.getsize(fpath), # TODO: fix the encoding here, plain 8 bit is not allowed according to the RFCs # There is no solution that is compatible to IE except stripping non-ascii chars @@ -778,24 +777,23 @@ request.write('<h2>' + _("Attachment '%(filename)s'") % {'filename': filename} + '</h2>') - type, enc = mimetypes.guess_type(filename) - if type: - if type[:5] == 'image': - timestamp = htdocs_access(request) and "?%s" % time.time() or '' - request.write('<img src="%s%s" alt="%s">' % ( - getAttachUrl(pagename, filename, request, escaped=1), timestamp, wikiutil.escape(filename, 1))) - return - elif type[:4] == 'text': - # TODO: should use formatter here! - request.write("<pre>") - # Try to decode file contents. It may return junk, but we - # don't have enough information on attachments. - content = open(fpath, 'r').read() - content = wikiutil.decodeUnknownInput(content) - content = wikiutil.escape(content) - request.write(content) - request.write("</pre>") - return + mt = wikiutil.MimeType(filename=filename) + if mt.major == 'image': + timestamp = htdocs_access(request) and "?%s" % time.time() or '' + request.write('<img src="%s%s" alt="%s">' % ( + getAttachUrl(pagename, filename, request, escaped=1), timestamp, wikiutil.escape(filename, 1))) + return + elif mt.major == 'text': + # TODO: should use formatter here! + request.write("<pre>") + # Try to decode file contents. It may return junk, but we + # don't have enough information on attachments. + content = open(fpath, 'r').read() + content = wikiutil.decodeUnknownInput(content) + content = wikiutil.escape(content) + request.write(content) + request.write("</pre>") + return package = packages.ZipPackage(request, fpath) if package.isPackage():
--- a/MoinMoin/action/MyPages.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/action/MyPages.py Thu Jun 29 01:05:53 2006 +0200 @@ -24,7 +24,7 @@ userhomewiki = request.cfg.user_homewiki if userhomewiki != 'Self' and userhomewiki != request.cfg.interwikiname: interwiki = wikiutil.getInterwikiHomePage(request, username=username) - wikitag, wikiurl, wikitail, wikitag_bad = wikiutil.resolve_wiki(request, '%s:%s' % interwiki) + wikitag, wikiurl, wikitail, wikitag_bad = wikiutil.resolve_wiki(request, '%s:"%s"' % interwiki) wikiurl = wikiutil.mapURL(request, wikiurl) homepageurl = wikiutil.join_wiki(wikiurl, wikitail) request.http_redirect('%s?action=MyPages' % homepageurl)
--- a/MoinMoin/action/fckdialog.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/action/fckdialog.py Thu Jun 29 01:05:53 2006 +0200 @@ -230,8 +230,8 @@ page_list = "" # list of interwiki names - wikiutil.resolve_wiki(request, "Self:FrontPage") - interwiki = request.cfg._interwiki_list.keys() + interwiki_list = wikiutil.load_wikimap(request) + interwiki = interwiki_list.keys() interwiki.sort() iwpreferred = request.cfg.interwiki_preferred if not iwpreferred or iwpreferred and iwpreferred[-1] != None:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/action/thread_monitor.py Thu Jun 29 01:05:53 2006 +0200 @@ -0,0 +1,52 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - Thread monitor action + + Shows the current traceback of all threads. + + @copyright: 2006 by MoinMoin:AlexanderSchremmer + @license: GNU GPL, see COPYING for details. +""" +import os, time +from StringIO import StringIO + +from MoinMoin import wikiutil +from MoinMoin.util import thread_monitor + +def execute_fs(pagename, request): + if thread_monitor.hook_enabled: + s = StringIO() + thread_monitor.trigger_dump(s) + time.sleep(5) # allow for all threads to dump to request + data = s.getvalue() + timestamp = time.time() + dump_fname = os.path.join(request.cfg.data_dir, "tm_%d.log" % timestamp) + f = file(dump_fname, "w") + f.write(data) + f.close() + else: + dump_fname = "nowhere" + + request.http_headers() + request.write('<html><body>A dump has been saved to %s.</body></html>' % dump_fname) + +def execute_wiki(pagename, request): + request.http_headers() + + request.theme.send_title("Thread monitor") + request.write('<pre>') + + if not thread_monitor.hook_enabled: + request.write("Hook is not enabled.") + else: + s = StringIO() + thread_monitor.trigger_dump(s) + time.sleep(5) # allow for all threads to dump to request + request.write(wikiutil.escape(s.getvalue())) + + request.write('</pre>') + request.theme.send_footer(pagename) + request.theme.send_closing_html() + +execute = execute_fs +
--- a/MoinMoin/action/tm.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -# -*- coding: iso-8859-1 -*- -""" - MoinMoin - Thread monitor action - - Shows the current traceback of all threads. - - @copyright: 2006 by MoinMoin:AlexanderSchremmer - @license: GNU GPL, see COPYING for details. -""" - -from MoinMoin.wikiutil import escape -from MoinMoin.util import thread_monitor -from StringIO import StringIO -from time import sleep - -def execute(pagename, request): - request.http_headers() - - request.theme.send_title("Thread monitor") - request.write('<pre>') - - if not thread_monitor.hook_enabled: - request.write("Hook is not enabled.") - else: - s = StringIO() - thread_monitor.trigger_dump(s) - sleep(5) # allow for all threads to dump to request - request.write(escape(s.getvalue())) - - request.write('</pre>') - request.theme.send_footer(pagename) - request.theme.send_closing_html()
--- a/MoinMoin/converter/text_html_text_moin_wiki.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/converter/text_html_text_moin_wiki.py Thu Jun 29 01:05:53 2006 +0200 @@ -392,6 +392,18 @@ ]> ''' +def pagename_from_url(url_frag): + """ url is a fragment of an URL we extract the pagename from by URL-unqouting + and possible adding quotes around the pagename if we detect blanks in it. + """ + pagename = qpagename = wikiutil.url_unquote(url_frag) + if " " in pagename: + if not '"' in pagename: + qpagename = '"%s"' % pagename + elif not "'" in pagename: + qpagename = "'%s'" % pagename + return pagename, qpagename + class ConvertError(error.FatalError): """ Raise when html to wiki conversion fails """ name = "MoinMoin Convert Error" @@ -1088,20 +1100,20 @@ wikitag, wikiurl, wikitail, err = wikiutil.resolve_wiki( self.request, title + ":") if not err and href.startswith(wikiurl): - pagename = href[len(wikiurl):].lstrip('/') - interwikiname = "%s:%s" % (wikitag, pagename) + pagename, qpagename = pagename_from_url(href[len(wikiurl):].lstrip('/')) + interwikiname = "%s:%s" % (wikitag, qpagename) else: raise ConvertError("Invalid InterWiki link: '%s'" % href) elif class_ == "badinterwiki" and title: if href == "/": # we used this as replacement for empty href href = "" - pagename = href - interwikiname = "%s:%s" % (title, href) + pagename, qpagename = pagename_from_url(href) + interwikiname = "%s:%s" % (title, qpagename) if interwikiname and pagename == text: self.text.append("%s" % interwikiname) return elif title == 'Self': - self.text.append("[:%s:%s]" % (href, text)) + self.text.append('["%s" %s]' % (href, text)) return elif interwikiname: self.text.append("[wiki:%s %s]" % (interwikiname, text)) @@ -1115,14 +1127,14 @@ # Attachments if title and title.startswith("attachment:"): - url = wikiutil.url_unquote(title[len("attachment:"):]) - if url != text: - self.text.append("[%s %s]" % (title, text)) + attname, qattname = pagename_from_url(title[len("attachment:"):]) + if attname != text: + self.text.append('[attachment:%s %s]' % (qattname, text)) else: - self.text.extend([self.white_space, title, self.white_space]) + self.text.extend([self.white_space, 'attachment:%s' % qattname, self.white_space]) # wiki link elif href.startswith(scriptname): - pagename = href[len(scriptname):].replace('_', ' ') + pagename = href[len(scriptname):] pagename = pagename.lstrip('/') # XXX temp fix for generated pagenames starting with / if text == pagename: self.text.append(wikiutil.pagelinkmarkup(pagename)) @@ -1137,7 +1149,7 @@ self.text.append(wikiutil.pagelinkmarkup(text)) # labeled link else: - self.text.append("[:%s:%s]" % (pagename, text)) + self.text.append('["%s" %s]' % (pagename, text)) # mailto link elif href.startswith("mailto:"): if href[len("mailto:"):] == text:
--- a/MoinMoin/filter/application_msword.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/filter/application_msword.py Thu Jun 29 01:05:53 2006 +0200 @@ -11,5 +11,5 @@ from MoinMoin import filter def execute(indexobj, filename): - return filter.execfilter("antiword %s", filename) + return filter.execfilter("HOME=/tmp antiword %s", filename) # no HOME makes antiword complain
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/filter/application_vnd_oasis_opendocument.py Thu Jun 29 01:05:53 2006 +0200 @@ -0,0 +1,25 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - OpenOffice.org 2.0 *.od? Filter (OpenDocument) + + Depends on: nothing (only python with zlib) + + @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann + @license: GNU GPL, see COPYING for details. +""" + +import re, zipfile + +rx_stripxml = re.compile("<[^>]*?>", re.DOTALL|re.MULTILINE) + +def execute(indexobj, filename): + try: + zf = zipfile.ZipFile(filename, "r") + data = zf.read("content.xml") + zf.close() + data = " ".join(rx_stripxml.sub(" ", data).split()) + except RuntimeError, err: + indexobj.request.log(str(err)) + data = "" + return data.decode('utf-8') +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/filter/application_vnd_oasis_opendocument_presentation.py Thu Jun 29 01:05:53 2006 +0200 @@ -0,0 +1,15 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - OpenOffice.org 2.x Presenter Filter (OpenDocument Presentation) + + Depends on: nothing (only python with zlib) + + @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann + @license: GNU GPL, see COPYING for details. +""" + +from MoinMoin.filter.application_vnd_oasis_opendocument import execute as odfilter + +def execute(indexobj, filename): + return odfilter(indexobj, filename) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/filter/application_vnd_oasis_opendocument_spreadsheet.py Thu Jun 29 01:05:53 2006 +0200 @@ -0,0 +1,15 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - OpenOffice.org 2.x Calc Filter (OpenDocument Spreadsheet) + + Depends on: nothing (only python with zlib) + + @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann + @license: GNU GPL, see COPYING for details. +""" + +from MoinMoin.filter.application_vnd_oasis_opendocument import execute as odfilter + +def execute(indexobj, filename): + return odfilter(indexobj, filename) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/filter/application_vnd_oasis_opendocument_text.py Thu Jun 29 01:05:53 2006 +0200 @@ -0,0 +1,15 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - OpenOffice.org 2.x Writer Filter (OpenDocument Text) + + Depends on: nothing (only python with zlib) + + @copyright: 2006 by ThomasWaldmann MoinMoin:ThomasWaldmann + @license: GNU GPL, see COPYING for details. +""" + +from MoinMoin.filter.application_vnd_oasis_opendocument import execute as odfilter + +def execute(indexobj, filename): + return odfilter(indexobj, filename) +
--- a/MoinMoin/filter/text.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/filter/text.py Thu Jun 29 01:05:53 2006 +0200 @@ -11,7 +11,7 @@ import codecs def execute(indexobj, filename): - for enc in ('utf-8', 'iso-8859-15', 'iso-8859-1', ): + for enc in ('utf-8', 'iso-8859-15', ): try: f = codecs.open(filename, "r", enc) data = f.read()
--- a/MoinMoin/formatter/__init__.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/formatter/__init__.py Thu Jun 29 01:05:53 2006 +0200 @@ -94,7 +94,7 @@ IMPORTANT: on and off must be called with same parameters, see also the text_html formatter. """ - wikitag, wikiurl, wikitail, wikitag_bad = wikiutil.resolve_wiki(self.request, '%s:%s' % (interwiki, pagename)) + wikitag, wikiurl, wikitail, wikitag_bad = wikiutil.resolve_wiki(self.request, '%s:"%s"' % (interwiki, pagename)) if wikitag == 'Self' or wikitag == self.request.cfg.interwikiname: if wikitail.find('#') > -1: wikitail, kw['anchor'] = wikitail.split('#', 1)
--- a/MoinMoin/formatter/text_docbook.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/formatter/text_docbook.py Thu Jun 29 01:05:53 2006 +0200 @@ -367,7 +367,7 @@ if not on: return self.url(on,kw) - wikitag, wikiurl, wikitail, wikitag_bad = wikiutil.resolve_wiki(self.request, '%s:%s' % (interwiki, pagename)) + wikitag, wikiurl, wikitail, wikitag_bad = wikiutil.resolve_wiki(self.request, '%s:"%s"' % (interwiki, pagename)) wikiurl = wikiutil.mapURL(self.request, wikiurl) href = wikiutil.join_wiki(wikiurl, wikitail)
--- a/MoinMoin/formatter/text_html.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/formatter/text_html.py Thu Jun 29 01:05:53 2006 +0200 @@ -502,7 +502,7 @@ """ @keyword title: override using the interwiki wikiname as title """ - wikitag, wikiurl, wikitail, wikitag_bad = wikiutil.resolve_wiki(self.request, '%s:%s' % (interwiki, pagename)) + wikitag, wikiurl, wikitail, wikitag_bad = wikiutil.resolve_wiki(self.request, '%s:"%s"' % (interwiki, pagename)) wikiurl = wikiutil.mapURL(self.request, wikiurl) if wikitag == 'Self': # for own wiki, do simple links if on: @@ -626,6 +626,7 @@ def attachment_link(self, url, text, **kw): _ = self.request.getText pagename, filename = AttachFile.absoluteName(url, self.page.page_name) + #self.request.log("attachment_link: url %s pagename %s filename %s" % (url, pagename, filename)) fname = wikiutil.taintfilename(filename) fpath = AttachFile.getFilename(self.request, pagename, fname) if not os.path.exists(fpath): @@ -694,8 +695,7 @@ # check for map file if os.path.exists(mappath): - # we have a image map. inline it and add a map ref - # to the img tag + # we have a image map. inline it and add a map ref to the img tag try: map = file(mappath, 'r').read() except IOError:
--- a/MoinMoin/i18n/__init__.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/i18n/__init__.py Thu Jun 29 01:05:53 2006 +0200 @@ -224,11 +224,19 @@ if dicts.has_dict(dictpagename): userdict = dicts.dict(dictpagename) translated = userdict[original] + else: + raise KeyError + else: + raise KeyError except KeyError: - # do not simply return trans with orig, but recursively call - # to get english translation, maybe formatted + # do not simply return trans with str, but recursively call + # to get english translation, maybe formatted. + # if we don't find an english "translation", we just format it + # on the fly (this is needed for cfg.editor_quickhelp). if lang != 'en': translated = getText(original, request, 'en', formatted) + elif formatted: + translated = translations[lang].formatMarkup(request, original) return translated
--- a/MoinMoin/logfile/editlog.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/logfile/editlog.py Thu Jun 29 01:05:53 2006 +0200 @@ -143,17 +143,19 @@ If `host` is None, it's read from request vars. """ - import socket - if host is None: host = request.remote_addr - - try: - hostname = socket.gethostbyaddr(host)[0] - hostname = unicode(hostname, config.charset) - except (socket.error, UnicodeError), err: + + if request.cfg.log_reverse_dns_lookups: + import socket + try: + hostname = socket.gethostbyaddr(host)[0] + hostname = unicode(hostname, config.charset) + except (socket.error, UnicodeError), err: + hostname = host + else: hostname = host - + remap_chars = {u'\t': u' ', u'\r': u' ', u'\n': u' ',} comment = comment.translate(remap_chars) user_id = request.user.valid and request.user.id or ''
--- a/MoinMoin/lupy.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,588 +0,0 @@ -# -*- coding: iso-8859-1 -*- -""" - MoinMoin - lupy indexing search engine - - @copyright: 2005 by Florian Festi, Nir Soffer, Thomas Waldmann - @license: GNU GPL, see COPYING for details. -""" - -import os, re, codecs, errno, time - -from MoinMoin.Page import Page -from MoinMoin import config, wikiutil -from MoinMoin.util import filesys, lock -from MoinMoin.support.lupy.index.term import Term -from MoinMoin.support.lupy import document -from MoinMoin.support.lupy.index.indexwriter import IndexWriter -from MoinMoin.support.lupy.search.indexsearcher import IndexSearcher - -from MoinMoin.support.lupy.index.term import Term -from MoinMoin.support.lupy.search.term import TermQuery -from MoinMoin.support.lupy.search.boolean import BooleanQuery - -############################################################################## -### Tokenizer -############################################################################## - -singleword = r"[%(u)s][%(l)s]+" % { - 'u': config.chars_upper, - 'l': config.chars_lower, - } - -singleword_re = re.compile(singleword, re.U) -wikiword_re = re.compile(r"^(%s){2,}$" % singleword, re.U) - -token_re = re.compile( - r"(?P<company>\w+[&@]\w+)|" + # company names like AT&T and Excite@Home. - r"(?P<email>\w+([.-]\w+)*@\w+([.-]\w+)*)|" + # email addresses - r"(?P<hostname>\w+(\.\w+)+)|" + # hostnames - r"(?P<num>(\w+[-/.,])*\w*\d\w*([-/.,]\w+)*)|" + # version numbers - r"(?P<acronym>(\w\.)+)|" + # acronyms: U.S.A., I.B.M., etc. - r"(?P<word>\w+)", # words - re.U) - -dot_re = re.compile(r"[-_/,.]") -mail_re = re.compile(r"[-_/,.]|(@)") - -def tokenizer(value): - """Yield a stream of lower cased words from a string.""" - if isinstance(value, list): # used for page links - for v in value: - yield v - else: - tokenstream = re.finditer(token_re, value) - for m in tokenstream: - if m.group("acronym"): - yield m.group("acronym").replace('.', '').lower() - elif m.group("company"): - yield m.group("company").lower() - elif m.group("email"): - for word in mail_re.split(m.group("email").lower()): - if word: - yield word - elif m.group("hostname"): - for word in dot_re.split(m.group("hostname").lower()): - yield word - elif m.group("num"): - for word in dot_re.split(m.group("num").lower()): - yield word - elif m.group("word"): - word = m.group("word") - yield word.lower() - # if it is a CamelCaseWord, we additionally yield Camel, Case and Word - if wikiword_re.match(word): - for sm in re.finditer(singleword_re, word): - yield sm.group().lower() - - -############################################################################# -### Indexing -############################################################################# - -class UpdateQueue: - def __init__(self, file, lock_dir): - self.file = file - self.writeLock = lock.WriteLock(lock_dir, timeout=10.0) - self.readLock = lock.ReadLock(lock_dir, timeout=10.0) - - def exists(self): - return os.path.exists(self.file) - - def append(self, pagename): - """ Append a page to queue """ - if not self.writeLock.acquire(60.0): - request.log("can't add %r to lupy update queue: can't lock queue" % - pagename) - return - try: - f = codecs.open(self.file, 'a', config.charset) - try: - f.write(pagename + "\n") - finally: - f.close() - finally: - self.writeLock.release() - - def pages(self): - """ Return list of pages in the queue """ - if self.readLock.acquire(1.0): - try: - return self._decode(self._read()) - finally: - self.readLock.release() - return [] - - def remove(self, pages): - """ Remove pages from the queue - - When the queue is empty, the queue file is removed, so exists() - can tell if there is something waiting in the queue. - """ - if self.writeLock.acquire(30.0): - try: - queue = self._decode(self._read()) - for page in pages: - try: - queue.remove(page) - except ValueError: - pass - if queue: - self._write(queue) - else: - self._removeFile() - return True - finally: - self.writeLock.release() - return False - - # Private ------------------------------------------------------- - - def _decode(self, data): - """ Decode queue data """ - pages = data.splitlines() - return self._filterDuplicates(pages) - - def _filterDuplicates(self, pages): - """ Filter duplicates in page list, keeping the order """ - unique = [] - seen = {} - for name in pages: - if not name in seen: - unique.append(name) - seen[name] = 1 - return unique - - def _read(self): - """ Read and return queue data - - This does not do anything with the data so we can release the - lock as soon as possible, enabling others to update the queue. - """ - try: - f = codecs.open(self.file, 'r', config.charset) - try: - return f.read() - finally: - f.close() - except (OSError, IOError), err: - if err.errno != errno.ENOENT: - raise - return '' - - def _write(self, pages): - """ Write pages to queue file - - Requires queue write locking. - """ - # XXX use tmpfile/move for atomic replace on real operating systems - data = '\n'.join(pages) + '\n' - f = codecs.open(self.file, 'w', config.charset) - try: - f.write(data) - finally: - f.close() - - def _removeFile(self): - """ Remove queue file - - Requires queue write locking. - """ - try: - os.remove(self.file) - except OSError, err: - if err.errno != errno.ENOENT: - raise - - -class Index: - class LockedException(Exception): - pass - - def __init__(self, request): - self.request = request - cache_dir = request.cfg.cache_dir - self.main_dir = os.path.join(cache_dir, 'lupy') - self.dir = os.path.join(self.main_dir, 'index') - filesys.makeDirs(self.dir) - self.sig_file = os.path.join(self.main_dir, 'complete') - self.segments_file = os.path.join(self.dir, 'segments') - lock_dir = os.path.join(self.main_dir, 'index-lock') - self.lock = lock.WriteLock(lock_dir, - timeout=3600.0, readlocktimeout=60.0) - self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0) - self.queue = UpdateQueue(os.path.join(self.main_dir, "update-queue"), - os.path.join(self.main_dir, 'update-queue-lock')) - - # Disabled until we have a sane way to build the index with a - # queue in small steps. - ## if not self.exists(): - ## self.indexPagesInNewThread(request) - - def exists(self): - """ Check if index exists """ - return os.path.exists(self.sig_file) - - def mtime(self): - return os.path.getmtime(self.segments_file) - - def _search(self, query): - """ read lock must be acquired """ - while True: - try: - searcher, timestamp = self.request.cfg.lupy_searchers.pop() - if timestamp != self.mtime(): - searcher.close() - else: - break - except IndexError: - searcher = IndexSearcher(self.dir) - timestamp = self.mtime() - break - - hits = list(searcher.search(query)) - self.request.cfg.lupy_searchers.append((searcher, timestamp)) - return hits - - def search(self, query): - if not self.read_lock.acquire(1.0): - raise self.LockedException - try: - hits = self._search(query) - finally: - self.read_lock.release() - return hits - - def update_page(self, page): - self.queue.append(page.page_name) - self._do_queued_updates_InNewThread() - - def _do_queued_updates_InNewThread(self): - """ do queued index updates in a new thread - - Should be called from a user request. From a script, use indexPages. - """ - if not self.lock.acquire(1.0): - self.request.log("can't index: can't acquire lock") - return - try: - from threading import Thread - indexThread = Thread(target=self._do_queued_updates, - args=(self._indexingRequest(self.request), self.lock)) - indexThread.setDaemon(True) - - # Join the index thread after current request finish, prevent - # Apache CGI from killing the process. - def joinDecorator(finish): - def func(): - finish() - indexThread.join() - return func - - self.request.finish = joinDecorator(self.request.finish) - indexThread.start() - except: - self.lock.release() - raise - - def indexPages(self, files=None, update=True): - """ Index all pages (and files, if given) - - Can be called only from a script. To index pages during a user - request, use indexPagesInNewThread. - @arg files: iterator or list of files to index additionally - @arg update: True = update an existing index, False = reindex everything - """ - if not self.lock.acquire(1.0): - self.request.log("can't index: can't acquire lock") - return - try: - request = self._indexingRequest(self.request) - self._index_pages(request, None, files, update) - finally: - self.lock.release() - - def indexPagesInNewThread(self, files=None, update=True): - """ Index all pages in a new thread - - Should be called from a user request. From a script, use indexPages. - """ - if not self.lock.acquire(1.0): - self.request.log("can't index: can't acquire lock") - return - try: - # Prevent rebuilding the index just after it was finished - if self.exists(): - self.lock.release() - return - from threading import Thread - indexThread = Thread(target=self._index_pages, - args=(self._indexingRequest(self.request), self.lock, files, update)) - indexThread.setDaemon(True) - - # Join the index thread after current request finish, prevent - # Apache CGI from killing the process. - def joinDecorator(finish): - def func(): - finish() - indexThread.join() - return func - - self.request.finish = joinDecorator(self.request.finish) - indexThread.start() - except: - self.lock.release() - raise - - def optimize(self): - """ Optimize the index - - This may take from few seconds to few hours, depending on the - size of the wiki. Currently it's usable only from a script. - - TODO: needs special locking, so the index is readable until the - optimization is finished. - """ - if not self.exists(): - raise RuntimeError("Index does not exist or is not finished") - if not self.lock.acquire(1.0): - self.request.log("can't lock the index for optimization") - return - try: - self._optimize(self.request) - finally: - self.lock.release() - - # ------------------------------------------------------------------- - # Private - - def _do_queued_updates(self, request, lock=None, amount=5): - """ Assumes that the write lock is acquired """ - try: - pages = self.queue.pages()[:amount] - for name in pages: - p = Page(request, name) - self._update_page(p) - self.queue.remove([name]) - finally: - if lock: - lock.release() - - def _update_page(self, page): - """ Assumes that the write lock is acquired """ - reader = IndexSearcher(self.dir) - reader.reader.deleteTerm(Term('pagename', page.page_name)) - reader.close() - if page.exists(): - writer = IndexWriter(self.dir, False, tokenizer) - self._index_page(writer, page, False) # we don't need to check whether it is updated - writer.close() - - def contentfilter(self, filename): - """ Get a filter for content of filename and return unicode content. """ - import mimetypes - from MoinMoin import wikiutil - request = self.request - mimetype, encoding = mimetypes.guess_type(filename) - if mimetype is None: - mimetype = 'application/octet-stream' - def mt2mn(mt): # mimetype to modulename - return mt.replace("/", "_").replace("-","_").replace(".", "_") - try: - _filter = mt2mn(mimetype) - execute = wikiutil.importPlugin(request.cfg, 'filter', _filter) - except wikiutil.PluginMissingError: - try: - _filter = mt2mn(mimetype.split("/", 1)[0]) - execute = wikiutil.importPlugin(request.cfg, 'filter', _filter) - except wikiutil.PluginMissingError: - try: - _filter = mt2mn('application/octet-stream') - execute = wikiutil.importPlugin(request.cfg, 'filter', _filter) - except wikiutil.PluginMissingError: - raise ImportError("Cannot load filter %s" % binaryfilter) - try: - data = execute(self, filename) - request.log("Filter %s returned %d characters for file %s" % (_filter, len(data), filename)) - except (OSError, IOError), err: - data = '' - request.log("Filter %s threw error '%s' for file %s" % (_filter, str(err), filename)) - return data - - def test(self, request): - query = BooleanQuery() - query.add(TermQuery(Term("text", 'suchmich')), True, False) - docs = self._search(query) - for d in docs: - request.log("%r %r %r" % (d, d.get('attachment'), d.get('pagename'))) - - def _index_file(self, request, writer, filename, update): - """ index a file as it were a page named pagename - Assumes that the write lock is acquired - """ - fs_rootpage = 'FS' # XXX FS hardcoded - try: - mtime = os.path.getmtime(filename) - mtime = wikiutil.timestamp2version(mtime) - if update: - query = BooleanQuery() - query.add(TermQuery(Term("pagename", fs_rootpage)), True, False) - query.add(TermQuery(Term("attachment", filename)), True, False) - docs = self._search(query) - updated = len(docs) == 0 or mtime > int(docs[0].get('mtime')) - else: - updated = True - request.log("%s %r" % (filename, updated)) - if updated: - file_content = self.contentfilter(filename) - d = document.Document() - d.add(document.Keyword('pagename', fs_rootpage)) - d.add(document.Keyword('mtime', str(mtime))) - d.add(document.Keyword('attachment', filename)) # XXX we should treat files like real pages, not attachments - pagename = " ".join(os.path.join(fs_rootpage, filename).split("/")) - d.add(document.Text('title', pagename, store=False)) - d.add(document.Text('text', file_content, store=False)) - writer.addDocument(d) - except (OSError, IOError), err: - pass - - def _index_page(self, writer, page, update): - """ Index a page - assumes that the write lock is acquired - @arg writer: the index writer object - @arg page: a page object - @arg update: False = index in any case, True = index only when changed - """ - pagename = page.page_name - request = page.request - mtime = page.mtime_usecs() - if update: - query = BooleanQuery() - query.add(TermQuery(Term("pagename", pagename)), True, False) - query.add(TermQuery(Term("attachment", "")), True, False) - docs = self._search(query) - updated = len(docs) == 0 or mtime > int(docs[0].get('mtime')) - else: - updated = True - request.log("%s %r" % (pagename, updated)) - if updated: - d = document.Document() - d.add(document.Keyword('pagename', pagename)) - d.add(document.Keyword('mtime', str(mtime))) - d.add(document.Keyword('attachment', '')) # this is a real page, not an attachment - d.add(document.Text('title', pagename, store=False)) - d.add(document.Text('text', page.get_raw_body(), store=False)) - - links = page.getPageLinks(request) - t = document.Text('links', '', store=False) - t.stringVal = links - d.add(t) - d.add(document.Text('link_text', ' '.join(links), store=False)) - - writer.addDocument(d) - - from MoinMoin.action import AttachFile - - attachments = AttachFile._get_files(request, pagename) - for att in attachments: - filename = AttachFile.getFilename(request, pagename, att) - mtime = wikiutil.timestamp2version(os.path.getmtime(filename)) - if update: - query = BooleanQuery() - query.add(TermQuery(Term("pagename", pagename)), True, False) - query.add(TermQuery(Term("attachment", att)), True, False) - docs = self._search(query) - updated = len(docs) == 0 or mtime > int(docs[0].get('mtime')) - else: - updated = True - request.log("%s %s %r" % (pagename, att, updated)) - if updated: - att_content = self.contentfilter(filename) - d = document.Document() - d.add(document.Keyword('pagename', pagename)) - d.add(document.Keyword('mtime', str(mtime))) - d.add(document.Keyword('attachment', att)) # this is an attachment, store its filename - d.add(document.Text('title', att, store=False)) # the filename is the "title" of an attachment - d.add(document.Text('text', att_content, store=False)) - writer.addDocument(d) - - - def _index_pages(self, request, lock=None, files=None, update=True): - """ Index all pages (and all given files) - - This should be called from indexPages or indexPagesInNewThread only! - - This may take few minutes up to few hours, depending on the size of - the wiki. - - When called in a new thread, lock is acquired before the call, - and this method must release it when it finishes or fails. - """ - try: - self._unsign() - start = time.time() - writer = IndexWriter(self.dir, not update, tokenizer) - writer.mergeFactor = 50 - pages = request.rootpage.getPageList(user='', exists=1) - request.log("indexing all (%d) pages..." % len(pages)) - for pagename in pages: - p = Page(request, pagename) - # code does NOT seem to assume request.page being set any more - #request.page = p - self._index_page(writer, p, update) - if files: - request.log("indexing all files...") - for fname in files: - fname = fname.strip() - self._index_file(request, writer, fname, update) - writer.close() - request.log("indexing completed successfully in %0.2f seconds." % - (time.time() - start)) - self._optimize(request) - self._sign() - finally: - if lock: - lock.release() - - def _optimize(self, request): - """ Optimize the index """ - self._unsign() - start = time.time() - request.log("optimizing index...") - writer = IndexWriter(self.dir, False, tokenizer) - writer.optimize() - writer.close() - request.log("optimizing completed successfully in %0.2f seconds." % - (time.time() - start)) - self._sign() - - def _indexingRequest(self, request): - """ Return a new request that can be used for index building. - - This request uses a security policy that lets the current user - read any page. Without this policy some pages will not render, - which will create broken pagelinks index. - """ - from MoinMoin.request import RequestCLI - from MoinMoin.security import Permissions - request = RequestCLI(request.url) - class SecurityPolicy(Permissions): - def read(*args, **kw): - return True - request.user.may = SecurityPolicy(request.user) - return request - - def _unsign(self): - """ Remove sig file - assume write lock acquired """ - try: - os.remove(self.sig_file) - except OSError, err: - if err.errno != errno.ENOENT: - raise - - def _sign(self): - """ Add sig file - assume write lock acquired """ - f = file(self.sig_file, 'w') - try: - f.write('') - finally: - f.close() -
--- a/MoinMoin/macro/SystemInfo.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/macro/SystemInfo.py Thu Jun 29 01:05:53 2006 +0200 @@ -112,7 +112,7 @@ ', '.join(wikiutil.wikiPlugins('parser', Macro.cfg)) or nonestr) state = (_('Disabled'), _('Enabled')) - row(_('Lupy search'), state[request.cfg.lupy_search]) + row(_('Xapian search'), state[request.cfg.xapian_search]) row(_('Active threads'), t_count or 'N/A') buf.write(u'</dl>')
--- a/MoinMoin/macro/__init__.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/macro/__init__.py Thu Jun 29 01:05:53 2006 +0200 @@ -335,13 +335,10 @@ def _macro_InterWiki(self, args): from StringIO import StringIO - - # load interwiki list - dummy = wikiutil.resolve_wiki(self.request, '') - + interwiki_list = wikiutil.load_wikimap(self.request) buf = StringIO() buf.write('<dl>') - list = self.cfg._interwiki_list.items() # this is where we cached it + list = interwiki_list.items() # this is where we cached it list.sort() for tag, url in list: buf.write('<dt><tt><a href="%s">%s</a></tt></dt>' % ( @@ -351,7 +348,6 @@ else: buf.write('<dd><tt>%s</tt></dd>' % url) buf.write('</dl>') - return self.formatter.rawHTML(buf.getvalue()) def _macro_PageCount(self, args):
--- a/MoinMoin/multiconfig.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/multiconfig.py Thu Jun 29 01:05:53 2006 +0200 @@ -272,7 +272,10 @@ language_ignore_browser = False # ignore browser settings, use language_default # or user prefs - lupy_search = False # disabled until lupy is finished + log_reverse_dns_lookups = True # if we do reverse dns lookups for logging hostnames + # instead of just IPs + + xapian_search = False # disabled until xapian is finished mail_login = None # or "user pwd" if you need to use SMTP AUTH mail_sendmail = None # "/usr/sbin/sendmail -t -i" to not use SMTP, but sendmail @@ -520,8 +523,8 @@ self.navi_bar = [elem % self for elem in self.navi_bar] self.backup_exclude = [elem % self for elem in self.backup_exclude] - # list to cache lupy searcher objects - self.lupy_searchers = [] + # list to cache xapian searcher objects + self.xapian_searchers = [] # check if mail is possible and set flag: self.mail_enabled = (self.mail_smarthost is not None or self.mail_sendmail is not None) and self.mail_from
--- a/MoinMoin/parser/text_moin_wiki.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/parser/text_moin_wiki.py Thu Jun 29 01:05:53 2006 +0200 @@ -30,8 +30,12 @@ # some common strings PARENT_PREFIX = wikiutil.PARENT_PREFIX + sq_string = ur"('.*?')" # single quoted string + dq_string = ur"(\".*?\")" # double quoted string + q_string = ur"(%s|%s)" % (sq_string, dq_string) # quoted string attachment_schemas = ["attachment", "inline", "drawing"] punct_pattern = re.escape(u'''"\'}]|:,.)?!''') + punct_no_quote_pattern = re.escape(u'''\}]|:,.)?!''') url_pattern = (u'http|https|ftp|nntp|news|mailto|telnet|wiki|file|irc|' + u'|'.join(attachment_schemas) + (config.url_schemas and u'|' + u'|'.join(config.url_schemas) or '')) @@ -43,10 +47,12 @@ 'subpages': wikiutil.CHILD_PREFIX + '?', 'parent': ur'(?:%s)?' % re.escape(PARENT_PREFIX), } - url_rule = ur'%(url_guard)s(%(url)s)\:([^\s\<%(punct)s]|([%(punct)s][^\s\<%(punct)s]))+' % { + url_rule = ur'%(url_guard)s(%(url)s)\:(([^\s\<%(punct)s]|([%(punctnq)s][^\s\<%(punct)s]))+|%(q_string)s)' % { 'url_guard': u'(^|(?<!\w))', 'url': url_pattern, 'punct': punct_pattern, + 'punctnq': punct_no_quote_pattern, + 'q_string': q_string, } ol_rule = ur"^\s+(?:[0-9]+|[aAiI])\.(?:#\d+)?\s" @@ -78,7 +84,7 @@ (?P<tableZ>\|\| $) (?P<table>(?:\|\|)+(?:<[^>]*?>)?(?!\|? $)) (?P<heading>^\s*(?P<hmarker>=+)\s.*\s(?P=hmarker) $) -(?P<interwiki>[A-Z][a-zA-Z]+\:[^\s'\"\:\<\|]([^\s%(punct)s]|([%(punct)s][^\s%(punct)s]))+) +(?P<interwiki>[A-Z][a-zA-Z]+\:(%(q_string)s|([^\s'\"\:\<\|]([^\s%(punct)s]|([%(punct)s][^\s%(punct)s]))+))) (?P<word>%(word_rule)s) (?P<url_bracket>\[((%(url)s)\:|#|\:)[^\s\]]+(\s[^\]]+)?\]) (?P<url>%(url_rule)s) @@ -87,11 +93,12 @@ (?P<smileyA>^(%(smiley)s)(?=\s)) (?P<ent_symbolic>&[a-zA-Z]+;) (?P<ent>[<>&]) -(?P<wikiname_bracket>\[".*?"\]) +(?P<wikiname_bracket>\[%(q_string)s.*?\]) (?P<tt_bt>`.*?`)""" % { 'url': url_pattern, 'punct': punct_pattern, + 'q_string': q_string, 'ol_rule': ol_rule, 'dl_rule': dl_rule, 'url_rule': url_rule, @@ -154,81 +161,48 @@ #result.append("<!-- close item end -->\n") - def interwiki(self, url_and_text, **kw): + def interwiki(self, target_and_text, **kw): # TODO: maybe support [wiki:Page http://wherever/image.png] ? - if len(url_and_text) == 1: - url = url_and_text[0] - text = None - else: - url, text = url_and_text + scheme, rest = target_and_text.split(':', 1) + wikiname, pagename, text = wikiutil.split_wiki(rest) + if not text: + text = pagename + #self.request.log("interwiki: split_wiki -> %s.%s.%s" % (wikiname,pagename,text)) - # keep track of whether this is a self-reference, so links - # are always shown even the page doesn't exist. - is_self_reference = 0 - url2 = url.lower() - if url2.startswith('wiki:self:'): - url = url[10:] # remove "wiki:self:" - is_self_reference = 1 - elif url2.startswith('wiki:'): - url = url[5:] # remove "wiki:" - - tag, tail = wikiutil.split_wiki(url) - if text is None: - if tag: - text = tail - else: - text = url - url = "" - elif (url.startswith(wikiutil.CHILD_PREFIX) or # fancy link to subpage [wiki:/SubPage text] - is_self_reference or # [wiki:Self:LocalPage text] or [:LocalPage:text] - Page(self.request, url).exists()): # fancy link to local page [wiki:LocalPage text] - return self._word_repl(url, text) - - wikitag, wikiurl, wikitail, wikitag_bad = wikiutil.resolve_wiki(self.request, url) - href = wikiutil.join_wiki(wikiurl, wikitail) + if wikiname.lower() == 'self': # [wiki:Self:LocalPage text] or [:LocalPage:text] + return self._word_repl(pagename, text) # check for image URL, and possibly return IMG tag - if not kw.get('pretty_url', 0) and wikiutil.isPicture(wikitail): + if not kw.get('pretty_url', 0) and wikiutil.isPicture(pagename): + dummy, wikiurl, dummy, wikitag_bad = wikiutil.resolve_wiki(self.request, rest) + href = wikiutil.join_wiki(wikiurl, pagename) + #self.request.log("interwiki: join_wiki -> %s.%s.%s" % (wikiurl,pagename,href)) return self.formatter.image(src=href) - # link to self? - if wikitag is None: - return self._word_repl(wikitail) - - return (self.formatter.interwikilink(1, tag, tail) + + return (self.formatter.interwikilink(1, wikiname, pagename) + self.formatter.text(text) + - self.formatter.interwikilink(0, tag, tail)) + self.formatter.interwikilink(0, wikiname, pagename)) - def attachment(self, url_and_text, **kw): - """ This gets called on attachment URLs. - """ + def attachment(self, target_and_text, **kw): + """ This gets called on attachment URLs """ _ = self._ - if len(url_and_text) == 1: - url = url_and_text[0] - text = None - else: - url, text = url_and_text + #self.request.log("attachment: target_and_text %s" % target_and_text) + scheme, fname, text = wikiutil.split_wiki(target_and_text) + if not text: + text = fname - inline = url[0] == 'i' - drawing = url[0] == 'd' - url = url.split(":", 1)[1] - url = wikiutil.url_unquote(url, want_unicode=True) - text = text or url + if scheme == 'drawing': + return self.formatter.attachment_drawing(fname, text) - from MoinMoin.action import AttachFile - if drawing: - return self.formatter.attachment_drawing(url, text) - - # check for image URL, and possibly return IMG tag - # (images are always inlined, just like for other URLs) - if not kw.get('pretty_url', 0) and wikiutil.isPicture(url): - return self.formatter.attachment_image(url) + # check for image, and possibly return IMG tag (images are always inlined) + if not kw.get('pretty_url', 0) and wikiutil.isPicture(fname): + return self.formatter.attachment_image(fname) # inline the attachment - if inline: - return self.formatter.attachment_inlined(url, text) + if scheme == 'inline': + return self.formatter.attachment_inlined(fname, text) - return self.formatter.attachment_link(url, text) + return self.formatter.attachment_link(fname, text) def _u_repl(self, word): """Handle underline.""" @@ -365,17 +339,17 @@ if wikitag_bad: return self.formatter.text(word) else: - return self.interwiki(["wiki:" + word]) - + return self.interwiki("wiki:" + word) def _url_repl(self, word): """Handle literal URLs including inline images.""" scheme = word.split(":", 1)[0] if scheme == "wiki": - return self.interwiki([word]) + return self.interwiki(word) + if scheme in self.attachment_schemas: - return self.attachment([word]) + return self.attachment(word) if wikiutil.isPicture(word): word = wikiutil.mapURL(self.request, word) @@ -389,43 +363,58 @@ self.formatter.url(0)) - def _wikiname_bracket_repl(self, word): - """Handle special-char wikinames.""" - wikiname = word[2:-2] - if wikiname: - return self._word_repl(wikiname) + def _wikiname_bracket_repl(self, text): + """Handle special-char wikinames with link text, like: + ["Jim O'Brian" Jim's home page] or ['Hello "world"!' a page with doublequotes]i + """ + word = text[1:-1] # strip brackets + first_char = word[0] + if first_char in "'\"": # this is quoted + # split on closing quote + target, linktext = word[1:].split(first_char, 1) + else: # not quoted + # split on whitespace + target, linktext = word.split(None, 1) + if target: + linktext = linktext.strip() + return self._word_repl(target, linktext) else: - return self.formatter.text(word) + return self.formatter.text(text) def _url_bracket_repl(self, word): """Handle bracketed URLs.""" + word = word[1:-1] # strip brackets + + # Local extended link? [:page name:link text] XXX DEPRECATED + if word[0] == ':': + words = word[1:].split(':', 1) + if len(words) == 1: + words = words * 2 + target_and_text = 'wiki:Self:"%s" %s' % tuple(words) + return self.interwiki(target_and_text, pretty_url=1) - # Local extended link? - if word[1] == ':': - words = word[2:-1].split(':', 1) + scheme_and_rest = word.split(":", 1) + if len(scheme_and_rest) == 1: # no scheme + # Traditional split on space + words = word.split(None, 1) if len(words) == 1: words = words * 2 - words[0] = 'wiki:Self:%s' % words[0] - return self.interwiki(words, pretty_url=1) - #return self._word_repl(words[0], words[1]) - # Traditional split on space - words = word[1:-1].split(None, 1) - if len(words) == 1: - words = words * 2 - - if words[0][0] == '#': - # anchor link - return (self.formatter.url(1, words[0]) + - self.formatter.text(words[1]) + - self.formatter.url(0)) - - scheme = words[0].split(":", 1)[0] - if scheme == "wiki": - return self.interwiki(words, pretty_url=1) - if scheme in self.attachment_schemas: - return self.attachment(words, pretty_url=1) + if words[0].startswith('#'): # anchor link + return (self.formatter.url(1, words[0]) + + self.formatter.text(words[1]) + + self.formatter.url(0)) + else: + scheme, rest = scheme_and_rest + if scheme == "wiki": + return self.interwiki(word, pretty_url=1) + if scheme in self.attachment_schemas: + return self.attachment(word, pretty_url=1) + + words = word.split(None, 1) + if len(words) == 1: + words = words * 2 if wikiutil.isPicture(words[1]) and re.match(self.url_rule, words[1]): return (self.formatter.url(1, words[0], css='external', do_escape=0) + @@ -888,7 +877,7 @@ """ Replace match using type name """ result = [] for type, hit in match.groupdict().items(): - if hit is not None and type != "hmarker": + if hit is not None and not type in ["hmarker", ]: ###result.append(u'<span class="info">[replace: %s: "%s"]</span>' % (type, hit)) if self.in_pre and type not in ['pre', 'ent']:
--- a/MoinMoin/request/TWISTED.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/request/TWISTED.py Thu Jun 29 01:05:53 2006 +0200 @@ -31,7 +31,6 @@ self.is_ssl = self.twistd.isSecure() self.path_info = '/' + '/'.join([pagename] + self.twistd.postpath) self.request_method = self.twistd.method - self.remote_host = self.twistd.getClient() self.remote_addr = self.twistd.getClientIP() self.request_uri = self.twistd.uri self.script_name = "/" + '/'.join(self.twistd.prepath[:-1])
--- a/MoinMoin/request/__init__.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/request/__init__.py Thu Jun 29 01:05:53 2006 +0200 @@ -803,9 +803,6 @@ def normalizePagename(self, name): """ Normalize page name - Convert '_' to spaces - allows using nice URLs with spaces, with no - need to quote. - Prevent creating page names with invisible characters or funny whitespace that might confuse the users or abuse the wiki, or just does not make sense. @@ -816,9 +813,6 @@ @rtype: unicode @return: decoded and sanitized page name """ - # Replace underscores with spaces - name = name.replace(u'_', u' ') - # Strip invalid characters name = config.page_invalid_chars_regex.sub(u'', name)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/script/index/__init__.py Thu Jun 29 01:05:53 2006 +0200 @@ -0,0 +1,17 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - Fullsearch Index Script Package + + TODO: rename this module back to xapian when script framework is + fixed to not confuse it with the xapian.org "xapian" module. + + @copyright: 2006 by Thomas Waldmann + @license: GNU GPL, see COPYING for details. +""" + +from MoinMoin.util import pysupport + +# create a list of extension scripts from the subpackage directory +index_scripts = pysupport.getPackageModules(__file__) +modules = index_scripts +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/script/index/build.py Thu Jun 29 01:05:53 2006 +0200 @@ -0,0 +1,44 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - build xapian search engine's index + + You must run this script as owner of the wiki files, usually this is the + web server user. + + @copyright: 2006 by MoinMoin:ThomasWaldmann + @license: GNU GPL, see COPYING for details. +""" + +from MoinMoin.script import MoinScript + +class IndexScript(MoinScript): + """ Xapian general index script class """ + + def __init__(self, argv, def_values): + MoinScript.__init__(self, argv, def_values) + self.parser.add_option( + "--files", metavar="FILES", dest="file_list", + help="filename of file list, e.g. files.lst (one file per line)" + ) + self.parser.add_option( + "--mode", metavar="MODE", dest="mode", + help="either add (unconditionally add to index) or update (update an existing index)" + ) + + def mainloop(self): + self.init_request() + # Do we have additional files to index? + if self.options.file_list: + self.files = file(self.options.file_list) + else: + self.files = None + self.command() + +class PluginScript(IndexScript): + """ Xapian index build script class """ + + def command(self): + from MoinMoin.Xapian import Index + Index(self.request).indexPages(self.files, self.options.mode) + #Index(self.request).test(self.request) +
--- a/MoinMoin/script/lupy/__init__.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ -# -*- coding: iso-8859-1 -*- -""" - MoinMoin - Fullsearch Index Script Package - - @copyright: 2006 by Thomas Waldmann - @license: GNU GPL, see COPYING for details. -""" - -from MoinMoin.util import pysupport - -# create a list of extension scripts from the subpackage directory -index_scripts = pysupport.getPackageModules(__file__) -modules = index_scripts -
--- a/MoinMoin/script/lupy/build.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -# -*- coding: iso-8859-1 -*- -""" - MoinMoin - build lupy search engine's index - - You must run this script as owner of the wiki files, usually this is the - web server user. - - @copyright: 2005 by Florian Festi, Nir Soffer - @license: GNU GPL, see COPYING for details. -""" - -import os - -from MoinMoin.script import MoinScript -from MoinMoin.request import RequestCLI -from MoinMoin.lupy import Index - - -class IndexScript(MoinScript): - """ Lupy general index script class """ - - def __init__(self, argv, def_values): - MoinScript.__init__(self, argv, def_values) - self.parser.add_option( - "--files", metavar="FILES", dest="file_list", - help="filename of file list, e.g. files.lst (one file per line)" - ) - self.parser.add_option( - "--update", action="store_true", dest="update", - help="when given, update an existing index" - ) - - def mainloop(self): - self.init_request() - # Do we have additional files to index? - if self.options.file_list: - self.files = file(self.options.file_list) - else: - self.files = None - self.command() - -class PluginScript(IndexScript): - """ Lupy index build script class """ - - def command(self): - Index(self.request).indexPages(self.files, self.options.update) - #Index(self.request).test(self.request) - -
--- a/MoinMoin/script/lupy/optimize.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -# -*- coding: iso-8859-1 -*- -""" - MoinMoin - optimize lupy search engine's index - - You must run this script as owner of the wiki files, usually this is the - web server user. - - @copyright: 2005 by Florian Festi, Nir Soffer, - 2006 by Thomas Waldmann - @license: GNU GPL, see COPYING for details. -""" -doit = 0 - -from MoinMoin.script.lupy.build import IndexScript -from MoinMoin.lupy import Index - -class PluginScript(IndexScript): - def command(self): - if doit: - Index(self.request).optimize() - else: - print "See http://moinmoin.wikiwikiweb.de/MoinMoinBugs/LupyOptimizeBreaksIndex !" -
--- a/MoinMoin/search.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/search.py Thu Jun 29 01:05:53 2006 +0200 @@ -2,25 +2,21 @@ """ MoinMoin - search engine - @copyright: 2005 MoinMoin:FlorianFesti - @copyright: 2005 MoinMoin:NirSoffer - @copyright: 2005 MoinMoin:AlexanderSchremmer + @copyright: 2005 MoinMoin:FlorianFesti, + 2005 MoinMoin:NirSoffer, + 2005 MoinMoin:AlexanderSchremmer, + 2006 MoinMoin:ThomasWaldmann, + 2006 MoinMoin:FranzPletz @license: GNU GPL, see COPYING for details """ -import re, time, sys, StringIO +import re, time, sys, StringIO, string from MoinMoin import wikiutil, config from MoinMoin.Page import Page -from MoinMoin.support.lupy.search.term import TermQuery -from MoinMoin.support.lupy.search.phrase import PhraseQuery -from MoinMoin.support.lupy.search.boolean import BooleanQuery, BooleanScorer -from MoinMoin.support.lupy.search.prefix import PrefixQuery -from MoinMoin.support.lupy.search.camelcase import CamelCaseQuery -from MoinMoin.support.lupy.search.regularexpression import RegularExpressionQuery -from MoinMoin.support.lupy.index.term import Term - -from MoinMoin.lupy import Index, tokenizer +import Xapian +from xapian import Query +from Xapian import UnicodeQuery ############################################################################# ### query objects @@ -152,7 +148,7 @@ def sortByCost(self): tmp = [(term.costs(), term) for term in self._subterms] tmp.sort() - self._subterms = [item[1] for item in tmp] + self._subterms = [item[1] for item in tmp] def search(self, page): """ Search for each term, cheap searches first """ @@ -173,12 +169,40 @@ return '|'.join(result) - def lupy_term(self): - required = self.operator== " " - lupy_term = BooleanQuery() + def xapian_wanted(self): + wanted = True + for term in self._subterms: + wanted = wanted and term.xapian_wanted() + return wanted + + def xapian_term(self): + # sort negated terms + terms = [] + not_terms = [] for term in self._subterms: - lupy_term.add(term.lupy_term(), required, term.negated) - return lupy_term + if not term.negated: + terms.append(term.xapian_term()) + else: + not_terms.append(term.xapian_term()) + + # prepare query for not negated terms + if len(terms) == 1: + t1 = Query(terms[0]) + else: + t1 = Query(Query.OP_AND, terms) + + # negated terms? + if not not_terms: + # no, just return query for not negated terms + return t1 + + # yes, link not negated and negated terms' query with a AND_NOT query + if len(not_terms) == 1: + t2 = Query(not_terms[0]) + else: + t2 = Query(Query.OP_OR, not_terms) + + return Query(Query.OP_AND_NOT, t1, t2) class OrExpression(AndExpression): @@ -200,6 +224,10 @@ matches.extend(result) return matches + def xapian_term(self): + # XXX: negated terms managed by _moinSearch? + return Query(Query.OP_OR, [term.xapian_term() for term in self._subterms]) + class TextSearch(BaseExpression): """ A term that does a normal text search @@ -255,34 +283,29 @@ # XXX why not return None or empty list? return [Match()] - def lupy_term(self): - or_term = BooleanQuery() - term = self.titlesearch.lupy_term() - or_term.add(term, False, False) - pattern = self._pattern.lower() + def xapian_wanted(self): + return not self.use_re + + def xapian_term(self): if self.use_re: - if pattern[0] == '^': - pattern = pattern[1:] - if pattern[:2] == '\b': - pattern = pattern[2:] - term = RegularExpressionQuery(Term("text", pattern)) + return None # xapian can't do regex search else: - terms = pattern.lower().split() - terms = [list(tokenizer(t)) for t in terms] - term = BooleanQuery() + analyzer = Xapian.WikiAnalyzer() + terms = self._pattern.split() + + # all parsed wikiwords, AND'ed + queries = [] for t in terms: - if len(t) == 1: - term.add(CamelCaseQuery(Term("text", t[0])), True, False) + t = [i.encode(config.charset) for i in list(analyzer.tokenize(t))] + if len(t) < 2: + queries.append(UnicodeQuery(t[0])) else: - phrase = PhraseQuery() - for w in t: - phrase.add(Term("text", w)) - term.add(phrase, True, False) - #term = CamelCaseQuery(Term("text", pattern)) - #term = PrefixQuery(Term("text", pattern), 3) - #term = TermQuery(Term("text", pattern)) - or_term.add(term, False, False) - return or_term + queries.append(UnicodeQuery(Query.OP_AND, t)) + + # titlesearch OR parsed wikiwords + return Query(Query.OP_OR, + (self.titlesearch.xapian_term(), + Query(Query.OP_AND, queries))) class TitleSearch(BaseExpression): @@ -309,7 +332,7 @@ return u'%s!"%s"' % (neg, unicode(self._pattern)) def highlight_re(self): - return u"(%s)" % self._pattern + return u"(%s)" % self._pattern def pageFilter(self): """ Page filter function for single title search """ @@ -336,16 +359,28 @@ # XXX why not return None or empty list? return [Match()] - def lupy_term(self): - pattern = self._pattern.lower() + def xapian_wanted(self): + return not self.use_re + + def xapian_term(self): if self.use_re: - if pattern[0] == '^': - pattern = pattern[1:] - term = RegularExpressionQuery(Term("title", pattern)) + return None # xapian doesn't support regex search else: - term = PrefixQuery(Term("title", pattern), 1000000) # number of chars which are ignored behind the match - #term.boost = 100.0 - return term + analyzer = Xapian.WikiAnalyzer() + terms = self._pattern.split() + terms = [list(analyzer.tokenize(t)) for t in terms] + + # all parsed wikiwords, AND'ed + queries = [] + for t in terms: + t = ['%s%s' % (Xapian.Index.prefixMap['title'], i) + for i in list(analyzer.tokenize(t))] + if len(t) < 2: + queries.append(UnicodeQuery(t[0])) + else: + queries.append(UnicodeQuery(Query.OP_AND, t)) + + return Query(Query.OP_AND, queries) class LinkSearch(BaseExpression): @@ -358,7 +393,6 @@ @param use_re: treat pattern as re of plain text, bool @param case: do case sensitive search, bool """ - pattern = pattern.replace("_", " ") # used for search in links self._pattern = pattern # used for search in text @@ -389,7 +423,7 @@ return u'%s!"%s"' % (neg, unicode(self._pattern)) def highlight_re(self): - return u"(%s)" % self._textpattern + return u"(%s)" % self._textpattern def search(self, page): # Get matches in page name @@ -403,7 +437,7 @@ break else: Found = False - + if Found: # Search in page text results = self.textsearch.search(page) @@ -422,16 +456,16 @@ # XXX why not return None or empty list? return [Match()] - def lupy_term(self): + def xapian_wanted(self): + return not self.use_re + + def xapian_term(self): pattern = self.pattern if self.use_re: - if pattern[0] == "^": - pattern = pattern[1:] - term = RegularExpressionQuery(Term("links", pattern)) + return None # xapian doesnt support regex search else: - term = TermQuery(Term("links", pattern)) - term.boost = 10.0 - return term + return UnicodeQuery('%s:%s' % + (Xapian.Index.prefixMap['linkto'], pattern)) ############################################################################ ### Results @@ -625,10 +659,33 @@ return [] +class FoundRemote(FoundPage): + """ Represent an attachment in search results """ + + def __init__(self, wikiname, page_name, attachment, matches=None, page=None): + self.wikiname = wikiname + self.page_name = page_name + self.attachment = attachment + self.page = page + if matches is None: + matches = [] + self._matches = matches + + def weight(self, unique=1): + return 1 + + def get_matches(self, unique=1, sort='start', type=Match): + return [] + + def _unique_matches(self, type=Match): + return [] + + ############################################################################## ### Parse Query ############################################################################## + class QueryParser: """ Converts a String into a tree of Query objects @@ -646,13 +703,15 @@ self.regex = kw.get('regex', 0) def parse_query(self, query): - """ transform an string into a tree of Query objects""" + """ transform an string into a tree of Query objects """ + if isinstance(query, str): + query = query.decode(config.charset) self._query = query result = self._or_expression() if result is None: result = BaseExpression() return result - + def _or_expression(self): result = self._and_expression() if self._query: @@ -683,7 +742,7 @@ r'(?P<OPS>\(|\)|(or\b(?!$)))|' + # or, (, ) r'(?P<MOD>(\w+:)*)' + r'(?P<TERM>("[^"]+")|' + - r"('[^']+')|(\S+)))") # search word itself + r"('[^']+')|(\S+)))") # search word itself self._query = self._query.strip() match = re.match(regex, self._query, re.U) if not match: @@ -727,7 +786,7 @@ if match.group("NEG"): obj.negate() - return obj + return obj def isQuoted(self, text): # Empty string '' is not considered quoted @@ -837,7 +896,7 @@ matchInfo, f.listitem(0), ] - write(''.join(item)) + write(''.join(item)) write(list(0)) return self.getvalue() @@ -1162,8 +1221,8 @@ def run(self): """ Perform search and return results object """ start = time.time() - if self.request.cfg.lupy_search: - hits = self._lupySearch() + if self.request.cfg.xapian_search: + hits = self._xapianSearch() else: hits = self._moinSearch() @@ -1172,12 +1231,14 @@ hits = self._filter(hits) result_hits = [] - for page, attachment, match in hits: - if attachment: - result_hits.append(FoundAttachment(page.page_name, attachment)) + for wikiname, page, attachment, match in hits: + if wikiname in (self.request.cfg.interwikiname, 'Self'): # a local match + if attachment: + result_hits.append(FoundAttachment(page.page_name, attachment)) + else: + result_hits.append(FoundPage(page.page_name, match)) else: - result_hits.append(FoundPage(page.page_name, match)) - + result_hits.append(FoundRemote(wikiname, page, attachment, match)) elapsed = time.time() - start count = self.request.rootpage.getPageCount() return SearchResults(self.query, result_hits, count, elapsed) @@ -1185,22 +1246,34 @@ # ---------------------------------------------------------------- # Private! - def _lupySearch(self): - """ Search using lupy + def _xapianSearch(self): + """ Search using Xapian - Get a list of pages using fast lupy search and return moin - search in those pages. + Get a list of pages using fast xapian search and + return moin search in those pages. """ pages = None - index = Index(self.request) - if index.exists(): - self.request.clock.start('_lupySearch') + index = Xapian.Index(self.request) + if index.exists() and self.query.xapian_wanted(): + self.request.clock.start('_xapianSearch') try: - hits = index.search(self.query.lupy_term()) - pages = [(hit.get('pagename'), hit.get('attachment')) for hit in hits] + from MoinMoin.support import xapwrap + query = self.query.xapian_term() + self.request.log("xapianSearch: query = %r" % + query.get_description()) + query = xapwrap.index.QObjQuery(query) + hits = index.search(query) + self.request.log("xapianSearch: finds: %r" % hits) + def dict_decode(d): + """ decode dict values to unicode """ + for k, v in d.items(): + d[k] = d[k].decode(config.charset) + return d + pages = [dict_decode(hit['values']) for hit in hits] + self.request.log("xapianSearch: finds pages: %r" % pages) except index.LockedException: pass - self.request.clock.stop('_lupySearch') + self.request.clock.stop('_xapianSearch') return self._moinSearch(pages) def _moinSearch(self, pages=None): @@ -1212,23 +1285,29 @@ self.request.clock.start('_moinSearch') from MoinMoin.Page import Page if pages is None: - # if we are not called from _lupySearch, we make a full pagelist, + # if we are not called from _xapianSearch, we make a full pagelist, # but don't search attachments (thus attachment name = '') - pages = [(p, '') for p in self._getPageList()] + pages = [{'pagename': p, 'attachment': '', 'wikiname': 'Self', } for p in self._getPageList()] hits = [] fs_rootpage = self.fs_rootpage - for pagename, attachment in pages: - page = Page(self.request, pagename) - if attachment: - if pagename == fs_rootpage: # not really an attachment - page = Page(self.request, "%s%s" % (fs_rootpage, attachment)) - hits.append((page, None, None)) + for valuedict in pages: + wikiname = valuedict['wikiname'] + pagename = valuedict['pagename'] + attachment = valuedict['attachment'] + if wikiname in (self.request.cfg.interwikiname, 'Self'): # THIS wiki + page = Page(self.request, pagename) + if attachment: + if pagename == fs_rootpage: # not really an attachment + page = Page(self.request, "%s%s" % (fs_rootpage, attachment)) + hits.append((wikiname, page, None, None)) + else: + hits.append((wikiname, page, attachment, None)) else: - hits.append((page, attachment, None)) - else: - match = self.query.search(page) - if match: - hits.append((page, attachment, match)) + match = self.query.search(page) + if match: + hits.append((wikiname, page, attachment, match)) + else: # other wiki + hits.append((wikiname, pagename, attachment, None)) self.request.clock.stop('_moinSearch') return hits @@ -1252,8 +1331,11 @@ """ Filter out deleted or acl protected pages """ userMayRead = self.request.user.may.read fs_rootpage = self.fs_rootpage + "/" - filtered = [(page, attachment, match) for page, attachment, match in hits - if page.exists() and userMayRead(page.page_name) or page.page_name.startswith(fs_rootpage)] + thiswiki = (self.request.cfg.interwikiname, 'Self') + filtered = [(wikiname, page, attachment, match) for wikiname, page, attachment, match in hits + if not wikiname in thiswiki or + page.exists() and userMayRead(page.page_name) or + page.page_name.startswith(fs_rootpage)] return filtered
--- a/MoinMoin/support/lupy/__init__.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -"""Lupy Package""" - -__version__ = '0.2.1'
--- a/MoinMoin/support/lupy/document.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,110 +0,0 @@ -# -*- test-case-name: lupy.test.test_document -*- -"""Documents and Fields""" -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -class Document(object): - """Documents are the unit of indexing and search. - - A Document is a set of fields. Each field has a name and a textual - value. A field may be stored with the document, in which case it is - returned with search hits on the document. Thus each document should - typically contain stored fields which uniquely identify it. - """ - - def __init__(self): - self._fields = {} - self.fieldNames = [] - - def add(self, field): - """Adds a field to a document.""" - name = field.name() - self._fields[name] = field - if name not in self.fieldNames: - self.fieldNames.append(name) - - def getField(self, name): - """Returns a field with the given name, or None if none exist.""" - return self._fields.get(name, None) - - def get(self, name): - """Returns the string value of a field, or None.""" - field = self.getField(name) - if field is not None: - return field.stringValue() - else: - return None - - def fields(self): - """Return Python iterator over fields.""" - return [self._fields[name] for name in self.fieldNames] - - def __repr__(self): - return '<Document[%s]>' % ("|".join(self.fieldNames),) - - -class Field(object): - """A field is a section of a Document. - - Each field has two parts, a name and a value. Values may be free - text, provided as a string or as a file, or they may be atomic - keywords, which are not further processed. Such keywords may be used - to represent dates, urls, etc. Fields are optionally stored in the - index, so that they may be returned with hits on the document. - """ - - def __init__(self, name, string, store=False, index=True, token=True): - self.nom = name - self.stringVal = string - self.readerVal = None - self.isStored = store - self.isIndexed = index - self.isTokenized = token - - def __repr__(self): - if self.isStored and self.isIndexed and not self.isTokenized: - return '<Keyword<' + self.nom + ':' + self.stringVal + '>>' - elif self.isStored and not self.isIndexed and not self.isTokenized: - return '<Unindexed<' + self.nom + ':' + self.stringVal + '>>' - elif self.isStored and self.isIndexed and self.isTokenized and self.stringVal is not None: - return '<Text<' + self.nom + ':' + self.stringVal + '>>' - elif self.isStored and self.isIndexed and self.isTokenized and self.stringVal is not None: - return '<Text<' + self.nom + ':' + self.readerVal + '>>' - else: - return '<Field<???>' - - def name(self): - return self.nom - - def stringValue(self): - return self.stringVal - - def readerValue(self): - return self.readerVal - - -def Keyword(name, value): - "An untokenized field that is included in the index and returned with search results." - return Field(name, value, True, True, False) - - -def Text(name, strOrFile, store=True): - """A tokenized field that is included in the index and returned - with search results. Accepts string or file-like object.""" - if isinstance(strOrFile, (str, unicode)): - res = Field(name, strOrFile, store, True, True) - else: - res = Field(name, None) - res.readerVal = strOrFile - res.stringVal = None - return res - - -def UnIndexed(name, value): - return Field(name, value, True, False, False) - - -def UnStored(name, value): - return Field(name, value, False, True, True)
--- a/MoinMoin/support/lupy/index/__init__.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -"""indexing classes"""
--- a/MoinMoin/support/lupy/index/documentwriter.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,181 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -from StringIO import StringIO -from array import array -import re -from MoinMoin.support.lupy.search import similarity -from MoinMoin.support.lupy.index import field, term - -def standardTokenizer(string): - """Yield a stream of downcased words from a string.""" - r = re.compile("\\w+", re.U) - tokenstream = re.finditer(r, string) - for m in tokenstream: - yield m.group().lower() - -class DocumentWriter(object): - - def __init__(self, directory, analyzer=None, mfl=None): - self.directory = directory - self.maxFieldLength = mfl - self.postingTable = {} - self.termBuffer = term.Term('','') - self.analyzer=analyzer or standardTokenizer - - def addDocument(self, segment, doc): - # Write field names - fi = self.fieldInfos = field.FieldInfos() - fi.add(doc) - fi.writeDir(self.directory, segment + '.fnm') - - # Write field values - fieldsWriter = field.FieldsWriter(self.directory, - segment, - self.fieldInfos) - try: - fieldsWriter.addDocument(doc) - finally: - fieldsWriter.close() - - # Invert doc into postingTable - self.postingTable = {} - self.fieldLengths = [0] * (len(self.fieldInfos)) - self.invertDocument(doc) - - # Sort postingTable into an array - postings = self.sortPostingTable() - - - # Write postings - self.writePostings(postings, segment) - - # Write noms of indexed files - self.writeNorms(doc, segment) - - - def invertDocument(self, doc): - fields = doc.fields() - for field in doc.fields(): - fieldName = field.name() - fieldNumber = self.fieldInfos.fieldNumber(fieldName) - - position = self.fieldLengths[fieldNumber] # Position in field - - if field.isIndexed: - if not field.isTokenized: - # Untokenized - self.addPosition(fieldName, field.stringValue(), position) - position += 1 - else: - # Find or make a reader - if field.readerValue() is not None: - val = field.readerValue().read() - elif field.stringValue() is not None: - val = field.stringValue() - else: - raise Exception, 'Field must have either a String or Reader value' - - for tok in self.analyzer(val): - self.addPosition(fieldName, tok, position) - position += 1 - - if self.maxFieldLength and (position > self.maxFieldLength): - break - - self.fieldLengths[fieldNumber] = position - - - def addPosition(self, field, text, position): - self.termBuffer.set(field, text) - - ti = self.postingTable.get(self.termBuffer, None) - - if ti is not None: - freq = ti.freq - ti.positions.append(position) - ti.freq = freq + 1 - else: - trm = term.Term(field, text, False) - self.postingTable[trm] = Posting(trm, position) - - - def sortPostingTable(self): - arr = self.postingTable.values() - arr.sort() - return arr - - - def writePostings(self, postings, segment): - freq = None - prox = None - tis = None - - try: - freq = self.directory.createFile(segment + '.frq') - prox = self.directory.createFile(segment + '.prx') - - tis = term.TermInfosWriter(self.directory, - segment, - self.fieldInfos) - ti = term.TermInfo() - - for posting in postings: - # print 'writing', posting, posting.term - # Add entry to the dictionary with pointers to prox and freq files - ti.set(1, freq.getFilePointer(), prox.getFilePointer()) - tis.add(posting.term, ti) - - # Add an entry to the freq file - f = posting.freq - if f == 1: # optimize freq == 1 - freq.writeVInt(1) # set low bit of doc num - else: - freq.writeVInt(0) # the document number - freq.writeVInt(f) # frequency in doc - - lastPosition = 0 - positions = posting.positions - - for position in positions: - prox.writeVInt(position - lastPosition) - lastPosition = position - - finally: - if freq is not None: - freq.close() - if prox is not None: - prox.close() - if tis is not None: - tis.close() - - - def writeNorms(self, doc, segment): - for field in doc.fields(): - if field.isIndexed: - fieldNumber = self.fieldInfos.fieldNumber(field.name()) - norm = self.directory.createFile(segment + - '.f' + str(fieldNumber)) - try: - norm.writeByte(similarity.normInt(self.fieldLengths[fieldNumber])) - finally: - norm.close() - - -class Posting(object): - - def __init__(self, t, position): - self.term = t - self.freq = 1 - self.positions = array('i',[1]) - self.positions[0] = position - - def __repr__(self): - s = '<Posting:' - s += str(self.term) + '>' - return s - - def __cmp__(self, other): - return cmp(self.term, other.term)
--- a/MoinMoin/support/lupy/index/field.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,173 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -from MoinMoin.support.lupy import document - -class FieldInfo(object): - - def __init__(self, na, tk, nu): - self.name = na - self.isIndexed = tk - self.number = nu - - -class FieldInfos(object): - - def __init__(self, d=None, name=None): - self.byNumber = [] - self.byName = {} - if d is None and name is None: - self.addString('',False) - else: - input = d.openFile(name) - try: - self.read(input) - finally: - input.close() - - def add(self, doc): - """Adds field info for a Document""" - for field in doc.fields(): - self.addString(field.name(), field.isIndexed) - - def addString(self, name, isIndxd): - fi = self.fieldInfo(name) - if fi is None: - self.addInternal(name, isIndxd) - elif fi.isIndexed is not isIndxd: - fi.isIndexed = True - - def addFieldInfos(self, other): - """Merges in information from another FieldInfos""" - for i in range(len(other)): - fi = other.fieldInfoInt(i) - self.addString(fi.name, fi.isIndexed) - - def addInternal(self, name, isIndexed): - fi = FieldInfo(name, isIndexed, len(self.byNumber)) - - self.byNumber.append(fi) - self.byName[name]=fi - - def fieldNumber(self, fieldName): - fi = self.fieldInfo(fieldName) - if fi is not None: - return fi.number - else: - return -1 - - def fieldInfo(self, fieldName): - return self.byName.get(fieldName, None) - - def fieldName(self, fieldNumber): - return self.byNumber[fieldNumber].name - - def fieldInfoInt(self, fieldNumber): - return self.byNumber[fieldNumber] - - def __len__(self): - return len(self.byNumber) - - def writeDir(self, d, name): - output = d.createFile(name) - try: - self.write(output) - finally: - output.close() - - def write(self, output): - output.writeVInt(len(self)) - - for i in range(len(self)): - fi = self.fieldInfoInt(i) - output.writeString(fi.name) - if fi.isIndexed: - output.writeByte(1) - else: - output.writeByte(0) - - def read(self, input): - size = input.readVInt() - for i in range(size): - self.addInternal(input.readString(), (input.readByte() != 0)) - - def fieldNames(self): - # Experimental for auto-queries - return self.byName.keys() - -class FieldsWriter(object): - - def __init__(self, d, segment, fn): - self.fieldInfos = fn - self.fieldsStream = d.createFile(segment + '.fdt') - self.indexStream = d.createFile(segment + '.fdx') - - - def addDocument(self, doc): - self.indexStream.writeLong(self.fieldsStream.getFilePointer()) - storedCount = 0 - for field in doc.fields(): - if field.isStored: - storedCount += 1 - - self.fieldsStream.writeVInt(storedCount) - - for field in doc.fields(): - if field.isStored: - self.fieldsStream.writeVInt(self.fieldInfos.fieldNumber(field.name())) - - bits = 0 - if field.isTokenized: - bits |= 1 - self.fieldsStream.writeByte(bits) - - self.fieldsStream.writeString(field.stringValue()) - - - def close(self): - self.fieldsStream.close() - self.indexStream.close() - - -class FieldsReader(object): - - def __init__(self, d, segment, fn): - self.fieldInfos = fn - - self.fieldsStream = d.openFile(segment + '.fdt') - self.indexStream = d.openFile(segment + '.fdx') - - self.sze = self.indexStream.length / 8 - - - def close(self): - self.fieldsStream.close() - self.indexStream.close() - - - def size(self): - return self.sze - - - def doc(self, n): - self.indexStream.seek(n * 8L) - position = self.indexStream.readLong() - self.fieldsStream.seek(position) - - doc = document.Document() - numFields = self.fieldsStream.readVInt() - for i in range(numFields): - fieldNumber = self.fieldsStream.readVInt() - fi = self.fieldInfos.fieldInfoInt(fieldNumber) - - bits = self.fieldsStream.readByte() - tokenized = ((bits & 1) != 0) - - doc.add(document.Field(fi.name, self.fieldsStream.readString(), - True, fi.isIndexed, tokenized)) - - return doc - -
--- a/MoinMoin/support/lupy/index/indexwriter.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,228 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -import sys - -from MoinMoin.support.lupy import store -from MoinMoin.support.lupy.index import segmentmerger, segment, documentwriter - -class IndexWriter(object): - - def __init__(self, path, create=False, analyzer=None): - if path is None: - if create is True: - self.directory = store.RAMDirectory() - else: - self.directory = path - else: - self.directory = store.FSDirectory(path, create) - - self.infoStream = None - self.analyzer = analyzer - self.maxMergeDocs = sys.maxint - self.mergeFactor = 20 # Never < 2 - self.segmentInfos = segment.SegmentInfos() - self.ramDirectory = store.RAMDirectory() - # self.writeLock = open("write.lock", "wb") - # locker.lock(self.writeLock, locker.LOCK_EX) - - if create is True: - self.segmentInfos.write(self.directory) - else: - self.segmentInfos.read(self.directory) - - - def close(self): - self.flushRamSegments() - self.ramDirectory.close() - # self.writeLock.close() - self.directory.close() - - - def docCount(self): - count = 0 - for si in self.segmentInfos: - count += si.docCount - return count - - - def addDocument(self, doc): - dw = documentwriter.DocumentWriter(self.ramDirectory, self.analyzer) - segmentName = self.newSegmentName() - dw.addDocument(segmentName, doc) - self.segmentInfos.append(segment.SegmentInfo(segmentName, 1, self.ramDirectory)) - self.maybeMergeSegments() - - - def newSegmentName(self): - res = '_' + str(self.segmentInfos.counter) - self.segmentInfos.counter += 1 - return res - - - def optimize(self): - self.flushRamSegments() - while ((len(self.segmentInfos) > 1) or (len(self.segmentInfos) == 1 and - (segmentmerger.SegmentReader.hasDeletions(self.segmentInfos[0]) or - self.segmentInfos[0].dir != self.directory))): - minSegment = (len(self.segmentInfos) - self.mergeFactor) - if minSegment < 0: - self.mergeSegments(0) - else: - self.mergeSegments(minSegment) - - - def addIndexes(self, dirs): - """Merges all segments from an array of indexes into this index. - - This may be used to parallelize batch indexing. A large document - collection can be broken into sub-collections. Each sub-collection can be - indexed in parallel, on a different thread, process or machine. The - complete index can then be created by merging sub-collection indexes - with this method. - - After this completes, the index is optimized.""" - #### UNTESTED #### - self.optimize() - for d in dirs: - sis = segment.SegmentInfos() - sis.read(d) - for j in range(len(sis)): - self.segmentInfos.append(sis[j]) - self.optimize() - - - def flushRamSegments(self): - """Merges all RAM-resident segments.""" - - sis = self.segmentInfos - minSegment = len(sis) - 1 - docCount = 0 - - while minSegment >= 0 and ((sis[minSegment]).dir == self.ramDirectory): - docCount += sis[minSegment].docCount - minSegment -= 1 - - if (minSegment < 0 or (docCount + sis[minSegment].docCount) > self.mergeFactor or - not (sis[len(sis)-1].dir == self.ramDirectory)): - minSegment += 1 - - if minSegment >= len(sis): - return - self.mergeSegments(minSegment) - - - def maybeMergeSegments(self): - """Incremental segment merger""" - - targetMergeDocs = self.mergeFactor - while targetMergeDocs <= self.maxMergeDocs: - # Find segment smaller than the current target size - minSegment = len(self.segmentInfos) - mergeDocs = 0 - minSegment -= 1 - while minSegment >= 0: - si = self.segmentInfos[minSegment] - if si.docCount >= targetMergeDocs: - break - mergeDocs += si.docCount - minSegment -= 1 - if mergeDocs >= targetMergeDocs: #found a merge to do - self.mergeSegments(minSegment + 1) - else: - break - targetMergeDocs *= self.mergeFactor # increase target size - - - def mergeSegments(self, minSegment): - """Pops segments off of segmentInfos stack down to minSegment, - merges them, and pushes the merged index onto the top of the - segmentInfos stack""" - - mergedName = self.newSegmentName() - mergedDocCount = 0 - merger = segmentmerger.SegmentMerger(self.directory, mergedName) - segmentsToDelete = [] - - for i in range(minSegment, len(self.segmentInfos)): - si = self.segmentInfos[i] - reader = segmentmerger.SegmentReader(si) - merger.add(reader) - if reader.directory is self.directory or reader.directory is self.ramDirectory: - segmentsToDelete.append(reader) - mergedDocCount += si.docCount - merger.merge() - - self.segmentInfos = self.segmentInfos[:minSegment] - self.segmentInfos.append(segment.SegmentInfo(mergedName, - mergedDocCount, - self.directory)) - - # TODO some locking here - self.segmentInfos.write(self.directory) # commit before deleting - self.deleteSegments(segmentsToDelete) # delete now-unused segments - - - def deleteSegments(self, segs): - """Some operating systems (e.g. Windows) don't permit a file to be deleted - while it is opened for read (e.g. by another process or thread). So we - assume that when a delete fails it is because the file is open in another - process, and queue the file for subsequent deletion.""" - - deletable = [] - - self.deleteFilesList(self.readDeleteableFiles(), deletable) # try to delete deletable - - for reader in segs: - if reader.directory is self.directory: - self.deleteFilesList(reader.files(), deletable) # try to delete our files - else: - self.deleteFilesDir(reader.files(), reader.directory) # delete, eg, RAM files - self.writeDeleteableFiles(deletable) # note files we can't delete - - - def deleteFilesDir(self, files, dir): - for file in files: - dir.deleteFile(file) - - - def deleteFilesList(self, files, deletable): - for file in files: - try: - self.directory.deleteFile(file) - except OSError: - # this occurs on windows where sometimes - # win reports a file to be in use - # in reality it is windows that is fiddling - # with the file and locking it temporarily - if self.directory.fileExists(file): - # schedule the file for later deletion - deletable.append(file) - - - def readDeleteableFiles(self): - result = [] - if not self.directory.fileExists('deletable'): - return result - input = self.directory.openFile('deletable') - try: - i = input.readInt() - while i > 0: - result.append(input.readString()) - i -= 1 - finally: - input.close() - return result - - - def writeDeleteableFiles(self, files): - output = self.directory.createFile('deletable.new') - try: - output.writeInt(len(files)) - for file in files: - output.writeString(file) - finally: - output.close() - self.directory.renameFile('deletable.new','deletable')
--- a/MoinMoin/support/lupy/index/segment.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,167 +0,0 @@ -# -*- test-case-name: lupy.test -*- -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -from MoinMoin.support.lupy.index import term - -#import copy #broken, see comments at top of this file: -from MoinMoin.support import copy - -class SegmentTermEnum: - - def __init__(self, i, fis, isi): - self.input = i - self.fieldInfos = fis - self.size = self.input.readInt() - self.isIndex = isi - - self.indexPointer = 0 - self.position = -1 - self.prev = None - self.prevTxt = '' - self.term = term.Term('','') - self.trmInfo = term.TermInfo() - - - def clone(self): - """Return a copy of self. - """ - - # TODO: implement as __copy__ - clone = copy.copy(self) - clone.input = self.input.clone() - - clone.trmInfo = term.TermInfo() - clone.trmInfo.setTo(self.trmInfo) - #clone.prevTxt = self.term.text() - return clone - - - def close(self): - self.input.close() - - - def docFreq(self): - return self.trmInfo.docFreq - - - def freqPointer(self): - return self.trmInfo.freqPointer - - - def next(self): - self.position += 1 - - if self.position > self.size -1: - self.position += 1 - self.term = None - raise StopIteration - - self.prev = self.term - self.term = self.readTerm() - - self.trmInfo.docFreq = self.input.readVInt() - self.trmInfo.freqPointer += self.input.readVLong() - self.trmInfo.proxPointer += self.input.readVLong() - - if self.isIndex: - self.indexPointer += self.input.readVLong() - - return self.term, self.indexPointer - - def __iter__(self): - return self - - def proxPointer(self): - return self.trmInfo.proxPointer - - - def readTerm(self): - # this bit is a mite tricky. in the java version they use a - # buffer for reading and just use 'start' as the offset for - # putting the read string into the buffer; when strings with - # common prefixes were read in, the offset would preserve the - # prefix. So here we just remember the last string and slice - # the common prefix from it. - start = self.input.readVInt() - self.prevTxt = txt = self.prevTxt[:start] + self.input.readString() - fi = self.input.readVInt() - fld = self.fieldInfos.fieldName(fi) - t = term.Term(fld,txt,False) - return t - - - def seek(self, pointer, p, t, ti): - self.input.seek(pointer) - self.position = p - self.term = t - self.prev = None - self.trmInfo.setTo(ti) - self.prevTxt = self.term.text() - - def termInfo(self, ti=None): - if ti is None: - nti = term.TermInfo() - nti.setTo(self.trmInfo) - return nti - else: - ti.setTo(self.trmInfo) - - def __cmp__(a, b): - return cmp(a.term, b.term) - - -class SegmentInfo(object): - - def __init__(self, name, docCount, d): - self.name = name - self.docCount = docCount - self.dir = d - - -class SegmentInfos(list): - - def __init__(self, lst = None): - self.counter = 0 - if lst is not None: - self.extend(lst) - - def __getslice__(self, lo, hi): - res = SegmentInfos(list.__getslice__(self, lo, hi)) - res.counter = self.counter - return res - - def read(self, directory): - input = directory.openFile('segments') - try: - self.counter = input.readInt() # read counter - i = input.readInt() - while i > 0: # read segment infos - si = SegmentInfo(input.readString(), - input.readInt(), - directory) - self.append(si) - i -= 1 - finally: - input.close() - - def write(self, directory): - output = directory.createFile('segments.new') - try: - output.writeInt(self.counter) - output.writeInt(len(self)) - for si in self: - output.writeString(si.name) - output.writeInt(si.docCount) - finally: - output.close() - - # Install new segment info - directory.renameFile('segments.new','segments') - - def __repr__(self): - return 'SegInfo' + list.__repr__(self) - -
--- a/MoinMoin/support/lupy/index/segmentmerger.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1078 +0,0 @@ -# -*- test-case-name: lupy.test -*- -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -import sys - -from array import array - -from MoinMoin.support.lupy.util import BitVector - -from MoinMoin.support.lupy.index import field, term, segment - -#import copy #broken, see comments at top of this file: -from MoinMoin.support import copy - -from bisect import insort -import os - -class IndexReader(object): - - """IndexReader is an abstract class, providing an interface for - accessing an index. Search of an index is done entirely through this abstract - interface, so that any subclass which implements it is searchable. - - Concrete subclasses of IndexReader are usually constructed with a call to L{lupy.search.indexsearcher.open}C{(path)}. - - For efficiency, in this API documents are often referred to via document - numbers, non-negative integers which each name a unique document in the index. - These document numbers are ephemeral--they may change as documents are added - to and deleted from an index. Clients should thus not rely on a given document - having the same number between sessions. """ - - def __init__(self, d): - self.directory = d - - def indexExists(self, d): - """Returns True if an index exists at the specified directory.""" - return self.directory.fileExists('segments') - - def isLocked(self): - # return self.directory.fileExists('write.lock') - return False - - def lastModified(self, d): - """Returns the time the index in this directory was last modified.""" - return self.directory.fileModified('segments') - - def lastModifiedString(self, d): - return self.lastModified(d) - - - #def unlock(self, directory): - # """Forcibly unlocks the index in the named directory. - # - # Caution: this should only be used by failure recovery code, - # when it is known that no other process nor thread is in fact - # currently accessing this index.""" - # - # directory.deleteFile('write.lock') - # directory.deleteFile('commit.lock') - - - def close(self): - """Closes files associated with this index. - Also saves any new deletions to disk. - No other methods should be called after this has been called.""" - - self.doClose() - - - def doClose(self): - pass - - - def delete(self, docNum): - - """Deletes the document numbered C{docNum}. Once a document - is deleted it will not appear in TermDocs or TermPositions - enumerations. Attempts to read its field with the L{document} - method will result in an error. The presence of this document - may still be reflected in the C{docFreq} statistic, though - this will be corrected eventually as the index is further - modified. """ - self.doDelete(docNum) - - - def deleteTerm(self, term): - """ Deletes all documents containing C{term}. - This is useful if one uses a document field to hold a unique ID string for - the document. Then to delete such a document, one merely constructs a - term with the appropriate field and the unique ID string as its text and - passes it to this method. Returns the number of documents deleted. - """ - docs = self.termDocsTerm(term) - try: - return len([self.delete(doc) for doc,freq in docs]) - finally: - docs.close() - - - - def termDocs(self): - """Returns an unpositioned TermDocs enumerator. - """ - - - def termDocsTerm(self, term): - """ Returns an enumeration of all the documents which contain - C{term}. For each document, the document number, the frequency of - the term in that document is also provided, for use in search scoring. - Thus, this method implements the mapping: - - Term S{->} <docNum, freq>* - - The enumeration is ordered by document number. Each document number - is greater than all that precede it in the enumeration.""" - - termDocs = self.termDocs() - termDocs.seekTerm(term) - return termDocs - - - def termPositionsTerm(self, term): - - """Returns an enumeration of all the documents which contain - C{term}. For each document, in addition to the document - number and frequency of the term in that document, a list of - all of the ordinal positions of the term in the document is - available. Thus, this method implements the mapping: - - M{Term S{->} <docNum, freq, <pos(1), pos(2), ... , pos(freq-1)>>*} - - This positional information faciliates phrase and proximity searching. - - The enumeration is ordered by document number. Each document - number is greater than all that precede it in the - enumeration.""" - - termPositions = self.termPositions() - termPositions.seekTerm(term) - return termPositions - -class SegmentTermDocs(object): - - def __init__(self, parent): - self.parent = parent - self.freqStream = parent.freqStream.clone() - self.deletedDocs = parent.deletedDocs - - self.docu = 0 - self.frq = 0 - - def close(self): - self.freqStream.close() - - def __iter__(self): - return self - - def next(self): - while True: - if self.freqCount == 0: - raise StopIteration - - docCode = self.freqStream.readVInt() - self.docu += docCode >> 1 - if (docCode & 1): - self.frq = 1 - else: - self.frq = self.freqStream.readVInt() - - self.freqCount -= 1 - - if self.deletedDocs is None or (not self.deletedDocs.get(self.docu)): - return self.docu, self.frq - self.skippingDoc() - - - def read(self): - return list(self) - - def skippingDoc(self): - pass - - def seekTerm(self, term): - ti = self.parent.tis.getTerm(term) - self.seekTi(ti) - - - def seekTi(self, ti): - if ti is None: - self.freqCount = 0 - else: - self.freqCount = ti.docFreq - self.docu = 0 - self.freqStream.seek(ti.freqPointer) - - -class SegmentTermPositions(SegmentTermDocs): - - def __init__(self, p): - self.proxCount = 0 - self.position = 0 - SegmentTermDocs.__init__(self, p) - - self.proxStream = self.parent.proxStream.clone() - - def close(self): - SegmentTermDocs.close(self) - self.proxStream.close() - - - def next(self): - #generator for accessing positions in the current doc - #kinda lame since it utterly breaks after next iteration - def nextPosition(freq): - for i in range(freq): - self.proxCount -= 1 - self.position += self.proxStream.readVInt() - yield self.position - - #skip unused positions - for i in range(self.proxCount): - - self.proxStream.readVInt() - - self.doc, self.frq = SegmentTermDocs.next(self) - self.proxCount = self.frq - self.position = 0 - return self.doc, self.frq, nextPosition(self.frq) - - - def skippingDoc(self): - # skip all positions - for f in range(self.frq, 0, -1): - self.proxStream.readVInt() - - def seekTi(self, ti): - SegmentTermDocs.seekTi(self, ti) - - if ti is not None: - self.proxStream.seek(ti.proxPointer) - else: - self.proxCount = 0 - - def __repr__(self): - s = '<stp>' + str(self.position) - return s - -class SegmentMergeInfo(object): - - def __init__(self, b, te, r): - self.base = b - self.reader = r - self.termEnum = te - self.term = te.term - self.docMap = None - self.postings = SegmentTermPositions(r) - - if self.reader.deletedDocs is not None: - # build array with maps document numbers around deletions - deletedDocs = self.reader.deletedDocs - maxDoc = self.reader.maxDoc() - self.docMap = [0] * maxDoc - j = 0 - for i in range(maxDoc): - if deletedDocs.get(i): - self.docMap[i] = -1 - else: - self.docMap[i] += 1 - - - def close(self): - self.termEnum.close() - self.postings.close() - - - def advance(self): - #I don't see a reasonable way out of this one. - try: - self.term, self.indexPointer= self.termEnum.next() - self.trmInfo = self.termEnum.termInfo() - return True - except StopIteration: - self.term = None - return False - - def __repr__(self): - return '<SegMergInfo' + str(self.term) +'>' - - def __lt__(a, b): - if a.term == b.term: - return a.base < b.base - else: - return a.termEnum < b.termEnum - - - -class SegmentMerger(object): - - def __init__(self, dir, name): - self.directory = dir - self.segment = name - self.freqOutput = None - self.proxOutput = None - self.termInfosWriter = None - self.readers = [] - self.termInfo = term.TermInfo() - self.smis = [] - - def add(self, reader): - self.readers.append(reader) - - - def appendPostings(self, smis, n): - lastDoc = 0 - df = 0 # number of with term - - for i in range(n): - smi = smis[i] - postings = smi.postings - base = smi.base - docMap = smi.docMap - smi.termEnum.termInfo(self.termInfo) - postings.seekTi(self.termInfo) - - for doc, freq, nextPos in postings: - if docMap is None: - # no deletions - d = base + doc - else: - # re-map around deletions - d = base + docMap[postings.doc] - if d < lastDoc: - raise RuntimeException, 'docs out of order' - - # use low bit ot flag freq = 1 - docCode = (d - lastDoc) << 1 - lastDoc = d - - if freq == 1: - # write doc & freq=1 - self.freqOutput.writeVInt(docCode | 1) - else: - # write doc - self.freqOutput.writeVInt(docCode) - # write frequency in doc - self.freqOutput.writeVInt(freq) - - lastPosition = 0 - for position in nextPos: - self.proxOutput.writeVInt(position - lastPosition) - lastPosition = position - - df += 1 - - return df - - - def merge(self): - try: - self.mergeFields() - self.mergeTerms() - self.mergeNorms() - finally: - for reader in self.readers: - reader.close() - - - def mergeFields(self): - # merge field names - self.fieldInfos = field.FieldInfos() - for reader in self.readers: - self.fieldInfos.addFieldInfos(reader.fieldInfos) - self.fieldInfos.writeDir(self.directory, self.segment + '.fnm') - - # merge field values - fieldsWriter = field.FieldsWriter(self.directory, - self.segment, - self.fieldInfos) - - try: - for reader in self.readers: - deletedDocs = reader.deletedDocs - maxDoc = reader.maxDoc() - for j in range(maxDoc): - if deletedDocs is None or not deletedDocs.get(j): - # skip deleted docs - fieldsWriter.addDocument(reader.document(j)) - finally: - fieldsWriter.close() - - - - def mergeNorms(self): - for i in range(len(self.fieldInfos)): - fi = self.fieldInfos.fieldInfoInt(i) - if fi.isIndexed: - output = self.directory.createFile(self.segment + '.f' + str(i)) - try: - for reader in self.readers: - deletedDocs = reader.deletedDocs - input = reader.normStream(fi.name) - maxDoc = reader.maxDoc() - try: - for k in range(maxDoc): - if input is None: - norm = 0 - else: - norm = input.readByte() - output.writeByte(norm) - finally: - if input is not None: - input.close() - finally: - output.close() - - - def mergeTermInfo(self, smis, n): - freqPointer = self.freqOutput.getFilePointer() - proxPointer = self.proxOutput.getFilePointer() - - # Append posting data - df = self.appendPostings(smis, n) - - if df > 0: - # add an entry to the dictionary with pointers to prox and freq files - self.termInfo.set(df, freqPointer, proxPointer) - self.termInfosWriter.add(smis[0].term, self.termInfo) - - - - def mergeTermInfos(self): - smis = self.smis - base = 0 - - - for reader in self.readers: - termEnum = reader.terms() - smi = SegmentMergeInfo(base, termEnum, reader) - base += reader.numDocs() - if smi.advance(): - insort(smis, smi) - else: - smi.close() - - match = [0] * len(self.readers) - while len(smis) > 0: - # pop matching terms - matchSize = 0 - match[matchSize] = smis.pop(0) - matchSize += 1 - term = match[0].term - top = smis and smis[0] or None - - while top is not None and cmp(term,top.term) == 0: - match[matchSize] = smis.pop(0) - matchSize += 1 - top = smis and smis[0] or None - - # add new TermInfo - self.mergeTermInfo(match, matchSize) - - while matchSize > 0: - matchSize -= 1 - smi = match[matchSize] - if smi.advance(): - insort(smis, smi) - else: - smi.close() - - - def mergeTerms(self): - try: - self.freqOutput = self.directory.createFile(self.segment + '.frq') - self.proxOutput = self.directory.createFile(self.segment + '.prx') - self.termInfosWriter = term.TermInfosWriter(self.directory, - self.segment, - self.fieldInfos) - self.mergeTermInfos() - finally: - if self.freqOutput is not None: - self.freqOutput.close() - if self.proxOutput is not None: - self.proxOutput.close() - if self.termInfosWriter is not None: - self.termInfosWriter.close() - for smi in self.smis: - smi.close() - - def segmentReader(self, i): - return self.readers[i] - - -class SegmentReader(IndexReader): - - # Class methods - def hasDeletions(cls, si): - return si.dir.fileExists(si.name + '.del') - - hasDeletions = classmethod(hasDeletions) - - - # instance methods - def __init__(self, si, closeDir=False): - self.directory = si.dir - self.closeDirectory = closeDir - self.segment = si.name - self.nrms = {} - self.deletedDocsDirty = False - - self.fieldInfos = field.FieldInfos(self.directory, - self.segment + '.fnm') - self.fieldsReader = field.FieldsReader(self.directory, - self.segment, - self.fieldInfos) - - self.tis = TermInfosReader(self.directory, - self.segment, - self.fieldInfos) - - if SegmentReader.hasDeletions(si): - self.deletedDocs = BitVector(self.directory, - self.segment + '.del') - else: - self.deletedDocs = None - - # makes sure that all index files have been read or are kept open - # so that if an index update removes them we'll still have them - self.freqStream = self.directory.openFile(self.segment + '.frq') - self.proxStream = self.directory.openFile(self.segment + '.prx') - - self.openNorms() - - - def closeNorms(self): - for v in self.nrms.values(): - norm = v - v.inStream.close() - - - def docFreq(self, t): - ti = self.tis.getTerm(t) - if ti is None: - return 0 - else: - return ti.docFreq - - - def doClose(self): - if self.deletedDocsDirty: - self.deletedDocs.write(self.directory, self.segment + ".tmp") - self.directory.renameFile(self.segment + ".tmp", - self.segment + ".del") - self.deletedDocsDirty = False - - self.fieldsReader.close() - self.tis.close() - - if self.freqStream is not None: - self.freqStream.close() - if self.proxStream is not None: - self.proxStream.close() - - self.closeNorms() - - if self.closeDirectory: - self.directory.close() - - - def document(self, n): - if self.isDeleted(n): - raise Exception, 'attempt to access deleted document' - return self.fieldsReader.doc(n) - - - def doDelete(self, docNum): - if self.deletedDocs is None: - self.deletedDocs = BitVector(self.maxDoc()) - self.deletedDocsDirty = True - self.deletedDocs.set(docNum) - - - def files(self): - suffix = ['.fnm','.fdx','.fdt','.tii','.tis','.frq','.prx'] - files = map((lambda x: self.segment + x), suffix) - - if self.directory.fileExists(self.segment + '.del'): - files.append(self.segment + '.del') - - for i in range(len(self.fieldInfos)): - fi = self.fieldInfos.fieldInfoInt(i) - if fi.isIndexed: - files.append(self.segment + '.f' + str(i)) - - return files - - - def isDeleted(self, n): - return (self.deletedDocs is not None and self.deletedDocs.get(n)) - - - def maxDoc(self): - return self.fieldsReader.size() - - - def normsField(self, field): - norm = self.nrms.get(field, None) - if norm is None: - return None - if norm.bytes is None: - bytes = array('B',[0x00]*self.maxDoc()) - self.norms(field, bytes, 0) - norm.bytes = bytes - - return norm.bytes - - - def norms(self, field, bytes, offset): - normStream = self.normStream(field) - if normStream is None: - return - try: - normStream.readBytes(bytes, offset, self.maxDoc()) - finally: - normStream.close() - - - def normStream(self, field): - norm = self.nrms.get(field, None) - if norm is None: - return None - # Cloning???? - result = norm.inStream.clone() - result.seek(0) - return result - - - def numDocs(self): - n = self.maxDoc() - if self.deletedDocs is not None: - n -= self.deletedDocs.count() - return n - - def openNorms(self): - for i in range(len(self.fieldInfos)): - fi = self.fieldInfos.fieldInfoInt(i) - if fi.isIndexed: - self.nrms[fi.name]=Norm(self.directory.openFile( - (self.segment + '.f' + str(fi.number)))) - - - def termDocs(self): - return SegmentTermDocs(self) - - - - def termPositions(self): - return SegmentTermPositions(self) - - - def terms(self, t = None): - return self.tis.terms(t) - - def fieldNames(self): - # Experimental for auto-queries - # Return a sorted list of all the field names - fNames = self.fieldInfos.fieldNames() - if not fNames: - return [] - # Remove the field with no name - fNames.remove('') - return fNames - - - -class Norm(object): - - def __init__(self, inStream): - self.inStream = inStream - self.bytes = None - - -class SegmentsReader(IndexReader): - - def __init__(self, directory, r): - IndexReader.__init__(self, directory) - self.readers = r - self.maxiDoc = 0 - self.normsCache = {} - self.numiDocs = -1 - self.starts = [0] - - i = 0 - for reader in self.readers: - self.maxiDoc += reader.maxDoc() - self.starts.append(self.maxiDoc) - - def docFreq(self, t): - total = 0 - for r in self.readers: - total += r.docFreq(t) - return total - - - def doClose(self): - for r in self.readers: - r.close() - - - def document(self, n): - # find segment num - i = self.readerIndex(n) - # dispatch to segment reader - return self.readers[i].document(n - self.starts[i]) - - - def doDelete(self, n): - # invalidate cache - self.numiDocs = -1 - # find seg num - i = self.readerIndex(n) - # dispatch to seg reader - self.readers[i].doDelete(n - self.starts[i]) - - - def isDeleted(self, n): - # find segment num - i = self.readerIndex(n) - # dispatch to segment reader - return self.readers[i].isDeleted(n - self.starts[i]) - - - def maxDoc(self): - return self.maxiDoc - - - def normsField(self, field): - bytes = self.normsCache.get(field, None) - if bytes is not None: - # cache hit - return bytes - - bytes = array('B',[0x00] * self.maxDoc()) - for i in range(len(self.readers)): - self.readers[i].norms(field, bytes, self.starts[i]) - # update cache - self.normsCache[field]=bytes - return bytes - - - #def numDocs(self): - # # check cache - # if numiDocs == -1: - # # cache miss - recompute - # n = 0 - # for r in self.readers: - # # sum from readers - # n += r.numDocs() - # self.numiDocs = n - # return self.numiDocs - - - def readerIndex(self, n): - # Search starts array for first element less than n - lo = 0 - hi = len(self.readers) - 1 - - while hi >= lo: - mid = (lo + hi) >> 1 - midValue = self.starts[mid] - if n < midValue: - hi = mid - 1 - elif n > midValue: - lo = mid + 1 - else: - return mid - return hi - - - def termDocs(self): - return SegmentsTermDocs(self.readers, self.starts) - - - def termPositions(self): - return SegmentsTermPositions(self.readers, self.starts) - - def terms(self, t = None): - return SegmentsTermEnum(self, t) - - def fieldNames(self): - # Experimental for auto-queries - if self.readers: - return self.readers[0].fieldInfos.fieldNames() - else: - return [] - -class SegmentsTermEnum(segment.SegmentTermEnum): - - def __init__(self, segmentsreader, term=None): - self.enums = [sr.terms(term) for sr in segmentsreader.readers] - self.prev = None - min = self.enums[0] - for enum in self.enums: - if enum.term is not None and enum < min: - min = enum - self.term = min.term - - def close(self): - for e in self.enums: e.close() - - def next(self): - min = self.enums[0] - for enum in self.enums: - if enum.term is not None and enum<min: - min = enum - if min.term is None: - raise StopIteration - else: - self.prev = self.term - self.term = min.term - try: - min.next() - except StopIteration: - pass - - -class SegmentsTermDocs(object): - - def __init__(self, r, s): - self.readers = r - self.starts = s - - self.base = 0 - self.pointer = 0 - self.current = None - self.term = None - - self.segTermDocs = [None] * len(r) - - - def close(self): - for segtdoc in self.segTermDocs: - if segtdoc is not None: - segtdoc.close() - - def freq(self): - return self.current.frq - frq = property(freq) # what can i say? API in transition - - def __iter__(self): - def x(): - if self.current is not None: - for item in self.current: - yield item - for ptr, reader in list(enumerate(self.readers))[self.pointer:]: - self.pointer = ptr - self.base = self.starts[self.pointer] - self.current = self.termDocsInt(self.pointer) - for item in self.current: - yield (item[0]+self.base,) + item[1:] - return x() - - - def read(self): - dfs = [] - while True: - while self.current is None: - if self.pointer < len(self.readers): - # try next segment - self.base = self.starts[self.pointer] - self.current = self.termDocsInt(self.pointer) - self.pointer += 1 - else: - return dfs - segmentDFs = self.current.read() - if segmentDFs: - b = self.base - for i, (d, f) in enumerate(segmentDFs): - segmentDFs[i] = d + b, f - dfs.extend(segmentDFs) - else: - self.current = None - - - def seekTerm(self, term): - self.term = term - self.base = 0 - self.pointer = 0 - self.current = None - - def termDocsInt(self, i): - if self.term is None: - return None - result = self.segTermDocs[i] - if result is None: - result = self.termDocsReader(self.readers[i]) - self.segTermDocs[i] = result - result.seekTerm(self.term) - return result - - - def termDocsReader(self, reader): - return reader.termDocs() - - - -class SegmentsTermPositions(SegmentsTermDocs): - - def termDocsReader(self, reader): - return reader.termPositions() - - - #def nextPosition(self): - # return self.current.nextPosition() - -class TermInfosReader(object): - - def __init__(self, d, seg, fis): - self.directory = d - self.segment = seg - self.fieldInfos = fis - - self.indexTerms = None - - self.enum = segment.SegmentTermEnum( - self.directory.openFile(self.segment + '.tis'), - self.fieldInfos, - False) - - self.sze = self.enum.size - self.readIndex() - - - def close(self): - if self.enum is not None: - self.enum.close() - - - - def getInt(self, position): - if self.sze == 0: - return None - - if (self.enum is not None and self.enum.term() is not None and - position > self.enum.position and - position < (self.enum.position + term.TermInfosWriter.INDEX_INTERVAL)): - # can avoid seek - return self.scanEnum(position) - - # must seek - self.seekEnum(position/term.TermInfosWriter.INDEX_INTERVAL) - return self.scanEnum(position) - - - def getIndexOffset(self, term): - #TODO - use bisect module? - - lo = 0 - hi = len(self.indexTerms) - 1 - - while hi >= lo: - mid = (lo + hi) >> 1 - delta = cmp(term, self.indexTerms[mid]) - if delta < 0: - hi = mid - 1 - elif delta > 0: - lo = mid + 1 - else: - return mid - - return hi - - - def getTerm(self, t): - if self.sze == 0: - return None - - # Optimize sequential access: first try scanning - # cached enum w/o seeking - - if (self.enum.term is not None and - ((self.enum.prev is not None and cmp(t,self.enum.prev) > 0) or - cmp(t,self.enum.term) >= 0)): - # term is at or past current - enumOffset = (self.enum.position/term.TermInfosWriter.INDEX_INTERVAL)+1 - - if (len(self.indexTerms) == enumOffset or - cmp(t, self.indexTerms[enumOffset]) < 0): - # but before end of block - # no need to seek - return self.scanEnum(t) - - # random-access: must seek - self.seekEnum(self.getIndexOffset(t)) - return self.scanEnum(t) - - - - def getPosition(self, term): - if size == 0: - return -1 - - indexOffset = self.getIndexOffest(term) - self.seekEnum(indexOffset) - - while (term > self.enum.term()) and self.enum.advance(): - pass - - if term == self.enum.term(): - return self.enum.position - else: - return -1 - - - def readIndex(self): - indexEnum = segment.SegmentTermEnum( - self.directory.openFile(self.segment + '.tii'), - self.fieldInfos, - True) - - try: - indexSize = indexEnum.size - - self.indexTerms = [] - self.indexInfos = [] - self.indexPointers = [] - - for term, indexPointer in indexEnum: - self.indexTerms.append(indexEnum.term) - self.indexInfos.append(indexEnum.termInfo()) - self.indexPointers.append(indexEnum.indexPointer) - - finally: - indexEnum.close() - - - def scanEnum(self, position): - while(self.enum.position < position): - if not enum.next(): - return None - return self.enum.term() - - - def scanEnum(self, term): - # Scans within block for matching term. - t = self.enum.term - while (cmp(term, t) > 0): - try: - #ugh ugh it is 7am make it stop - t = self.enum.next()[0] - except StopIteration: - break - if (self.enum.term is not None and cmp(term, self.enum.term) == 0): - return self.enum.termInfo() - else: - return None - - - def seekEnum(self, indexOffset): - self.enum.seek(self.indexPointers[indexOffset], - (indexOffset * term.TermInfosWriter.INDEX_INTERVAL) - 1, - self.indexTerms[indexOffset], self.indexInfos[indexOffset]) - - def terms(self, term = None): - if term is None: - # Returns an enumeration of all the Terms and TermInfos in the set - if (self.enum.position != -1): - # if not at start - # reset to start - self.seekEnum(0) - else: - self.getTerm(term) - - res = self.enum.clone() - return res -
--- a/MoinMoin/support/lupy/index/term.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,157 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -class Term(object): - - def __init__(self, fld, txt, intern=False): - self.set(fld, txt) - - def __cmp__(self, other): - """Compares two terms, returning an integer which is less than zero iff this - term belongs after the argument, equal zero iff this term is equal to the - argument, and greater than zero iff this term belongs after the argument. - - The ordering of terms is first by field, then by text.""" - - if self.fld == other.fld: - # fields are interned - return cmp(self.txt, other.txt) - else: - return cmp(self.fld, other.fld) - - def __hash__(self): - return self._hash - - def field(self): - return self.fld - - def readObject(self, inp): - inp.defaultReadObject() - - def set(self, fld, txt): - self.fld = fld - self.txt = txt - self._hash = hash(fld + txt) - - def text(self): - return self.txt - - def __repr__(self): - return 'Term<'+self.fld.encode('utf8')+':'+self.txt.encode('utf8')+'>' - -class TermInfo(object): - - def __init__(self): - self.docFreq = 0 - self.freqPointer = 0 - self.proxPointer = 0 - - def set(self, df, fp, pp): - self.docFreq = df - self.freqPointer = fp - self.proxPointer = pp - - def setTo(self, ti): - self.docFreq = ti.docFreq - self.freqPointer = ti.freqPointer - self.proxPointer = ti.proxPointer - - def __repr__(self): - return '<TermInfo:d:' + str(self.docFreq)+ ' f:' + str(self.freqPointer) +\ - ' p:' + str(self.proxPointer) + '>' - - -class TermInfosWriter(object): - INDEX_INTERVAL = 128 - - - def __init__(self, d, seg, fis, isIndex = False): - - self.initialize(d, seg, fis, isIndex) - - self.size = 0 - self.lastIndexPointer = 0 - self.lastTerm = Term('','') - self.lastTi = TermInfo() - - if isIndex is False: - self.other = TermInfosWriter(d, seg, fis, True) - self.other.other = self - - - def initialize(self, d, seg, fis, isi): - self.fieldInfos = fis - self.isIndex = isi - if isi is True: - ext = '.tii' - else: - ext = '.tis' - - self.output=d.createFile(seg + ext) - # leave space for size - self.output.writeInt(0) - - - def stringDifference(self, s1, s2): - prefixLength = min(len(s1), len(s2)) - for i in range(prefixLength): - if s1[i] != s2[i]: - return i - - return prefixLength - - - def add(self, term, ti): - if not self.isIndex and term <= self.lastTerm: - raise Exception, "term out of order: " + str(term) + str(self.lastTerm) - if ti.freqPointer < self.lastTi.freqPointer: - raise Exception, "freqPointer out of order" - if ti.proxPointer < self.lastTi.proxPointer: - raise Exception, "proxPointer out of order" - - if (not self.isIndex and self.size % self.INDEX_INTERVAL == 0): - # add an index term - self.other.add(self.lastTerm, self.lastTi) - - # write term - self.writeTerm(term) - # write doc freq - self.output.writeVInt(ti.docFreq) - # write pointers - self.output.writeVLong(ti.freqPointer - self.lastTi.freqPointer) - self.output.writeVLong(ti.proxPointer - self.lastTi.proxPointer) - - if self.isIndex: - self.output.writeVLong(self.other.output.getFilePointer() - self.lastIndexPointer) - self.lastIndexPointer = self.other.output.getFilePointer() - - self.lastTi.setTo(ti) - self.size += 1 - - - def close(self): - self.output.seek(0) - self.output.writeInt(self.size) - self.output.close() - - if self.isIndex is not True: - self.other.close() - - - def writeTerm(self, term): - a, b = self.lastTerm.text(), term.text() - start = self.stringDifference(a, b) - delta = term.text()[start:] - # write shared prefix length - self.output.writeVInt(start) - # write delta chars - self.output.writeString(delta) - # write field num - i = self.fieldInfos.fieldNumber(term.field()) - self.output.writeVInt(i) - self.lastTerm = term - - -
--- a/MoinMoin/support/lupy/index/terminfo.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -class TermInfo(object): - - def __init__(self): - self.docFreq = 0 - self.freqPointer = 0 - self.proxPointer = 0 - - def set(self, df, fp, pp): - self.docFreq = df - self.freqPointer = fp - self.proxPointer = pp - - def setTo(self, ti): - self.docFreq = ti.docFreq - self.freqPointer = ti.freqPointer - self.proxPointer = ti.proxPointer - - def __repr__(self): - return '<TermInfo:d:' + str(self.docFreq)+ ' f:' + str(self.freqPointer) +\ - ' p:' + str(self.proxPointer) + '>' - -
--- a/MoinMoin/support/lupy/indexer.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,262 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -"""A simple interface to indexing and searching. -""" -import os, sys, re - -from MoinMoin.support.lupy.index.indexwriter import IndexWriter -from MoinMoin.support.lupy.index.documentwriter import standardTokenizer -from MoinMoin.support.lupy.index.term import Term - -from MoinMoin.support.lupy import document - -from MoinMoin.support.lupy.search import indexsearcher -from MoinMoin.support.lupy.search.term import TermQuery -from MoinMoin.support.lupy.search.phrase import PhraseQuery -from MoinMoin.support.lupy.search.boolean import BooleanQuery - - -class Index: - - def __init__(self, name, create=False, analyzer=None): - """ - @param name: Name of the directory for this index. - @param create: Whether to create this directory or not. - @type create: boolean - """ - - self.name = name - self.analyzer = analyzer or standardTokenizer - # Create the index if we need to. From here on we assume - # that the index exists - self.indexer = IndexWriter(self.name, create, analyzer) - # Remember the default merge factor - self.mergeFactor = self.indexer.mergeFactor - # Clean up - self.indexer.close() - self.indexer = self.searcher = None - - def index(self, **kw): - """Add a document to the index. - - **kw contains the name and values of each Field in the - Document that we are creating. - - If the key in **kw starts with '_' the field will be created - as a Keyword. If it starts with '__', it is created as a - stored Text field (e.g. tokenized and stored), otherwise it - will be created as a Text field. The leading '_' are removed - before field creation. - - Text fields will have their value tokenized before - indexing. The value is not stored in the index. This is the - usual type of field that you need for plain text. - - Keyword fields will not have their value tokenized. The value - is stored in the index and is returned with search hits on the - Document. If you wanted to store the path to a document along - with each document, you would use a Keyword field. The path - would not be tokenized and its value would be returned in the - query results, so you could easily open and display the file. - """ - self._setupIndexer() - - # create document - d = document.Document() - - # TODO - Please find another way of defining fields - # than magic field names!!! - - # add a file field containing the path to this file - for key, value in kw.items(): - if key[:2] == '__': - key = key[2:] - # Tokenized and stored - f = document.Text(key, value, True) - elif key[0] == '_': - # Not tokenized and stored - key = key[1:] - # keyword - f = document.Keyword(key, value) - else: - # Tokenized and not stored - f = document.Text(key, value, False) - d.add(f) - self.indexer.addDocument(d) - - def _setupIndexer(self): - if self.searcher is not None: - self.searcher.close() - self.searcher = None - if self.indexer is None: - self.indexer = IndexWriter(self.name, False, self.analyzer) - self.indexer.mergeFactor = self.mergeFactor - - def _setupSearcher(self): - if self.indexer is not None: - self.indexer.close() - self.indexer = None - if self.searcher is None: - self.searcher = indexsearcher.IndexSearcher(self.name) - - def delete(self, **kw): - "Delete the first document containing the specified term. See also L{deleteAll}." - # Not very efficient for bulk deletes - # Use deleteAll for bulk deletes - self._setupSearcher() - if len(kw) != 1: - raise RuntimeError, 'one and only one field for the moment' - field, value = kw.items()[0] - t = Term(field, value) - self.searcher.reader.deleteTerm(t) - - def deleteAll(self, **kw): - "Remove all documents containing this field and value." - self.close() - reader = indexsearcher.open(self.name) - if len(kw) != 1: - raise RuntimeError, 'one and only one field for the moment' - field, values = kw.items()[0] - for value in values: - t = Term(field, value) - reader.deleteTerm(t) - # commit the deletes - reader.close() - - def close(self): - # Indexer and Searchers are different - # and we have to open the right kind - # for the operation we are performing. - # The actual creation is done in the index and find - # methods. Here we close whatever is open. - if self.searcher is not None: - self.searcher.close() - self.searcher = None - if self.indexer is not None: - self.indexer.close() - self.indexer = None - - def flush(self): - """Flush outstanding indexes to disk. - - This makes sure we are searching the latest stuff. - """ - if self.indexer is not None: - self.indexer.flushRamSegments() - - def optimize(self): - """Merge all on-disk segments into a single segment. Saves space and can speed up queries.""" - self._setupIndexer() - self.indexer.optimize() - - def parse(self, field, qString): - if qString.startswith('"'): - qString = qString.strip('"') - #qWords = qString.strip('"').split() - qWords = self._tokenize(qString) - return self.phraseSearch(field, qWords) - else: - qWords = self._tokenize(qString) - if len(qWords) == 1: - return self.termSearch(field, qWords[0]) - else: - return self.boolSearch(field, qWords) - - def _tokenize(self, qString): - return list(self.analyzer(qString)) - - def find(self, qStr): - """Perform a search in any field in this index. - - If the search string is enclosed in double quotes, a phrase - search will be run; otherwise, the search will be for - documents containing all words specified.""" - - self._setupSearcher() - - fields = self.searcher.fieldNames() - if not fields: - return [] - all = [self.parse(field, qStr) for field in fields] - if len(all) is 1: - # simple case - return self.searcher.search(all[0]) - - q = BooleanQuery() - for query in all: - # OR all of the field queries - q.add(query, False, False) - hits = self.searcher.search(q) - return hits - - def findInField(self, **kw): - """Search only in a single field.""" - # eg index.findInField(text='flute') - if len(kw) != 1: - raise RuntimeError, 'one and only one field for the moment' - self._setupSearcher() - field, query = kw.items()[0] - q = self.parse(field, query) - hits = self.searcher.search(q) - return hits - - def termSearch(self, field, term): - "Search for a single C{term} in a C{field}." - t = Term(field, term) - q = TermQuery(t) - return q - - def phraseSearch(self, field, words): - "Search for a phrase (given as a list of words) in C{field}." - q = PhraseQuery() - for word in words: - t = Term(field, word) - q.add(t) - return q - - def boolSearch(self, field, ands=[], ors=[], nots=[]): - """Build a simple boolean query. - - Each word in C{ands} is equiv to +word - Each word in C{ors} is equiv to word - Each word in C{nots} is equiv to -word - - E.g. C{boolSearch(['spam'], ['eggs'], ['parrot', 'cheese'])} is - equiv to C{+spam eggs -parrot -cheese} in Google/Lucene syntax. - """ - q = BooleanQuery() - - for a in ands: - t = Term(field, a) - tq = TermQuery(t) - q.add(tq, True, False) - - for a in ors: - t = Term(field, a) - tq = TermQuery(t) - q.add(tq, False, False) - - for a in nots: - t = Term(field, a) - tq = TermQuery(t) - q.add(tq, False, True) - - return q - - def printHits(self, hits): - if len(hits) == 0: - print 'Nothing found!' - else: - for i in range(len(hits)): - print hits.doc(i), hits.score(i) - - def setMergeFactor(self, anInt): - "Set how many documents will be processed before the indexes will be merged. Never less than 2." - # Never less than 2 - if anInt >= 2: - self.mergeFactor = anInt - -
--- a/MoinMoin/support/lupy/search/__init__.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -"""indexing classes"""
--- a/MoinMoin/support/lupy/search/boolean.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,211 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -import itertools -import similarity -import traceback - -class BooleanQuery: - """A Query that matches documents matching boolean combinations of - other queries, typically L{lupy.search.term.TermQuery}s or L{lupy.search.phrase.PhraseQuery}s.""" - - - def __init__(self): - """Constructs an empty boolean query.""" - - self.clauses = [] - self.boost = 1.0 - - def addClause(self, clause): - """Adds a BooleanClause to this query.""" - self.clauses.append(clause) - - - def add(self, query, required, prohibited): - """Adds a clause to a boolean query. Clauses may be: - C{required} which means that documents which I{do not} - match this sub-query will I{not} match the boolean query; - C{prohibited} which means that documents which I{do} - match this sub-query will I{not} match the boolean query; or - neither, in which case matched documents are neither prohibited from - nor required to match the sub-query. - - It is an error to specify a clause as both C{required} and - C{prohibited}.""" - - self.clauses.append(BooleanClause(query, - required, - prohibited)) - - - def normalize(self, norm): - for c in self.clauses: - if not c.prohibited: - c.query.normalize(norm) - - def scorer(self, reader): - # optimize zero-term case - if len(self.clauses) == 1: - # just return term scorer - c = self.clauses[0] - if not c.prohibited: - return c.query.scorer(reader) - - result = BooleanScorer() - - for c in self.clauses: - subScorer = c.query.scorer(reader) - if subScorer is not None: - result.add(subScorer, c.required, c.prohibited) - elif c.required: - return None - - return result - - - def sumOfSquaredWeights(self, searcher): - sum = 0.0 - - for c in self.clauses: - if not c.prohibited: - # sum sub-query weights - sum += c.query.sumOfSquaredWeights(searcher) - else: - # allow complex queries to initialize themself - c.query.sumOfSquaredWeights(searcher) - return sum - - - def toString(self, field): - """Prints a user-readable version of this query""" - - buffer = '' - - for c in self.clauses: - if c.prohibited: - buffer += '-' - elif c.required: - buffer += '+' - - subQuery = c.query - if isinstance(subQuery, BooleanQuery): - # wrap sub-bools in parens - buffer += '(' - buffer += c.query.toString(field) - buffer += ')' - else: - buffer += c.query.toString(field) - - return buffer - -class BooleanClause(object): - """A clause in a BooleanQuery""" - - def __init__(self, q, r, p): - self.query = q - self.required = r - self.prohibited = p - -class BooleanScorer: - - def __init__(self): - self.coordFactors = None - self.maxCoord = 1 - self.nextMask = 1 - self.prohibitedMask = 0 - self.requiredMask = 0 - self.scorers = [] - self.currentDoc = 0 - self.validList = [] - self.table = {} - - def add(self, scorer, required, prohibited): - mask = 0 - if required or prohibited: - if self.nextMask == 0: - raise Exception, 'More than 32 required/prohibited clauses in a query.' - mask = self.nextMask - self.nextMask = self.nextMask << 1 - else: - '???' - mask = 0 - - if not prohibited: - self.maxCoord += 1 - - if prohibited: - # Update prohibited mask - self.prohibitedMask |= mask - elif required: - # Update required mask - self.requiredMask |= mask - - self.scorers.append(SubScorer(scorer, required, prohibited, mask)) - - - def computeCoordFactors(self): - self.coordFactors = [] - for i in range(self.maxCoord): - self.coordFactors.append(similarity.coord(i, self.maxCoord)) - - - def collect(self, doc, score, mask): - bucket = self.table.get(doc, None) - if bucket is None: - #doc, score, bits, coord - bucket = [-1, 0, 0, 0] - self.table[doc] = bucket - if bucket[0] != doc: - # invalid doc - # initialize fields - bucket[:] = [doc, score, mask, 1] - self.validList.append(bucket) - else: - # valid bucket - # increment score - bucket[1] += score - # add bits in mask - bucket[2] |= mask - # increment coord - bucket[3] += 1 # XXX - #print doc, score, mask, bucket - - - def score(self, maxDoc): - if self.coordFactors is None: - self.computeCoordFactors() - for t in self.scorers: - #print "SCORER %r" % t.scorer - for d,score in t.scorer.score(maxDoc): - #print "DOCUMENT %r %r" % (d, score) - self.collect(d,score,t.mask) - return self.collectHits() - - def collectHits(self): - for bucket in self.validList: - doc, score, bits, coord = bucket - if (bits & self.prohibitedMask) == 0 and (bits & self.requiredMask) == self.requiredMask: - # if prohibited and required check out - # add to results - #print "CollectHits:", doc, score, self.coordFactors, coord - try: - scorecf = score * self.coordFactors[coord] - except IndexError, err: # XXX ugly way to avoid it crashing 8( - scorecf = 0.0 - yield (doc, scorecf) - del self.validList[:] - - -class SubScorer(object): - - def __init__(self, scorer, required, prohibited, mask): - self.scorer = scorer - self.required = required - self.prohibited = prohibited - self.mask = mask - - - -
--- a/MoinMoin/support/lupy/search/camelcase.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2005 Florian -# Festi. This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -from term import TermQuery -from boolean import BooleanQuery, BooleanScorer -from phrase import PhraseQuery -from MoinMoin.support.lupy.index.term import Term - -class CamelCaseQuery(TermQuery): - """ - XXX write new comment - A Query that matches documents that contains words - the term starts with. This is usefull for CamelCase - words. You need to filter the results to make shure - the camel case words are really contained within the - document. - """ - def sumOfSquaredWeights(self, searcher): - self.query = BooleanQuery() - self.reader = searcher.reader - self.splitToWords(self.term, self.reader, []) - return self.query.sumOfSquaredWeights(searcher) - - def scorer(self, reader): - return self.query.scorer(reader) - - def _add_phrase(self, terms): - phrase = PhraseQuery() - for term in terms: - phrase.add(term) - self.query.add(phrase, False, False) - - def splitToWords(self, term, reader, terms): - text = term.text() - field = term.field() - for l in xrange(2, len(text)+1): - prefix = text[:l] - ts = reader.terms(Term(field, prefix)) - if ((ts.term.text()==prefix and - ts.term.field()==field)): - t = terms[:] - t.append(ts.term) - self.splitToWords(Term(field, text[l:]), reader, t) - else: - ts = reader.terms(term) - - # check for end words - if len(text): - return - max_length = len(text) + 3 - while ts.term.text().startswith(text): - if (len(ts.term.text()) < max_length and - ts.term.field()==field): - self._add_phrase(terms+[ts.term]) - try: - ts.next() - except StopIteration: - break - else: - self._add_phrase(terms)
--- a/MoinMoin/support/lupy/search/fuzzy.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,92 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2005 Florian -# Festi. This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -from term import TermQuery -from boolean import BooleanScorer -from MoinMoin.support.lupy.index.term import Term - -def min(*l): - m = l[0] - for v in l: - if v<m: m = v - return m - -class FuzzyQuery(TermQuery): - """Port of the Lucene FuzzyQuery - Still untested, use on your own risk... - """ - WORD_SIZE = 50 - - def __init__(self, term, similarity, prefix_length): - TermQuery.__init__(self, term) - #self.term = term - self.prefix = term.text()[:prefix_length] - self.text = term.text()[len(self.prefix):] - self.min_similarity = similarity - self.d = [] - for i in xrange(self.WORD_SIZE): - self.d.append([0]* self.WORD_SIZE) - - def scorer(self, reader): - prefix = self.prefix - lprefix = len(prefix) - field = self.term.field() - terms = [] - - ts = reader.terms(Term(field, self.prefix)) - scorer = BooleanScorer() - - while True: - text = ts.term.text() - if not text.startswith(prefix): - break - sim = self.similarity(text[lprefix:]) - if (ts.term.field()==field and - sim > self.min_similarity): - tq = TermQuery(ts.term) - tq.weight=1.0 - scorer.add(tq.scorer(reader), False, False) - terms.append(ts.term) - try: - ts.next() - except StopIteration: - break - - if terms is None: - return None - - return scorer - - def initialize_array(self, n, m): - d = self.d - if len(d)<n+1: - l = len(d[0]) - for i in xrange(len(d), n+1): - d.append([0] * l) - if len(d[0])<m+1: - l = [0] * (m - len(d[0]) + 1) - for i in xrange(len(d)): - d[i].extend(l) - - for i in xrange(n+1): d[i][0] = i - for i in xrange(m+1): d[0][i] = i - - - def similarity(self, target): - n = len(self.text) - m = len(target) - d = self.d - - self.initialize_array(n, m) - - for i in xrange(n): - s_i = self.text[i] - for j in xrange(m): - if s_i != target[j]: - d[i+1][j+1] = min(d[i][j+1], d[i+1][j], d[i][j]) + 1 - else: - d[i+1][j+1] = min(d[i][j+1]+1, d[i+1][j]+1, d[i][j]) - return 1.0 - (d[n][m]/ float(len(self.prefix) + min(m,n))) -
--- a/MoinMoin/support/lupy/search/hits.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,98 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -class Hits(object): - """A ranked list of documents, used to hold search results.""" - def __init__(self, s, q, f): - """Initialize scoreDocs and totalHits. - """ - self.query = q - self.searcher = s - self.filter = f - self.hitDocs = [] - self._cache = [] - self.maxDocs = 200 - self.length = 0 - # retrieve 100 initially - self.getMoreDocs(50) - - def __len__(self): - return self.length - - def __getitem__(self, indexOrSlice): - # NB - Does not handle hits[:-1] - # there has to be a better way than isinstance - if isinstance(indexOrSlice, int): - return self.doc(indexOrSlice) - else: - slyce = indexOrSlice - start = slyce.start or 0 - stop = min(slyce.stop or len(self), len(self)) - step = slyce.step or 1 - return [self[i] for i in range(start, stop, step)] - - def doc(self, n): - if n > len(self.hitDocs): - self.getMoreDocs(n) - elif n >= self.length: - raise IndexError, 'Not a valid hit number ' + str(n) - hitDoc = self.hitDocs[n] - - # update LRU cache of documents - # remove from list, if there - if hitDoc in self._cache: - self._cache.remove(hitDoc) - # add to front of list - self._cache.insert(0,hitDoc) - - if len(self._cache) > self.maxDocs: - oldLast = self._cache[-1] - del self._cache[-1] - # let doc get gc'd - oldLast['doc'] = None - - if hitDoc['doc'] is None: - # cache miss: read document - hitDoc['doc'] = self.searcher.doc(hitDoc['id']) - - return hitDoc['doc'] - - def getMoreDocs(self, minDoc): - """Tries to add new documents to hitDocs. - Ensures that the hit numbered C{minDoc} has been retrieved. - """ - minDoc = max(len(self.hitDocs), minDoc) - - # double number retrieved - n = minDoc * 2 - - topDocs = self.searcher.search(self.query, self.filter, n) - scoreDocs = topDocs.scoreDocs - self.length = topDocs.totalHits - - scoreNorm = 1.0 - if self.length > 0 and scoreDocs[0].score > 1.0: - scoreNorm = 1.0 / scoreDocs[0].score - - if len(scoreDocs) < self.length: - end = len(scoreDocs) - else: - end = self.length - - for i in range(len(self.hitDocs),end): - self.hitDocs.append({'score': scoreDocs[i].score * scoreNorm, 'id': scoreDocs[i].doc, 'doc': None}) - - def score(self, n): - """ Returns the score for the C{n}th document in the set. - """ - return self.hitDocs[n]['score'] - - def __repr__(self): - s= '<' + str(len(self)) + ' Hit' - if len(self) == 1: - s += '>' - else: - s += 's>' - return s
--- a/MoinMoin/support/lupy/search/indexsearcher.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,155 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -import math, itertools - -import similarity, hits - -from bisect import insort -from MoinMoin.support.lupy.index import segment, segmentmerger -from MoinMoin.support.lupy import store - -def openDir(directory): - infos = segment.SegmentInfos() - infos.read(directory) - if len(infos) == 1: # index is optimized - return segmentmerger.SegmentReader(infos[0], True) - elif len(infos) == 0: - readers = [] - else: - readers = [segmentmerger.SegmentReader(info,False) for info in infos[:-1]] - readers.append(segmentmerger.SegmentReader(infos[-1],True)) - return segmentmerger.SegmentsReader(directory, readers) - -def open(path): - """Returns an IndexReader reading the index in an FSDirectory in - the named path.""" - - return openDir(store.getDirectory(path, False)) - - -class IndexSearcher: - - """The base class for search implementations. - Implements search over a single index. - - Subclasses may implement search over multiple indices, and over - indices on remote servers.""" - - def __init__(self, dirOrPath): - """Creates a searcher searching the provided index. - """ - if isinstance(dirOrPath, basestring): - self.reader = open(dirOrPath) - else: - self.reader = openDir(dirOrPath) - - def close(self): - """Frees resources associated with this Searcher.""" - self.reader.close() - - def docFreq(self, term): - return self.reader.docFreq(term) - - def maxDoc(self): - return self.reader.maxDoc() - - def doc(self, i): - """For use by L{lupy.search.hits.Hits}.""" - return self.reader.document(i) - - def searchAll(self, query, filter): - """Lower-level search API. - - Returns a generator that yields all non-zero scoring documents - for this query that pass the filter. - - Applications should only use this if they need I{all} of the - matching documents. The high-level search API - (L{search(Query)}) is usually more efficient, as it skips - non-high-scoring hits. - - - C{query} to match documents - - C{filter} if non-null, a bitset used to eliminate some documents - """ - scorer = getScorer(query, self, self.reader) - if filter is not None: - bits = filter.bits(reader) - - if scorer is None: - return - - return itertools.imap(lambda doc, score: doc, - itertools.ifilter(lambda doc, score: score > 0 and (bits is None or bits.get(doc)), - scorer.score(self.reader.maxDoc()))) - - def search(self, query, filter=None, nDocs=None): - - """Search this index for documents matching C{query} and - (optionally) passing the C{filter} bitvector. If C{nDocs} is - specified then only the top C{nDocs} hits will be returned.""" - - if nDocs is None: - return hits.Hits(self, query, filter) - - scorer = getScorer(query, self, self.reader) - if scorer is None: - return TopDocs(0, []) - - if filter is not None: - bits = filter.bits(reader) - else: - bits = None - - scoreDocs = [] - totalHits = [0] - minScore = 0.0 - - for doc, scr in scorer.score(self.reader.maxDoc()): - if scr > 0.0 and (bits is None or bits.get(doc)): - # ignore zeroed buckets and docs not in bits - totalHits[0] += 1 - if scr >= minScore: - # update hit queue - insort(scoreDocs, ScoreDoc(doc, scr)) - if len(scoreDocs) > nDocs: - # if hit queue overfull - # remove lowest in hit queue - scoreDocs.pop() - # reset minimum score - minScore = scoreDocs[0].score - - return TopDocs(totalHits[0], scoreDocs) - - def fieldNames(self): - # Experimental for auto queries - return self.reader.fieldNames() - - -def getScorer(query, searcher, reader): - sum = query.sumOfSquaredWeights(searcher) - norm = 1.0/(math.sqrt(sum) or 1.0) - query.normalize(norm) - return query.scorer(reader) - -class ScoreDoc(object): - - def __init__(self, d, s): - self.doc = d - self.score = s - - def __lt__(a, b): - if a.score == b.score: - return a.doc > b.doc - else: - return a.score < b.score - - -class TopDocs(object): - - def __init__(self, th, sds): - self.totalHits = th - self.scoreDocs = sds -
--- a/MoinMoin/support/lupy/search/phrase.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,232 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - - -from bisect import insort -from MoinMoin.support.lupy.search import term, similarity -import sys - -class PhraseQuery: - """A query that matches documents containing a particular - sequence of terms. This may be combined with other terms - with a L{lupy.search.boolean.BooleanQuery}. - """ - - def __init__(self): - """Constructs an empty phrase query.""" - - self.idf = 0.0 - self.slop = 0 - self.terms = [] - self.weight = 0.0 - self.boost = 1.0 - - def add(self, term): - """Adds a term to the end of the query phrase.""" - if len(self.terms) == 0: - self.field = term.field() - - elif term.field() != self.field: - raise Exception, 'All phrase terms must be in the same field: ' + str(term) - - self.terms.append(term) - - - def getSlop(self): - """Returns the slop. See setSlop().""" - return self.slop - - - def normalize(self, norm): - # normalize for query - self.weight *= norm - # factor from document - self.weight *= self.idf - - - def scorer(self, reader): - # optimize zero-term case - if len(self.terms) == 0: - return None - - # optimize one-term case - if len(self.terms) == 1: - t = self.terms[0] - docs = reader.termDocsTerm(t) - if docs is None: - return None - return term.TermScorer(docs, reader.normsField(t.field()), self.weight) - - tps = [] - - for t in self.terms: - p = reader.termPositionsTerm(t) - if p is None: - # I am not sure how this is ever reached? - return None - tps.append(p) - - if self.slop == 0: - return ExactPhraseScorer(tps, reader.normsField(self.field), - self.weight) - else: - return SloppyPhraseScorer(tps, reader.norms(self.field), - self.weight) - - - def sumOfSquaredWeights(self, searcher): - # sum term IDFs - for term in self.terms: - self.idf += similarity.idfTerm(term, searcher) - - self.weight = self.idf * self.boost - # square term weights - return self.weight * self.weight - - - def toString(self, f): - """Prints a user-readable version of this query""" - - buffer = '' - if not self.field == f : - buffer += f + ':' - buffer += '\\' - - for term in self.terms[:-1]: - buffer += term.text() + ' ' - - buffer += self.terms[-1].text() + '\\' - - if self.slop != 0: - buffer += '~' + str(self.slop) - - if self.boost != 1.0: - buffer += '^' + str(self.boost) - - return buffer - - -class PhraseScorer: - - def __init__(self, tps, n, w): - self.norms = n - self.weight = w - - self.pps = [PhrasePositions(tp, i) for i, tp in enumerate(tps)] - self.pps.sort() - - def phraseQuery(self): - """Subclass responsibility""" - - def score(self, end): - # find doc w/ all the terms - while self.pps[-1].doc < end: - while self.pps[0].doc < self.pps[-1].doc: - self.pps[0].advance() - while self.pps[0].doc < self.pps[-1].doc: - self.pps[0].advance() - self.pps.append(self.pps.pop(0)) - if self.pps[-1].doc >= end: - return - - # found doc with all terms - # check for phrase - freq = self.phraseFreq() - - if freq > 0.0: - # compute score - score = similarity.tf(freq) * self.weight - # normalize - score *= similarity.normByte(self.norms[self.pps[0].doc]) - # add to results - yield (self.pps[0].doc, score) - # resume scanning - self.pps[-1].advance() - - - - -class ExactPhraseScorer(PhraseScorer): - - def phraseFreq(self): - for pp in self.pps: - pp.firstPosition() - self.pps.sort() - freq = 0.0 - - init = 0 - # the 'init' bits are to simulate a do-while loop :-/ - while init == 0 or self.pps[-1].nextPosition(): - while self.pps[0].position < self.pps[-1].position: - # scan forward in first - init2 = 0 - while init2 == 0 or self.pps[0].position < self.pps[-1].position: - if not self.pps[0].nextPosition(): - return freq - init2 = 1 - - self.pps.append(self.pps.pop(0)) - # all equal: a match - freq += 1 - init = 1 - - return freq - - -class PhrasePositions(object): - - def __init__(self, t, o): - self.tp = t - self.offset = o - - self.position = 0 - self.count = 0 - self.doc = 0 - self.tpiter = iter(t) - self.advance() - - - def firstPosition(self): - self.count = self.tp.frq - self.nextPosition() - - - def advance(self): - """Increments to next doc""" - - for doc, frq, nextPos in self.tpiter: - self.doc = doc - self.frq = frq - self._nextPos = nextPos - self.position = 0 - return - else: - # close stream - self.tp.close() - # sentinel value - self.doc = sys.maxint - return - - - def nextPosition(self): - if self.count > 0: - self.count -= 1 - # read subsequent positions - self.position = self._nextPos.next() - self.offset - return True - else: - self.count -= 1 - return False - - - def __repr__(self): - res = '<pp>d:' + str(self.doc) + ' p:' + str(self.position) + ' o:' + str(self.offset) - return res - - def __lt__(this, that): - if this.doc == that.doc: - return this.position < that.position - else: - return this.doc < that.doc
--- a/MoinMoin/support/lupy/search/prefix.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2005 Florian -# Festi. This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -from term import TermQuery -from boolean import BooleanQuery -from MoinMoin.support.lupy.index.term import Term - -class PrefixQuery(TermQuery): - """A Query that matches documents that contains the term and terms - that start with the term and have upto max_addon additional chars. - This allows to have better matching especially if no stemming is used""" - def __init__(self, term, max_addon=10000): - TermQuery.__init__(self, term) - self.term = term - self.max_length = len(term.text()) + max_addon - self.weight = 0.0 - self.boost = 1.0 - - def sumOfSquaredWeights(self, searcher): - self.query = BooleanQuery() - reader = searcher.reader - - text = self.term.text() - field = self.term.field() - - ts = reader.terms(self.term) - - while True: - if not ts.term.text().startswith(text): - break - if ((len(ts.term.text()) <= self.max_length) and - ts.term.field()==field): - self.query.add(TermQuery(ts.term), False, False) - try: - ts.next() - except StopIteration: - break - - return self.query.sumOfSquaredWeights(searcher) - - def scorer(self, reader): - return self.query.scorer(reader)
--- a/MoinMoin/support/lupy/search/regularexpression.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,86 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2005 Florian -# Festi. This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -import re - -def re_prefix(regex): - """ - return a string that the beginning of regex that will always match - Assumes the regex is a valid regular expression!!! - """ - match = re.search(r"[[({\\.$*+?|]", regex) - if not match: return regex - - if regex.find("|") != -1: - # XXXX use string or RE to group non special chars - # States - plain = 0 - escape = 1 - charset = 2 - charsetfirst = 3 - charsetescape = 4 - - state = plain - parenthesis = 0 - for c in regex: - if state == plain: - if c == "\\": state = escape - elif c == "(": parenthesis += 1 - elif c == ")": parenthesis -= 1 - elif c == "[": state = charsetfirst - elif c == "|": - if parenthesis == 0: - # | on toplevel - return "" - elif state == charset: - if c == "]": state = plain - elif c == "\\": state = charsetescape - elif state == charsetfirst: - if c == "\\": state = charsetescape - else: state = charset - elif state == charsetescape: state = charset - elif state == escape: - state = plain - - end = match.start() - if match.group() in "*{?": end -= 1 # RE element refere to last char - return regex[:end] - -from term import TermQuery -from boolean import BooleanQuery -from MoinMoin.support.lupy.index.term import Term - -class RegularExpressionQuery(TermQuery): - """Matches all documents that contain a word match the - regular expression (RE) handed over as text of the term. - This query is reasonably fast if the RE starts with normal chars. - If the RE starts with RE special chars the whole index is searched! - The RE is MATCHED against the terms in the documents! - """ - def sumOfSquaredWeights(self, searcher): - self.query = BooleanQuery() - reader = searcher.reader - - needle = self.term.text() - prefix = re_prefix(needle) - reg_ex = re.compile(needle, re.U) - field = self.term.field() - - ts = reader.terms(Term(field, prefix)) - - while True: - if reg_ex.match(ts.term.text()) and ts.term.field()==field: - self.query.add(TermQuery(ts.term), False, False) - if not ts.term.text().startswith(prefix): - break - try: - ts.next() - except StopIteration: - break - return self.query.sumOfSquaredWeights(searcher) - - def scorer(self, reader): - return self.query.scorer(reader) -
--- a/MoinMoin/support/lupy/search/similarity.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -import math - - -NORM_TABLE = map(lambda x: x/255.0, range(0,256)) - -def coord(overlap, maxOverlap): - return overlap/float(maxOverlap) - -def idf(docFreq, numDocs): - return math.log((numDocs/(docFreq + 1.0)) or 1.0) + 1.0 - -def idfTerm(term, searcher): - """Use maxDoc() instead of numDocs() because its proportional to docFreq(), - i.e., when one is inaccurate, so is the other, and in the same way.""" - - return idf(searcher.docFreq(term), searcher.maxDoc()) - -def normByte(normByte): - """Un-scales from the byte encoding of a norm into a float, i.e., - approximately 1/sqrt(numTerms).""" - - return NORM_TABLE[normByte & 0xFF] - -def normInt(numTerms): - """Computes the normalization byte for a document given the total number of - terms contained in the document. These values are stored in an index and - used by the search code - - Scales 1/sqrt(numTerms) into a byte, i.e. 256/sqrt(numTerms). - Math.ceil is used to ensure that even very long documents don't get a - zero norm byte, as that is reserved for zero-lengthed documents and - deleted documents.""" - - if numTerms == 0: - return 0 - return int((math.ceil(255.0 / math.sqrt(numTerms)))) & 0xFF - - -def tf(freq): - return float(math.sqrt(freq)) - -
--- a/MoinMoin/support/lupy/search/term.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -from itertools import islice -import sys -import similarity - -class TermQuery: - """A Query that matches documents containing a term. - This may be combined with other terms with a L{lupy.search.boolean.BooleanQuery}.""" - - def __init__(self, t): - """Constructs a query for the term B{t}.""" - - self.term = t - self.idf = 0.0 - self.weight = 0.0 - self.boost = 1.0 - - def normalize(self, norm): - # normalize for query - self.weight *= norm - # factor from document - self.weight *= self.idf - - - def scorer(self, reader): - termDocs = reader.termDocsTerm(self.term) - if termDocs is None: - return None - - return TermScorer(termDocs, - reader.normsField(self.term.field()), - self.weight) - - - def sumOfSquaredWeights(self, searcher): - self.idf = similarity.idfTerm(self.term, searcher) - self.weight = self.idf * self.boost - # square term weights - return self.weight * self.weight - - - def toString(self, field): - """Prints a user-readable version of this query""" - - buffer = '' - if not self.term.field() == field: - buffer += self.term.field() + ':' - - buffer += self.term.text() - - if self.boost != 1.0: - buffer += '^' + str(self.boost) - - return buffer - - - -class TermScorer: - - """Scorer for L{TermQuery}s.""" - - SCORE_CACHE_SIZE = 32 - - - def __init__(self, td, n, w): - self.termDocs = td - self.norms = n - self.weight = w - self.scoreCache = [similarity.tf(i) * self.weight for i in range(self.SCORE_CACHE_SIZE)] - #self.docs, self.freqs = zip(*list(islice(self.termDocs, 128))) - - def score(self, end): - - for d, f in self.termDocs.read(): - if d >= end: - break - if f < self.SCORE_CACHE_SIZE: - score = self.scoreCache[f] - else: - # cache miss - score = similarity.tf(f) * self.weight - - # normalize for field - score *= similarity.normByte(self.norms[d]) - # collect score - yield (d, score) - else: - # close stream - self.termDocs.close() - # set to sentinel value - self.doc = sys.maxint -
--- a/MoinMoin/support/lupy/store.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,308 +0,0 @@ -"""Interface to directories and files, both in memory and on disk""" -from array import array -import weakref -import os, stat, struct - -from StringIO import StringIO - - -DIRECTORIES = weakref.WeakValueDictionary() - - -def getDirectory(path, gen): - dir = DIRECTORIES.get(path, None) - - if dir is None: - dir = FSDirectory(path, gen) - DIRECTORIES[path]=dir - elif gen is True: - dir.create() - - return dir - -class FSDirectory: - - def __init__(self, path, gen): - self.directory = path - if gen is True: - self.create() - DIRECTORIES[path]=self - - def fpath(self, fname): - return os.path.join(self.directory, fname) - - def create(self): - path = self.directory - if not os.path.exists(path): - os.mkdir(path) - try: - files = os.listdir(path) - except IOError: - files = [] - - for file in files: - os.remove(os.path.join(path,file)) - - def fileExists(self, name): - return os.path.exists(self.fpath(name)) - - def fileModified(self, name, path=None): - return os.stat(self.fpath(name))[stat.ST_MTIME] - - def fileLength(self, name): - return os.stat(self.fpath(name))[stat.ST_SIZE] - - def deleteFile(self, name): - os.remove(self.fpath(name)) - - def renameFile(self, frm, to): - if os.path.exists(self.fpath(to)): - os.remove(self.fpath(to)) - - os.rename(self.fpath(frm),self.fpath(to)) - - def createFile(self, name): - #print "creating " + name - f = FileStream(self.fpath(name), 'wb') - f._name = name - return f - def openFile(self, name): - #print "opening " + name - f = FileStream(self.fpath(name), 'rb') - f._name = name - return f - def close(self): - pass - #del(DIRECTORIES[self.directory]) - # breaks if object is used several times - # and should not be needed as DIRECTORIES is a weakref dict - - def __str__(self): - return 'FSDirectory:' + self.directory - -class RAMDirectory: - - def __init__(self): - self.files = {} - - - def list(self): - return self.files.keys() - - - def fileExists(self, name): - return (self.files.get(name, None) is not None) - - - def fileModified(self, name): - file = self.files[name] - return file.lastModified - - - def fileLength(self, name): - file=self.files[name] - return len(file) - - - def deleteFile(self, name): - del(self.files[name]) - - - def renameFile(self, name, newName): - file = self.files[name] - del(self.files[name]) - self.files[newName]=file - - - def createFile(self, name): - #print "creating RAM file " + name - file = RAMStream() - file._name = name - self.files[name]=file - return file - - - def openFile(self, name): - x = self.files[name] - #print "opening RAM file " + name - x.seek(0) - return x - - def makeLock(self, name): - """TBC""" - - - def close(self): - """Do nothing""" - -class Stream(object): - - def writeByte(self, b): - self.write(chr(b)) - - def writeBytes(self, b, length): - b[:length].tofile(self._getfile()) - - def writeInt(self, i): - self.write(struct.pack("!I",i)) - - def writeVInt(self, i): - while (i & ~0x7F) != 0: - self.writeByte((i & 0x7F) | 0x80) - i = i >> 7 - self.writeByte(i) - - writeVLong = writeVInt - - def writeLong(self, i): - self.writeInt((i >> 32) & 0xFFFFFFFF) - self.writeInt(i & 0xFFFFFFFF) - - def writeString(self, s): - length = len(s) - self.writeVInt(length) - #print "WRITING: %r" % s - self.write(s.encode("utf8")) - - def getFilePointer(self): - return self.tell() - - def readByte(self): - return ord(self.read(1)) - - def readBytes(self, b, offset, len): - a = array('B') - a.fromfile(self._getfile(), len) - b[offset:offset+len] = a - - def readInt(self): - return struct.unpack("!I",self.read(4))[0] - - - def readVInt(self): - b = self.readByte() - i = b & 0x7F - - shift = 7 - while b & 0x80 != 0: - b = self.readByte() - i |= (b & 0x7F) << shift - shift += 7 - return i - - - def readLong(self): - return(self.readInt() << 32 | (self.readInt() & 0xFFFFFFFFL)) - - - def readVLong(self): - b = self.readByte() - i = b & 0x7F - - shift = 7 - while b & 0x80 != 0: - b = self.readByte() - i |= (b & 0x7FL) << shift - shift += 7 - - return i - - - def readString(self): - length = self.readVInt() - return self.readChars(length) - - def readChars(self, length): - buffer = [] - for i in range(length): - b = self.readByte() - if (b & 0x80) == 0: - buffer.append(unichr(b & 0x7F)) - elif (b & 0xE0) != 0xE0: - tmpInt = (((b & 0x1F) << 6)|(self.readByte() & 0x3F)) - buffer.append(unichr(tmpInt)) - else: - buffer.append(unichr((((b & 0x0f) << 12) | - ((self.readByte() & 0x3F) << 6) | - (self.readByte() & 0x3F)))) - x = u''.join(buffer) - #print "READING: %r" % x - return x - -class FileStream(Stream): - - def __init__(self, name, mode='rb', clone=0): - if not clone: - self.f = file(name, mode) - self.length = os.stat(name).st_size - self.isClone = 0 - self._position = 0 - else: - self.f = name - self.isClone = 1 - - def close(self): - pass - #print "!!!@#! Closing " + self._name - if not self.isClone: - self.f.close() - - def seek(self, pos): - self._position = pos - self.f.seek(pos) - - def tell(self): - return self._position - - def read(self, n): - p = self.f.tell() - if p != self._position: - #print "!!!position mismatch in %s (at %s, wants to be at %s)" % (self._name, p, self._position) - self.seek(self._position) - s = self.f.read(n) - self._position += len(s) - return s - - - def write(self, v): - p = self.f.tell() - if p != self._position: - #print "!!!position mismatch in %s (at %s, wants to be at %s)" % (self._name, p, self._position) - self.seek(self._position) - self.f.write(v) - self._position += len(v) - - - def clone(self): - g = FileStream(self.f, clone=1) - g._name = self._name + " <clone>" - g._position = self._position - return g - - def _getfile(self): - return self.f - - def __getattr__(self, attr): - return getattr(self.f, attr) - -class RAMStream(Stream, StringIO): - def __init__(self, *args): - StringIO.__init__(self, *args) - self.isClone = 0 - - def close(self): - pass - - def _getfile(self): - return self - - def get_size(self): - return len(self.getvalue()) - length = property(get_size) - - def clone(self): - r = RAMStream(self.getvalue()) - r._name = self._name + " <clone>" - r.isClone = 1 - r.seek(self.tell()) - return r
--- a/MoinMoin/support/lupy/util.py Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,108 +0,0 @@ -# This module is part of the Lupy project and is Copyright 2003 Amir -# Bakhtiar (amir@divmod.org). This is free software; you can redistribute -# it and/or modify it under the terms of version 2.1 of the GNU Lesser -# General Public License as published by the Free Software Foundation. - -from array import array -import os - -# Table of bits/byte -BYTE_COUNTS = array('B',[ - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8]) - -class BitVector(object): - - def __init__(self, dirOrInt, name = None): - # some low fi type dispatch - if name is None: - # create a new vector of dirOrInt length - self.len = dirOrInt - self.bits = array('B', ([0x00]*((self.len >> 3) + 1))) - self.bcount = -1 - else: - # read a BitVector from a file - input = dirOrInt.openFile(name) - try: - self.len = input.readInt() # read size - self.bcount = input.readInt() # read count - self.bits = array('B', [0x00]*((self.len >> 3) + 1)) # allocate bits - input.readBytes(self.bits, 0, len(self.bits)) - finally: - input.close() - - - def init__(self, n): - self.len = n - self.bits = array('B', ([0x00]*((self.len >> 3) + 1))) - - - def clear(self, bit): - # Set value of bit to zero - self.bits[bit >> 3] &= ~(1 << (bit & 7)) - self.bcount = -1 - - - def count(self): - """Returns the total number of one bits in this vector. - This is efficiently computed and cached, so that, if the - vector is not changed, no recomputation is done for - repeated calls.""" - - if self.bcount == -1: - c = 0 - for b in self.bits: - c += BYTE_COUNTS[b & 0xFF] # sum bits per byte - - self.bcount = c - - return self.bcount - - - def get(self, bit): - # Returns True if bit is one and False if it is zero - return(self.bits[bit >> 3] & (1 << (bit & 7)) != 0) - - - def set(self, bit): - # Sets the value of bit to one - self.bits[bit >> 3] |= 1 << (bit & 7) - self.bcount = -1 - - - def __len__(self): - return self.len - - - def write(self, d, name): - output = d.createFile(name) - try: - output.writeInt(len(self)) # write size - output.writeInt(self.count()) # write count - output.writeBytes(self.bits, len(self.bits)) - finally: - output.close() - -def sibpath(path, sibling): - """Return the path to a sibling of a file in the filesystem. - - This is useful in conjunction with the special __file__ attribute - that Python provides for modules, so modules can load associated - resource files. - """ - return os.path.join(os.path.dirname(os.path.abspath(path)), sibling) -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/support/xapwrap/__init__.py Thu Jun 29 01:05:53 2006 +0200 @@ -0,0 +1,2 @@ +""" xapwrap version 0.3.1 """ +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/support/xapwrap/document.py Thu Jun 29 01:05:53 2006 +0200 @@ -0,0 +1,319 @@ +""" + xapwrap.document - Pythonic wrapper around Xapian's Document API +""" +import string +import datetime +import re +import cPickle +import xapian + +MAX_KEY_LEN = 240 # this comes from xapian's btree.h, Btree::max_key_len +# NOTE: xapian's btree.h file says that its actually 252, but due to +# xapian's implementation details, the actual limit is closer to 245 +# bytes. See http://thread.gmane.org/gmane.comp.search.xapian.cvs/329 +# for more info, especially the second message. + +# The limit described above only holds true assuming keys that do not +# contain any NULL bytes. Since xapian internally escapes \0 bytes, +# xapian sees the key length as (2*N + 2) where N is the number of +# embedded NULL characters. + +INTER_FIELD_POSITION_GAP = 100 + +UNICODE_ENCODING = "UTF-8" # XXX this should not be hardcoded on module level +UNICODE_ERROR_POLICY = "replace" + +class StandardAnalyzer: + WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U) + + def tokenize(self, unknownText): + originalText = cleanInputText(unknownText, True) + # we want to perform lower() and the re search using a unicode + # object. if we try to perform those operations on regular + # string object that happens to represent unicode text encoded + # with UTF-8, we'll get garbage, or at least an + # OS/libc/$LC_CTYPE dependant result + text = originalText.lower() + for match in self.WORD_RE.finditer(text): + # we yield unicode ONLY + yield match.group() + + +class TextField(object): + __slots__ = ('name', 'text', 'prefix') + + def __init__(self, name, text = '', prefix = False): + if name and not text: + assert not prefix # it makes no sense to use a prefixed + # field without a name + self.text = name + self.name = '' + else: + self.name = name + self.text = text + self.prefix = prefix + + def __len__(self): + return len(self.text) + +class SortKey(object): + __slots__ = ('name', 'value', 'index', 'flattener') + + def __init__(self, name, value, index = None, flattener = None): + self.name = name + self.value = value + self.index = index + assert (name is None) ^ (index is None) + self.flattener = flattener + +class Value(SortKey): + pass + +class Term(object): + __slots__ = ('value') + + def __init__(self, value): + self.value = value + + def __len__(self): + return len(self.value) + +class Keyword(object): + __slots__ = ('name', 'value') + + def __init__(self, name, value): + self.name = name + self.value = value + + def __len__(self): + return len(self.value) + + +class Document: + """ + @ivar keywords: sequence of Keyword objects + @ivar sortFields: sequence of SortKey objects + @ivar textFields: sequence of TextField objects + + @cvar analyzerFactory: factory object for constructing analyzers + @cvar _picklerProtocol: protocol used in pickling data attributes + @cvar _noObject: dummy object used to indicate that there is no + data attribute + @cvar source: this is an optional argument to point at the + original text/object that this document represents + """ + _noObject = object() + _picklerProtocol = -1 + analyzerFactory = StandardAnalyzer + + # XXX TODO: add a fromXapianDoc classmethod that can be used by + # indices when returning documents from the db + + def __init__(self, textFields = (), sortFields = (), keywords = (), + terms = (), values = (), uid = None, data = _noObject, source = None): + """ + sortFields and values are really the same thing as far as + xapian is concerned. We differentiate them in the hope of + making the API easier to understand. + """ + for fields in ('textFields', 'sortFields', 'keywords', 'terms', 'values'): + arg = vars()[fields] + if not isinstance(arg, (list, tuple)): + arg = (arg,) + setattr(self, fields, list(arg)) + # copy the list so we can modify without affecting the original + self.uid = uid + self.data = data + self.source = source + # sortFields and values are really the same thing as far as xapian is concerned + self.sortFields += self.values + + def __len__(self): + length = 0 + for fieldList in (self.textFields, self.keywords): + length += sum(map(len, fieldList)) + + if self.data != self._noObject: + length += len(cPickle.dumps(self.data, self._picklerProtocol)) + + return length + + def toXapianDocument(self, indexValueMap, prefixMap=None): + d = xapian.Document() + position = 1 + analyzer = self.analyzerFactory() + + # add text fields + for field in self.textFields: + for token in analyzer.tokenize(field.text): + # the xapian swig bindings don't like unicode objects, so we + # decode terms to UTF-8 before indexing. this is fine as + # long as all data that goes into the db (whether for + # indexing or search) is converted to UTF-8 string and all + # data coming from the db (.get_value(), .get_data()) is + # decoded as UTF-8. + token = token.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) + # the tokenizer cannot guarantee that token length is + # below MAX_KEY_LEN since the regexp is done with + # unicode and the result is later converted to UTF-8. In + # the process, the string length could expand, so we + # need to check here as well. + d.add_posting(checkKeyLen(token), position) + position += 1 + position += INTER_FIELD_POSITION_GAP + + if field.prefix: + prefix = field.name + for token in analyzer.tokenize(field.text): + # token is unicode, but gets converted to UTF-8 + # by makePairForWrite: + term = makePairForWrite(prefix, token, prefixMap) + d.add_posting(term, position) + position += 1 + position += INTER_FIELD_POSITION_GAP + + # add keyword fields + for field in self.keywords: + term = makePairForWrite(field.name, field.value, prefixMap) + d.add_term(term) + + # add non positional terms + for term in self.terms: + d.add_term(term.value) + + # add sort keys + for field in self.sortFields: + self.addSortField(d, field, indexValueMap) + + # serialize and add the data object if present + if self.data is not self._noObject: + dataStr = cPickle.dumps(self.data, self._picklerProtocol) + d.set_data(dataStr) + + return d + + def addSortField(self, doc, field, indexValueMap): + if field.index is None: + valueIndex = indexValueMap.get(field.name, None) + if valueIndex is None: + from index import NoIndexValueFound + raise NoIndexValueFound(field.name, indexValueMap) + else: + valueIndex = field.index + assert isinstance(valueIndex, int) + + if field.flattener: + flatValue = field.flattener(field.value) + else: + flatValue = self.flatten(field.value) + # xapian has no limit on value length + cleanValue = cleanInputText(flatValue) + doc.add_value(valueIndex, cleanValue) + + _flatteners = {} + + def flatten(self, value): + t = type(value) + if t == str: + return value + elif t in self._flatteners: + flattener = self._flatteners[t] + flatVal = flattener(value) + return flatVal + else: + raise ValueError("Cannot flatten %r into a string. Perhaps you " + "should register a flattener for type %r." + % (value, type(value))) + + def registerFlattener(klass, typeToFlatten, flattener): + if typeToFlatten in klass._flatteners: + raise ValueError("A sort field flattener for type %s has already" + "been registered (%s) but you are attempting to" + "register a new flattener: %s" + % (typeToFlatten, klass._flatteners[typeToFlatten], + flattener)) + assert callable(flattener) + klass._flatteners[typeToFlatten] = flattener + registerFlattener = classmethod(registerFlattener) + + def unregisterFlattener(klass, typeToFlatten): + if typeToFlatten in klass._flatteners: + del klass._flatteners[typeToFlatten] + unregisterFlattener = classmethod(unregisterFlattener) + +# common flatteners: + +def flattenNumeric(value, numDigits = 10): + return ''.join(('%', str(numDigits), '.d')) % value + +Document.registerFlattener(int, flattenNumeric) + +def flattenLong(value): + return flattenNumeric(value, numDigits=20) + +Document.registerFlattener(long, flattenLong) + +def flattenDate(value): + return value.isoformat() + +for dt in (datetime.date, datetime.time, datetime.datetime): + Document.registerFlattener(dt, flattenDate) + +def flattenUnicode(value): + return value.encode(UNICODE_ENCODING) + +Document.registerFlattener(unicode, flattenUnicode) + + +def cleanInputText(unknownText, returnUnicode = False): + if isinstance(unknownText, str): + originalText = unknownText.decode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) # XXX hardcoded UTF-8, make param XXX + elif isinstance(unknownText, unicode): + originalText = unknownText + else: + raise ValueError("Only strings and unicode objects can be indexed.") + # be very careful about lowercasing the text here: since the API we + # expose to higher levels doesn't allow searchup.py to call + # findInField directly, searches for INDEXERVERSION:4 have to be + # sent as regular queries. lowercasing all queries here will break + # keyword searches. + if returnUnicode: + return originalText + else: + return originalText.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) + + +def makePairForWrite(prefix, token, prefixMap=None): + # prefixes must be uppercase; if the prefix given to us is a str + # that happens to be UTF-8 encoded, bad things will happen when we + # uppercase it, so we convert everything to unicode first + if isinstance(prefix, str): + prefix = prefix.decode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) + if isinstance(token, str): + token = token.decode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) # XXX hardcoded UTF-8, make param + + if prefixMap is None: + prefix = prefix.upper() + else: # we have a map, so first translate it using the map (e.g. 'title' -> 'S') + prefix = prefixMap.get(prefix, prefix.upper()) + + result = '%s%s%s' % (prefix, prefix[0] == 'X' and ':' or '', token) + # since return value is going into the db, it must be encoded as UTF-8 + result = result.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) + return checkKeyLen(result) + +def checkKeyLen(s): + if not s: + return ' ' + numNullBytes = s.count('\0') + 1 + xapianLen = numNullBytes + len(s) + 1 # that last one is for the + # terminating \0 + if xapianLen < MAX_KEY_LEN: + return s + else: + # doing nothing seems preferable to mangling an overly large + # token that we don't know how to handle. we use a space + # instead of an empty string because xapian doesn't like + # getting empty strings added as terms + return ' ' +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/support/xapwrap/index.py Thu Jun 29 01:05:53 2006 +0200 @@ -0,0 +1,1030 @@ +# Copyright (c) 2005 Divmod Inc. See LICENSE file for details. +""" +Xapwrap provides an improved interface to the Xapian text indexing +library (see http://www.xapian.org/ for more information on +Xapian). Xapwrap provides a layered approach offering ample +opportunities for customization. + +Example +------- +:: + + from xapwrap import SmartIndex, Document, TextField, SortKey + from datetime import date + + idx = SmartIndex('/tmp/index', True) + d1 = Document(TextField('hi there bob'), + sortFields = [SortKey('date', date(2004, 1, 1)), + SortKey('author', 'Bob'), + SortKey('size', 450)]) + idx.index(d1) + idx.close() + + idx = SmartIndex('/tmp/index') + print idx.search('there', 'date', sortAscending = True) + + + +Indices +------- + +Important methods for C{ReadOnlyIndex}: + __init__(self, *pathnames) + close(self) + configure(self, prefixMap = None, indexValueMap = None) + flush(self) + search(self, query, sortKeyt = None, + startingIndex = 0, batchSize = MAX_DOCS_TO_RETURN, + sortIndex = None, sortAscending = True, + sortByRelevence = False) + count(self, query) + checkIndex(self, maxID) + get_doccount(self, uid) + +Important methods for C{Index}: + (all methods in ReadOnlyIndex) + __init__(self, pathname, create) + index(self, doc) + add_document(self, doc) + replace_document(self, uid, doc) + delete_document(self, uid) + +C{SmartIndex} and C{SmartReadOnlyIndex} define the same methods as their +dumb counterparts. + +The primary way to interact with a Xapian index is to use either the +C{Index} or C{ReadOnlyIndex} class. In addition to offering read only +access without the inconveniance of lock files, C{ReadOnlyIndex} offers +the ability to merge several xapian indices into one super index with +only a small performance impediement. + +In addition to C{Index} and C{ReadOnlyIndex}, Xapwrap also offers +C{SmartIndex} and C{SmartReadOnlyIndex} classes. These classes +automatically store and manage the index value map and the prefix map in +the index. There are two caveats to using them however. First, one +cannot index documents that have a xapian ID of 1. Secondly, when using +C{SmartReadOnlyIndex} to combine multiple indices together, the indices +must have consistent value index maps. Indices where all documents have +the same index value map are always consistent. The problem only emerges +when indices can have different types of documents with different sets +of sort keys. More specifically, the problem can only emerge if one +indices documents in such a way that sort keys are added to different +indices in different orders. + + +Documents +--------- + +In order to add new data to an index, one asks a C{Index} or +C{SmartIndex} instance to index a C{Document} instance. Documents take a +sequence of text fields, a sequence of sort keys and a sequence of +keywords as constructor arguments. They also take optional universal +identifiers and an arbitrary serializable object. The first three +sequences can be created using the C{TextField}, C{SortKey}, and +C{Keyword} classes defined below. C{TextField} instances contain a chunk +of text and an optional name as well as a boolean indicating whether the +field is to be prefixed. Prefixed fields are effectively indexed twice: +after being indexed normally, each token is indexed again with the field +name. This allows the user to perform fielded searches and is primarily +useful for small text fields, such as the subject of an email or a list +of author names. C{Keyword} instances denote individual prefixed tokens +that are indexed with no positional information. C{SortKey} instances +denote arbitrary fields that are used for sorting documents. They +include a sort field name and the sort key value. Since Xapian only +accepts strings as sort keys, sort key values must be flattened into +strings before entering the index. + +Xapwrap defines flattener functions that automatically flatten integer, +date, time, and datetime instances into strings that sort properly. You +can define your own flatteners for custom data types by using the +C{registerFlattener} class method of the C{Document} class. + + +Error Handling +-------------- +Internal Xapian error conditions should generate normal python +exceptions defined in this file that inherit from xapwrap.XapianError. + + +Logging +------- +Xapwrap will use twisted's logging facilities if available. In any +event, a custom logging function can be supplied by setting xapwrap.log. + + +Future Work +----------- +Xapwrap currently does not support stemming or stop words, although a +future version will. + +""" +import cPickle, sets, glob, os +import xapian +from document import makePairForWrite, StandardAnalyzer, Document, SortKey, Keyword +from document import UNICODE_ENCODING, UNICODE_ERROR_POLICY + +try: + from atop.tpython import FilesystemLock +except ImportError: + from os import symlink, readlink, remove as rmlink + import errno + + class FilesystemLock: + """A mutex. + + This relies on the filesystem property that creating + a symlink is an atomic operation and that it will + fail if the symlink already exists. Deleting the + symlink will release the lock. + + @ivar name: The name of the file associated with this lock. + @ivar clean: Indicates whether this lock was released cleanly by its + last owner. Only meaningful after C{lock} has been called and returns + True. + """ + + clean = None + locked = False + + def __init__(self, name): + self.name = name + + def lock(self): + """Acquire this lock. + + @rtype: C{bool} + @return: True if the lock is acquired, false otherwise. + + @raise: Any exception os.symlink() may raise, other than + EEXIST. + """ + try: + pid = readlink(self.name) + except (OSError, IOError), e: + if e.errno != errno.ENOENT: + raise + self.clean = True + else: + if not hasattr(os, 'kill'): + return False + try: + os.kill(int(pid), 0) + except (OSError, IOError), e: + if e.errno != errno.ESRCH: + raise + rmlink(self.name) + self.clean = False + else: + return False + + symlink(str(os.getpid()), self.name) + self.locked = True + return True + + def unlock(self): + """Release this lock. + + This deletes the directory with the given name. + + @raise: Any exception os.readlink() may raise, or + ValueError if the lock is not owned by this process. + """ + pid = readlink(self.name) + if int(pid) != os.getpid(): + raise ValueError("Lock %r not owned by this process" % (self.name,)) + rmlink(self.name) + self.locked = False + +try: + from twisted.python.log import msg as log +except ImportError: + def log(*args): + pass + + +# max number of bytes that can be indexed without forcing an index +# flush. this limits memory consumption +MAX_DATA_INDEXED_BETWEEN_FLUSHES = 200 * 1000 + +MAX_DOCS_TO_RETURN = 1000 * 1000 + +XAPIAN_LOCK_FILENAME = "db_lock" +XAPWRAP_LOCK_FILENAME = "xapian_lock" + +# Xapian error handling is somewhat weak: all errors trigger either an +# IOError, a RuntimeError, or a ValueError. The exception's args +# attribute is a singleton tuple containing an explanation +# string. Possible errors include 'DatabaseCorruptError: Quartz metafile +# /tmp/foo/meta is invalid: magic string not found.' and +# 'DatabaseLockError: Unable to acquire database write lock +# /tmp/foo/db_lock'. Instead of looking inside exception error strings +# everywhere, I made a wrapper for xapian database operations that +# catches exceptions and translates them into the more meaningful +# exceptions shown below. + +class XapianError(StandardError): + pass +class XapianRuntimeError(XapianError): + pass +class XapianLogicError(XapianError): + pass +class XapianDatabaseError(XapianError): + pass + +class XapianAssertionError(XapianLogicError): + pass +class InvalidOperationError(XapianLogicError): + pass +class InvalidArgumentError(XapianLogicError): + pass +class UnimplementedError(XapianLogicError): + pass + +class DocNotFoundError(XapianRuntimeError): + pass +class RangeError(XapianRuntimeError): + pass +class InternalError(XapianRuntimeError): + pass +class FeatureUnavalableError(XapianRuntimeError): + pass +class XapianNetworkError(XapianRuntimeError): + pass + +class NetworkTimeoutError(XapianNetworkError): + pass + +class DatabaseCorruptionError(XapianDatabaseError): + pass +class DatabaseCreationError(XapianDatabaseError): + pass +class DatabaseOpeningError(XapianDatabaseError): + pass +class DatabaseLockError(XapianDatabaseError): + pass +class DatabaseModifiedError(XapianDatabaseError): + pass + +# these exceptions are not Xapian errors +class UnknownDatabaseError(XapianError): + pass + +class NoIndexValueFound(XapianError): + pass + +class InconsistantIndex(XapianError): + pass + +class InconsistantIndexCombination(XapianError): + pass + + +def makeTranslatedMethod(methodName): + def translatedMethod(self, *args, **kwargs): + try: + return getattr(self.db, methodName)(*args, **kwargs) + except (IOError, RuntimeError, ValueError), e: + errorMsg = e.args[0] + for subString, exceptionClass in self.exceptionStrMap.iteritems(): + if subString in errorMsg: + raise exceptionClass(e) + else: + raise UnknownDatabaseError(e) + except: + raise + return translatedMethod + +class ExceptionTranslater: + def __init__(self, db): + self.db = db + + def openIndex(klass, readOnly, *args, **kwargs): + try: + if readOnly: + assert len(kwargs) == 0 + # assume all args are db paths + db = xapian.Database(args[0]) + for path in args[1:]: + db.add_database(xapian.Database(path)) + return klass(db) + else: + return klass(xapian.open(*args, **kwargs)) + except (IOError, RuntimeError, ValueError), e: + errorMsg = e.args[0] + for subString, exceptionClass in klass.exceptionStrMap.iteritems(): + if subString in errorMsg: + raise exceptionClass(e) + else: + raise UnknownDatabaseError(e) + except Exception, e: + raise UnknownDatabaseError(e) + + openIndex = classmethod(openIndex) + + # possible exceptions are taken from the list at + # http://www.xapian.org/docs/apidoc/html/errortypes_8h.html + exceptionStrMap = { + # exceptions whose names differ between xapwrap and Xapian + 'DatabaseCorruptError': DatabaseCorruptionError, + 'AssertionError': XapianAssertionError, + 'DatabaseCreateError': DatabaseCreationError, + + # exceptions translated with the same name + 'DatabaseLockError': DatabaseLockError, + 'DatabaseOpeningError': DatabaseOpeningError, + 'DatabaseModifiedError': DatabaseModifiedError, + 'FeatureUnavalableError': FeatureUnavalableError, + 'DocNotFoundError': DocNotFoundError, + 'InvalidOperationError': InvalidOperationError, + 'InvalidArgumentError': InvalidArgumentError, + 'UnimplementedError': UnimplementedError, + 'NetworkError': XapianNetworkError, + 'NetworkTimeoutError': NetworkTimeoutError, + 'DatabaseError': XapianDatabaseError, + 'InternalError': InternalError, + 'RangeError': RangeError, + 'RuntimeError': XapianRuntimeError, + 'LogicError': XapianLogicError + } + + get_doccount = makeTranslatedMethod('get_doccount') + add_document = makeTranslatedMethod('add_document') + replace_document = makeTranslatedMethod('replace_document') + delete_document = makeTranslatedMethod('delete_document') + flush = makeTranslatedMethod('flush') + term_exists = makeTranslatedMethod('term_exists') + reopen = makeTranslatedMethod('reopen') + begin_transaction = makeTranslatedMethod('begin_transaction') + commit_transaction = makeTranslatedMethod('commit_transaction') + cancel_transaction = makeTranslatedMethod('cancel_transaction') + get_lastdocid = makeTranslatedMethod('get_lastdocid') + get_avlength = makeTranslatedMethod('get_avlength') + get_termfreq = makeTranslatedMethod('get_termfreq') + get_collection_freq = makeTranslatedMethod('get_collection_freq') + get_doclength = makeTranslatedMethod('get_doclength') + get_document = makeTranslatedMethod('get_document') + + postlist_begin = makeTranslatedMethod('postlist_begin') + postlist_end = makeTranslatedMethod('postlist_end') + termlist_begin = makeTranslatedMethod('termlist_begin') + termlist_end = makeTranslatedMethod('termlist_end') + positionlist_begin = makeTranslatedMethod('positionlist_begin') + positionlist_end = makeTranslatedMethod('positionlist_end') + allterms_begin = makeTranslatedMethod('allterms_begin') + allterms_end = makeTranslatedMethod('allterms_end') + + +def makeProtectedDBMethod(method, setupDB = True): + def protectedMethod(self, *args, **kwargs): + if setupDB: + self.setupDB() + try: + return method(self, *args, **kwargs) +## # test that this works and doesn't recurse infinitely +## except DatabaseModifiedError: +## self.reopen() +## return protectedMethod(self, *args, **kwargs) + except XapianError, e: + #log("error encountered while performing xapian index operation %s: %s" + # % (method.__name__, e)) + self.close() + raise + return protectedMethod + + +# there are lots of places below where we write code like: +# enq = mset = None +# try: +# enq = self.enquire(foo) +# mset = enq.get_mset(0, 10) +# return mset[0][flimflam] +# except: +# del enq, mset +# raise + +# the purpose of this code is to ensure that no references to enquire +# objects or msets will outlive the function call. msets and enquire +# objsects hold a reference to the xapian db, and thus prevent it from +# being properly gc'd. if we fail to delete enq and mset on exception, +# then they can be kept around for arbitrarily long periods of time as +# part of the exception state + + +# be extremely careful about keeping a db object in local scope; +# once its there, an unhandled exception could create a traceback +# containing a frame object that holds a copy of the locals dict, +# including the db object. if that frame/traceback object is kept +# around forever (which parts of twisted/quotient seem to do, +# especially deferreds), then the db object will never be deleted +# and the indexer lock will never go away. + +# in order to prevent that from happening, we maintain two invariants: + +# 1. the db is only accessed as an instance attribute and is never +# copied into a local variable. i.e., we always say self.db and +# never ever say db = self.db. this keeps the db object from ever +# getting captured by a frame/traceback. + +# 2. the db is only accessed from within an exception handler that +# calls self.close() in the event of *any* failure. this ensures +# that the instance loses all references to the db on failure, so, +# even if the instance object is captured by a frame object (or +# something else), the db will already have been freed. + + +class ReadOnlyIndex: + """ + I represent a Xapian index that is read only by wrapping the + xapian.Database class. Because I provide read only access, I can be + used to combine several Xapian indices into one index with + performance only slightly lower than when using only one index. + + @cvar DEFAULT_QUERY_COMBINER_OP: the operation used by the query parser to combine query terms + + @cvar STEMMING_LANGUAGE: the language used by the query parser for + stemming. this is of little use since Xapwrap does not yet support + stemming when indexing. + + @ivar names: a sequence of file names representing paths to Xapian + indices + + Please use the configure method to modify C{prefixMap} and C{indexValueMap} + + @ivar prefixMap: a map of prefixes used by named fields in the index + and the name they should be referred to by the query parser + + @ivar indexValueMap: a map from sort field names to value integer + + @ivar amountIndexedSinceLastFlush: the number of bytes indexed since + the last flush + + The following instance attributes should never be modified or + accessed directly: + + @ivar db: the xapian index object + @ivar qp: the xapian query parser object + @ivar _searchSessions: a map from query description string to + (enquire, lastIndexSortedBy) + """ + + DEFAULT_QUERY_COMBINER_OP = xapian.Query.OP_AND + STEMMING_LANGUAGE = 'none' + + def __init__(self, *names): + if len(names) < 1: + raise ValueError("No index directory supplied to Index constructor") + self.names = names + self.db = None + self.qp = None + self._searchSessions = {} + self.prefixMap = {} + self.indexValueMap = {} + self.amountIndexedSinceLastFlush = 0 + + def setupDB(self): + # we hide the db so that methods always access it only through + # this method since db objects can be silently reaped when not + # in use. db objects consume 5 file descriptors. + + if self.db is None: + self._setupDB() + + #self.qp = xapian.QueryParser() + # this is vital: these options specify no language for + # stemming (""), disable stemming (False), and specify an + # empty stop word object (None). we need this because by + # default, xapian's query parser does english stemming + #s = xapian.Stem(self.STEMMING_LANGUAGE) + #self.qp.set_stemmer(s) + + # we want query terms to be ANDed together by default + #self.qp.set_default_op(self.DEFAULT_QUERY_COMBINER_OP) + self._configure() + + log("Index %s contains %s documents" % + (self.names, self.get_doccount())) + + def _setupDB(self): + self.db = ExceptionTranslater.openIndex(True, *self.names) + + def close(self): + log("closing xapian index %s" % self.names) + for query in self._searchSessions.keys(): + del self._searchSessions[query] + self.qp = None + self.db = None + + def _configure(self): + if 'uid' not in self.indexValueMap: + # this a gross hack... + self.indexValueMap['uid'] = 0 + self.indexValueMap['uidREV'] = 1 + if self.qp is not None: + for k, v in self.prefixMap.iteritems(): + # check for unicode encoding? + if v: + V = v.upper() + else: + V = k.upper() + self.qp.add_prefix(k, V) + + def configure(self, prefixMap = None, indexValueMap = None): + if prefixMap is not None: + self.prefixMap = prefixMap + if indexValueMap is not None: + self.indexValueMap = indexValueMap + self._configure() + + def get_doccount(self): + return self.db.get_doccount() + get_doccount = makeProtectedDBMethod(get_doccount) + + def enquire(self, query): + searchSession = None + try: + searchSession = xapian.Enquire(self.db.db) + searchSession.set_query(query) + return searchSession + except: + del query, searchSession + raise + enquire = makeProtectedDBMethod(enquire) + + def flush(self): + if self.db is not None: + self.db.flush() + self.amountIndexedSinceLastFlush = 0 + flush = makeProtectedDBMethod(flush) + + def search(self, query, + sortKey = None, + startingIndex = 0, + batchSize = MAX_DOCS_TO_RETURN, + sortIndex = None, sortAscending = True, + sortByRelevence = False, + valuesWanted = None): + """ + Search an index. + + @ivar valuesWanted: a list of Values that will be returned as part + of the result dictionary. + """ + + # TODO - allow a simple way to get Keywords out + self.setupDB() + if isinstance(query, (str, unicode)): + query = ParsedQuery(query) + elif not(isinstance(query, Query)): + raise ValueError("query %s must be either a string or a " + "subclass of xapwrap.Query" % query) + + q = query.prepare(self.qp) + # uggg. this mess is due to the fact that xapain Query objects + # don't hash in a sane way. + qString = q.get_description() + + # the only thing we use sortKey for is to set sort index + if sortKey is not None: + sortIndex = self.indexValueMap[sortKey] + + # once you call set_sorting on an Enquire instance, there is no + # way to resort it by relevence, so we have to open a new + # session instead. + + # ignore sortAscending since there's no easy way to implement + # ascending relevancy sorts and it's tough to imagine a case + # where you'd want to see the worst results. in any event, the + # user can always sort by relevancy and go to the last page of + # results. + + enq = mset = None + if qString not in self._searchSessions: + self._searchSessions[qString] = (self.enquire(q), None) + try: + enq, lastIndexSortedBy = self._searchSessions[qString] + + # if we don't set sortIndex, the results will be returned + # sorted by relevance, assuming that we have never called + # set_sorting on this session + if sortByRelevence and lastIndexSortedBy is not None: + sortIndex = sortKey = None + if lastIndexSortedBy is not None: + del self._searchSessions[qString] + self._searchSessions[qString] = (self.enquire(q), None) + enq, lastIndexSortedBy = self._searchSessions[qString] + if sortIndex is not None: + # It seems that we have the opposite definition of sort ascending + # than Xapian so we invert the ascending flag! + enq.set_sort_by_value(sortIndex, not sortAscending) + + self._searchSessions[qString] = (enq, sortIndex) + + mset = enq.get_mset(startingIndex, batchSize) + results = [] + for m in mset: + thisResult = {} + thisResult['uid'] = m[xapian.MSET_DID] + thisResult['score'] = m[xapian.MSET_PERCENT] + if valuesWanted: + xapDoc = m[4] + valRes = {} + for valName in valuesWanted: + valueIndex = self.indexValueMap.get(valName, None) + if valueIndex is None: + raise NoIndexValueFound(valName, self.indexValueMap) + valRes[valName] = xapDoc.get_value(valueIndex) + thisResult['values'] = valRes + results.append(thisResult) + return results + except: + del enq, mset + raise + search = makeProtectedDBMethod(search) + + def count(self, query): + enq = mset = None + try: + enq = self.enquire(query) + # get_matches_estimated does not return accurate results if + # given a small ending number like 0 or 1 + mset = enq.get_mset(0, MAX_DOCS_TO_RETURN) + sizeEstimate = mset.get_matches_estimated() + return sizeEstimate, self.get_doccount() + except: + del enq, mset + raise + count = makeProtectedDBMethod(count) + + def checkIndex(self, maxID): + """Compute a list of all UIDs less than or equal to maxID that + are not in the db. + """ + # I had originally suspected that the performance hit of + # returning a huge list in the case of empty indexes would be + # substantial, but testing with a 120,000 msg index indicates + # that performance is fine and that the space overhead is quite + # reasonable. If that were not the case, this could be optimized + # by calculating the maximum document ID in the index and only + # scanning up to the minimum of maxID and the max ID in the + # index, assuming that were using the same document IDs in the + # index as in atop. + + missingUIDs = [] + for uid in xrange(maxID + 1): + term = makePairForWrite('UID', str(uid)) + if not self.db.term_exists(term): + missingUIDs.append(uid) + return missingUIDs + checkIndex = makeProtectedDBMethod(checkIndex) + + def get_documents(self, uid): + """ return a list of remapped UIDs corresponding to the actual UID given + """ + docTerm = makePairForWrite('UID', str(uid)) + candidates = self.search(RawQuery(docTerm)) + return [int(c['uid']) for c in candidates] + + def get_document(self, uid): + # we cannot simply use db.get_document since doc ids get + # remapped when combining multiple databases + candidates = self.get_documents(uid) + if len(candidates) == 0: + raise DocNotFoundError(uid) + elif len(candidates) == 1: + return self._get_document(candidates[0]) + else: + raise InconsistantIndex( + "Something has gone horribly wrong. I tried " + "retrieving document id %s but found %i documents " + "with that document ID term" % (uid, len(candidates))) + + def _get_document(self, uid): + assert isinstance(uid, int) + return self.db.get_document(uid) + _get_document = makeProtectedDBMethod(_get_document) + + def term_exists(self, term): + assert isinstance(term, str) + return self.db.term_exists(term) + term_exists = makeProtectedDBMethod(term_exists) + + def get_lastdocid(self): + return self.db.get_lastdocid() + get_lastdocid = makeProtectedDBMethod(get_lastdocid) + +# XXX FIXME: we should consider deleting all searchSessions whenever we +# add a document, or we should reopen the db + + +class Index(ReadOnlyIndex): + + def __init__(self, name, create = False, analyzer = None): + # XXX FIXME: we should really try opening the db here, so that + # any errors are caught immediately rather than waiting for the + # first time we try to do something... + ReadOnlyIndex.__init__(self, name) + self.name = name + if create: + self.flags = xapian.DB_CREATE_OR_OPEN + else: + self.flags = xapian.DB_OPEN + self.analyzer = analyzer or StandardAnalyzer() + self.lockFile = FilesystemLock( + os.path.join(self.name, XAPWRAP_LOCK_FILENAME)) + + def _setupDB(self): + """ really get a xapian database object """ + + # xapian expects directories! self.name should refer to a + # directory. if it doesn't exist, we'll make one. + if not os.path.exists(self.name): + os.mkdir(self.name) + + # try to acquire a lock file + if not self.lockFile.lock(): + owningPid = os.readlink(self.lockFile.name) + errorMsg = ("cannot acquire lock file for xapian index %s" + "because it is owned by process %s" % + (self.name, owningPid)) + log(errorMsg) + raise DatabaseLockError(errorMsg) + xapLockFilePath = os.path.join(self.name, XAPIAN_LOCK_FILENAME) + if os.path.exists(xapLockFilePath): + log("Stale database lock found in %s. Deleting it now." % xapLockFilePath) + os.remove(xapLockFilePath) + + # actually try to open a xapian DB + try: + try: + self.db = ExceptionTranslater.openIndex(False, self.name, self.flags) + except DatabaseCorruptionError, e: + # the index is trashed, so there's no harm in blowing it + # away and starting from scratch + log("Xapian index at %s is corrupted and will be destroyed" + % self.name) + if self.lockFile.locked: + self.lockFile.unlock() + for idxFname in glob.glob(os.path.join(self.name, '*')): + os.remove(idxFname) + self.db = ExceptionTranslater.openIndex(False, self.name, self.flags) + finally: + if self.db is None and self.lockFile.locked: + self.lockFile.unlock() + + def __del__(self): + self.close() + + def close(self): + # this is important! the only way to get xapian to release the + # db lock is to call the db object's destructor. that won't + # happen until nobody is holding a reference to the db + # object. unfortunately, the query parser holds a reference to + # it, so the query parser must also go away. do not hold + # references to these objects anywhere but here. + + # enquire objects and mset objects hold a reference to the db, + # so if any of them are left alive, the db will not be reclaimed + + if self.db is not None: + ReadOnlyIndex.close(self) + # the islink test is needed in case the index directory has + # been deleted before we close was called. + if self.lockFile.locked and os.path.islink(self.lockFile.name): + self.lockFile.unlock() + # there is no point in checking if the lock file is still + # around right here: it will only be deleted when xapian's + # destructor runs, but python defers running destructors + # until after exception handling is complete. since this + # code will often get called from an exception handler, we + # have to assume that the lock file's removal will be + # delayed at least until after this method exits + + def get_document(self, uid): + return self._get_document(uid) + + # methods that modify db state + + def index(self, doc): + self.setupDB() + if hasattr(doc, 'uid') and doc.uid: + uid = int(doc.uid) + doc.sortFields.append(SortKey('uid', uid)) + doc.keywords.append(Keyword('uid', str(uid))) + xapDoc = doc.toXapianDocument(self.indexValueMap, self.prefixMap) + self.replace_document(uid, xapDoc) + else: + # We need to know the uid of the doc we're going to add + # before we add it so we can setup appropriate uid sorting + # values. But, another thread could potentially insert a + # document at that uid after we determine the last uid, but + # before we manage the insertion. Yay race conditions! So we + # try to add the document and then check that it ended up at + # the right uid. If it did not, we update it with the + # correct uid sort values. + uid = self.get_lastdocid() + 1 + doc.sortFields.append(SortKey('uid', uid)) + doc.keywords.append(Keyword('uid', str(uid))) + xapDoc = doc.toXapianDocument(self.indexValueMap, self.prefixMap) + newUID = self.add_document(xapDoc) + if newUID != uid: + doc.sortFields.append(SortKey('uid', newUID)) + doc.keywords.append(Keyword('uid', str(newUID))) + xapDoc = doc.toXapianDocument(self.indexValueMap, self.prefixMap) + self.replace_document(newUID, xapDoc) + + # a simpler alternative would be to add an empty document + # and then replace it. the problem with that strategy is + # that it kills performance since xapian performs an + # implicit flush when you replace a document that was added + # but not yet committed to disk. + + self.amountIndexedSinceLastFlush += len(doc) + if self.amountIndexedSinceLastFlush > MAX_DATA_INDEXED_BETWEEN_FLUSHES: + self.flush() + return uid + + def add_document(self, doc): + return self.db.add_document(doc) + add_document = makeProtectedDBMethod(add_document) + + def replace_document(self, uid, doc): + return self.db.replace_document(uid, doc) + replace_document = makeProtectedDBMethod(replace_document) + + def delete_document(self, docID): + return self.db.delete_document(docID) + delete_document = makeProtectedDBMethod(delete_document) + +class Query: + pass + +class ParsedQuery(Query): + def __init__(self, queryString): + if isinstance(queryString, unicode): + queryString = queryString.encode(UNICODE_ENCODING, UNICODE_ERROR_POLICY) + # as of xapian 0.9.5 the query parser makes trouble with utf-8. but it + # also doesnt work with iso-8859-15, so we just live with ascii-only search + # for now... - a utf8 fix seems to be planned for the near future! + self.queryString = queryString + + def prepare(self, queryParser): + return queryParser.parse_query(self.queryString) + +class RawQuery(Query): + def __init__(self, queryString): + if isinstance(queryString, unicode): + queryString = queryString.encode('utf-8') + + assert isinstance(queryString, str) + self.queryString = queryString + + def prepare(self, queryParser): + return xapian.Query(self.queryString) + +class QObjQuery(Query): + def __init__(self, query): + assert isinstance(query, xapian.Query) + self.query = query + + def prepare(self, queryParser): + return self.query + +class SmartIndex(Index): + documentFactory = Document + + def __init__(self, *args, **kwargs): + Index.__init__(self, *args, **kwargs) + self.fetchState() + + def saveState(self): + self.setupDB() + state = {'indexValueMap': self.indexValueMap, + 'prefixMap': self.prefixMap} + d = self.documentFactory(uid = 1, data = state) + self.index(d, checkID = False) + self.flush() + + def fetchState(self): + self.setupDB() + if self.get_doccount() == 0: + # Don't rely on the try:except: for this case + self.saveState() + try: + doc = self.get_document(1) + except DocNotFoundError: + newState = {'indexValueMap': {}, 'prefixMap': {}} + self.saveState() + else: + dataStr = doc.get_data() + newState = cPickle.loads(dataStr) + self.indexValueMap.update(newState['indexValueMap']) + self.prefixMap.update(newState['prefixMap']) + + def index(self, doc, checkID = True): + if hasattr(doc, 'uid') and doc.uid == 1 and checkID: + raise InvalidArgumentError( + "document UIDs must be greater than one when using SmartIndex") + + docSortKeys = sets.Set([sk.name for sk in doc.sortFields if sk.name is not None]) + indexSortKeys = sets.Set(self.indexValueMap.keys()) + if not docSortKeys.issubset(indexSortKeys): + nextValueIndex = 1 + max(self.indexValueMap.itervalues()) + # we sort the sortKeys in order to improve the odds that two + # indices that are indexed with the same documents in the + # same order will always end up with the same + # indexValueMaps, even if different versions of python are + # used with different hash functions + sortKeys = list(docSortKeys) + sortKeys.sort() + for sortKey in sortKeys: + if sortKey not in self.indexValueMap: + assert nextValueIndex % 2 == 0 + self.indexValueMap[sortKey] = nextValueIndex + self.indexValueMap[sortKey + 'REV'] = nextValueIndex + 1 + nextValueIndex += 2 + self.saveState() + + docKeywords = sets.Set([tf.name for tf in doc.textFields if tf.prefix] + + [kw.name for kw in doc.keywords]) + indexKeyWords = sets.Set(self.prefixMap.keys()) + if not docKeywords.issubset(indexKeyWords): + for k in docKeywords - indexKeyWords: + self.prefixMap[k] = k.upper() + self.saveState() + + return Index.index(self, doc) + + +class SmartReadOnlyIndex(ReadOnlyIndex): + + def __init__(self, *args, **kwargs): + ReadOnlyIndex.__init__(self, *args, **kwargs) + self.fetchState() + + def fetchState(self): + stateDocIDs = self.get_documents(1) + stateDocs = map(self._get_document, stateDocIDs) + states = [cPickle.loads(s.get_data()) for s in stateDocs] + + # should we issue a warning when the number of states that we + # retrieve is less than the number of indices we opened? the + # only problem is that some indices may be empty, but there's no + # easy way to check how many documents are in a subindex without + # opening it explicitly using xapian.Database and that seems + # rather expensive for this code path. + + # merge all the states into a master state + master = {'prefixMap': self.prefixMap, + 'indexValueMap': self.indexValueMap} + # note that if there are conflicts, there is no guarantee on who + # will win, but it doesn't matter since we'll die on conflicts + # later anyway + for s in states: + for substate in ('prefixMap', 'indexValueMap'): + sub = s.get(substate, {}) + mSub = master[substate] + for k, v in sub.iteritems(): + mSub[k] = v + + # ensure that states are compatible (check for conflicts) + conflicts = [] + for s in states: + for substate in ('prefixMap', 'indexValueMap'): + sub = s.get(substate, {}) + mSub = master[substate] + for k, v in sub.iteritems(): + if k in mSub and mSub[k] != v: + # we defer error reporting so that the user sees + # as much info on the error as possible + conflicts.append((substate, k, v, mSub[k])) + + # the only way states can be incompatible is if two states have + # different values for the same keys in the same substate + + if conflicts: + raise InconsistantIndexCombination( + "The SmartReadOnlyIndex opened on %s cannot recconcile " + "the following conflicts in the subindices' states:\n%s" + % (self.names, + '\n'.join(["%s[%r] is %r in one index but %r in another" + % c for c in conflicts]))) + + self.prefixMap = master['prefixMap'] + self.indexValueMap = master['indexValueMap'] + + def search(self, query, sortKey = None, + startingIndex = 0, + batchSize = MAX_DOCS_TO_RETURN, + sortIndex = None, sortAscending = True, + sortByRelevence = False): + # if the appropriate index value string is not in + # self.indexValueMap, fetchState() before calling + # ReadOnlyIndex.search. if it still isn't there, let + # ReadOnlyIndex.search take care of throwing an error + if sortKey is not None and sortKey not in self.indexValueMap: + self.fetchState() + return ReadOnlyIndex.search(self, query, sortKey, + startingIndex, batchSize, + sortIndex, sortAscending, + sortByRelevence) +
--- a/MoinMoin/theme/__init__.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/theme/__init__.py Thu Jun 29 01:05:53 2006 +0200 @@ -347,10 +347,6 @@ except ValueError: pass - # Normalize page names, replace '_' with ' '. Usually - # all names use spaces internally, but for - # [name_with_spaces label] we must save the underscores - # until this point. pagename = request.normalizePagename(pagename) link = Page(request, pagename).link_to(request, title) @@ -555,9 +551,6 @@ for pagename in trail: try: interwiki, page = pagename.split(":", 1) - # Items in trail are saved as valid interwiki - # links, using _ for spaces. - page = page.replace('_', ' ') if request.cfg.interwikiname != interwiki: link = (self.request.formatter.interwikilink(True, interwiki, page) + self.shortenPagename(page) +
--- a/MoinMoin/user.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/user.py Thu Jun 29 01:05:53 2006 +0200 @@ -136,9 +136,9 @@ @rtype: unicode @return: user name that can be used in acl lines """ - name = name.replace('_', ' ') # we treat _ as a blank - username_allowedchars = "'@.-" # ' for names like O'Brian or email addresses. - # "," and ":" must not be allowed (ACL delimiters). + username_allowedchars = "'@.-_" # ' for names like O'Brian or email addresses. + # "," and ":" must not be allowed (ACL delimiters). + # We also allow _ in usernames for nicer URLs. # Strip non alpha numeric characters (except username_allowedchars), keep white space name = ''.join([c for c in name if c.isalnum() or c.isspace() or c in username_allowedchars]) @@ -155,7 +155,6 @@ @param name: user name, unicode """ normalized = normalizeName(name) - name = name.replace('_', ' ') # we treat _ as a blank return (name == normalized) and not wikiutil.isGroupPage(request, name) @@ -812,8 +811,6 @@ if not self._cfg.interwikiname: return None - # Interwiki links must use _ e.g Wiki:Main_Page - pagename = pagename.replace(" ", "_") return "%s:%s" % (self._cfg.interwikiname, pagename) # ----------------------------------------------------------------- @@ -936,7 +933,7 @@ else: markup = pagename else: - markup = '%s:%s' % (wikiname, pagename.replace(" ","_")) + markup = '%s:%s' % (wikiname, pagename) return markup def mailAccountData(self, cleartext_passwd=None):
--- a/MoinMoin/userform.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/userform.py Thu Jun 29 01:05:53 2006 +0200 @@ -38,10 +38,6 @@ Each line is a page name, empty lines ignored. - Items can use '_' as spaces, needed by [name_with_spaces label] - format used in quicklinks. We do not touch those names here, the - underscores are handled later by the theme code. - @param key: the form key to get @rtype: list of unicode strings @return: list of normalized names @@ -53,10 +49,6 @@ item = item.strip() if not item: continue - # Normalize names - except [name_with_spaces label] - # Commented out to allow URLs - #if not (item.startswith('[') and item.endswith(']')): - # item = self.request.normalizePagename(item) items.append(item) return items @@ -408,7 +400,7 @@ options = [] users = user.getUserList(self.request) for uid in users: - name = user.User(self.request, id=uid).name # + '_' + uid # for debugging + name = user.User(self.request, id=uid).name # + '/' + uid # for debugging options.append((name, name)) options.sort()
--- a/MoinMoin/util/thread_monitor.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/util/thread_monitor.py Thu Jun 29 01:05:53 2006 +0200 @@ -80,3 +80,4 @@ trigger_dump() threading.Thread(target=background_dumper, args=(seconds, )).start() +
--- a/MoinMoin/wikiutil.py Sun Jun 18 13:58:08 2006 +0200 +++ b/MoinMoin/wikiutil.py Thu Jun 29 01:05:53 2006 +0200 @@ -208,7 +208,6 @@ @rtype: string @return: the quoted filename, all unsafe characters encoded """ - pagename = pagename.replace(u' ', u'_') pagename = pagename.encode(charset) return urllib.quote(pagename) @@ -295,7 +294,6 @@ @rtype: string @return: quoted name, safe for any file system """ - wikiname = wikiname.replace(u' ', u'_') # " " -> "_" filename = wikiname.encode(charset) quoted = [] @@ -376,7 +374,6 @@ # raise InvalidFileNameError(filename) wikiname = decodeUserInput(wikiname, charsets) - wikiname = wikiname.replace(u'_', u' ') # "_" -> " " return wikiname # time scaling @@ -481,51 +478,8 @@ ############################################################################# ### InterWiki ############################################################################# - -def split_wiki(wikiurl): - """ - Split a wiki url. - - @param wikiurl: the url to split - @rtype: tuple - @return: (tag, tail) - """ - # !!! use a regex here! - try: - wikitag, tail = wikiurl.split(":", 1) - except ValueError: - try: - wikitag, tail = wikiurl.split("/", 1) - except ValueError: - wikitag, tail = 'Self', wikiurl - return wikitag, tail - - -def join_wiki(wikiurl, wikitail): - """ - Add a page name to an interwiki url. - - @param wikiurl: wiki url, maybe including a $PAGE placeholder - @param wikitail: page name - @rtype: string - @return: generated URL of the page in the other wiki - """ - if wikiurl.find('$PAGE') == -1: - return wikiurl + wikitail - else: - return wikiurl.replace('$PAGE', wikitail) - - -def resolve_wiki(request, wikiurl): - """ - Resolve an interwiki link. - - @param request: the request object - @param wikiurl: the InterWiki:PageName link - @rtype: tuple - @return: (wikitag, wikiurl, wikitail, err) - """ - # load map (once, and only on demand) +def load_wikimap(request): + """ load interwiki map (once, and only on demand) """ try: _interwiki_list = request.cfg._interwiki_list except AttributeError: @@ -565,15 +519,75 @@ # save for later request.cfg._interwiki_list = _interwiki_list + + return _interwiki_list + +def split_wiki(wikiurl): + """ Split a wiki url, e.g: + + 'MoinMoin:FrontPage' -> "MoinMoin", "FrontPage", "" + 'FrontPage' -> "Self", "FrontPage", "" + 'MoinMoin:"Page with blanks" link title' -> "MoinMoin", "Page with blanks", "link title" - # split wiki url - wikitag, tail = split_wiki(wikiurl) + can also be used for: + + 'attachment:"filename with blanks.txt" other title' -> "attachment", "filename with blanks.txt", "other title" - # return resolved url - if _interwiki_list.has_key(wikitag): - return (wikitag, _interwiki_list[wikitag], tail, False) + @param wikiurl: the url to split + @rtype: tuple + @return: (wikiname, pagename, linktext) + """ + try: + wikiname, rest = wikiurl.split(":", 1) # e.g. MoinMoin:FrontPage + except ValueError: + try: + wikiname, rest = wikiurl.split("/", 1) # for what is this used? + except ValueError: + wikiname, rest = 'Self', wikiurl + first_char = rest[0] + if first_char in "'\"": # quoted pagename + pagename_linktext = rest[1:].split(first_char, 1) + else: # not quoted, split on whitespace + pagename_linktext = rest.split(None, 1) + if len(pagename_linktext) == 1: + pagename, linktext = pagename_linktext[0], "" else: - return (wikitag, request.getScriptname(), "/InterWiki", True) + pagename, linktext = pagename_linktext + linktext = linktext.strip() + return wikiname, pagename, linktext + +def resolve_wiki(request, wikiurl): + """ Resolve an interwiki link. + + @param request: the request object + @param wikiurl: the InterWiki:PageName link + @rtype: tuple + @return: (wikitag, wikiurl, wikitail, err) + """ + _interwiki_list = load_wikimap(request) + wikiname, pagename, linktext = split_wiki(wikiurl) + if _interwiki_list.has_key(wikiname): + return (wikiname, _interwiki_list[wikiname], pagename, False) + else: + return (wikiname, request.getScriptname(), "/InterWiki", True) + +def join_wiki(wikiurl, wikitail): + """ + Add a (url_quoted) page name to an interwiki url. + + Note: We can't know what kind of URL quoting a remote wiki expects. + We just use a utf-8 encoded string with standard URL quoting. + + @param wikiurl: wiki url, maybe including a $PAGE placeholder + @param wikitail: page name + @rtype: string + @return: generated URL of the page in the other wiki + """ + wikitail = url_quote(wikitail) + if '$PAGE' in wikiurl: + return wikiurl.replace('$PAGE', wikitail) + else: + return wikiurl + wikitail ############################################################################# @@ -761,33 +775,57 @@ else: return u'["%s"]' % pagename +############################################################################# +### mimetype support +############################################################################# +import mimetypes + +MIMETYPES_MORE = { + # OpenOffice 2.x & other open document stuff + '.odt': 'application/vnd.oasis.opendocument.text', + '.ods': 'application/vnd.oasis.opendocument.spreadsheet', + '.odp': 'application/vnd.oasis.opendocument.presentation', + '.odg': 'application/vnd.oasis.opendocument.graphics', + '.odc': 'application/vnd.oasis.opendocument.chart', + '.odf': 'application/vnd.oasis.opendocument.formula', + '.odb': 'application/vnd.oasis.opendocument.database', + '.odi': 'application/vnd.oasis.opendocument.image', + '.odm': 'application/vnd.oasis.opendocument.text-master', + '.ott': 'application/vnd.oasis.opendocument.text-template', + '.ots': 'application/vnd.oasis.opendocument.spreadsheet-template', + '.otp': 'application/vnd.oasis.opendocument.presentation-template', + '.otg': 'application/vnd.oasis.opendocument.graphics-template', +} +[mimetypes.add_type(mimetype, ext, True) for ext, mimetype in MIMETYPES_MORE.items()] + +MIMETYPES_sanitize_mapping = { + # this stuff is text, but got application/* for unknown reasons + ('application', 'docbook+xml'): ('text', 'docbook'), + ('application', 'x-latex'): ('text', 'latex'), + ('application', 'x-tex'): ('text', 'tex'), + ('application', 'javascript'): ('text', 'javascript'), +} + +MIMETYPES_spoil_mapping = {} # inverse mapping of above +for key, value in MIMETYPES_sanitize_mapping.items(): + MIMETYPES_spoil_mapping[value] = key + + # mimetype stuff ------------------------------------------------------------ class MimeType(object): """ represents a mimetype like text/plain """ - sanitize_mapping = { - # this stuff is text, but got application/* for unknown reasons - ('application', 'docbook+xml'): ('text', 'docbook'), - ('application', 'x-latex'): ('text', 'latex'), - ('application', 'x-tex'): ('text', 'tex'), - ('application', 'javascript'): ('text', 'javascript'), - } - spoil_mapping = {} # inverse mapping of above def __init__(self, mimestr=None, filename=None): self.major = self.minor = None # sanitized mime type and subtype self.params = {} # parameters like "charset" or others self.charset = None # this stays None until we know for sure! - for key, value in self.sanitize_mapping.items(): - self.spoil_mapping[value] = key - if mimestr: self.parse_mimetype(mimestr) elif filename: self.parse_filename(filename) def parse_filename(self, filename): - import mimetypes mtype, encoding = mimetypes.guess_type(filename) if mtype is None: mtype = 'application/octet-stream' @@ -844,13 +882,13 @@ readable text, we will return some text/* mimetype, not application/*, because we need text/plain as fallback and not application/octet-stream. """ - self.major, self.minor = self.sanitize_mapping.get((self.major, self.minor), (self.major, self.minor)) + self.major, self.minor = MIMETYPES_sanitize_mapping.get((self.major, self.minor), (self.major, self.minor)) def spoil(self): """ this returns something conformant to /etc/mime.type or IANA as a string, kind of inverse operation of sanitize(), but doesn't change self """ - major, minor = self.spoil_mapping.get((self.major, self.minor), (self.major, self.minor)) + major, minor = MIMETYPES_spoil_mapping.get((self.major, self.minor), (self.major, self.minor)) return self.content_type(major, minor) def content_type(self, major=None, minor=None, charset=None, params=None):
--- a/docs/CHANGES Sun Jun 18 13:58:08 2006 +0200 +++ b/docs/CHANGES Thu Jun 29 01:05:53 2006 +0200 @@ -28,6 +28,33 @@ and improving it and after having made a backup with some other, proven method. USE BOTH ON YOUR OWN RISK! +Branch moin-1.6-xapian: + New Features: + * Added Xapian (see http://xapian.org/) based indexed search code. + Our implementation is still buggy, only use it if you want to help + debugging it or to implement / test indexing filters (see + MoinMoin/filter/). To use this: + * Install xapian-core and xapian-bindings on your machine. + We used 0.9.4, but newer code should hopefully work, too. + * cfg.xapian_search = True + * Execute this to build the index: + $ moin ... index build # indexes pages and attachments + $ moin ... index build --files=files.lst # same plus a list of files + You should run those commands as the same user you use for your wiki, + usually this is the webserver userid, e.g.: + $ sudo -u www-data moin --config=... --wiki-url=wiki.example.org/ \ + index build --files=files.lst + ToDo: + * fix/improve query parsing (xapian_term member functions) + * fix/improve evaluation of search result + * maybe add some "xapian native query" mode (can we make it working without + the _moinSearch post-processing (not possible as it uses the same query)) + + Other Changes: + * Removed Lupy based indexed search code. If you were brave enough to + use cfg.lupy_search, you maybe want to try cfg.xapian_search instead. + + Version 1.6.current: This is the active development branch. All changes get done here and critical stuff gets committed with -m "... (backport to 1.5)" and then @@ -72,16 +99,35 @@ types because the official ones suck) * renamed parsers to module names representing sane mimetypes, e.g.: parser.wiki -> parser.text_moin_wiki - * Added thread_monitor. It can be activated using: + * Added thread_monitor debugging aid. It can be activated using: from MoinMoin.util import thread_monitor; thread_monitor.activate_hook() + and then triggered by requesting URL ...?action=thread_monitor - please + be aware that monitoring threads has a big performance impact on its own, + so you only want to temporarily enable this for debugging. + By default, it dumps its output to the data_dir as tm_<timestamp>.log, + you can change this at bottom of action/thread_monitor.py if you want to + see output in your browser. * Introduced scope parameter to CacheEntry() - if you specify 'farm', it will cache into a common directory for all wikis in the same farm, if you specify 'wiki', it will use a cache directory per wiki and if you specify 'item', it will use a cache directory per item (== per page). Creating a CacheEntry without explicit scope is DEPRECATED. * smileys moved from MoinMoin.config to MoinMoin.theme + * removed all _ magic in URLs and filenames + TODO: write mig script for data_dir + TODO: make blanks in interwiki pagelinks possible New Features: + * Removed "underscore in URL" == "blank in pagename magic" - it made more + trouble than it was worth. If you still want to have a _ in URL, just + put a _ into pagename. + * Introduced quoting for pagenames and new, easier link markup: + * ["Page with blanks" but different link text], + NOTE: using [:Page with blanks:but different link text] is DEPRECATED. + * ["/Sub Page" with different link text] + * MoinMoin:"Page with blanks" + * [wiki:MoinMoin:"Page with blanks" different link text] + * attachment:"blanks are evil.txt" * FeatureRequests/WikiEmailIntegration TODO:make some help page when stable * HTML parser (called "html") that allows you to use HTML on the page. @@ -94,6 +140,9 @@ To use TLS/SSL support you must also install the TLSLite library (http://trevp.net/tlslite/). Version 0.3.8 was used for development and testing. + * cfg.log_reverse_dns_lookups [default: True] - you can set this to False + if rev. dns lookups are broken in your network (leading to long delays + on page saves). With False, edit-log will only contain IP, not hostname. Bugfixes: * on action "info" page, "revert" link will not be displayed for empty page @@ -106,6 +155,8 @@ * fixed smiley caching bug (smileys didn't change theme) * fixed backtrace when user removed css_url entry from user_form_fields * Fixed the output of macro and "attachment:" usages of the rst parser. + * Removed Twisted request object reverse DNS lookup + * cfg.editor_quickhelp was not parsed with the wiki parser when customized Other changes: * we use (again) the same browser compatibility check as FCKeditor uses
--- a/docs/CHANGES.fpletz Sun Jun 18 13:58:08 2006 +0200 +++ b/docs/CHANGES.fpletz Thu Jun 29 01:05:53 2006 +0200 @@ -1,6 +1,3 @@ -Please use your CHANGES.$yourname for recording your changes you do while -Google Summer of Code. - Branch moin/1.6-xapian-fpletz ============================= @@ -8,10 +5,12 @@ * ... ToDo: - * ... + * Manually parse prefixes (e.g. title:) in MoinMoin.Xapian.Index + right before searching + * Mockup the new search UI New Features: - * ... + * TBD Bugfixes (only stuff that is buggy in moin/1.6 main branch): * ... @@ -25,9 +24,12 @@ Diary ===== -Please make at least one entry per day (and commit it) about what your work was about. -2006-05-29 ... -2006-05-30 ... -2006-05-31 ... +2006-06-10 Changed xapian_term() functions to return xapian.Query objects +but without touching the prefixes as we don't have a prefixMap yet. Will +implement this in MoinMoin.Xapian.Index. AndExpression needed some more +tweaking to use AND_NOT because Xapian doesn't provide a pure NOT. Should +be no issue with OrExpression as _moinSearch handles this correctly. +2006-06-11 +
--- a/docs/Lupy-0.2.1/LICENSE Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,504 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - <one line to give the library's name and a brief idea of what it does.> - Copyright (C) <year> <name of author> - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - <signature of Ty Coon>, 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! - -
--- a/docs/Lupy-0.2.1/README.txt Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -Lupy full text indexer r0.2.1 ------------------------------ - -**What is Lupy?** - Lupy is a port of the excellent Jakarta Lucene 1.2 into - Python. - -**What can I do with Lupy?** - Lupy is a full text indexer and search engine. It can be used to - index text documents such as web pages, source code, email, etc. - -**What is in this release?** - Most of Lucene 1.2 is in Lupy 0.2. Lupy supports text indexing - producing files that are binary compatible with Lucene. Index - creation, update and searching are supported. - - This release supports TermQuery, PhraseQuery and BooleanQuery. - -**What is not in this release?** - There is no locking or synchronization. - - The query parser has not been ported, nor all of the analysis/doc - parsing classes. Queries can be built using the basic building blocks. - - Tokenization is done with a simple regexp; there is no stop-lists, - Porter stemming, StandardAnalyzer or German analyzer. - - This release does not contain the following queries: - - - QueryParser - - MultiTermQuery - - FuzzyQuery - - WildCardQuery - - PrefixQuery - - RangeQuery - - Sloppy phrase queries - - DateField has not been ported. - - Merging of multiple multi-segment indices is not supported. - -**How do I get started?** - Look in the examples directory. - - Most of the Lucene documentation is relevant to Lupy: - - - http://jakarta.apache.org/lucene - - http://www.onjava.com/pub/a/onjava/2003/01/15/lucene.html - - http://darksleep.com/lucene/ - -**Performance** - Java is faster. - - -**Acknowledgements** - Many thanks to Doug Cutting and the Jakarta Lucene team for building - and enhancing such a high quality piece of open source software. - - Glyph Lefkowitz for serving as my language guru for Python and Java. - - Allen Short did the refactoring for the 0.2 release. - - I hope you find what you are searching for ;-) - amir@divmod.org
--- a/docs/Lupy-0.2.1/releasenotes.txt Sun Jun 18 13:58:08 2006 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ -0.2.0 Release notes - -This release brings major reorganization of the code, grouping classes -into larger modules instead of the original Java style, as well as -rewriting several of the classes to be more Pythonic, removing -extraneous data structures and so forth; overall, the code has been -reduced by 20%. The public interface, indexer.py, has not changed; -other classes have not been changed significantly, other than being -moved to new modules. - -Also, this release changes the interface for analyzers: they are now -iterable objects that take one argument, the string to be tokenized, -and produce tokens, rather than the analysis classes ported from -Lucene. This improves performance while simplifying the code. If an -analyzer is not specified, lupy.index.documentwriter.standardTokenizer -is used. The regex used by that generator is re.compile("\\w+", re.U), -and the tokens are downcased before being stored. - -Along with this improvement in tokenization comes better Unicode -support; all text is now handled as Unicode strings. There is a -simple test for the indexing and retrieval of documents containing -non-ASCII data.
--- a/setup.py Sun Jun 18 13:58:08 2006 +0200 +++ b/setup.py Thu Jun 29 01:05:53 2006 +0200 @@ -216,9 +216,9 @@ 'MoinMoin.script.cli', 'MoinMoin.script.export', 'MoinMoin.script.import', + 'MoinMoin.script.index', 'MoinMoin.script.maint', 'MoinMoin.script.migration', - 'MoinMoin.script.lupy', 'MoinMoin.script.old', 'MoinMoin.script.old.migration', 'MoinMoin.script.old.xmlrpc-tools', @@ -226,9 +226,7 @@ 'MoinMoin.server', 'MoinMoin.stats', 'MoinMoin.support', - 'MoinMoin.support.lupy', - 'MoinMoin.support.lupy.index', - 'MoinMoin.support.lupy.search', + 'MoinMoin.support.xapwrap', 'MoinMoin.theme', 'MoinMoin.util', 'MoinMoin.widget',
--- a/wiki/config/more_samples/ldap_smb_farmconfig.py Sun Jun 18 13:58:08 2006 +0200 +++ b/wiki/config/more_samples/ldap_smb_farmconfig.py Thu Jun 29 01:05:53 2006 +0200 @@ -87,7 +87,8 @@ #ldap_bindpw = 'secret' #or we can use the username and password we got from the user: - ldap_binddn = '%(username)s@example.org' # DN we use for first bind + ldap_binddn = '%(username)s@example.org' # DN we use for first bind (AD) + #ldap_binddn = 'cn=admin,dc=example,dc=org' # DN we use for first bind (OpenLDAP) ldap_bindpw = '%(password)s' # password we use for first bind ldap_base = 'ou=SOMEUNIT,dc=example,dc=org' # base DN we use for searching