view MoinMoin/script/maint/ @ 808:4411b124f647 pytest2

moin index --action build --procs 1 --> use simple writer, not MultiSegmentWriter
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Wed, 31 Aug 2011 00:38:43 +0200
parents 2dfaef39854f
children b38ec86abed0
line wrap: on
line source
# Copyright: 2011 MoinMoin:MichaelMayorov
# License: GNU GPL v2 (or any later version), see LICENSE.txt for details.

MoinMoin - Manage whoosh indexes

import os, datetime

from flask import current_app as app
from flask import g as flaskg
from flaskext.script import Command, Option
from whoosh.filedb.multiproc import MultiSegmentWriter
from whoosh.index import open_dir, create_in, exists_in
from whoosh.index import EmptyIndexError

from import WhooshIndex
from MoinMoin.config import MTIME, NAME, CONTENTTYPE
from MoinMoin.error import FatalError
from import NoSuchItemError, NoSuchRevisionError
from MoinMoin.util.mime import Type
from import backend_to_index
from import convert_to_indexable

from MoinMoin import log
logging = log.getLogger(__name__)

# Information about index and schema for latest and all revisions
latest_indexname_schema = ("latest_revisions_index", "latest_revisions_schema")
all_indexname_schema = ("all_revisions_index", "all_revisions_schema")
both_indexnames_schemas = [latest_indexname_schema, all_indexname_schema]

class IndexOperations(Command):
    description = 'Build indexes'

    option_list = (
        Option('--for', required=True, dest='indexname', type=str, choices=("all-revs", "latest-revs", "both"),
            help='For what type of indexes we will use action'),
        Option('--action', required=True, dest='action', type=str, choices=("build", "update", "clean", "move", "show"),
                  Action for given indexes:
                  build -- Build in index_dir_tmp
                  update -- Update in index_dir
                  clean -- Clean index_dir
                  move  -- Move index files from index_dir_tmp to index_dir
                  show -- Show index contents for the given index.
        Option('--procs', '-p', required=False, dest='procs', type=int, default=None,
            help='Number of processors the writer will use.'),
        Option('--limitmb', '-l', required=False, dest='limitmb', type=int, default=10,
            help='Maximum memory (in megabytes) each index-writer will use for the indexing pool.'),

    def run(self, indexname, action, procs, limitmb):

        def build_index(indexnames_schemas):
            Building in app.cfg.index_dir_tmp
            indexnames = [indexname for indexname, schema in indexnames_schemas]
            if procs == 1:
                # MultiSegmentWriter sometimes has issues and is pointless for procs == 1,
                # so use the simple writer when --procs 1 is given:
                _all_rev_writer = all_rev_index.writer()
                _latest_rev_writer = latest_rev_index.writer()
                _all_rev_writer = MultiSegmentWriter(all_rev_index, procs, limitmb)
                _latest_rev_writer = MultiSegmentWriter(latest_rev_index, procs, limitmb)
            with _all_rev_writer as all_rev_writer:
                with _latest_rev_writer as latest_rev_writer:
                    for item in backend.iter_items_noindex():
                            rev_no = None
                            if "all_revisions_index" in indexnames:
                                for rev_no in item.list_revisions():
                                    revision = item.get_revision(rev_no)
                                    rev_content = convert_to_indexable(revision)
                                    metadata = backend_to_index(revision, rev_no, all_rev_schema, rev_content, interwikiname)
                                revision = item.get_revision(-1)
                                rev_no = revision.revno
                                rev_content = convert_to_indexable(revision)
                        except NoSuchRevisionError: # item has no such revision
                        # revision is now the latest revision of this item
                        if "latest_revisions_index" in indexnames and rev_no:
                            metadata = backend_to_index(revision, rev_no, latest_rev_schema, rev_content, interwikiname)

        def update_index(indexnames_schemas):
            Updating index in app.cfg.index_dir_tmp

            indexnames = [indexname for indexname, schema in indexnames_schemas]
            create_documents = []
            delete_documents = []
            latest_documents = []
            for item in backend.iter_items_noindex():
                backend_rev_list = item.list_revisions()
                if not backend_rev_list: # If item hasn't revisions, skipping it
                name = item.get_revision(-1)[NAME]
                index_rev_list = item_index_revs(all_rev_searcher, name)
                add_rev_nos = set(backend_rev_list) - set(index_rev_list)
                if add_rev_nos:
                    if "all_revisions_index" in indexnames:
                        create_documents.append((item, add_rev_nos))
                    if "latest_revisions_index" in indexnames:
                        latest_documents.append((item, max(add_rev_nos))) # Add latest revision
                remove_rev_nos = set(index_rev_list) - set(backend_rev_list)
                if remove_rev_nos:
                    if "all_revisions_index" in indexnames:
                        delete_documents.append((item, remove_rev_nos))

            if "latest_revisions_index" in indexnames and latest_documents:
                with latest_rev_index.writer() as latest_rev_writer:
                    for item, rev_no in latest_documents:
                        revision = item.get_revision(rev_no)
                        rev_content = convert_to_indexable(revision)
                        converted_rev = backend_to_index(revision, rev_no, latest_rev_schema, rev_content, interwikiname)
                        found = latest_rev_searcher.document(,
                        if not found:
                        # Checking that last revision is the latest
                        elif found["rev_no"] < converted_rev["rev_no"]:
                            doc_number = latest_rev_searcher.document_number(, wikiname=interwikiname)

            if "all_revisions_index" in indexnames and delete_documents:
                with all_rev_index.writer() as all_rev_writer:
                    for item, rev_nos in delete_documents:
                        for rev_no in rev_nos:
                            doc_number = all_rev_searcher.document_number(rev_no=rev_no,
                            if doc_number:

            if "all_revisions_index" in indexnames and create_documents:
                with all_rev_index.writer() as all_rev_writer:
                    for item, rev_nos in create_documents:
                        for rev_no in rev_nos:
                            revision = item.get_revision(rev_no)
                            rev_content = convert_to_indexable(revision)
                            converted_rev = backend_to_index(revision, rev_no, all_rev_schema, rev_content, interwikiname)

        def clean_index(indexnames_schemas):
            Clean given index in app.cfg.index_dir
            for indexname, schema in indexnames_schemas:

        def move_index(indexnames_schemas):
            Move given indexes from index_dir_tmp to index_dir
            for indexname, schema in indexnames_schemas:
                if not exists_in(app.cfg.index_dir_tmp, indexname=indexname):
                    raise FatalError(u"Can't find %s in %s" % (indexname, app.cfg.index_dir_tmp))
                for filename in
                    src_file = os.path.join(app.cfg.index_dir_tmp, filename)
                    dst_file = os.path.join(app.cfg.index_dir, filename)
                    if indexname in filename and os.path.exists(src_file):
                        os.rename(src_file, dst_file)

        def show_index(indexnames_schemas):
            Print documents in given index to stdout

            for indexname, schema in indexnames_schemas:
                    if indexname == "all_revisions_index":
                        ix = open_dir(app.cfg.index_dir, indexname="all_revisions_index")
                    elif indexname == "latest_revisions_index":
                        ix = open_dir(app.cfg.index_dir, indexname="latest_revisions_index")
                    print "*** Revisions in", indexname
                    with ix.searcher() as searcher:
                        for rev in searcher.all_stored_fields():
                            name = rev.pop("name", u"")
                            content = rev.pop("content", u"")
                            for field, value in [("name", name), ] + sorted(rev.items()) + [("content", content), ]:
                                print "%s: %s" % (field, repr(value)[:70])
                            print "\n"
                except (IOError, OSError, EmptyIndexError) as err:
                    raise FatalError("%s [Can not open %s index" % str(err), indexname)

        def item_index_revs(searcher, name):
            Return list of found documents for given name using index searcher

            revs_found = searcher.documents(name_exact=name, wikiname=interwikiname)
            return [rev["rev_no"] for rev in revs_found]

        def do_action(action, indexnames_schemas):
            if action == "build":
            elif action == "update":
            elif action == "clean":
            elif action == "move":
            elif action == "show":

        backend = flaskg.unprotected_storage = app.unprotected_storage
        index_object = WhooshIndex(index_dir=app.cfg.index_dir_tmp)
        interwikiname = app.cfg.interwikiname
        if os.path.samefile(app.cfg.index_dir_tmp, app.cfg.index_dir):
            raise FatalError(u"cfg.index_dir and cfg.index_dir_tmp must point to different directories.")

        latest_rev_index = index_object.latest_revisions_index
        all_rev_index = index_object.all_revisions_index

        latest_rev_schema = latest_rev_index.schema
        all_rev_schema = all_rev_index.schema

        latest_rev_searcher = latest_rev_index.searcher()
        all_rev_searcher = all_rev_index.searcher()

        if indexname == "both":
            do_action(action, both_indexnames_schemas)
        elif indexname == "all-revs":
            do_action(action, (all_indexname_schema, ))
        elif indexname == "latest-revs":
            do_action(action, (latest_indexname_schema, ))