view MoinMoin/parser/ @ 0:77665d8e2254

tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0 (automatically generated log message) imported from: moin--main--1.5--base-0
author Thomas Waldmann <>
date Thu, 22 Sep 2005 15:09:50 +0000
line wrap: on
line source

# -*- coding: iso-8859-1 -*-
    MoinMoin - DocBook-XML Parser

    This code was tested with 4Suite 1.0a4 and 1.0b1

    @copyright: 2005 by Henry Ho <henryho167 AT hotmail DOT com>
    @copyright: 2005 by MoinMoin:AlexanderSchremmer
    @license: GNU GPL, see COPYING for details.

    DOCBOOK Parser:

    - image support through Attachment
    - internal Wikilinks if a word is a strict wikiname
    - image alt is perserved
    - works with compiled xslt stylesheet for optimized performance

    - make sure you have installed the DocBook XSLT files
    - set the path to the html directory of the DocBook XSLT files in your
      wiki or farm configuration:
      docbook_html_dir = r"/usr/share/xml/docbook/stylesheet/nwalsh/html/"
      Note that this directory needs to be writable because a cache file will
      be created there.

    >How can I use Ft API for DTD validation?
    If you have PyXMl installed, you can use ValidatingReader rather than
    NonvalidatingReader.  See:

import StringIO
import os.path
import cPickle
import re

from MoinMoin import caching, config, wikiutil, Page
from MoinMoin.parser.xslt import Parser as XsltParser
from import Parser as WikiParser

Dependencies = []

class Parser(XsltParser):
        Send XML file formatted via XSLT.

    caching = 1
    Dependencies = Dependencies

    def __init__(self, raw, request, **kw):
        XsltParser.__init__(self, raw, request)

        # relative path to docbook.xsl and compiled_xsl
        docbook_html_directory = request.cfg.docbook_html_dir
        self.db_xsl = os.path.join(docbook_html_directory, 'docbook.xsl')
        self.db_compiled_xsl = os.path.join(docbook_html_directory, 'db_compiled.dat')

        self.wikiParser = WikiParser(raw = self.raw, request = self.request, pretty_url=1)
        self.key = 'docbook'

    def format(self, formatter):
        self.wikiParser.formatter = formatter
        XsltParser.format(self, formatter)

    def append_stylesheet(self):
            virtual function, for docbook parser
        abs_db_xsl = os.path.abspath(self.db_xsl)
        abs_db_compiled_xsl = os.path.abspath(self.db_compiled_xsl)

        # same as path.exists, but also test if it is a file
        if not os.path.isfile(abs_db_compiled_xsl):
            _compile_xsl(abs_db_xsl, abs_db_compiled_xsl)

        assert os.path.isfile(abs_db_compiled_xsl)

        self.processor.appendStylesheetInstance(cPickle.load(file(abs_db_compiled_xsl, 'rb')))

    def parse_result(self, result):
        additional parsing to the resulting XSLT'ed result (resultString) before saving

        will do:
            BASIC CLEAN UP   : remove unnecessary HTML tags
            RESOLVE IMG SRC  : fix src to find attachment
            RESOLVE WikiNames: if a word is a valid wikiname & a valid wikipage,
                               replace word with hyperlink

        # BASIC CLEAN UP
        # remove from beginning until end of body tag
        found ='<body.*?>', result)
        if found:
            result = result[found.end():]

        # remove everything after & including </body>
        found = result.rfind('</body>')
        if found != -1:
            result = result[:found]

        found = re.finditer('<img.*?>', result)
        if found:
            splitResult = _splitResult(found, result)
            for index in range(len(splitResult)):
                if splitResult[index].startswith('<img'):
                    found ='src="(?P<source>.*?)"', splitResult[index])
                    imageSrc ='source')
                    imageAlt = None # save alt
                    found ='alt="(?P<alt>.*?)"', splitResult[index])
                    if found:
                        imageAlt ='alt')
                    splitResult[index] = self.wikiParser.attachment( ('attachment:' + imageSrc, "") )
                    if imageAlt: # restore alt
                        splitResult[index] = re.sub('alt=".*?"', 'alt="%s"' % imageAlt, splitResult[index])

            result = ''.join(splitResult)

        # RESOLVE WikiNames
        #    if a word is a valid wikiname & a valid wikipage,
        #    replace word with hyperlink

        found = re.finditer(self.wikiParser.word_rule, result)
        if found:
            splitResult = _splitResult(found, result)

            for index in range(len(splitResult)):
                if (re.match(self.wikiParser.word_rule, splitResult[index])
                    and Page.Page(self.request, splitResult[index]).exists()):
                    splitResult[index] = self.wikiParser._word_repl(splitResult[index])
            result = ''.join(splitResult)

        # remove stuff that fail HTML 4.01 Strict verification

        # remove unsupported attributes
        result = re.sub(' target=".*?"| type=".*?"', '', result)
        result = re.sub('<hr .*?>', '<hr>', result)

        # remove <p>...</p> inside <a>...</a> or <caption>...</caption>
        found = re.finditer('<a href=".*?</a>|<caption>.*?</caption>', result) # XXX re.DOTALL)
        if found:
            splitResult = _splitResult(found, result)
            for index in range(len(splitResult)):
                if (splitResult[index].startswith('<a href="')
                    or splitResult[index].startswith('<caption>')):
                    splitResult[index] = splitResult[index].replace('<p>', '').replace('</p>', '')
            result = ''.join(splitResult)

        return result

def _compile_xsl(XSLT_FILE, XSLT_COMPILED_FILE):
        compiling docbook stylesheet

    from Ft.Xml.Xslt.Processor import Processor
    from Ft.Xml.Xslt import Stylesheet
    from Ft.Xml import InputSource
    from Ft.Lib import Uri

    # New docbook processor

    # Docbook Stylesheet
    my_sheet_uri = Uri.OsPathToUri(XSLT_FILE, 1)
    sty_isrc = InputSource.DefaultFactory.fromUri(my_sheet_uri)

    # Append Stylesheet

    # Pickled stylesheet will be self.abs_db_compiled_xsl file
    db_root = db_processor.stylesheet.root
    fw = file(XSLT_COMPILED_FILE, 'wb')
    cPickle.dump(db_root, fw) # , protocol=2)

def _splitResult(iterator, result):
    startpos = 0
    splitResult = []

    for f in iterator:
        start, end = f.span()
        startpos = end

    return splitResult