comparison MoinMoin/parser/docbook.py @ 0:77665d8e2254

tag of nonpublic@localhost--archive/moin--enterprise--1.5--base-0 (automatically generated log message) imported from: moin--main--1.5--base-0
author Thomas Waldmann <tw-public@gmx.de>
date Thu, 22 Sep 2005 15:09:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:77665d8e2254
1 # -*- coding: iso-8859-1 -*-
2 """
3 MoinMoin - DocBook-XML Parser
4
5 This code was tested with 4Suite 1.0a4 and 1.0b1
6
7 @copyright: 2005 by Henry Ho <henryho167 AT hotmail DOT com>
8 @copyright: 2005 by MoinMoin:AlexanderSchremmer
9 @license: GNU GPL, see COPYING for details.
10
11 DOCBOOK Parser:
12
13 Features:
14 - image support through Attachment
15 - internal Wikilinks if a word is a strict wikiname
16 - image alt is perserved
17 - works with compiled xslt stylesheet for optimized performance
18
19 Configuration:
20 - make sure you have installed the DocBook XSLT files
21 - set the path to the html directory of the DocBook XSLT files in your
22 wiki or farm configuration:
23 docbook_html_dir = r"/usr/share/xml/docbook/stylesheet/nwalsh/html/"
24 Note that this directory needs to be writable because a cache file will
25 be created there.
26
27 >How can I use Ft API for DTD validation?
28 If you have PyXMl installed, you can use ValidatingReader rather than
29 NonvalidatingReader. See:
30 http://uche.ogbuji.net/tech/akara/nodes/2003-01-01/domlettes
31 """
32
33 import StringIO
34 import os.path
35 import cPickle
36 import re
37
38 from MoinMoin import caching, config, wikiutil, Page
39 from MoinMoin.parser.xslt import Parser as XsltParser
40 from MoinMoin.parser.wiki import Parser as WikiParser
41
42 Dependencies = []
43
44 class Parser(XsltParser):
45 """
46 Send XML file formatted via XSLT.
47 """
48
49 caching = 1
50 Dependencies = Dependencies
51
52 def __init__(self, raw, request, **kw):
53 XsltParser.__init__(self, raw, request)
54
55 # relative path to docbook.xsl and compiled_xsl
56 docbook_html_directory = request.cfg.docbook_html_dir
57 self.db_xsl = os.path.join(docbook_html_directory, 'docbook.xsl')
58 self.db_compiled_xsl = os.path.join(docbook_html_directory, 'db_compiled.dat')
59
60 self.wikiParser = WikiParser(raw = self.raw, request = self.request, pretty_url=1)
61 self.key = 'docbook'
62
63 def format(self, formatter):
64 self.wikiParser.formatter = formatter
65 XsltParser.format(self, formatter)
66
67 def append_stylesheet(self):
68 """"
69 virtual function, for docbook parser
70 """
71 abs_db_xsl = os.path.abspath(self.db_xsl)
72 abs_db_compiled_xsl = os.path.abspath(self.db_compiled_xsl)
73
74 # same as path.exists, but also test if it is a file
75 if not os.path.isfile(abs_db_compiled_xsl):
76 _compile_xsl(abs_db_xsl, abs_db_compiled_xsl)
77
78 assert os.path.isfile(abs_db_compiled_xsl)
79
80 self.processor.appendStylesheetInstance(cPickle.load(file(abs_db_compiled_xsl, 'rb')))
81
82 def parse_result(self, result):
83 """
84 additional parsing to the resulting XSLT'ed result (resultString) before saving
85
86 will do:
87 BASIC CLEAN UP : remove unnecessary HTML tags
88 RESOLVE IMG SRC : fix src to find attachment
89 RESOLVE WikiNames: if a word is a valid wikiname & a valid wikipage,
90 replace word with hyperlink
91 """
92
93 # BASIC CLEAN UP
94 # remove from beginning until end of body tag
95 found = re.search('<body.*?>', result)
96 if found:
97 result = result[found.end():]
98
99 # remove everything after & including </body>
100 found = result.rfind('</body>')
101 if found != -1:
102 result = result[:found]
103
104 # RESOLVE IMG SRC
105 found = re.finditer('<img.*?>', result)
106 if found:
107 splitResult = _splitResult(found, result)
108 for index in range(len(splitResult)):
109 if splitResult[index].startswith('<img'):
110 found = re.search('src="(?P<source>.*?)"', splitResult[index])
111 imageSrc = found.group('source')
112 imageAlt = None # save alt
113 found = re.search('alt="(?P<alt>.*?)"', splitResult[index])
114 if found:
115 imageAlt = found.group('alt')
116 splitResult[index] = self.wikiParser.attachment( ('attachment:' + imageSrc, "") )
117 if imageAlt: # restore alt
118 splitResult[index] = re.sub('alt=".*?"', 'alt="%s"' % imageAlt, splitResult[index])
119
120 result = ''.join(splitResult)
121
122
123 # RESOLVE WikiNames
124 # if a word is a valid wikiname & a valid wikipage,
125 # replace word with hyperlink
126
127 found = re.finditer(self.wikiParser.word_rule, result)
128 if found:
129 splitResult = _splitResult(found, result)
130
131 for index in range(len(splitResult)):
132 if (re.match(self.wikiParser.word_rule, splitResult[index])
133 and Page.Page(self.request, splitResult[index]).exists()):
134 splitResult[index] = self.wikiParser._word_repl(splitResult[index])
135 result = ''.join(splitResult)
136
137 # remove stuff that fail HTML 4.01 Strict verification
138
139 # remove unsupported attributes
140 result = re.sub(' target=".*?"| type=".*?"', '', result)
141 result = re.sub('<hr .*?>', '<hr>', result)
142
143 # remove <p>...</p> inside <a>...</a> or <caption>...</caption>
144 found = re.finditer('<a href=".*?</a>|<caption>.*?</caption>', result) # XXX re.DOTALL)
145 if found:
146 splitResult = _splitResult(found, result)
147 for index in range(len(splitResult)):
148 if (splitResult[index].startswith('<a href="')
149 or splitResult[index].startswith('<caption>')):
150 splitResult[index] = splitResult[index].replace('<p>', '').replace('</p>', '')
151 result = ''.join(splitResult)
152
153 return result
154
155
156
157 def _compile_xsl(XSLT_FILE, XSLT_COMPILED_FILE):
158 """
159 compiling docbook stylesheet
160
161 reference: http://155.210.85.193:8010/ccia/nodes/2005-03-18/compileXslt?xslt=/akara/akara.xslt
162 """
163 from Ft.Xml.Xslt.Processor import Processor
164 from Ft.Xml.Xslt import Stylesheet
165 from Ft.Xml import InputSource
166 from Ft.Lib import Uri
167
168 # New docbook processor
169 db_processor=Processor()
170
171 # Docbook Stylesheet
172 my_sheet_uri = Uri.OsPathToUri(XSLT_FILE, 1)
173 sty_isrc = InputSource.DefaultFactory.fromUri(my_sheet_uri)
174
175 # Append Stylesheet
176 db_processor.appendStylesheet(sty_isrc)
177
178 # Pickled stylesheet will be self.abs_db_compiled_xsl file
179 db_root = db_processor.stylesheet.root
180 fw = file(XSLT_COMPILED_FILE, 'wb')
181 cPickle.dump(db_root, fw) # , protocol=2)
182 fw.close()
183
184
185 def _splitResult(iterator, result):
186 startpos = 0
187 splitResult = []
188
189 for f in iterator:
190 start, end = f.span()
191 splitResult.append(result[startpos:start])
192 splitResult.append(result[start:end])
193 startpos = end
194 splitResult.append(result[startpos:])
195
196 return splitResult
197