simplify auto scroll initialization; fix bug in IE init discovered when using IE7 on pages with wide tables
2 MoinMoin - convert from html to wiki markup
4 @copyright: 2005-2006 Bastian Blank, Florian Festi, Reimar Bauer,
6 @license: GNU GPL, see COPYING for details.
10 import xml.dom.minidom # HINT: the nodes in parse result tree need .has_key(), "x in ..." does not work
12 from xml.dom import Node
14 from MoinMoin import config, wikiutil
15 from MoinMoin.error import ConvertError
16 from werkzeug import url_decode
17 from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
18 interwiki_re = re.compile(WikiParser.interwiki_rule, re.VERBOSE|re.UNICODE)
21 # Portions (C) International Organization for Standardization 1986
22 # Permission to copy in any form is granted for use with
23 # conforming SGML systems and applications as defined in
24 # ISO 8879, provided this notice is included in all copies.
27 <!ENTITY nbsp " "> <!-- no-break space = non-breaking space, U+00A0, convert to U+0020 -->
28 <!ENTITY iexcl "¡"> <!-- inverted exclamation mark, U+00A1 ISOnum -->
29 <!ENTITY cent "¢"> <!-- cent sign, U+00A2 ISOnum -->
30 <!ENTITY pound "£"> <!-- pound sign, U+00A3 ISOnum -->
31 <!ENTITY curren "¤"> <!-- currency sign, U+00A4 ISOnum -->
32 <!ENTITY yen "¥"> <!-- yen sign = yuan sign, U+00A5 ISOnum -->
33 <!ENTITY brvbar "¦"> <!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
34 <!ENTITY sect "§"> <!-- section sign, U+00A7 ISOnum -->
35 <!ENTITY uml "¨"> <!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
36 <!ENTITY copy "©"> <!-- copyright sign, U+00A9 ISOnum -->
37 <!ENTITY ordf "ª"> <!-- feminine ordinal indicator, U+00AA ISOnum -->
38 <!ENTITY laquo "«"> <!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
39 <!ENTITY not "¬"> <!-- not sign = angled dash, U+00AC ISOnum -->
40 <!ENTITY shy "­"> <!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
41 <!ENTITY reg "®"> <!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
42 <!ENTITY macr "¯"> <!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
43 <!ENTITY deg "°"> <!-- degree sign, U+00B0 ISOnum -->
44 <!ENTITY plusmn "±"> <!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
45 <!ENTITY sup2 "²"> <!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
46 <!ENTITY sup3 "³"> <!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
47 <!ENTITY acute "´"> <!-- acute accent = spacing acute, U+00B4 ISOdia -->
48 <!ENTITY micro "µ"> <!-- micro sign, U+00B5 ISOnum -->
49 <!ENTITY para "¶"> <!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
50 <!ENTITY middot "·"> <!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
51 <!ENTITY cedil "¸"> <!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
52 <!ENTITY sup1 "¹"> <!-- superscript one = superscript digit one, U+00B9 ISOnum -->
53 <!ENTITY ordm "º"> <!-- masculine ordinal indicator, U+00BA ISOnum -->
54 <!ENTITY raquo "»"> <!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
55 <!ENTITY frac14 "¼"> <!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
56 <!ENTITY frac12 "½"> <!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
57 <!ENTITY frac34 "¾"> <!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
58 <!ENTITY iquest "¿"> <!-- inverted question mark = turned question mark, U+00BF ISOnum -->
59 <!ENTITY Agrave "À"> <!-- latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 -->
60 <!ENTITY Aacute "Á"> <!-- latin capital letter A with acute, U+00C1 ISOlat1 -->
61 <!ENTITY Acirc "Â"> <!-- latin capital letter A with circumflex, U+00C2 ISOlat1 -->
62 <!ENTITY Atilde "Ã"> <!-- latin capital letter A with tilde, U+00C3 ISOlat1 -->
63 <!ENTITY Auml "Ä"> <!-- latin capital letter A with diaeresis, U+00C4 ISOlat1 -->
64 <!ENTITY Aring "Å"> <!-- latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 -->
65 <!ENTITY AElig "Æ"> <!-- latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 -->
66 <!ENTITY Ccedil "Ç"> <!-- latin capital letter C with cedilla, U+00C7 ISOlat1 -->
67 <!ENTITY Egrave "È"> <!-- latin capital letter E with grave, U+00C8 ISOlat1 -->
68 <!ENTITY Eacute "É"> <!-- latin capital letter E with acute, U+00C9 ISOlat1 -->
69 <!ENTITY Ecirc "Ê"> <!-- latin capital letter E with circumflex, U+00CA ISOlat1 -->
70 <!ENTITY Euml "Ë"> <!-- latin capital letter E with diaeresis, U+00CB ISOlat1 -->
71 <!ENTITY Igrave "Ì"> <!-- latin capital letter I with grave, U+00CC ISOlat1 -->
72 <!ENTITY Iacute "Í"> <!-- latin capital letter I with acute, U+00CD ISOlat1 -->
73 <!ENTITY Icirc "Î"> <!-- latin capital letter I with circumflex, U+00CE ISOlat1 -->
74 <!ENTITY Iuml "Ï"> <!-- latin capital letter I with diaeresis, U+00CF ISOlat1 -->
75 <!ENTITY ETH "Ð"> <!-- latin capital letter ETH, U+00D0 ISOlat1 -->
76 <!ENTITY Ntilde "Ñ"> <!-- latin capital letter N with tilde, U+00D1 ISOlat1 -->
77 <!ENTITY Ograve "Ò"> <!-- latin capital letter O with grave, U+00D2 ISOlat1 -->
78 <!ENTITY Oacute "Ó"> <!-- latin capital letter O with acute, U+00D3 ISOlat1 -->
79 <!ENTITY Ocirc "Ô"> <!-- latin capital letter O with circumflex, U+00D4 ISOlat1 -->
80 <!ENTITY Otilde "Õ"> <!-- latin capital letter O with tilde, U+00D5 ISOlat1 -->
81 <!ENTITY Ouml "Ö"> <!-- latin capital letter O with diaeresis, U+00D6 ISOlat1 -->
82 <!ENTITY times "×"> <!-- multiplication sign, U+00D7 ISOnum -->
83 <!ENTITY Oslash "Ø"> <!-- latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 -->
84 <!ENTITY Ugrave "Ù"> <!-- latin capital letter U with grave, U+00D9 ISOlat1 -->
85 <!ENTITY Uacute "Ú"> <!-- latin capital letter U with acute, U+00DA ISOlat1 -->
86 <!ENTITY Ucirc "Û"> <!-- latin capital letter U with circumflex, U+00DB ISOlat1 -->
87 <!ENTITY Uuml "Ü"> <!-- latin capital letter U with diaeresis, U+00DC ISOlat1 -->
88 <!ENTITY Yacute "Ý"> <!-- latin capital letter Y with acute, U+00DD ISOlat1 -->
89 <!ENTITY THORN "Þ"> <!-- latin capital letter THORN, U+00DE ISOlat1 -->
90 <!ENTITY szlig "ß"> <!-- latin small letter sharp s = ess-zed, U+00DF ISOlat1 -->
91 <!ENTITY agrave "à"> <!-- latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 -->
92 <!ENTITY aacute "á"> <!-- latin small letter a with acute, U+00E1 ISOlat1 -->
93 <!ENTITY acirc "â"> <!-- latin small letter a with circumflex, U+00E2 ISOlat1 -->
94 <!ENTITY atilde "ã"> <!-- latin small letter a with tilde, U+00E3 ISOlat1 -->
95 <!ENTITY auml "ä"> <!-- latin small letter a with diaeresis, U+00E4 ISOlat1 -->
96 <!ENTITY aring "å"> <!-- latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 -->
97 <!ENTITY aelig "æ"> <!-- latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 -->
98 <!ENTITY ccedil "ç"> <!-- latin small letter c with cedilla, U+00E7 ISOlat1 -->
99 <!ENTITY egrave "è"> <!-- latin small letter e with grave, U+00E8 ISOlat1 -->
100 <!ENTITY eacute "é"> <!-- latin small letter e with acute, U+00E9 ISOlat1 -->
101 <!ENTITY ecirc "ê"> <!-- latin small letter e with circumflex, U+00EA ISOlat1 -->
102 <!ENTITY euml "ë"> <!-- latin small letter e with diaeresis, U+00EB ISOlat1 -->
103 <!ENTITY igrave "ì"> <!-- latin small letter i with grave, U+00EC ISOlat1 -->
104 <!ENTITY iacute "í"> <!-- latin small letter i with acute, U+00ED ISOlat1 -->
105 <!ENTITY icirc "î"> <!-- latin small letter i with circumflex, U+00EE ISOlat1 -->
106 <!ENTITY iuml "ï"> <!-- latin small letter i with diaeresis, U+00EF ISOlat1 -->
107 <!ENTITY eth "ð"> <!-- latin small letter eth, U+00F0 ISOlat1 -->
108 <!ENTITY ntilde "ñ"> <!-- latin small letter n with tilde, U+00F1 ISOlat1 -->
109 <!ENTITY ograve "ò"> <!-- latin small letter o with grave, U+00F2 ISOlat1 -->
110 <!ENTITY oacute "ó"> <!-- latin small letter o with acute, U+00F3 ISOlat1 -->
111 <!ENTITY ocirc "ô"> <!-- latin small letter o with circumflex, U+00F4 ISOlat1 -->
112 <!ENTITY otilde "õ"> <!-- latin small letter o with tilde, U+00F5 ISOlat1 -->
113 <!ENTITY ouml "ö"> <!-- latin small letter o with diaeresis, U+00F6 ISOlat1 -->
114 <!ENTITY divide "÷"> <!-- division sign, U+00F7 ISOnum -->
115 <!ENTITY oslash "ø"> <!-- latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 -->
116 <!ENTITY ugrave "ù"> <!-- latin small letter u with grave, U+00F9 ISOlat1 -->
117 <!ENTITY uacute "ú"> <!-- latin small letter u with acute, U+00FA ISOlat1 -->
118 <!ENTITY ucirc "û"> <!-- latin small letter u with circumflex, U+00FB ISOlat1 -->
119 <!ENTITY uuml "ü"> <!-- latin small letter u with diaeresis, U+00FC ISOlat1 -->
120 <!ENTITY yacute "ý"> <!-- latin small letter y with acute, U+00FD ISOlat1 -->
121 <!ENTITY thorn "þ"> <!-- latin small letter thorn, U+00FE ISOlat1 -->
122 <!ENTITY yuml "ÿ"> <!-- latin small letter y with diaeresis, U+00FF ISOlat1 -->
124 <!-- Latin Extended-B -->
125 <!ENTITY fnof "ƒ"> <!-- latin small f with hook = function = florin, U+0192 ISOtech -->
128 <!ENTITY Alpha "Α"> <!-- greek capital letter alpha, U+0391 -->
129 <!ENTITY Beta "Β"> <!-- greek capital letter beta, U+0392 -->
130 <!ENTITY Gamma "Γ"> <!-- greek capital letter gamma,
132 <!ENTITY Delta "Δ"> <!-- greek capital letter delta,
134 <!ENTITY Epsilon "Ε"> <!-- greek capital letter epsilon, U+0395 -->
135 <!ENTITY Zeta "Ζ"> <!-- greek capital letter zeta, U+0396 -->
136 <!ENTITY Eta "Η"> <!-- greek capital letter eta, U+0397 -->
137 <!ENTITY Theta "Θ"> <!-- greek capital letter theta,
139 <!ENTITY Iota "Ι"> <!-- greek capital letter iota, U+0399 -->
140 <!ENTITY Kappa "Κ"> <!-- greek capital letter kappa, U+039A -->
141 <!ENTITY Lambda "Λ"> <!-- greek capital letter lambda,
143 <!ENTITY Mu "Μ"> <!-- greek capital letter mu, U+039C -->
144 <!ENTITY Nu "Ν"> <!-- greek capital letter nu, U+039D -->
145 <!ENTITY Xi "Ξ"> <!-- greek capital letter xi, U+039E ISOgrk3 -->
146 <!ENTITY Omicron "Ο"> <!-- greek capital letter omicron, U+039F -->
147 <!ENTITY Pi "Π"> <!-- greek capital letter pi, U+03A0 ISOgrk3 -->
148 <!ENTITY Rho "Ρ"> <!-- greek capital letter rho, U+03A1 -->
149 <!-- there is no Sigmaf, and no U+03A2 character either -->
150 <!ENTITY Sigma "Σ"> <!-- greek capital letter sigma,
152 <!ENTITY Tau "Τ"> <!-- greek capital letter tau, U+03A4 -->
153 <!ENTITY Upsilon "Υ"> <!-- greek capital letter upsilon,
155 <!ENTITY Phi "Φ"> <!-- greek capital letter phi,
157 <!ENTITY Chi "Χ"> <!-- greek capital letter chi, U+03A7 -->
158 <!ENTITY Psi "Ψ"> <!-- greek capital letter psi,
160 <!ENTITY Omega "Ω"> <!-- greek capital letter omega,
163 <!ENTITY alpha "α"> <!-- greek small letter alpha,
165 <!ENTITY beta "β"> <!-- greek small letter beta, U+03B2 ISOgrk3 -->
166 <!ENTITY gamma "γ"> <!-- greek small letter gamma,
168 <!ENTITY delta "δ"> <!-- greek small letter delta,
170 <!ENTITY epsilon "ε"> <!-- greek small letter epsilon,
172 <!ENTITY zeta "ζ"> <!-- greek small letter zeta, U+03B6 ISOgrk3 -->
173 <!ENTITY eta "η"> <!-- greek small letter eta, U+03B7 ISOgrk3 -->
174 <!ENTITY theta "θ"> <!-- greek small letter theta,
176 <!ENTITY iota "ι"> <!-- greek small letter iota, U+03B9 ISOgrk3 -->
177 <!ENTITY kappa "κ"> <!-- greek small letter kappa,
179 <!ENTITY lambda "λ"> <!-- greek small letter lambda,
181 <!ENTITY mu "μ"> <!-- greek small letter mu, U+03BC ISOgrk3 -->
182 <!ENTITY nu "ν"> <!-- greek small letter nu, U+03BD ISOgrk3 -->
183 <!ENTITY xi "ξ"> <!-- greek small letter xi, U+03BE ISOgrk3 -->
184 <!ENTITY omicron "ο"> <!-- greek small letter omicron, U+03BF NEW -->
185 <!ENTITY pi "π"> <!-- greek small letter pi, U+03C0 ISOgrk3 -->
186 <!ENTITY rho "ρ"> <!-- greek small letter rho, U+03C1 ISOgrk3 -->
187 <!ENTITY sigmaf "ς"> <!-- greek small letter final sigma,
189 <!ENTITY sigma "σ"> <!-- greek small letter sigma,
191 <!ENTITY tau "τ"> <!-- greek small letter tau, U+03C4 ISOgrk3 -->
192 <!ENTITY upsilon "υ"> <!-- greek small letter upsilon,
194 <!ENTITY phi "φ"> <!-- greek small letter phi, U+03C6 ISOgrk3 -->
195 <!ENTITY chi "χ"> <!-- greek small letter chi, U+03C7 ISOgrk3 -->
196 <!ENTITY psi "ψ"> <!-- greek small letter psi, U+03C8 ISOgrk3 -->
197 <!ENTITY omega "ω"> <!-- greek small letter omega,
199 <!ENTITY thetasym "ϑ"> <!-- greek small letter theta symbol,
201 <!ENTITY upsih "ϒ"> <!-- greek upsilon with hook symbol,
203 <!ENTITY piv "ϖ"> <!-- greek pi symbol, U+03D6 ISOgrk3 -->
205 <!-- General Punctuation -->
206 <!ENTITY bull "•"> <!-- bullet = black small circle,
208 <!-- bullet is NOT the same as bullet operator, U+2219 -->
209 <!ENTITY hellip "…"> <!-- horizontal ellipsis = three dot leader,
211 <!ENTITY prime "′"> <!-- prime = minutes = feet, U+2032 ISOtech -->
212 <!ENTITY Prime "″"> <!-- double prime = seconds = inches,
214 <!ENTITY oline "‾"> <!-- overline = spacing overscore,
216 <!ENTITY frasl "⁄"> <!-- fraction slash, U+2044 NEW -->
218 <!-- Letterlike Symbols -->
219 <!ENTITY weierp "℘"> <!-- script capital P = power set
220 = Weierstrass p, U+2118 ISOamso -->
221 <!ENTITY image "ℑ"> <!-- blackletter capital I = imaginary part,
223 <!ENTITY real "ℜ"> <!-- blackletter capital R = real part symbol,
225 <!ENTITY trade "™"> <!-- trade mark sign, U+2122 ISOnum -->
226 <!ENTITY alefsym "ℵ"> <!-- alef symbol = first transfinite cardinal,
228 <!-- alef symbol is NOT the same as hebrew letter alef,
229 U+05D0 although the same glyph could be used to depict both characters -->
232 <!ENTITY larr "←"> <!-- leftwards arrow, U+2190 ISOnum -->
233 <!ENTITY uarr "↑"> <!-- upwards arrow, U+2191 ISOnum-->
234 <!ENTITY rarr "→"> <!-- rightwards arrow, U+2192 ISOnum -->
235 <!ENTITY darr "↓"> <!-- downwards arrow, U+2193 ISOnum -->
236 <!ENTITY harr "↔"> <!-- left right arrow, U+2194 ISOamsa -->
237 <!ENTITY crarr "↵"> <!-- downwards arrow with corner leftwards
238 = carriage return, U+21B5 NEW -->
239 <!ENTITY lArr "⇐"> <!-- leftwards double arrow, U+21D0 ISOtech -->
240 <!-- ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
241 but also does not have any other character for that function. So ? lArr can
242 be used for 'is implied by' as ISOtech suggests -->
243 <!ENTITY uArr "⇑"> <!-- upwards double arrow, U+21D1 ISOamsa -->
244 <!ENTITY rArr "⇒"> <!-- rightwards double arrow,
246 <!-- ISO 10646 does not say this is the 'implies' character but does not have
247 another character with this function so ?
248 rArr can be used for 'implies' as ISOtech suggests -->
249 <!ENTITY dArr "⇓"> <!-- downwards double arrow, U+21D3 ISOamsa -->
250 <!ENTITY hArr "⇔"> <!-- left right double arrow,
253 <!-- Mathematical Operators -->
254 <!ENTITY forall "∀"> <!-- for all, U+2200 ISOtech -->
255 <!ENTITY part "∂"> <!-- partial differential, U+2202 ISOtech -->
256 <!ENTITY exist "∃"> <!-- there exists, U+2203 ISOtech -->
257 <!ENTITY empty "∅"> <!-- empty set = null set = diameter,
259 <!ENTITY nabla "∇"> <!-- nabla = backward difference,
261 <!ENTITY isin "∈"> <!-- element of, U+2208 ISOtech -->
262 <!ENTITY notin "∉"> <!-- not an element of, U+2209 ISOtech -->
263 <!ENTITY ni "∋"> <!-- contains as member, U+220B ISOtech -->
264 <!-- should there be a more memorable name than 'ni'? -->
265 <!ENTITY prod "∏"> <!-- n-ary product = product sign,
267 <!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
268 the same glyph might be used for both -->
269 <!ENTITY sum "∑"> <!-- n-ary sumation, U+2211 ISOamsb -->
270 <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
271 though the same glyph might be used for both -->
272 <!ENTITY minus "−"> <!-- minus sign, U+2212 ISOtech -->
273 <!ENTITY lowast "∗"> <!-- asterisk operator, U+2217 ISOtech -->
274 <!ENTITY radic "√"> <!-- square root = radical sign,
276 <!ENTITY prop "∝"> <!-- proportional to, U+221D ISOtech -->
277 <!ENTITY infin "∞"> <!-- infinity, U+221E ISOtech -->
278 <!ENTITY ang "∠"> <!-- angle, U+2220 ISOamso -->
279 <!ENTITY and "∧"> <!-- logical and = wedge, U+2227 ISOtech -->
280 <!ENTITY or "∨"> <!-- logical or = vee, U+2228 ISOtech -->
281 <!ENTITY cap "∩"> <!-- intersection = cap, U+2229 ISOtech -->
282 <!ENTITY cup "∪"> <!-- union = cup, U+222A ISOtech -->
283 <!ENTITY int "∫"> <!-- integral, U+222B ISOtech -->
284 <!ENTITY there4 "∴"> <!-- therefore, U+2234 ISOtech -->
285 <!ENTITY sim "∼"> <!-- tilde operator = varies with = similar to,
287 <!-- tilde operator is NOT the same character as the tilde, U+007E,
288 although the same glyph might be used to represent both -->
289 <!ENTITY cong "≅"> <!-- approximately equal to, U+2245 ISOtech -->
290 <!ENTITY asymp "≈"> <!-- almost equal to = asymptotic to,
292 <!ENTITY ne "≠"> <!-- not equal to, U+2260 ISOtech -->
293 <!ENTITY equiv "≡"> <!-- identical to, U+2261 ISOtech -->
294 <!ENTITY le "≤"> <!-- less-than or equal to, U+2264 ISOtech -->
295 <!ENTITY ge "≥"> <!-- greater-than or equal to,
297 <!ENTITY sub "⊂"> <!-- subset of, U+2282 ISOtech -->
298 <!ENTITY sup "⊃"> <!-- superset of, U+2283 ISOtech -->
299 <!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
300 font encoding and is not included. Should it be, for symmetry?
302 <!ENTITY nsub "⊄"> <!-- not a subset of, U+2284 ISOamsn -->
303 <!ENTITY sube "⊆"> <!-- subset of or equal to, U+2286 ISOtech -->
304 <!ENTITY supe "⊇"> <!-- superset of or equal to,
306 <!ENTITY oplus "⊕"> <!-- circled plus = direct sum,
308 <!ENTITY otimes "⊗"> <!-- circled times = vector product,
310 <!ENTITY perp "⊥"> <!-- up tack = orthogonal to = perpendicular,
312 <!ENTITY sdot "⋅"> <!-- dot operator, U+22C5 ISOamsb -->
313 <!-- dot operator is NOT the same character as U+00B7 middle dot -->
315 <!-- Miscellaneous Technical -->
316 <!ENTITY lceil "⌈"> <!-- left ceiling = apl upstile,
318 <!ENTITY rceil "⌉"> <!-- right ceiling, U+2309 ISOamsc -->
319 <!ENTITY lfloor "⌊"> <!-- left floor = apl downstile,
321 <!ENTITY rfloor "⌋"> <!-- right floor, U+230B ISOamsc -->
322 <!ENTITY lang "〈"> <!-- left-pointing angle bracket = bra,
324 <!-- lang is NOT the same character as U+003C 'less than'
325 or U+2039 'single left-pointing angle quotation mark' -->
326 <!ENTITY rang "〉"> <!-- right-pointing angle bracket = ket,
328 <!-- rang is NOT the same character as U+003E 'greater than'
329 or U+203A 'single right-pointing angle quotation mark' -->
331 <!-- Geometric Shapes -->
332 <!ENTITY loz "◊"> <!-- lozenge, U+25CA ISOpub -->
334 <!-- Miscellaneous Symbols -->
335 <!ENTITY spades "♠"> <!-- black spade suit, U+2660 ISOpub -->
336 <!-- black here seems to mean filled as opposed to hollow -->
337 <!ENTITY clubs "♣"> <!-- black club suit = shamrock,
339 <!ENTITY hearts "♥"> <!-- black heart suit = valentine,
341 <!ENTITY diams "♦"> <!-- black diamond suit, U+2666 ISOpub -->
343 <!-- C0 Controls and Basic Latin -->
344 <!ENTITY quot """> <!-- quotation mark = APL quote,
346 <!ENTITY amp "&"> <!-- ampersand, U+0026 ISOnum -->
347 <!ENTITY lt "<"> <!-- less-than sign, U+003C ISOnum -->
348 <!ENTITY gt ">"> <!-- greater-than sign, U+003E ISOnum -->
350 <!-- Latin Extended-A -->
351 <!ENTITY OElig "Œ"> <!-- latin capital ligature OE,
353 <!ENTITY oelig "œ"> <!-- latin small ligature oe, U+0153 ISOlat2 -->
354 <!-- ligature is a misnomer, this is a separate character in some languages -->
355 <!ENTITY Scaron "Š"> <!-- latin capital letter S with caron,
357 <!ENTITY scaron "š"> <!-- latin small letter s with caron,
359 <!ENTITY Yuml "Ÿ"> <!-- latin capital letter Y with diaeresis,
362 <!-- Spacing Modifier Letters -->
363 <!ENTITY circ "ˆ"> <!-- modifier letter circumflex accent,
365 <!ENTITY tilde "˜"> <!-- small tilde, U+02DC ISOdia -->
367 <!-- General Punctuation -->
368 <!ENTITY ensp " "> <!-- en space, U+2002 ISOpub -->
369 <!ENTITY emsp " "> <!-- em space, U+2003 ISOpub -->
370 <!ENTITY thinsp " "> <!-- thin space, U+2009 ISOpub -->
371 <!ENTITY zwnj "‌"> <!-- zero width non-joiner,
372 U+200C NEW RFC 2070 -->
373 <!ENTITY zwj "‍"> <!-- zero width joiner, U+200D NEW RFC 2070 -->
374 <!ENTITY lrm "‎"> <!-- left-to-right mark, U+200E NEW RFC 2070 -->
375 <!ENTITY rlm "‏"> <!-- right-to-left mark, U+200F NEW RFC 2070 -->
376 <!ENTITY ndash "–"> <!-- en dash, U+2013 ISOpub -->
377 <!ENTITY mdash "—"> <!-- em dash, U+2014 ISOpub -->
378 <!ENTITY lsquo "‘"> <!-- left single quotation mark,
380 <!ENTITY rsquo "’"> <!-- right single quotation mark,
382 <!ENTITY sbquo "‚"> <!-- single low-9 quotation mark, U+201A NEW -->
383 <!ENTITY ldquo "“"> <!-- left double quotation mark,
385 <!ENTITY rdquo "”"> <!-- right double quotation mark,
387 <!ENTITY bdquo "„"> <!-- double low-9 quotation mark, U+201E NEW -->
388 <!ENTITY dagger "†"> <!-- dagger, U+2020 ISOpub -->
389 <!ENTITY Dagger "‡"> <!-- double dagger, U+2021 ISOpub -->
390 <!ENTITY permil "‰"> <!-- per mille sign, U+2030 ISOtech -->
391 <!ENTITY lsaquo "‹"> <!-- single left-pointing angle quotation mark,
392 U+2039 ISO proposed -->
393 <!-- lsaquo is proposed but not yet ISO standardized -->
394 <!ENTITY rsaquo "›"> <!-- single right-pointing angle quotation mark,
395 U+203A ISO proposed -->
396 <!-- rsaquo is proposed but not yet ISO standardized -->
397 <!ENTITY euro "€"> <!-- euro sign, U+20AC NEW -->
402 class visitor(object):
404 self.visit_node_list(tree.childNodes)
406 def visit_node_list(self, nodelist):
407 for node in nodelist:
410 def visit(self, node):
411 nodeType = node.nodeType
412 if node.nodeType == Node.ELEMENT_NODE:
413 return self.visit_element(node)
414 elif node.nodeType == Node.ATTRIBUTE_NODE:
415 return self.visit_attribute(node)
416 elif node.nodeType == Node.TEXT_NODE:
417 return self.visit_text(node)
418 elif node.nodeType == Node.CDATA_SECTION_NODE:
419 return self.visit_cdata_section(node)
421 def visit_element(self, node):
422 if len(node.childNodes):
423 self.visit_node_list(node.childNodes)
425 def visit_attribute(self, node):
428 def visit_text(self, node):
431 def visit_cdata_section(self, node):
435 class strip_whitespace(visitor):
437 def visit_element(self, node):
438 if node.localName == 'p':
439 # XXX: our formatter adds a whitespace at the end of each paragraph
440 if node.hasChildNodes() and node.childNodes[-1].nodeType == Node.TEXT_NODE:
441 data = node.childNodes[-1].data.rstrip('\n ')
444 node.removeChild(node.childNodes[-1])
446 node.childNodes[-1].data = data
447 # Remove empty paragraphs
448 if not node.hasChildNodes():
449 node.parentNode.removeChild(node)
451 if node.hasChildNodes():
452 self.visit_node_list(node.childNodes)
455 class convert_tree(visitor):
456 white_space = object()
458 new_line_dont_remove = object()
460 def __init__(self, request, pagename):
461 self.request = request
462 self.pagename = pagename
467 self.visit(tree.documentElement)
468 self.check_whitespace()
469 return ''.join(self.text)
471 def check_whitespace(self):
475 if text[i] is self.white_space:
476 if i == 0 or i == len(text)-1:
478 elif text[i-1].endswith(" ") or text[i-1].endswith("\n"):
479 # last char of previous element is whitespace
481 elif (text[i+1] is self.white_space or
482 # next element is white_space
483 text[i+1] is self.new_line):
486 elif text[i+1].startswith(" ") or text[i+1].startswith("\n"):
487 # first char of next element is whitespace
492 elif text[i] is self.new_line:
495 elif i == len(text) - 1:
498 elif text[i-1].endswith("\n") or (
499 isinstance(text[i+1], str) and text[i+1].startswith("\n")):
504 elif text[i] is self.new_line_dont_remove:
510 def visit_text(self, node):
511 self.text.append(node.data)
513 def visit_element(self, node):
514 name = node.localName
515 if name is None: # not sure this can happen here (DOM comment node), but just for the case
517 func = getattr(self, "process_%s" % name, None)
521 self.process_inline(node)
523 def visit_node_list_element_only(self, nodelist):
524 for node in nodelist:
525 if node.nodeType == Node.ELEMENT_NODE:
526 self.visit_element(node)
528 def node_list_text_only(self, nodelist):
530 for node in nodelist:
531 if node.nodeType == Node.TEXT_NODE:
532 result.append(node.data)
534 result.extend(self.node_list_text_only(node.childNodes))
535 return "".join(result)
537 def get_desc(self, nodelist):
538 """ links can have either text or an image as description - we extract
539 this from the child nodelist and return wiki markup.
542 text = self.node_list_text_only(nodelist).replace("\n", " ").strip()
547 # search for an img / object
548 for node in nodelist:
549 if node.nodeType == Node.ELEMENT_NODE:
550 name = node.localName
552 markup = self._process_img(node) # XXX problem: markup containts auto-generated alt text with link target
554 elif name == 'object':
555 markup = self._process_object(node)
559 def process_page(self, node):
560 for i in node.childNodes:
561 if i.nodeType == Node.ELEMENT_NODE:
562 self.visit_element(i)
563 elif i.nodeType == Node.TEXT_NODE: # if this is missing, all std text under a headline is dropped!
564 txt = i.data.strip() # IMPORTANT: don't leave this unstripped or there will be wrong blanks
566 self.text.append(txt)
567 #we use <pre class="comment"> now, so this is currently unused:
568 #elif i.nodeType == Node.COMMENT_NODE:
569 # self.text.append(i.data)
570 # self.text.append("\n")
572 def process_br(self, node):
573 self.text.append(self.new_line) # without this, std multi-line text below some heading misses a whitespace
574 # when it gets merged to float text, like word word wordword word word
576 def process_heading(self, node):
577 text = self.node_list_text_only(node.childNodes).strip()
579 depth = int(node.localName[1])
581 self.text.append(self.new_line)
582 self.text.append("%s %s %s" % (hstr, text.replace("\n", " "), hstr))
583 self.text.append(self.new_line)
585 process_h1 = process_heading
586 process_h2 = process_heading
587 process_h3 = process_heading
588 process_h4 = process_heading
589 process_h5 = process_heading
590 process_h6 = process_heading
592 def _get_list_item_markup(self, list, listitem):
594 #indent = str(self.depth) * self.depth # nice for debugging :)
595 indent = " " * self.depth
597 name = list.localName
599 class_ = listitem.getAttribute("class")
601 before = self.new_line_dont_remove
602 if list.hasAttribute("type"):
603 type = list.getAttribute("type")
606 markup = "%s. " % type
608 class_ = listitem.getAttribute("class")
610 before = self.new_line_dont_remove
611 style = listitem.getAttribute("style")
612 if re.match(ur"list-style-type:\s*none", style, re.I):
614 # set markup with white space when list element containes table
615 for i in listitem.childNodes:
616 if i.nodeType == Node.ELEMENT_NODE:
617 if i.localName == 'table':
624 raise ConvertError("Illegal list type %s" % name)
625 return before, indent, markup
627 def process_dl(self, node):
629 markup = ":: " # can there be a dl dd without dt?
630 for i in node.childNodes:
631 if i.nodeType == Node.ELEMENT_NODE:
634 before, indent, markup = self._get_list_item_markup(node, i)
635 self.text.extend([before, indent])
636 text = self.node_list_text_only(i.childNodes)
637 self.text.append(text.replace("\n", " "))
639 self.text.append(markup)
640 self.process_list_item(i, indent) # XXX no dt -> indent is undefined!!!
642 raise ConvertError("Illegal list element %s" % i.localName)
645 self.text.append(self.new_line_dont_remove)
647 def process_list(self, node):
649 for i in node.childNodes:
650 if i.nodeType == Node.ELEMENT_NODE:
653 before, indent, markup = self._get_list_item_markup(node, i)
654 self.text.extend([before, indent, markup])
655 self.process_list_item(i, indent)
656 elif name in ('ol', 'ul', ):
661 raise ConvertError("Illegal list element %s" % i.localName)
664 self.text.append(self.new_line_dont_remove)
666 process_ul = process_list
667 process_ol = process_list
669 def empty_paragraph_queue(self, nodelist, indent, need_indent):
671 self.text.append(indent)
673 if i.nodeType == Node.ELEMENT_NODE:
674 if i.localName == 'br':
675 self.text.append('<<BR>>')
677 self.process_inline(i)
678 elif i.nodeType == Node.TEXT_NODE:
679 self.text.append(i.data.strip('\n').replace('\n', ' '))
680 self.text.append(self.new_line)
683 def process_list_item(self, node, indent):
688 # If this is a empty list item, we just terminate the line
689 if node.childNodes.length == 0:
690 self.text.append(self.new_line)
693 for i in node.childNodes:
696 if name in ('p', 'pre', 'ol', 'ul', 'dl', 'table', ) and pending:
697 self.empty_paragraph_queue(pending, indent, need_indent)
702 self.text.append(indent)
703 self.process_paragraph_item(i)
704 self.text.append(self.new_line)
708 self.text.append(indent)
709 self.process_preformatted_item(i)
711 elif name in ('ol', 'ul', ):
717 elif name == 'table':
719 self.text.append(indent)
720 self.process_table(i)
731 self.empty_paragraph_queue(pending, indent, need_indent)
733 def process_blockquote(self, node):
734 # XXX this does not really work. e.g.:
739 for i in node.childNodes:
740 if i.nodeType == Node.ELEMENT_NODE:
743 self.text.append(self.new_line)
744 self.text.append(" " * self.depth)
747 self.text.append(self.new_line)
748 self.text.append(" " * self.depth)
750 elif name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', ):
751 self.process_heading(i)
752 elif name in ('ol', 'ul', ):
761 self.visit_node_list_element_only(i.childNodes)
762 elif name == 'blockquote':
763 self.process_blockquote(i)
769 raise ConvertError("process_blockquote: Don't support %s element" % name)
772 def process_inline(self, node):
773 if node.nodeType == Node.TEXT_NODE:
774 self.text.append(node.data.strip('\n').replace('\n', ' '))
777 # do we need to check for Node.ELEMENT_NODE and return (do nothing)?
778 name = node.localName # can be None for DOM Comment nodes
783 if name in (u'title', u'meta', u'style'):
786 if name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', ): # headers are not allowed here (e.g. inside a ul li),
787 text = self.node_list_text_only(node.childNodes).strip() # but can be inserted via the editor
788 self.text.append(text) # so we just drop the header markup and keep the text
791 func = getattr(self, "process_%s" % name, None)
797 if name in ('em', 'i', ):
799 elif name in ('strong', 'b', ):
806 elif name == 'small':
809 elif name == 'strike':
811 command_close = ")--"
816 elif name in ('area', 'center', 'code', 'embed', 'fieldset', 'font', 'form', 'iframe', 'input', 'label', 'link', 'map',
817 'meta', 'noscript', 'option', 'script', 'select', 'textarea', 'wbr'):
818 command = "" # just throw away unsupported elements
820 raise ConvertError("process_inline: Don't support %s element" % name)
822 self.text.append(command)
823 for i in node.childNodes:
824 # lonly childnodes checked if they are only 'br'
825 if command and len(node.childNodes) == 1:
826 # formatted br alone is not wanted (who wants a bold br?)
827 if i.localName != 'br':
828 self.process_inline(i)
830 if i.localName == 'br':
831 # dont make a real \n because that breaks tables
832 self.text.append('<<BR>>')
834 self.process_inline(i)
836 command = command_close
837 self.text.append(command)
839 def process_span(self, node):
840 # process span tag for firefox3
841 node_style = node.getAttribute("style")
843 is_strike = node.getAttribute("class") == "strike"
844 is_strike = is_strike or "line-through" in node_style
845 is_strong = "bold" in node_style
846 is_italic = "italic" in node_style
847 is_underline = "underline" in node_style
848 is_comment = node.getAttribute("class") == "comment"
852 self.text.append("/* ")
854 self.text.append("--(")
856 self.text.append("'''")
858 self.text.append("''")
860 self.text.append("__")
863 for i in node.childNodes:
864 self.process_inline(i)
868 self.text.append("__")
870 self.text.append("''")
872 self.text.append("'''")
874 self.text.append(")--")
876 self.text.append(" */")
878 def process_div(self, node):
880 self._process_indent(node)
882 # ignore div tags - just descend
883 for i in node.childNodes:
886 def process_tt(self, node):
887 text = self.node_list_text_only(node.childNodes).replace("\n", " ")
888 if node.getAttribute("class") == "backtick":
889 self.text.append("`%s`" % text)
891 self.text.append("{{{%s}}}" % text)
893 def process_hr(self, node):
894 if node.hasAttribute("class"):
895 class_ = node.getAttribute("class")
898 if class_.startswith("hr") and class_[2] in "123456":
899 length = int(class_[2]) + 4
902 self.text.extend([self.new_line, "-" * length, self.new_line])
904 def process_p(self, node):
906 self._process_indent(node)
907 self.process_paragraph_item(node)
908 self.text.append("\n\n") # do not use self.new_line here!
910 def _process_indent(self, node):
912 node_style = node.getAttribute("style")
913 match = re.match(r"margin-left:\s*(\d+)px", node_style)
915 left_margin = int(match.group(1))
916 indent_depth = int(left_margin / 40)
918 self.text.append(' . ')
920 def process_paragraph_item(self, node):
921 for i in node.childNodes:
922 if i.nodeType == Node.ELEMENT_NODE:
923 self.process_inline(i)
924 elif i.nodeType == Node.TEXT_NODE:
925 self.text.append(i.data.strip('\n').replace('\n', ' '))
927 def process_pre(self, node):
928 self.process_preformatted_item(node)
929 self.text.append(self.new_line)
931 def process_preformatted_item(self, node):
932 if node.hasAttribute("class"):
933 class_ = node.getAttribute("class")
936 if class_ == "comment": # we currently use this for stuff like ## or #acl
937 for i in node.childNodes:
938 if i.nodeType == Node.TEXT_NODE:
939 self.text.append(i.data.replace('\n', ''))
940 elif i.localName == 'br':
941 self.text.append(self.new_line)
946 longest_inner_formater = ''
952 this has problem when outer delimiter has two more { than inside one
953 e.g. {{{{{{ {{{ foo }}} }}}}}} --> {{{{ {{{ foo }}} }}}}
954 {{{foo {{{ }}} foo}}} --> {{{{ {{{ }}} }}}}
957 for i in node.childNodes:
958 if i.nodeType == Node.TEXT_NODE:
959 # get longest pre tag({{{ or }}}) from content
960 delimiters.extend(re.compile("((?u){+)").findall(i.data))
961 delimiters.extend(re.compile("((?u)}+)").findall(i.data))
962 # when first line is empty, start iteration second line of i.data
963 data_lines = i.data.rstrip().split('\n')
964 if data_lines[0].strip() == '':
965 data_lines = data_lines[1:]
966 for line in data_lines:
967 if line.strip().startswith('#!'):
969 bang_args = line.strip()
971 content_buffer.extend([line, self.new_line])
973 content_buffer.extend([line, self.new_line])
974 elif i.localName == 'br':
975 content_buffer.append(self.new_line_dont_remove)
980 longest_inner_formater = max(delimiters)
982 if (len(longest_inner_formater) >= 3):
983 self.text.extend([("{" * (len(longest_inner_formater) + 1)) + bang_args, \
985 self.text.extend(content_buffer)
986 self.text.extend(["}" * (len(longest_inner_formater) + 1), \
989 self.text.extend(["{{{"+bang_args, self.new_line])
990 self.text.extend(content_buffer)
991 self.text.extend(["}}}", self.new_line])
993 _alignment = {"left": "(",
999 def _check_length(self, value):
1006 def _get_color(self, node, prefix):
1007 if node.hasAttribute("bgcolor"):
1008 value = node.getAttribute("bgcolor")
1009 match = re.match(r"rgb\((\d+),\s*(\d+),\s*(\d+)\)", value)
1011 value = '#%X%X%X' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
1013 match = re.match(r"#[0-9A-Fa-f]{6}", value)
1014 if not prefix and match:
1017 result = '%sbgcolor="%s"' % (prefix, value)
1022 def _table_style(self, node):
1023 # TODO: attrs = get_attrs(node)
1025 result.append(self._get_color(node, 'table'))
1026 if node.hasAttribute("width"):
1027 value = node.getAttribute("width")
1028 result.append('tablewidth="%s"' % self._check_length(value))
1029 if node.hasAttribute("height"):
1030 value = node.getAttribute("height")
1031 result.append('tableheight="%s"' % self._check_length(value))
1032 if node.hasAttribute("align"):
1033 value = node.getAttribute("align")
1034 result.append('tablealign="%s"' % value)
1035 if node.hasAttribute("style"):
1036 result.append('tablestyle="%s"' % node.getAttribute("style"))
1037 if node.hasAttribute("class"):
1038 result.append('tableclass="%s"' % node.getAttribute("class"))
1039 return " ".join(result).strip()
1041 def _row_style(self, node):
1042 # TODO: attrs = get_attrs(node)
1044 result.append(self._get_color(node, 'row'))
1045 if node.hasAttribute("style"):
1046 result.append('rowstyle="%s"' % node.getAttribute("style"))
1047 if node.hasAttribute("class"):
1048 result.append('rowclass="%s"' % node.getAttribute("class"))
1049 return " ".join(result).strip()
1051 def _cell_style(self, node):
1052 # TODO: attrs = get_attrs(node)
1053 if node.hasAttribute("rowspan"):
1054 rowspan = ("|%s" % node.getAttribute("rowspan"))
1058 if node.hasAttribute("colspan"):
1059 colspan = int(node.getAttribute("colspan"))
1063 spanning = rowspan or colspan > 1
1067 result.append(self._get_color(node, ''))
1068 if node.hasAttribute("align"):
1069 value = node.getAttribute("align")
1070 if not spanning or value != "center":
1071 # ignore "center" in spanning cells
1072 align += self._alignment.get(value, "")
1073 if node.hasAttribute("valign"):
1074 value = node.getAttribute("valign")
1075 if not spanning or value != "center":
1076 # ignore "center" in spanning cells
1077 align += self._alignment.get(value, "")
1078 if node.hasAttribute("width"):
1079 value = node.getAttribute("width")
1080 if value and value[-1] == "%":
1083 result.append('width="%s"' % self._check_length(value))
1084 if node.hasAttribute("height"):
1085 value = node.getAttribute("height")
1086 result.append('height="%s"' % self._check_length(value))
1087 if node.hasAttribute("class"):
1088 result.append('class="%s"' % node.getAttribute("class"))
1089 if node.hasAttribute("id"):
1090 result.append('id="%s"' % node.getAttribute("id"))
1091 if node.hasAttribute("style"):
1092 result.append('style="%s"' % node.getAttribute("style"))
1095 result.insert(0, "%s" % align)
1096 result.append(rowspan)
1097 return " ".join(result).strip()
1099 def process_table(self, node, style=""):
1101 self.text.append(self.new_line)
1102 self.new_table = True
1103 style += self._table_style(node)
1104 for i in node.childNodes:
1105 if i.nodeType == Node.ELEMENT_NODE:
1108 self.process_table_record(i, style)
1110 elif name in ('thead', 'tbody', 'tfoot'):
1111 self.process_table(i, style)
1112 elif name == 'caption':
1113 self.process_caption(node, i, style)
1115 elif name in ('col', 'colgroup', 'strong', ):
1116 pass # we don't support these, but we just ignore them
1118 raise ConvertError("process_table: Don't support %s element" % name)
1120 # raise ConvertError("Unexpected node: %r" % i)
1121 self.text.append(self.new_line_dont_remove)
1123 def process_caption(self, table, node, style=""):
1125 for i in table.childNodes:
1126 if i.localName in ('thead', 'tbody', 'tfoot'): # XXX is this correct?
1127 #if i.localName == 'tbody': (old version)
1128 for i in i.childNodes:
1129 if i.localName == 'tr':
1132 elif i.localName == 'tr':
1135 if i.localName == 'tr':
1137 for td in i.childNodes:
1138 if not td.nodeType == Node.ELEMENT_NODE:
1140 span = td.getAttribute('colspan')
1142 colspan += int(span)
1147 text = self.node_list_text_only(node.childNodes).replace('\n', ' ').strip()
1150 style = '<%s>' % style
1151 self.text.extend(["%s%s'''%s'''||" % ('||' * colspan, style, text), self.new_line_dont_remove])
1153 def process_table_data(self, node, style=""):
1154 if node.hasAttribute("colspan"):
1155 colspan = int(node.getAttribute("colspan"))
1158 self.text.append("||" * colspan)
1160 style += self._cell_style(node)
1162 self.text.append("<%s>" % style)
1165 for i in node.childNodes:
1168 self.process_paragraph_item(i)
1169 self.text.append(self.white_space)
1172 for i in node.childNodes:
1174 if i.nodeType == Node.ELEMENT_NODE:
1176 # if we get a br for a cell from e.g. cut and paste from OOo
1177 # or if someone simulates a list by enter in a cell
1178 # it should be appended as macro BR.
1179 self.text.append('<<BR>>')
1183 self.process_inline(i)
1185 elif i.nodeType == Node.TEXT_NODE:
1186 data = i.data.strip('\n').replace('\n', ' ')
1189 self.text.append(data)
1191 self.text.append(" ")
1193 def process_table_record(self, node, style=""):
1194 if not self.new_table:
1195 self.text.append(" " * self.depth)
1197 self.new_table = False
1198 style += self._row_style(node)
1199 for i in node.childNodes:
1200 if i.nodeType == Node.ELEMENT_NODE:
1202 if name in ('td', 'th', ):
1203 self.process_table_data(i, style=style)
1206 raise ConvertError("process_table_record: Don't support %s element" % name)
1207 self.text.extend(["||", self.new_line_dont_remove])
1209 def process_a(self, node):
1210 attrs = get_attrs(node)
1212 title = attrs.pop('title', '')
1213 href = attrs.pop('href', None)
1214 css_class = attrs.get('class')
1216 scriptname = self.request.script_root
1217 if scriptname == "":
1220 # can either be a link (with href) or an anchor (with e.g. id)
1221 # we don't need to support anchors here as we currently handle them as <<Anchor(id)>> macro
1223 href = wikiutil.url_unquote(href)
1225 interwikiname = None
1226 desc = self.get_desc(node.childNodes)
1229 if css_class == "interwiki":
1230 wikitag, wikiurl, wikitail, err = wikiutil.resolve_interwiki(
1231 self.request, title, "") # the title has the wiki name, page = ""
1232 if not err and href.startswith(wikiurl):
1233 pagename = wikiutil.url_unquote(href[len(wikiurl):].lstrip('/'))
1234 interwikiname = "%s:%s" % (wikitag, pagename)
1236 raise ConvertError("Invalid InterWiki link: '%s'" % href)
1237 elif css_class == "badinterwiki" and title:
1238 if href == "/": # we used this as replacement for empty href
1240 pagename = wikiutil.url_unquote(href)
1241 interwikiname = "%s:%s" % (title, pagename)
1242 if interwikiname and pagename == desc:
1243 if interwiki_re.match(interwikiname+' '): # the blank is needed by interwiki_re to match
1244 # this is valid as a free interwiki link
1245 self.text.append("%s" % interwikiname)
1247 self.text.append("[[%s]]" % interwikiname)
1249 elif title == 'Self':
1250 self.text.append('[[%s|%s]]' % (href, desc))
1253 self.text.append("[[%s|%s]]" % (interwikiname, desc))
1256 # fix links generated by a broken copy & paste of gecko based browsers
1257 brokenness = '../../../..'
1258 if href.startswith(brokenness):
1259 href = href[len(brokenness):] # just strip it away!
1260 # TODO: IE pastes complete http://server/Page/SubPage as href and as text, too
1263 if title.startswith("attachment:"):
1264 attname = wikiutil.url_unquote(title[len("attachment:"):])
1265 if 'do=get' in href: # quick&dirty fix for not dropping &do=get param
1275 self.text.append('[[attachment:%s%s%s]]' % (attname, desc, parms))
1277 elif href.startswith(scriptname):
1278 pagename = href[len(scriptname):]
1279 pagename = pagename.lstrip('/') # XXX temp fix for generated pagenames starting with /
1280 if desc == pagename:
1281 self.text.append(wikiutil.pagelinkmarkup(pagename))
1282 # relative link /SubPage
1283 elif desc.startswith('/') and href.endswith(desc):
1284 if pagename.startswith(self.pagename): # is this a subpage of us?
1285 self.text.append(wikiutil.pagelinkmarkup(pagename[len(self.pagename):]))
1287 self.text.append(wikiutil.pagelinkmarkup(pagename))
1289 elif desc.startswith('../') and href.endswith(desc[3:]):
1290 self.text.append(wikiutil.pagelinkmarkup(desc))
1291 # internal link #internal
1292 elif '#' in href and pagename.startswith(self.pagename):
1293 self.text.append(wikiutil.pagelinkmarkup(href[href.index('#'):], desc))
1296 self.text.append(wikiutil.pagelinkmarkup(pagename, desc))
1298 elif href.startswith("mailto:"):
1299 if href == desc or href[len("mailto:"):] == desc:
1300 self.text.extend([self.white_space, desc, self.white_space])
1302 self.text.append("[[%s|%s]]" % (href, desc)) # XXX use a (renamed) pagelinkmarkup
1306 href = href.replace(" ", "%20")
1307 self.text.append(href)
1309 href = href.replace(" ", "%20")
1312 self.text.append("[[%s%s]]" % (href, desc))
1314 def process_img(self, node):
1315 markup = self._process_img(node)
1316 self.text.extend([self.white_space, markup, self.white_space])
1318 def _process_img(self, node):
1319 attrs = get_attrs(node)
1321 title = attrs.pop('title', '')
1322 if title.startswith("smiley:"):
1323 markup = title[len("smiley:"):]
1326 alt = attrs.pop('alt', None)
1327 src = attrs.pop('src', None)
1328 css_class = attrs.get('class')
1331 if title.startswith("attachment:"):
1332 target = wikiutil.url_unquote(title)
1333 if alt == title[len("attachment:"):]:
1334 # kill auto-generated alt
1336 elif title.startswith("drawing:"):
1337 target = wikiutil.url_unquote(title)
1338 if alt == title[len("drawing:"):]:
1339 # kill auto-generated alt
1342 if css_class == 'external_image':
1343 # kill auto-generated alt and class
1353 params = ','.join(['%s="%s"' % (k, v) for k, v in attrs.items()])
1354 # if k in ('width', 'height', )])
1356 params = '|' + params
1360 markup = "{{%s%s%s}}" % (target, desc, params)
1363 def process_object(self, node):
1364 markup = self._process_object(node)
1365 self.text.append(markup)
1367 def _process_object(self, node):
1368 attrs = get_attrs(node)
1370 data = attrs.pop('data', None)
1372 scheme, netloc, path, params, query, fragment = urlparse.urlparse(data)
1373 args = url_decode(query)
1374 action = args.get("action")
1375 attachname = args.get("target")
1377 if (not scheme and not netloc # same server (local attachment!)
1378 and path and action == 'AttachFile' and attachname):
1379 scriptname = self.request.script_root or "/"
1380 pagename = path[len(scriptname):].lstrip("/")
1381 pagename = wikiutil.url_unquote(pagename)
1383 if pagename != self.request.page.page_name:
1384 attachname = "%s/%s" % (pagename, attachname)
1385 data = "attachment:%s" % attachname
1387 desc = self.get_desc(node.childNodes)
1391 # Exlude 'type' attribute cause it generates a 'key already present' error.
1392 params = ','.join(['%s="%s"' % (k, v) for k, v in attrs.items() if not k in ('type', )])
1394 params = '|' + params
1397 markup = "{{%s%s%s}}" % (data, desc, params)
1399 # TODO: for target PAGES, use some code from process_a to get the pagename from URL
1400 # TODO: roundtrip attachment: correctly
1401 # TODO: handle object's content better?
1403 def get_attrs(node):
1404 """ get the attributes of <node> into an easy-to-use dict """
1406 for attr_name in node.attributes.keys():
1407 # get attributes of style element
1408 if attr_name == "style":
1409 for style_element in node.attributes.get(attr_name).nodeValue.split(';'):
1410 if style_element.strip() != '':
1411 style_elements = style_element.split(':')
1412 if len(style_elements) == 2:
1413 attrs[style_elements[0].strip()] = style_elements[1].strip()
1414 # get attributes without style element
1416 attrs[attr_name] = node.attributes.get(attr_name).nodeValue
1420 def parse(request, text):
1421 text = u'<?xml version="1.0"?>%s%s' % (dtd, text)
1422 text = text.encode(config.charset)
1424 return xml.dom.minidom.parseString(text)
1425 except xml.parsers.expat.ExpatError, msg:
1426 # this sometimes crashes when it should not, so save the stuff to analyze it:
1427 logname = os.path.join(request.cfg.data_dir, "expaterror.log")
1428 f = file(logname, "w")
1430 f.write("\n" + "-"*80 + "\n" + str(msg))
1432 raise ConvertError('ExpatError: %s (see dump in %s)' % (msg, logname))
1434 def convert(request, pagename, text):
1435 # Due to expat needing explicitly set namespaces, we set these here to allow pasting
1436 # from Word / Excel without issues.
1437 # If you encounter 'ExpatError: unbound prefix', try adding the namespace to the list.
1438 namespace = [u'xmlns:o="urn:schemas-microsoft-com:office:office"',
1439 u'xmlns:x="urn:schemas-microsoft-com:office:excel"',
1440 u'xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet"',
1441 u'xmlns:c="urn:schemas-microsoft-com:office:component:spreadsheet"',
1442 u'xmlns:s="uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882"',
1443 u'xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882"',
1444 u'xmlns:rs="urn:schemas-microsoft-com:rowset"',
1445 u'xmlns:z="#RowsetSchema"',
1446 u'xmlns:x2="http://schemas.microsoft.com/office/excel/2003/xml"',
1447 u'xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core"',
1448 u'xmlns:aml="http://schemas.microsoft.com/aml/2001/core"',
1449 u'xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml"',
1450 u'xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint"',
1451 u'xmlns:w10="urn:schemas-microsoft-com:office:word"',
1452 u'xmlns:v="urn:schemas-microsoft-com:office:vml"']
1453 text = u'<page %s>%s</page>' % (' '.join(namespace), text)
1454 tree = parse(request, text)
1455 strip_whitespace().do(tree)
1456 text = convert_tree(request, pagename).do(tree)
1457 text = '\n'.join([s.rstrip() for s in text.splitlines()] + ['']) # remove trailing blanks