MoinMoin/converter/text_html_text_moin_wiki.py
author Thomas Waldmann <tw AT waldmann-edv DOT de>
Wed, 11 Feb 2009 02:38:25 +0100
changeset 4570 e86a7b66eb0e
parent 4498 910474dded06
child 4948 068c47fc2c3a
permissions -rw-r--r--
Despam: must use request.values because it also does GET requests, add logging/debug code
     1 """
     2     MoinMoin - convert from html to wiki markup
     3 
     4     @copyright: 2005-2006 Bastian Blank, Florian Festi, Reimar Bauer,
     5                 2005-2007 MoinMoin:ThomasWaldmann
     6     @license: GNU GPL, see COPYING for details.
     7 """
     8 
     9 import re, os
    10 import xml.dom.minidom # HINT: the nodes in parse result tree need .has_key(), "x in ..." does not work
    11 from xml.dom import Node
    12 
    13 from MoinMoin import config, wikiutil
    14 from MoinMoin.error import ConvertError
    15 
    16 from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
    17 interwiki_re = re.compile(WikiParser.interwiki_rule, re.VERBOSE|re.UNICODE)
    18 
    19 # Portions (C) International Organization for Standardization 1986
    20 # Permission to copy in any form is granted for use with
    21 # conforming SGML systems and applications as defined in
    22 # ISO 8879, provided this notice is included in all copies.
    23 dtd = ur'''
    24 <!DOCTYPE html [
    25 <!ENTITY nbsp   "&#32;">  <!-- no-break space = non-breaking space, U+00A0, convert to U+0020 -->
    26 <!ENTITY iexcl  "&#161;"> <!-- inverted exclamation mark, U+00A1 ISOnum -->
    27 <!ENTITY cent   "&#162;"> <!-- cent sign, U+00A2 ISOnum -->
    28 <!ENTITY pound  "&#163;"> <!-- pound sign, U+00A3 ISOnum -->
    29 <!ENTITY curren "&#164;"> <!-- currency sign, U+00A4 ISOnum -->
    30 <!ENTITY yen    "&#165;"> <!-- yen sign = yuan sign, U+00A5 ISOnum -->
    31 <!ENTITY brvbar "&#166;"> <!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
    32 <!ENTITY sect   "&#167;"> <!-- section sign, U+00A7 ISOnum -->
    33 <!ENTITY uml    "&#168;"> <!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
    34 <!ENTITY copy   "&#169;"> <!-- copyright sign, U+00A9 ISOnum -->
    35 <!ENTITY ordf   "&#170;"> <!-- feminine ordinal indicator, U+00AA ISOnum -->
    36 <!ENTITY laquo  "&#171;"> <!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
    37 <!ENTITY not    "&#172;"> <!-- not sign = angled dash, U+00AC ISOnum -->
    38 <!ENTITY shy    "&#173;"> <!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
    39 <!ENTITY reg    "&#174;"> <!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
    40 <!ENTITY macr   "&#175;"> <!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
    41 <!ENTITY deg    "&#176;"> <!-- degree sign, U+00B0 ISOnum -->
    42 <!ENTITY plusmn "&#177;"> <!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
    43 <!ENTITY sup2   "&#178;"> <!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
    44 <!ENTITY sup3   "&#179;"> <!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
    45 <!ENTITY acute  "&#180;"> <!-- acute accent = spacing acute, U+00B4 ISOdia -->
    46 <!ENTITY micro  "&#181;"> <!-- micro sign, U+00B5 ISOnum -->
    47 <!ENTITY para   "&#182;"> <!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
    48 <!ENTITY middot "&#183;"> <!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
    49 <!ENTITY cedil  "&#184;"> <!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
    50 <!ENTITY sup1   "&#185;"> <!-- superscript one = superscript digit one, U+00B9 ISOnum -->
    51 <!ENTITY ordm   "&#186;"> <!-- masculine ordinal indicator, U+00BA ISOnum -->
    52 <!ENTITY raquo  "&#187;"> <!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
    53 <!ENTITY frac14 "&#188;"> <!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
    54 <!ENTITY frac12 "&#189;"> <!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
    55 <!ENTITY frac34 "&#190;"> <!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
    56 <!ENTITY iquest "&#191;"> <!-- inverted question mark = turned question mark, U+00BF ISOnum -->
    57 <!ENTITY Agrave "&#192;"> <!-- latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 -->
    58 <!ENTITY Aacute "&#193;"> <!-- latin capital letter A with acute, U+00C1 ISOlat1 -->
    59 <!ENTITY Acirc  "&#194;"> <!-- latin capital letter A with circumflex, U+00C2 ISOlat1 -->
    60 <!ENTITY Atilde "&#195;"> <!-- latin capital letter A with tilde, U+00C3 ISOlat1 -->
    61 <!ENTITY Auml   "&#196;"> <!-- latin capital letter A with diaeresis, U+00C4 ISOlat1 -->
    62 <!ENTITY Aring  "&#197;"> <!-- latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 -->
    63 <!ENTITY AElig  "&#198;"> <!-- latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 -->
    64 <!ENTITY Ccedil "&#199;"> <!-- latin capital letter C with cedilla, U+00C7 ISOlat1 -->
    65 <!ENTITY Egrave "&#200;"> <!-- latin capital letter E with grave, U+00C8 ISOlat1 -->
    66 <!ENTITY Eacute "&#201;"> <!-- latin capital letter E with acute, U+00C9 ISOlat1 -->
    67 <!ENTITY Ecirc  "&#202;"> <!-- latin capital letter E with circumflex, U+00CA ISOlat1 -->
    68 <!ENTITY Euml   "&#203;"> <!-- latin capital letter E with diaeresis, U+00CB ISOlat1 -->
    69 <!ENTITY Igrave "&#204;"> <!-- latin capital letter I with grave, U+00CC ISOlat1 -->
    70 <!ENTITY Iacute "&#205;"> <!-- latin capital letter I with acute, U+00CD ISOlat1 -->
    71 <!ENTITY Icirc  "&#206;"> <!-- latin capital letter I with circumflex, U+00CE ISOlat1 -->
    72 <!ENTITY Iuml   "&#207;"> <!-- latin capital letter I with diaeresis, U+00CF ISOlat1 -->
    73 <!ENTITY ETH    "&#208;"> <!-- latin capital letter ETH, U+00D0 ISOlat1 -->
    74 <!ENTITY Ntilde "&#209;"> <!-- latin capital letter N with tilde, U+00D1 ISOlat1 -->
    75 <!ENTITY Ograve "&#210;"> <!-- latin capital letter O with grave, U+00D2 ISOlat1 -->
    76 <!ENTITY Oacute "&#211;"> <!-- latin capital letter O with acute, U+00D3 ISOlat1 -->
    77 <!ENTITY Ocirc  "&#212;"> <!-- latin capital letter O with circumflex, U+00D4 ISOlat1 -->
    78 <!ENTITY Otilde "&#213;"> <!-- latin capital letter O with tilde, U+00D5 ISOlat1 -->
    79 <!ENTITY Ouml   "&#214;"> <!-- latin capital letter O with diaeresis, U+00D6 ISOlat1 -->
    80 <!ENTITY times  "&#215;"> <!-- multiplication sign, U+00D7 ISOnum -->
    81 <!ENTITY Oslash "&#216;"> <!-- latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 -->
    82 <!ENTITY Ugrave "&#217;"> <!-- latin capital letter U with grave, U+00D9 ISOlat1 -->
    83 <!ENTITY Uacute "&#218;"> <!-- latin capital letter U with acute, U+00DA ISOlat1 -->
    84 <!ENTITY Ucirc  "&#219;"> <!-- latin capital letter U with circumflex, U+00DB ISOlat1 -->
    85 <!ENTITY Uuml   "&#220;"> <!-- latin capital letter U with diaeresis, U+00DC ISOlat1 -->
    86 <!ENTITY Yacute "&#221;"> <!-- latin capital letter Y with acute, U+00DD ISOlat1 -->
    87 <!ENTITY THORN  "&#222;"> <!-- latin capital letter THORN, U+00DE ISOlat1 -->
    88 <!ENTITY szlig  "&#223;"> <!-- latin small letter sharp s = ess-zed, U+00DF ISOlat1 -->
    89 <!ENTITY agrave "&#224;"> <!-- latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 -->
    90 <!ENTITY aacute "&#225;"> <!-- latin small letter a with acute, U+00E1 ISOlat1 -->
    91 <!ENTITY acirc  "&#226;"> <!-- latin small letter a with circumflex, U+00E2 ISOlat1 -->
    92 <!ENTITY atilde "&#227;"> <!-- latin small letter a with tilde, U+00E3 ISOlat1 -->
    93 <!ENTITY auml   "&#228;"> <!-- latin small letter a with diaeresis, U+00E4 ISOlat1 -->
    94 <!ENTITY aring  "&#229;"> <!-- latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 -->
    95 <!ENTITY aelig  "&#230;"> <!-- latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 -->
    96 <!ENTITY ccedil "&#231;"> <!-- latin small letter c with cedilla, U+00E7 ISOlat1 -->
    97 <!ENTITY egrave "&#232;"> <!-- latin small letter e with grave, U+00E8 ISOlat1 -->
    98 <!ENTITY eacute "&#233;"> <!-- latin small letter e with acute, U+00E9 ISOlat1 -->
    99 <!ENTITY ecirc  "&#234;"> <!-- latin small letter e with circumflex, U+00EA ISOlat1 -->
   100 <!ENTITY euml   "&#235;"> <!-- latin small letter e with diaeresis, U+00EB ISOlat1 -->
   101 <!ENTITY igrave "&#236;"> <!-- latin small letter i with grave, U+00EC ISOlat1 -->
   102 <!ENTITY iacute "&#237;"> <!-- latin small letter i with acute, U+00ED ISOlat1 -->
   103 <!ENTITY icirc  "&#238;"> <!-- latin small letter i with circumflex, U+00EE ISOlat1 -->
   104 <!ENTITY iuml   "&#239;"> <!-- latin small letter i with diaeresis, U+00EF ISOlat1 -->
   105 <!ENTITY eth    "&#240;"> <!-- latin small letter eth, U+00F0 ISOlat1 -->
   106 <!ENTITY ntilde "&#241;"> <!-- latin small letter n with tilde, U+00F1 ISOlat1 -->
   107 <!ENTITY ograve "&#242;"> <!-- latin small letter o with grave, U+00F2 ISOlat1 -->
   108 <!ENTITY oacute "&#243;"> <!-- latin small letter o with acute, U+00F3 ISOlat1 -->
   109 <!ENTITY ocirc  "&#244;"> <!-- latin small letter o with circumflex, U+00F4 ISOlat1 -->
   110 <!ENTITY otilde "&#245;"> <!-- latin small letter o with tilde, U+00F5 ISOlat1 -->
   111 <!ENTITY ouml   "&#246;"> <!-- latin small letter o with diaeresis, U+00F6 ISOlat1 -->
   112 <!ENTITY divide "&#247;"> <!-- division sign, U+00F7 ISOnum -->
   113 <!ENTITY oslash "&#248;"> <!-- latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 -->
   114 <!ENTITY ugrave "&#249;"> <!-- latin small letter u with grave, U+00F9 ISOlat1 -->
   115 <!ENTITY uacute "&#250;"> <!-- latin small letter u with acute, U+00FA ISOlat1 -->
   116 <!ENTITY ucirc  "&#251;"> <!-- latin small letter u with circumflex, U+00FB ISOlat1 -->
   117 <!ENTITY uuml   "&#252;"> <!-- latin small letter u with diaeresis, U+00FC ISOlat1 -->
   118 <!ENTITY yacute "&#253;"> <!-- latin small letter y with acute, U+00FD ISOlat1 -->
   119 <!ENTITY thorn  "&#254;"> <!-- latin small letter thorn, U+00FE ISOlat1 -->
   120 <!ENTITY yuml   "&#255;"> <!-- latin small letter y with diaeresis, U+00FF ISOlat1 -->
   121 
   122 <!-- Latin Extended-B -->
   123 <!ENTITY fnof     "&#402;"> <!-- latin small f with hook = function                                    = florin, U+0192 ISOtech -->
   124 
   125 <!-- Greek -->
   126 <!ENTITY Alpha    "&#913;"> <!-- greek capital letter alpha, U+0391 -->
   127 <!ENTITY Beta     "&#914;"> <!-- greek capital letter beta, U+0392 -->
   128 <!ENTITY Gamma    "&#915;"> <!-- greek capital letter gamma,
   129                                     U+0393 ISOgrk3 -->
   130 <!ENTITY Delta    "&#916;"> <!-- greek capital letter delta,
   131                                     U+0394 ISOgrk3 -->
   132 <!ENTITY Epsilon  "&#917;"> <!-- greek capital letter epsilon, U+0395 -->
   133 <!ENTITY Zeta     "&#918;"> <!-- greek capital letter zeta, U+0396 -->
   134 <!ENTITY Eta      "&#919;"> <!-- greek capital letter eta, U+0397 -->
   135 <!ENTITY Theta    "&#920;"> <!-- greek capital letter theta,
   136                                     U+0398 ISOgrk3 -->
   137 <!ENTITY Iota     "&#921;"> <!-- greek capital letter iota, U+0399 -->
   138 <!ENTITY Kappa    "&#922;"> <!-- greek capital letter kappa, U+039A -->
   139 <!ENTITY Lambda   "&#923;"> <!-- greek capital letter lambda,
   140                                     U+039B ISOgrk3 -->
   141 <!ENTITY Mu       "&#924;"> <!-- greek capital letter mu, U+039C -->
   142 <!ENTITY Nu       "&#925;"> <!-- greek capital letter nu, U+039D -->
   143 <!ENTITY Xi       "&#926;"> <!-- greek capital letter xi, U+039E ISOgrk3 -->
   144 <!ENTITY Omicron  "&#927;"> <!-- greek capital letter omicron, U+039F -->
   145 <!ENTITY Pi       "&#928;"> <!-- greek capital letter pi, U+03A0 ISOgrk3 -->
   146 <!ENTITY Rho      "&#929;"> <!-- greek capital letter rho, U+03A1 -->
   147 <!-- there is no Sigmaf, and no U+03A2 character either -->
   148 <!ENTITY Sigma    "&#931;"> <!-- greek capital letter sigma,
   149                                     U+03A3 ISOgrk3 -->
   150 <!ENTITY Tau      "&#932;"> <!-- greek capital letter tau, U+03A4 -->
   151 <!ENTITY Upsilon  "&#933;"> <!-- greek capital letter upsilon,
   152                                     U+03A5 ISOgrk3 -->
   153 <!ENTITY Phi      "&#934;"> <!-- greek capital letter phi,
   154                                     U+03A6 ISOgrk3 -->
   155 <!ENTITY Chi      "&#935;"> <!-- greek capital letter chi, U+03A7 -->
   156 <!ENTITY Psi      "&#936;"> <!-- greek capital letter psi,
   157                                     U+03A8 ISOgrk3 -->
   158 <!ENTITY Omega    "&#937;"> <!-- greek capital letter omega,
   159                                     U+03A9 ISOgrk3 -->
   160 
   161 <!ENTITY alpha    "&#945;"> <!-- greek small letter alpha,
   162                                     U+03B1 ISOgrk3 -->
   163 <!ENTITY beta     "&#946;"> <!-- greek small letter beta, U+03B2 ISOgrk3 -->
   164 <!ENTITY gamma    "&#947;"> <!-- greek small letter gamma,
   165                                     U+03B3 ISOgrk3 -->
   166 <!ENTITY delta    "&#948;"> <!-- greek small letter delta,
   167                                     U+03B4 ISOgrk3 -->
   168 <!ENTITY epsilon  "&#949;"> <!-- greek small letter epsilon,
   169                                     U+03B5 ISOgrk3 -->
   170 <!ENTITY zeta     "&#950;"> <!-- greek small letter zeta, U+03B6 ISOgrk3 -->
   171 <!ENTITY eta      "&#951;"> <!-- greek small letter eta, U+03B7 ISOgrk3 -->
   172 <!ENTITY theta    "&#952;"> <!-- greek small letter theta,
   173                                     U+03B8 ISOgrk3 -->
   174 <!ENTITY iota     "&#953;"> <!-- greek small letter iota, U+03B9 ISOgrk3 -->
   175 <!ENTITY kappa    "&#954;"> <!-- greek small letter kappa,
   176                                     U+03BA ISOgrk3 -->
   177 <!ENTITY lambda   "&#955;"> <!-- greek small letter lambda,
   178                                     U+03BB ISOgrk3 -->
   179 <!ENTITY mu       "&#956;"> <!-- greek small letter mu, U+03BC ISOgrk3 -->
   180 <!ENTITY nu       "&#957;"> <!-- greek small letter nu, U+03BD ISOgrk3 -->
   181 <!ENTITY xi       "&#958;"> <!-- greek small letter xi, U+03BE ISOgrk3 -->
   182 <!ENTITY omicron  "&#959;"> <!-- greek small letter omicron, U+03BF NEW -->
   183 <!ENTITY pi       "&#960;"> <!-- greek small letter pi, U+03C0 ISOgrk3 -->
   184 <!ENTITY rho      "&#961;"> <!-- greek small letter rho, U+03C1 ISOgrk3 -->
   185 <!ENTITY sigmaf   "&#962;"> <!-- greek small letter final sigma,
   186                                     U+03C2 ISOgrk3 -->
   187 <!ENTITY sigma    "&#963;"> <!-- greek small letter sigma,
   188                                     U+03C3 ISOgrk3 -->
   189 <!ENTITY tau      "&#964;"> <!-- greek small letter tau, U+03C4 ISOgrk3 -->
   190 <!ENTITY upsilon  "&#965;"> <!-- greek small letter upsilon,
   191                                     U+03C5 ISOgrk3 -->
   192 <!ENTITY phi      "&#966;"> <!-- greek small letter phi, U+03C6 ISOgrk3 -->
   193 <!ENTITY chi      "&#967;"> <!-- greek small letter chi, U+03C7 ISOgrk3 -->
   194 <!ENTITY psi      "&#968;"> <!-- greek small letter psi, U+03C8 ISOgrk3 -->
   195 <!ENTITY omega    "&#969;"> <!-- greek small letter omega,
   196                                     U+03C9 ISOgrk3 -->
   197 <!ENTITY thetasym "&#977;"> <!-- greek small letter theta symbol,
   198                                     U+03D1 NEW -->
   199 <!ENTITY upsih    "&#978;"> <!-- greek upsilon with hook symbol,
   200                                     U+03D2 NEW -->
   201 <!ENTITY piv      "&#982;"> <!-- greek pi symbol, U+03D6 ISOgrk3 -->
   202 
   203 <!-- General Punctuation -->
   204 <!ENTITY bull     "&#8226;"> <!-- bullet = black small circle,
   205                                      U+2022 ISOpub  -->
   206 <!-- bullet is NOT the same as bullet operator, U+2219 -->
   207 <!ENTITY hellip   "&#8230;"> <!-- horizontal ellipsis = three dot leader,
   208                                      U+2026 ISOpub  -->
   209 <!ENTITY prime    "&#8242;"> <!-- prime = minutes = feet, U+2032 ISOtech -->
   210 <!ENTITY Prime    "&#8243;"> <!-- double prime = seconds = inches,
   211                                      U+2033 ISOtech -->
   212 <!ENTITY oline    "&#8254;"> <!-- overline = spacing overscore,
   213                                      U+203E NEW -->
   214 <!ENTITY frasl    "&#8260;"> <!-- fraction slash, U+2044 NEW -->
   215 
   216 <!-- Letterlike Symbols -->
   217 <!ENTITY weierp   "&#8472;"> <!-- script capital P = power set
   218                                      = Weierstrass p, U+2118 ISOamso -->
   219 <!ENTITY image    "&#8465;"> <!-- blackletter capital I = imaginary part,
   220                                      U+2111 ISOamso -->
   221 <!ENTITY real     "&#8476;"> <!-- blackletter capital R = real part symbol,
   222                                      U+211C ISOamso -->
   223 <!ENTITY trade    "&#8482;"> <!-- trade mark sign, U+2122 ISOnum -->
   224 <!ENTITY alefsym  "&#8501;"> <!-- alef symbol = first transfinite cardinal,
   225                                      U+2135 NEW -->
   226 <!-- alef symbol is NOT the same as hebrew letter alef,
   227      U+05D0 although the same glyph could be used to depict both characters -->
   228 
   229 <!-- Arrows -->
   230 <!ENTITY larr     "&#8592;"> <!-- leftwards arrow, U+2190 ISOnum -->
   231 <!ENTITY uarr     "&#8593;"> <!-- upwards arrow, U+2191 ISOnum-->
   232 <!ENTITY rarr     "&#8594;"> <!-- rightwards arrow, U+2192 ISOnum -->
   233 <!ENTITY darr     "&#8595;"> <!-- downwards arrow, U+2193 ISOnum -->
   234 <!ENTITY harr     "&#8596;"> <!-- left right arrow, U+2194 ISOamsa -->
   235 <!ENTITY crarr    "&#8629;"> <!-- downwards arrow with corner leftwards
   236                                      = carriage return, U+21B5 NEW -->
   237 <!ENTITY lArr     "&#8656;"> <!-- leftwards double arrow, U+21D0 ISOtech -->
   238 <!-- ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
   239     but also does not have any other character for that function. So ? lArr can
   240     be used for 'is implied by' as ISOtech suggests -->
   241 <!ENTITY uArr     "&#8657;"> <!-- upwards double arrow, U+21D1 ISOamsa -->
   242 <!ENTITY rArr     "&#8658;"> <!-- rightwards double arrow,
   243                                      U+21D2 ISOtech -->
   244 <!-- ISO 10646 does not say this is the 'implies' character but does not have
   245      another character with this function so ?
   246      rArr can be used for 'implies' as ISOtech suggests -->
   247 <!ENTITY dArr     "&#8659;"> <!-- downwards double arrow, U+21D3 ISOamsa -->
   248 <!ENTITY hArr     "&#8660;"> <!-- left right double arrow,
   249                                      U+21D4 ISOamsa -->
   250 
   251 <!-- Mathematical Operators -->
   252 <!ENTITY forall   "&#8704;"> <!-- for all, U+2200 ISOtech -->
   253 <!ENTITY part     "&#8706;"> <!-- partial differential, U+2202 ISOtech  -->
   254 <!ENTITY exist    "&#8707;"> <!-- there exists, U+2203 ISOtech -->
   255 <!ENTITY empty    "&#8709;"> <!-- empty set = null set = diameter,
   256                                      U+2205 ISOamso -->
   257 <!ENTITY nabla    "&#8711;"> <!-- nabla = backward difference,
   258                                      U+2207 ISOtech -->
   259 <!ENTITY isin     "&#8712;"> <!-- element of, U+2208 ISOtech -->
   260 <!ENTITY notin    "&#8713;"> <!-- not an element of, U+2209 ISOtech -->
   261 <!ENTITY ni       "&#8715;"> <!-- contains as member, U+220B ISOtech -->
   262 <!-- should there be a more memorable name than 'ni'? -->
   263 <!ENTITY prod     "&#8719;"> <!-- n-ary product = product sign,
   264                                      U+220F ISOamsb -->
   265 <!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
   266      the same glyph might be used for both -->
   267 <!ENTITY sum      "&#8721;"> <!-- n-ary sumation, U+2211 ISOamsb -->
   268 <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
   269      though the same glyph might be used for both -->
   270 <!ENTITY minus    "&#8722;"> <!-- minus sign, U+2212 ISOtech -->
   271 <!ENTITY lowast   "&#8727;"> <!-- asterisk operator, U+2217 ISOtech -->
   272 <!ENTITY radic    "&#8730;"> <!-- square root = radical sign,
   273                                      U+221A ISOtech -->
   274 <!ENTITY prop     "&#8733;"> <!-- proportional to, U+221D ISOtech -->
   275 <!ENTITY infin    "&#8734;"> <!-- infinity, U+221E ISOtech -->
   276 <!ENTITY ang      "&#8736;"> <!-- angle, U+2220 ISOamso -->
   277 <!ENTITY and      "&#8743;"> <!-- logical and = wedge, U+2227 ISOtech -->
   278 <!ENTITY or       "&#8744;"> <!-- logical or = vee, U+2228 ISOtech -->
   279 <!ENTITY cap      "&#8745;"> <!-- intersection = cap, U+2229 ISOtech -->
   280 <!ENTITY cup      "&#8746;"> <!-- union = cup, U+222A ISOtech -->
   281 <!ENTITY int      "&#8747;"> <!-- integral, U+222B ISOtech -->
   282 <!ENTITY there4   "&#8756;"> <!-- therefore, U+2234 ISOtech -->
   283 <!ENTITY sim      "&#8764;"> <!-- tilde operator = varies with = similar to,
   284                                      U+223C ISOtech -->
   285 <!-- tilde operator is NOT the same character as the tilde, U+007E,
   286      although the same glyph might be used to represent both  -->
   287 <!ENTITY cong     "&#8773;"> <!-- approximately equal to, U+2245 ISOtech -->
   288 <!ENTITY asymp    "&#8776;"> <!-- almost equal to = asymptotic to,
   289                                      U+2248 ISOamsr -->
   290 <!ENTITY ne       "&#8800;"> <!-- not equal to, U+2260 ISOtech -->
   291 <!ENTITY equiv    "&#8801;"> <!-- identical to, U+2261 ISOtech -->
   292 <!ENTITY le       "&#8804;"> <!-- less-than or equal to, U+2264 ISOtech -->
   293 <!ENTITY ge       "&#8805;"> <!-- greater-than or equal to,
   294                                      U+2265 ISOtech -->
   295 <!ENTITY sub      "&#8834;"> <!-- subset of, U+2282 ISOtech -->
   296 <!ENTITY sup      "&#8835;"> <!-- superset of, U+2283 ISOtech -->
   297 <!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
   298      font encoding and is not included. Should it be, for symmetry?
   299      It is in ISOamsn  -->
   300 <!ENTITY nsub     "&#8836;"> <!-- not a subset of, U+2284 ISOamsn -->
   301 <!ENTITY sube     "&#8838;"> <!-- subset of or equal to, U+2286 ISOtech -->
   302 <!ENTITY supe     "&#8839;"> <!-- superset of or equal to,
   303                                      U+2287 ISOtech -->
   304 <!ENTITY oplus    "&#8853;"> <!-- circled plus = direct sum,
   305                                      U+2295 ISOamsb -->
   306 <!ENTITY otimes   "&#8855;"> <!-- circled times = vector product,
   307                                      U+2297 ISOamsb -->
   308 <!ENTITY perp     "&#8869;"> <!-- up tack = orthogonal to = perpendicular,
   309                                      U+22A5 ISOtech -->
   310 <!ENTITY sdot     "&#8901;"> <!-- dot operator, U+22C5 ISOamsb -->
   311 <!-- dot operator is NOT the same character as U+00B7 middle dot -->
   312 
   313 <!-- Miscellaneous Technical -->
   314 <!ENTITY lceil    "&#8968;"> <!-- left ceiling = apl upstile,
   315                                      U+2308 ISOamsc  -->
   316 <!ENTITY rceil    "&#8969;"> <!-- right ceiling, U+2309 ISOamsc  -->
   317 <!ENTITY lfloor   "&#8970;"> <!-- left floor = apl downstile,
   318                                      U+230A ISOamsc  -->
   319 <!ENTITY rfloor   "&#8971;"> <!-- right floor, U+230B ISOamsc  -->
   320 <!ENTITY lang     "&#9001;"> <!-- left-pointing angle bracket = bra,
   321                                      U+2329 ISOtech -->
   322 <!-- lang is NOT the same character as U+003C 'less than'
   323      or U+2039 'single left-pointing angle quotation mark' -->
   324 <!ENTITY rang     "&#9002;"> <!-- right-pointing angle bracket = ket,
   325                                      U+232A ISOtech -->
   326 <!-- rang is NOT the same character as U+003E 'greater than'
   327      or U+203A 'single right-pointing angle quotation mark' -->
   328 
   329 <!-- Geometric Shapes -->
   330 <!ENTITY loz      "&#9674;"> <!-- lozenge, U+25CA ISOpub -->
   331 
   332 <!-- Miscellaneous Symbols -->
   333 <!ENTITY spades   "&#9824;"> <!-- black spade suit, U+2660 ISOpub -->
   334 <!-- black here seems to mean filled as opposed to hollow -->
   335 <!ENTITY clubs    "&#9827;"> <!-- black club suit = shamrock,
   336                                      U+2663 ISOpub -->
   337 <!ENTITY hearts   "&#9829;"> <!-- black heart suit = valentine,
   338                                      U+2665 ISOpub -->
   339 <!ENTITY diams    "&#9830;"> <!-- black diamond suit, U+2666 ISOpub -->
   340 
   341 <!-- C0 Controls and Basic Latin -->
   342 <!ENTITY quot    "&#34;"> <!-- quotation mark = APL quote,
   343                                     U+0022 ISOnum -->
   344 <!ENTITY amp     "&#38;"> <!-- ampersand, U+0026 ISOnum -->
   345 <!ENTITY lt      "&#60;"> <!-- less-than sign, U+003C ISOnum -->
   346 <!ENTITY gt      "&#62;"> <!-- greater-than sign, U+003E ISOnum -->
   347 
   348 <!-- Latin Extended-A -->
   349 <!ENTITY OElig   "&#338;"> <!-- latin capital ligature OE,
   350                                     U+0152 ISOlat2 -->
   351 <!ENTITY oelig   "&#339;"> <!-- latin small ligature oe, U+0153 ISOlat2 -->
   352 <!-- ligature is a misnomer, this is a separate character in some languages -->
   353 <!ENTITY Scaron  "&#352;"> <!-- latin capital letter S with caron,
   354                                     U+0160 ISOlat2 -->
   355 <!ENTITY scaron  "&#353;"> <!-- latin small letter s with caron,
   356                                     U+0161 ISOlat2 -->
   357 <!ENTITY Yuml    "&#376;"> <!-- latin capital letter Y with diaeresis,
   358                                     U+0178 ISOlat2 -->
   359 
   360 <!-- Spacing Modifier Letters -->
   361 <!ENTITY circ    "&#710;"> <!-- modifier letter circumflex accent,
   362                                     U+02C6 ISOpub -->
   363 <!ENTITY tilde   "&#732;"> <!-- small tilde, U+02DC ISOdia -->
   364 
   365 <!-- General Punctuation -->
   366 <!ENTITY ensp    "&#8194;"> <!-- en space, U+2002 ISOpub -->
   367 <!ENTITY emsp    "&#8195;"> <!-- em space, U+2003 ISOpub -->
   368 <!ENTITY thinsp  "&#8201;"> <!-- thin space, U+2009 ISOpub -->
   369 <!ENTITY zwnj    "&#8204;"> <!-- zero width non-joiner,
   370                                     U+200C NEW RFC 2070 -->
   371 <!ENTITY zwj     "&#8205;"> <!-- zero width joiner, U+200D NEW RFC 2070 -->
   372 <!ENTITY lrm     "&#8206;"> <!-- left-to-right mark, U+200E NEW RFC 2070 -->
   373 <!ENTITY rlm     "&#8207;"> <!-- right-to-left mark, U+200F NEW RFC 2070 -->
   374 <!ENTITY ndash   "&#8211;"> <!-- en dash, U+2013 ISOpub -->
   375 <!ENTITY mdash   "&#8212;"> <!-- em dash, U+2014 ISOpub -->
   376 <!ENTITY lsquo   "&#8216;"> <!-- left single quotation mark,
   377                                     U+2018 ISOnum -->
   378 <!ENTITY rsquo   "&#8217;"> <!-- right single quotation mark,
   379                                     U+2019 ISOnum -->
   380 <!ENTITY sbquo   "&#8218;"> <!-- single low-9 quotation mark, U+201A NEW -->
   381 <!ENTITY ldquo   "&#8220;"> <!-- left double quotation mark,
   382                                     U+201C ISOnum -->
   383 <!ENTITY rdquo   "&#8221;"> <!-- right double quotation mark,
   384                                     U+201D ISOnum -->
   385 <!ENTITY bdquo   "&#8222;"> <!-- double low-9 quotation mark, U+201E NEW -->
   386 <!ENTITY dagger  "&#8224;"> <!-- dagger, U+2020 ISOpub -->
   387 <!ENTITY Dagger  "&#8225;"> <!-- double dagger, U+2021 ISOpub -->
   388 <!ENTITY permil  "&#8240;"> <!-- per mille sign, U+2030 ISOtech -->
   389 <!ENTITY lsaquo  "&#8249;"> <!-- single left-pointing angle quotation mark,
   390                                     U+2039 ISO proposed -->
   391 <!-- lsaquo is proposed but not yet ISO standardized -->
   392 <!ENTITY rsaquo  "&#8250;"> <!-- single right-pointing angle quotation mark,
   393                                     U+203A ISO proposed -->
   394 <!-- rsaquo is proposed but not yet ISO standardized -->
   395 <!ENTITY euro   "&#8364;"> <!-- euro sign, U+20AC NEW -->
   396 
   397 ]>
   398 '''
   399 
   400 class visitor(object):
   401     def do(self, tree):
   402         self.visit_node_list(tree.childNodes)
   403 
   404     def visit_node_list(self, nodelist):
   405         for node in nodelist:
   406             self.visit(node)
   407 
   408     def visit(self, node):
   409         nodeType = node.nodeType
   410         if node.nodeType == Node.ELEMENT_NODE:
   411             return self.visit_element(node)
   412         elif node.nodeType == Node.ATTRIBUTE_NODE:
   413             return self.visit_attribute(node)
   414         elif node.nodeType == Node.TEXT_NODE:
   415             return self.visit_text(node)
   416         elif node.nodeType == Node.CDATA_SECTION_NODE:
   417             return self.visit_cdata_section(node)
   418 
   419     def visit_element(self, node):
   420         if len(node.childNodes):
   421             self.visit_node_list(node.childNodes)
   422 
   423     def visit_attribute(self, node):
   424         pass
   425 
   426     def visit_text(self, node):
   427         pass
   428 
   429     def visit_cdata_section(self, node):
   430         pass
   431 
   432 
   433 class strip_whitespace(visitor):
   434 
   435     def visit_element(self, node):
   436         if node.localName == 'p':
   437             # XXX: our formatter adds a whitespace at the end of each paragraph
   438             if node.hasChildNodes() and node.childNodes[-1].nodeType == Node.TEXT_NODE:
   439                 data = node.childNodes[-1].data.rstrip('\n ')
   440                 # Remove it if empty
   441                 if data == '':
   442                     node.removeChild(node.childNodes[-1])
   443                 else:
   444                     node.childNodes[-1].data = data
   445             # Remove empty paragraphs
   446             if not node.hasChildNodes():
   447                 node.parentNode.removeChild(node)
   448 
   449         if node.hasChildNodes():
   450             self.visit_node_list(node.childNodes)
   451 
   452 
   453 class convert_tree(visitor):
   454     white_space = object()
   455     new_line = object()
   456     new_line_dont_remove = object()
   457 
   458     def __init__(self, request, pagename):
   459         self.request = request
   460         self.pagename = pagename
   461 
   462     def do(self, tree):
   463         self.depth = 0
   464         self.text = []
   465         self.visit(tree.documentElement)
   466         self.check_whitespace()
   467         return ''.join(self.text)
   468 
   469     def check_whitespace(self):
   470         i = 0
   471         text = self.text
   472         while i < len(text):
   473             if text[i] is self.white_space:
   474                 if i == 0 or i == len(text)-1:
   475                     del text[i]
   476                 elif text[i-1].endswith(" ") or text[i-1].endswith("\n"):
   477                     # last char of previous element is whitespace
   478                     del text[i]
   479                 elif (text[i+1] is self.white_space or
   480                       # next element is white_space
   481                       text[i+1] is self.new_line):
   482                       # or new_line
   483                     del text[i]
   484                 elif text[i+1].startswith(" ") or text[i+1].startswith("\n"):
   485                     # first char of next element is whitespace
   486                     del text[i]
   487                 else:
   488                     text[i] = " "
   489                     i += 1
   490             elif text[i] is self.new_line:
   491                 if i == 0:
   492                     del text[i]
   493                 elif i == len(text) - 1:
   494                     text[i] = "\n"
   495                     i += 1
   496                 elif text[i-1].endswith("\n") or (
   497                       isinstance(text[i+1], str) and text[i+1].startswith("\n")):
   498                     del text[i]
   499                 else:
   500                     text[i] = "\n"
   501                     i += 1
   502             elif text[i] is self.new_line_dont_remove:
   503                 text[i] = "\n"
   504                 i += 1
   505             else:
   506                 i += 1
   507 
   508     def visit_text(self, node):
   509         self.text.append(node.data)
   510 
   511     def visit_element(self, node):
   512         name = node.localName
   513         if name is None: # not sure this can happen here (DOM comment node), but just for the case
   514             return
   515         func = getattr(self, "process_%s" % name, None)
   516         if func:
   517             func(node)
   518         else:
   519             self.process_inline(node)
   520 
   521     def visit_node_list_element_only(self, nodelist):
   522         for node in nodelist:
   523             if node.nodeType == Node.ELEMENT_NODE:
   524                 self.visit_element(node)
   525 
   526     def node_list_text_only(self, nodelist):
   527         result = []
   528         for node in nodelist:
   529             if node.nodeType == Node.TEXT_NODE:
   530                 result.append(node.data)
   531             else:
   532                 result.extend(self.node_list_text_only(node.childNodes))
   533         return "".join(result)
   534 
   535     def get_desc(self, nodelist):
   536         """ links can have either text or an image as description - we extract
   537             this from the child nodelist and return wiki markup.
   538         """
   539         markup = ''
   540         text = self.node_list_text_only(nodelist).replace("\n", " ").strip()
   541         if text:
   542             # found some text
   543             markup = text
   544         else:
   545             # search for an img / object
   546             for node in nodelist:
   547                 if node.nodeType == Node.ELEMENT_NODE:
   548                     name = node.localName
   549                     if name == 'img':
   550                         markup = self._process_img(node) # XXX problem: markup containts auto-generated alt text with link target
   551                         break
   552                     elif name == 'object':
   553                         markup = self._process_object(node)
   554                         break
   555         return markup
   556 
   557     def process_page(self, node):
   558         for i in node.childNodes:
   559             if i.nodeType == Node.ELEMENT_NODE:
   560                 self.visit_element(i)
   561             elif i.nodeType == Node.TEXT_NODE: # if this is missing, all std text under a headline is dropped!
   562                 txt = i.data.strip() # IMPORTANT: don't leave this unstripped or there will be wrong blanks
   563                 if txt:
   564                     self.text.append(txt)
   565             #we use <pre class="comment"> now, so this is currently unused:
   566             #elif i.nodeType == Node.COMMENT_NODE:
   567             #    self.text.append(i.data)
   568             #    self.text.append("\n")
   569 
   570     def process_br(self, node):
   571         self.text.append(self.new_line) # without this, std multi-line text below some heading misses a whitespace
   572                                         # when it gets merged to float text, like word word wordword word word
   573 
   574     def process_heading(self, node):
   575         text = self.node_list_text_only(node.childNodes).strip()
   576         if text:
   577             depth = int(node.localName[1])
   578             hstr = "=" * depth
   579             self.text.append(self.new_line)
   580             self.text.append("%s %s %s" % (hstr, text.replace("\n", " "), hstr))
   581             self.text.append(self.new_line)
   582 
   583     process_h1 = process_heading
   584     process_h2 = process_heading
   585     process_h3 = process_heading
   586     process_h4 = process_heading
   587     process_h5 = process_heading
   588     process_h6 = process_heading
   589 
   590     def _get_list_item_markup(self, list, listitem):
   591         before = ""
   592         #indent = str(self.depth) * self.depth # nice for debugging :)
   593         indent = " " * self.depth
   594         markup = ""
   595         name = list.localName
   596         if name == 'ol':
   597             class_ = listitem.getAttribute("class")
   598             if class_ == "gap":
   599                 before = self.new_line_dont_remove
   600             if list.hasAttribute("type"):
   601                 type = list.getAttribute("type")
   602             else:
   603                 type = "1"
   604             markup = "%s. " % type
   605         elif name == 'ul':
   606             class_ = listitem.getAttribute("class")
   607             if class_ == "gap":
   608                 before = self.new_line_dont_remove
   609             style = listitem.getAttribute("style")
   610             if re.match(ur"list-style-type:\s*none", style, re.I):
   611                 markup = ". "
   612                 # set markup with white space when list element containes table
   613                 for i in listitem.childNodes:
   614                     if i.nodeType == Node.ELEMENT_NODE:
   615                         if i.localName == 'table':
   616                             markup = ""
   617             else:
   618                 markup = "* "
   619         elif name == 'dl':
   620             markup = ":: "
   621         else:
   622             raise ConvertError("Illegal list type %s" % name)
   623         return before, indent, markup
   624 
   625     def process_dl(self, node):
   626         self.depth += 1
   627         markup = ":: " # can there be a dl dd without dt?
   628         for i in node.childNodes:
   629             if i.nodeType == Node.ELEMENT_NODE:
   630                 name = i.localName
   631                 if name == 'dt':
   632                     before, indent, markup = self._get_list_item_markup(node, i)
   633                     self.text.extend([before, indent])
   634                     text = self.node_list_text_only(i.childNodes)
   635                     self.text.append(text.replace("\n", " "))
   636                 elif name == 'dd':
   637                     self.text.append(markup)
   638                     self.process_list_item(i, indent) # XXX no dt -> indent is undefined!!!
   639                 else:
   640                     raise ConvertError("Illegal list element %s" % i.localName)
   641         self.depth -= 1
   642         if self.depth == 0:
   643             self.text.append(self.new_line_dont_remove)
   644 
   645     def process_list(self, node):
   646         self.depth += 1
   647         for i in node.childNodes:
   648             if i.nodeType == Node.ELEMENT_NODE:
   649                 name = i.localName
   650                 if name == 'li':
   651                     before, indent, markup = self._get_list_item_markup(node, i)
   652                     self.text.extend([before, indent, markup])
   653                     self.process_list_item(i, indent)
   654                 elif name in ('ol', 'ul', ):
   655                     self.process_list(i)
   656                 elif name == 'dl':
   657                     self.process_dl(i)
   658                 else:
   659                     raise ConvertError("Illegal list element %s" % i.localName)
   660         self.depth -= 1
   661         if self.depth == 0:
   662             self.text.append(self.new_line_dont_remove)
   663 
   664     process_ul = process_list
   665     process_ol = process_list
   666 
   667     def empty_paragraph_queue(self, nodelist, indent, need_indent):
   668         if need_indent:
   669             self.text.append(indent)
   670         for i in nodelist:
   671             if i.nodeType == Node.ELEMENT_NODE:
   672                 if i.localName == 'br':
   673                     self.text.append('<<BR>>')
   674                 else:
   675                     self.process_inline(i)
   676             elif i.nodeType == Node.TEXT_NODE:
   677                 self.text.append(i.data.strip('\n').replace('\n', ' '))
   678         self.text.append(self.new_line)
   679         del nodelist[:]
   680 
   681     def process_list_item(self, node, indent):
   682         found = False
   683         need_indent = False
   684         pending = []
   685         for i in node.childNodes:
   686             name = i.localName
   687 
   688             if name in ('p', 'pre', 'ol', 'ul', 'dl', 'table', ) and pending:
   689                 self.empty_paragraph_queue(pending, indent, need_indent)
   690                 need_indent = True
   691 
   692             if name == 'p':
   693                 if need_indent:
   694                     self.text.append(indent)
   695                 self.process_paragraph_item(i)
   696                 self.text.append(self.new_line)
   697                 found = True
   698             elif name == 'pre':
   699                 if need_indent:
   700                     self.text.append(indent)
   701                 self.process_preformatted_item(i)
   702                 found = True
   703             elif name in ('ol', 'ul', ):
   704                 self.process_list(i)
   705                 found = True
   706             elif name == 'dl':
   707                 self.process_dl(i)
   708                 found = True
   709             elif name == 'table':
   710                 if need_indent:
   711                     self.text.append(indent)
   712                 self.process_table(i)
   713                 found = True
   714             elif name == 'br':
   715                 pending.append(i)
   716             else:
   717                 pending.append(i)
   718 
   719             if found:
   720                 need_indent = True
   721 
   722         if pending:
   723             self.empty_paragraph_queue(pending, indent, need_indent)
   724 
   725     def process_blockquote(self, node):
   726         # XXX this does not really work. e.g.:
   727         # <bq>aaaaaa
   728         # <hr---------->
   729         # <bq>bbbbbb
   730         self.depth += 1
   731         for i in node.childNodes:
   732             if i.nodeType == Node.ELEMENT_NODE:
   733                 name = i.localName
   734                 if name == 'p':
   735                     self.text.append(self.new_line)
   736                     self.text.append(" " * self.depth)
   737                     self.process_p(i)
   738                 elif name == 'pre':
   739                     self.text.append(self.new_line)
   740                     self.text.append(" " * self.depth)
   741                     self.process_pre(i)
   742                 elif name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', ):
   743                     self.process_heading(i)
   744                 elif name in ('ol', 'ul', ):
   745                     self.process_list(i)
   746                 elif name == 'dl':
   747                     self.process_dl(i)
   748                 elif name == 'a':
   749                     self.process_a(i)
   750                 elif name == 'img':
   751                     self.process_img(i)
   752                 elif name == 'div':
   753                     self.visit_node_list_element_only(i.childNodes)
   754                 elif name == 'blockquote':
   755                     self.process_blockquote(i)
   756                 elif name == 'hr':
   757                     self.process_hr(i)
   758                 elif name == 'br':
   759                     self.process_br(i)
   760                 else:
   761                     raise ConvertError("process_blockquote: Don't support %s element" % name)
   762         self.depth -= 1
   763 
   764     def process_inline(self, node):
   765         if node.nodeType == Node.TEXT_NODE:
   766             self.text.append(node.data.strip('\n').replace('\n', ' '))
   767             return
   768 
   769         # do we need to check for Node.ELEMENT_NODE and return (do nothing)?
   770         name = node.localName # can be None for DOM Comment nodes
   771         if name is None:
   772             return
   773 
   774         # unsupported tags
   775         if name in (u'title', u'meta', u'style'):
   776             return
   777 
   778         if name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', ): # headers are not allowed here (e.g. inside a ul li),
   779             text = self.node_list_text_only(node.childNodes).strip() # but can be inserted via the editor
   780             self.text.append(text)                          # so we just drop the header markup and keep the text
   781             return
   782 
   783         func = getattr(self, "process_%s" % name, None)
   784         if func:
   785             func(node)
   786             return
   787 
   788         command_close = None
   789         if name in ('em', 'i', ):
   790             command = "''"
   791         elif name in ('strong', 'b', ):
   792             command = "'''"
   793         elif name == 'u':
   794             command = "__"
   795         elif name == 'big':
   796             command = "~+"
   797             command_close = "+~"
   798         elif name == 'small':
   799             command = "~-"
   800             command_close = "-~"
   801         elif name == 'strike':
   802             command = "--("
   803             command_close = ")--"
   804         elif name == 'sub':
   805             command = ",,"
   806         elif name == 'sup':
   807             command = "^"
   808         elif name in ('area', 'center', 'code', 'embed', 'fieldset', 'font', 'form', 'iframe', 'input', 'label', 'link', 'map',
   809                       'meta', 'noscript', 'option', 'script', 'select', 'textarea', 'wbr'):
   810             command = "" # just throw away unsupported elements
   811         else:
   812             raise ConvertError("process_inline: Don't support %s element" % name)
   813 
   814         self.text.append(command)
   815         for i in node.childNodes:
   816             # lonly childnodes checked if they are only 'br'
   817             if command and len(node.childNodes) == 1:
   818                 # formatted br alone is not wanted (who wants a bold br?)
   819                 if i.localName != 'br':
   820                     self.process_inline(i)
   821             else:
   822                 if i.localName == 'br':
   823                     # dont make a real \n because that breaks tables
   824                     self.text.append('<<BR>>')
   825                 else:
   826                     self.process_inline(i)
   827         if command_close:
   828             command = command_close
   829         self.text.append(command)
   830 
   831     def process_span(self, node):
   832         # process span tag for firefox3
   833         node_style = node.getAttribute("style")
   834 
   835         is_strike = node.getAttribute("class") == "strike"
   836         is_strike = is_strike or "line-through" in node_style
   837         is_strong = "bold" in node_style
   838         is_italic = "italic" in node_style
   839         is_underline = "underline" in node_style
   840         is_comment = node.getAttribute("class") == "comment"
   841 
   842         # start tag
   843         if is_comment:
   844             self.text.append("/* ")
   845         if is_strike:
   846             self.text.append("--(")
   847         if is_strong:
   848             self.text.append("'''")
   849         if is_italic:
   850             self.text.append("''")
   851         if is_underline:
   852             self.text.append("__")
   853 
   854         # body
   855         for i in node.childNodes:
   856             self.process_inline(i)
   857 
   858         # end tag
   859         if is_underline:
   860             self.text.append("__")
   861         if is_italic:
   862             self.text.append("''")
   863         if is_strong:
   864             self.text.append("'''")
   865         if is_strike:
   866             self.text.append(")--")
   867         if is_comment:
   868             self.text.append(" */")
   869 
   870     def process_div(self, node):
   871         # process indent
   872         self._process_indent(node)
   873 
   874         # ignore div tags - just descend
   875         for i in node.childNodes:
   876             self.visit(i)
   877 
   878     def process_tt(self, node):
   879         text = self.node_list_text_only(node.childNodes).replace("\n", " ")
   880         if node.getAttribute("class") == "backtick":
   881             self.text.append("`%s`" % text)
   882         else:
   883             self.text.append("{{{%s}}}" % text)
   884 
   885     def process_hr(self, node):
   886         if node.hasAttribute("class"):
   887             class_ = node.getAttribute("class")
   888         else:
   889             class_ = "hr0"
   890         if class_.startswith("hr") and class_[2] in "123456":
   891             length = int(class_[2]) + 4
   892         else:
   893             length = 4
   894         self.text.extend([self.new_line, "-" * length, self.new_line])
   895 
   896     def process_p(self, node):
   897         # process indent
   898         self._process_indent(node)
   899         self.process_paragraph_item(node)
   900         self.text.append("\n\n") # do not use self.new_line here!
   901 
   902     def _process_indent(self, node):
   903         # process indent
   904         node_style = node.getAttribute("style")
   905         match = re.match(r"margin-left:\s*(\d+)px", node_style)
   906         if match:
   907             left_margin = int(match.group(1))
   908             indent_depth = int(left_margin / 40)
   909             if indent_depth > 0:
   910                 self.text.append(' . ')
   911 
   912     def process_paragraph_item(self, node):
   913         for i in node.childNodes:
   914             if i.nodeType == Node.ELEMENT_NODE:
   915                 self.process_inline(i)
   916             elif i.nodeType == Node.TEXT_NODE:
   917                 self.text.append(i.data.strip('\n').replace('\n', ' '))
   918 
   919     def process_pre(self, node):
   920         self.process_preformatted_item(node)
   921         self.text.append(self.new_line)
   922 
   923     def process_preformatted_item(self, node):
   924         if node.hasAttribute("class"):
   925             class_ = node.getAttribute("class")
   926         else:
   927             class_ = None
   928         if class_ == "comment": # we currently use this for stuff like ## or #acl
   929             for i in node.childNodes:
   930                 if i.nodeType == Node.TEXT_NODE:
   931                     self.text.append(i.data.replace('\n', ''))
   932                 elif i.localName == 'br':
   933                     self.text.append(self.new_line)
   934                 else:
   935                     pass
   936         else:
   937             content_buffer = []
   938             longest_inner_formater = ''
   939             bang_args = ''
   940             delimiters = []
   941 
   942             """
   943             below code fixed for MoinMoinBugs/GuiEditorCantNest bug
   944             this has problem when outer delimiter has two more { than inside one
   945             e.g. {{{{{{ {{{ foo }}} }}}}}}  --> {{{{ {{{ foo }}} }}}}
   946                    {{{foo {{{ }}} foo}}} --> {{{{ {{{ }}} }}}}
   947             """
   948 
   949             for i in node.childNodes:
   950                 if i.nodeType == Node.TEXT_NODE:
   951                     # get longest pre tag({{{ or }}}) from content
   952                     delimiters.extend(re.compile("((?u){+)").findall(i.data))
   953                     delimiters.extend(re.compile("((?u)}+)").findall(i.data))
   954                     # when first line is empty, start iteration second line of i.data
   955                     data_lines = i.data.rstrip().split('\n')
   956                     if data_lines[0].strip() == '':
   957                         data_lines = data_lines[1:]
   958                     for line in data_lines:
   959                         if line.strip().startswith('#!'):
   960                             if bang_args == '':
   961                                 bang_args = line.strip()
   962                             else:
   963                                 content_buffer.extend([line, self.new_line])
   964                         else:
   965                             content_buffer.extend([line, self.new_line])
   966                 elif i.localName == 'br':
   967                     content_buffer.append(self.new_line_dont_remove)
   968                 else:
   969                     pass
   970 
   971             if delimiters:
   972                 longest_inner_formater = max(delimiters)
   973 
   974             if (len(longest_inner_formater) >= 3):
   975                 self.text.extend([("{" * (len(longest_inner_formater) + 1)) + bang_args, \
   976                                       self.new_line])
   977                 self.text.extend(content_buffer)
   978                 self.text.extend(["}" * (len(longest_inner_formater) + 1), \
   979                                       self.new_line])
   980             else:
   981                 self.text.extend(["{{{"+bang_args, self.new_line])
   982                 self.text.extend(content_buffer)
   983                 self.text.extend(["}}}", self.new_line])
   984 
   985     _alignment = {"left": "(",
   986                   "center": ":",
   987                   "right": ")",
   988                   "top": "^",
   989                   "bottom": "v"}
   990 
   991     def _check_length(self, value):
   992         try:
   993             int(value)
   994             return value + 'px'
   995         except ValueError:
   996             return value
   997 
   998     def _get_color(self, node, prefix):
   999         if node.hasAttribute("bgcolor"):
  1000             value = node.getAttribute("bgcolor")
  1001             match = re.match(r"rgb\((\d+),\s*(\d+),\s*(\d+)\)", value)
  1002             if match:
  1003                 value = '#%X%X%X' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
  1004             else:
  1005                 match = re.match(r"#[0-9A-Fa-f]{6}", value)
  1006             if not prefix and match:
  1007                 result = value
  1008             else:
  1009                 result = '%sbgcolor="%s"' % (prefix, value)
  1010         else:
  1011             result = ''
  1012         return result
  1013 
  1014     def _table_style(self, node):
  1015         # TODO: attrs = get_attrs(node)
  1016         result = []
  1017         result.append(self._get_color(node, 'table'))
  1018         if node.hasAttribute("width"):
  1019             value = node.getAttribute("width")
  1020             result.append('tablewidth="%s"' % self._check_length(value))
  1021         if node.hasAttribute("height"):
  1022             value = node.getAttribute("height")
  1023             result.append('tableheight="%s"' % self._check_length(value))
  1024         if node.hasAttribute("align"):
  1025             value = node.getAttribute("align")
  1026             result.append('tablealign="%s"' % value)
  1027         if node.hasAttribute("style"):
  1028             result.append('tablestyle="%s"' % node.getAttribute("style"))
  1029         if node.hasAttribute("class"):
  1030             result.append('tableclass="%s"' % node.getAttribute("class"))
  1031         return " ".join(result).strip()
  1032 
  1033     def _row_style(self, node):
  1034         # TODO: attrs = get_attrs(node)
  1035         result = []
  1036         result.append(self._get_color(node, 'row'))
  1037         if node.hasAttribute("style"):
  1038             result.append('rowstyle="%s"' % node.getAttribute("style"))
  1039         if node.hasAttribute("class"):
  1040             result.append('rowclass="%s"' % node.getAttribute("class"))
  1041         return " ".join(result).strip()
  1042 
  1043     def _cell_style(self, node):
  1044         # TODO: attrs = get_attrs(node)
  1045         if node.hasAttribute("rowspan"):
  1046             rowspan = ("|%s" % node.getAttribute("rowspan"))
  1047         else:
  1048             rowspan = ""
  1049 
  1050         if node.hasAttribute("colspan"):
  1051             colspan = int(node.getAttribute("colspan"))
  1052         else:
  1053             colspan = 1
  1054 
  1055         spanning = rowspan or colspan > 1
  1056 
  1057         align = ""
  1058         result = []
  1059         result.append(self._get_color(node, ''))
  1060         if node.hasAttribute("align"):
  1061             value = node.getAttribute("align")
  1062             if not spanning or value != "center":
  1063                 # ignore "center" in spanning cells
  1064                 align += self._alignment.get(value, "")
  1065         if node.hasAttribute("valign"):
  1066             value = node.getAttribute("valign")
  1067             if not spanning or value != "center":
  1068                 # ignore "center" in spanning cells
  1069                 align += self._alignment.get(value, "")
  1070         if node.hasAttribute("width"):
  1071             value = node.getAttribute("width")
  1072             if value[-1] == "%":
  1073                 align += value
  1074             else:
  1075                 result.append('width="%s"' % self._check_length(value))
  1076         if node.hasAttribute("height"):
  1077             value = node.getAttribute("height")
  1078             result.append('height="%s"' % self._check_length(value))
  1079         if node.hasAttribute("class"):
  1080             result.append('class="%s"' % node.getAttribute("class"))
  1081         if node.hasAttribute("id"):
  1082             result.append('id="%s"' % node.getAttribute("id"))
  1083         if node.hasAttribute("style"):
  1084             result.append('style="%s"' % node.getAttribute("style"))
  1085 
  1086         if align:
  1087             result.insert(0, "%s" % align)
  1088         result.append(rowspan)
  1089         return " ".join(result).strip()
  1090 
  1091     def process_table(self, node, style=""):
  1092         if self.depth == 0:
  1093             self.text.append(self.new_line)
  1094         self.new_table = True
  1095         style += self._table_style(node)
  1096         for i in node.childNodes:
  1097             if i.nodeType == Node.ELEMENT_NODE:
  1098                 name = i.localName
  1099                 if name == 'tr':
  1100                     self.process_table_record(i, style)
  1101                     style = ""
  1102                 elif name in ('thead', 'tbody', 'tfoot'):
  1103                     self.process_table(i, style)
  1104                 elif name == 'caption':
  1105                     self.process_caption(node, i, style)
  1106                     style = ''
  1107                 elif name in ('col', 'colgroup', 'strong', ):
  1108                     pass # we don't support these, but we just ignore them
  1109                 else:
  1110                     raise ConvertError("process_table: Don't support %s element" % name)
  1111             #else:
  1112             #    raise ConvertError("Unexpected node: %r" % i)
  1113         self.text.append(self.new_line_dont_remove)
  1114 
  1115     def process_caption(self, table, node, style=""):
  1116         # get first row
  1117         for i in table.childNodes:
  1118             if i.localName in ('thead', 'tbody', 'tfoot'): # XXX is this correct?
  1119             #if i.localName == 'tbody': (old version)
  1120                 for i in i.childNodes:
  1121                     if i.localName == 'tr':
  1122                         break
  1123                 break
  1124             elif i.localName == 'tr':
  1125                 break
  1126         # count columns
  1127         if i.localName == 'tr':
  1128             colspan = 0
  1129             for td in i.childNodes:
  1130                 if not td.nodeType == Node.ELEMENT_NODE:
  1131                     continue
  1132                 span = td.getAttribute('colspan')
  1133                 try:
  1134                     colspan += int(span)
  1135                 except ValueError:
  1136                     colspan += 1
  1137         else:
  1138             colspan = 1
  1139         text = self.node_list_text_only(node.childNodes).replace('\n', ' ').strip()
  1140         if text:
  1141             if style:
  1142                 style = '<%s>' % style
  1143             self.text.extend(["%s%s'''%s'''||" % ('||' * colspan, style, text), self.new_line_dont_remove])
  1144 
  1145     def process_table_data(self, node, style=""):
  1146         if node.hasAttribute("colspan"):
  1147             colspan = int(node.getAttribute("colspan"))
  1148         else:
  1149             colspan = 1
  1150         self.text.append("||" * colspan)
  1151 
  1152         style += self._cell_style(node)
  1153         if style:
  1154             self.text.append("<%s>" % style)
  1155 
  1156         found = False
  1157         for i in node.childNodes:
  1158             name = i.localName
  1159             if name == 'p':
  1160                 self.process_paragraph_item(i)
  1161                 self.text.append(self.white_space)
  1162                 found = True
  1163         if not found:
  1164             for i in node.childNodes:
  1165                 name = i.localName
  1166                 if i.nodeType == Node.ELEMENT_NODE:
  1167                     if name == 'br':
  1168                         # if we get a br for a cell from e.g. cut and paste from OOo
  1169                         # or if someone simulates a list by enter in a cell
  1170                         # it should be appended as macro BR.
  1171                         self.text.append('<<BR>>')
  1172                         found = True
  1173                         continue
  1174                     else:
  1175                         self.process_inline(i)
  1176                         found = True
  1177                 elif i.nodeType == Node.TEXT_NODE:
  1178                     data = i.data.strip('\n').replace('\n', ' ')
  1179                     if data:
  1180                         found = True
  1181                         self.text.append(data)
  1182         if not found:
  1183             self.text.append(" ")
  1184 
  1185     def process_table_record(self, node, style=""):
  1186         if not self.new_table:
  1187             self.text.append(" " * self.depth)
  1188         else:
  1189             self.new_table = False
  1190         style += self._row_style(node)
  1191         for i in node.childNodes:
  1192             if i.nodeType == Node.ELEMENT_NODE:
  1193                 name = i.localName
  1194                 if name in ('td', 'th', ):
  1195                     self.process_table_data(i, style=style)
  1196                     style = ""
  1197                 else:
  1198                     raise ConvertError("process_table_record: Don't support %s element" % name)
  1199         self.text.extend(["||", self.new_line_dont_remove])
  1200 
  1201     def process_a(self, node):
  1202         attrs = get_attrs(node)
  1203 
  1204         title = attrs.pop('title', '')
  1205         href = attrs.pop('href', None)
  1206         css_class = attrs.get('class')
  1207 
  1208         scriptname = self.request.script_root
  1209         if scriptname == "":
  1210             scriptname = "/"
  1211 
  1212         # can either be a link (with href) or an anchor (with e.g. id)
  1213         # we don't need to support anchors here as we currently handle them as <<Anchor(id)>> macro
  1214         if href:
  1215             href = wikiutil.url_unquote(href)
  1216 
  1217             interwikiname = None
  1218             desc = self.get_desc(node.childNodes)
  1219 
  1220             # interwiki link
  1221             if css_class == "interwiki":
  1222                 wikitag, wikiurl, wikitail, err = wikiutil.resolve_interwiki(
  1223                     self.request, title, "") # the title has the wiki name, page = ""
  1224                 if not err and href.startswith(wikiurl):
  1225                     pagename = wikiutil.url_unquote(href[len(wikiurl):].lstrip('/'))
  1226                     interwikiname = "%s:%s" % (wikitag, pagename)
  1227                 else:
  1228                     raise ConvertError("Invalid InterWiki link: '%s'" % href)
  1229             elif css_class == "badinterwiki" and title:
  1230                 if href == "/": # we used this as replacement for empty href
  1231                     href = ""
  1232                 pagename = wikiutil.url_unquote(href)
  1233                 interwikiname = "%s:%s" % (title, pagename)
  1234             if interwikiname and pagename == desc:
  1235                 if interwiki_re.match(interwikiname+' '): # the blank is needed by interwiki_re to match
  1236                     # this is valid as a free interwiki link
  1237                     self.text.append("%s" % interwikiname)
  1238                 else:
  1239                     self.text.append("[[%s]]" % interwikiname)
  1240                 return
  1241             elif title == 'Self':
  1242                 self.text.append('[[%s|%s]]' % (href, desc))
  1243                 return
  1244             elif interwikiname:
  1245                 self.text.append("[[%s|%s]]" % (interwikiname, desc))
  1246                 return
  1247 
  1248             # fix links generated by a broken copy & paste of gecko based browsers
  1249             brokenness = '../../../..'
  1250             if href.startswith(brokenness):
  1251                 href = href[len(brokenness):] # just strip it away!
  1252             # TODO: IE pastes complete http://server/Page/SubPage as href and as text, too
  1253 
  1254             # Attachments
  1255             if title.startswith("attachment:"):
  1256                 attname = wikiutil.url_unquote(title[len("attachment:"):])
  1257                 if 'do=get' in href: # quick&dirty fix for not dropping &do=get param
  1258                     parms = '|&do=get'
  1259                 else:
  1260                     parms = ''
  1261                 if attname != desc:
  1262                     desc = '|%s' % desc
  1263                 elif parms:
  1264                     desc = '|'
  1265                 else:
  1266                     desc = ''
  1267                 self.text.append('[[attachment:%s%s%s]]' % (attname, desc, parms))
  1268             # wiki link
  1269             elif href.startswith(scriptname):
  1270                 pagename = href[len(scriptname):]
  1271                 pagename = pagename.lstrip('/')    # XXX temp fix for generated pagenames starting with /
  1272                 if desc == pagename:
  1273                     self.text.append(wikiutil.pagelinkmarkup(pagename))
  1274                 # relative link /SubPage
  1275                 elif desc.startswith('/') and href.endswith(desc):
  1276                     if pagename.startswith(self.pagename): # is this a subpage of us?
  1277                         self.text.append(wikiutil.pagelinkmarkup(pagename[len(self.pagename):]))
  1278                     else:
  1279                         self.text.append(wikiutil.pagelinkmarkup(pagename))
  1280                 # relative link ../
  1281                 elif desc.startswith('../') and href.endswith(desc[3:]):
  1282                     self.text.append(wikiutil.pagelinkmarkup(desc))
  1283                 # internal link #internal
  1284                 elif '#' in href and pagename.startswith(self.pagename):
  1285                     self.text.append(wikiutil.pagelinkmarkup(href[href.index('#'):], desc))
  1286                 # labeled link
  1287                 else:
  1288                     self.text.append(wikiutil.pagelinkmarkup(pagename, desc))
  1289             # mailto link
  1290             elif href.startswith("mailto:"):
  1291                 if href == desc or href[len("mailto:"):] == desc:
  1292                     self.text.extend([self.white_space, desc, self.white_space])
  1293                 else:
  1294                     self.text.append("[[%s|%s]]" % (href, desc)) # XXX use a (renamed) pagelinkmarkup
  1295             # link
  1296             else:
  1297                 if href == desc:
  1298                     href = href.replace(" ", "%20")
  1299                     self.text.append(href)
  1300                 else:
  1301                     href = href.replace(" ", "%20")
  1302                     if desc:
  1303                         desc = '|' + desc
  1304                     self.text.append("[[%s%s]]" % (href, desc))
  1305 
  1306     def process_img(self, node):
  1307         markup = self._process_img(node)
  1308         self.text.extend([self.white_space, markup, self.white_space])
  1309 
  1310     def _process_img(self, node):
  1311         attrs = get_attrs(node)
  1312 
  1313         title = attrs.pop('title', '')
  1314         if title.startswith("smiley:"):
  1315             markup = title[len("smiley:"):]
  1316             return markup
  1317 
  1318         alt = attrs.pop('alt', None)
  1319         src = attrs.pop('src', None)
  1320         css_class = attrs.get('class')
  1321 
  1322         target = src
  1323         if title.startswith("attachment:"):
  1324             target = wikiutil.url_unquote(title)
  1325             if alt == title[len("attachment:"):]:
  1326                 # kill auto-generated alt
  1327                 alt = None
  1328         elif title.startswith("drawing:"):
  1329             target = wikiutil.url_unquote(title)
  1330             if alt == title[len("drawing:"):]:
  1331                 # kill auto-generated alt
  1332                 alt = None
  1333         else:
  1334             if css_class == 'external_image':
  1335                 # kill auto-generated alt and class
  1336                 if src == alt:
  1337                     alt = None
  1338                 del attrs['class']
  1339 
  1340         if alt:
  1341             desc = '|' + alt
  1342         else:
  1343             desc = ''
  1344 
  1345         params = ','.join(['%s="%s"' % (k, v) for k, v in attrs.items()])
  1346                            # if k in ('width', 'height', )])
  1347         if params:
  1348             params = '|' + params
  1349             if not desc:
  1350                 desc = '|'
  1351 
  1352         markup = "{{%s%s%s}}" % (target, desc, params)
  1353         return markup
  1354 
  1355     def process_object(self, node):
  1356         markup = self._process_object(node)
  1357         self.text.append(markup)
  1358 
  1359     def _process_object(self, node):
  1360         attrs = get_attrs(node)
  1361         markup = ''
  1362         data = attrs.pop('data', None)
  1363         if data:
  1364             data = wikiutil.url_unquote(data)
  1365 
  1366             desc = self.get_desc(node.childNodes)
  1367             if desc:
  1368                 desc = '|' + desc
  1369 
  1370             params = ','.join(['%s="%s"' % (k, v) for k, v in attrs.items()])
  1371                                # if k in ('width', 'height', )])
  1372             if params:
  1373                 params = '|' + params
  1374                 if not desc:
  1375                     desc = '|'
  1376             markup = "{{%s%s%s}}" % (data, desc, params)
  1377         return markup
  1378         # TODO: for target PAGES, use some code from process_a to get the pagename from URL
  1379         # TODO: roundtrip attachment: correctly
  1380         # TODO: handle object's content better?
  1381 
  1382 def get_attrs(node):
  1383     """ get the attributes of <node> into an easy-to-use dict """
  1384     attrs = {}
  1385     for attr_name in node.attributes.keys():
  1386         # get attributes of style element
  1387         if attr_name == "style":
  1388             for style_element in node.attributes.get(attr_name).nodeValue.split(';'):
  1389                 if style_element.strip() != '':
  1390                     style_elements = style_element.split(':')
  1391                     if len(style_elements) == 2:
  1392                         attrs[style_elements[0].strip()] = style_elements[1].strip()
  1393         # get attributes without style element
  1394         else:
  1395             attrs[attr_name] = node.attributes.get(attr_name).nodeValue
  1396     return attrs
  1397 
  1398 
  1399 def parse(request, text):
  1400     text = u'<?xml version="1.0"?>%s%s' % (dtd, text)
  1401     text = text.encode(config.charset)
  1402     try:
  1403         return xml.dom.minidom.parseString(text)
  1404     except xml.parsers.expat.ExpatError, msg:
  1405         # this sometimes crashes when it should not, so save the stuff to analyze it:
  1406         logname = os.path.join(request.cfg.data_dir, "expaterror.log")
  1407         f = file(logname, "w")
  1408         f.write(text)
  1409         f.write("\n" + "-"*80 + "\n" + str(msg))
  1410         f.close()
  1411         raise ConvertError('ExpatError: %s (see dump in %s)' % (msg, logname))
  1412 
  1413 def convert(request, pagename, text):
  1414     # Due to expat needing explicitly set namespaces, we set these here to allow pasting
  1415     # from Word / Excel without issues.
  1416     # If you encounter 'ExpatError: unbound prefix', try adding the namespace to the list.
  1417     namespace = [u'xmlns:o="urn:schemas-microsoft-com:office:office"',
  1418                  u'xmlns:x="urn:schemas-microsoft-com:office:excel"',
  1419                  u'xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet"',
  1420                  u'xmlns:c="urn:schemas-microsoft-com:office:component:spreadsheet"',
  1421                  u'xmlns:s="uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882"',
  1422                  u'xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882"',
  1423                  u'xmlns:rs="urn:schemas-microsoft-com:rowset"',
  1424                  u'xmlns:z="#RowsetSchema"',
  1425                  u'xmlns:x2="http://schemas.microsoft.com/office/excel/2003/xml"',
  1426                  u'xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core"',
  1427                  u'xmlns:aml="http://schemas.microsoft.com/aml/2001/core"',
  1428                  u'xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml"',
  1429                  u'xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint"',
  1430                  u'xmlns:w10="urn:schemas-microsoft-com:office:word"',
  1431                  u'xmlns:v="urn:schemas-microsoft-com:office:vml"']
  1432     text = u'<page %s>%s</page>' % (' '.join(namespace), text)
  1433     tree = parse(request, text)
  1434     strip_whitespace().do(tree)
  1435     text = convert_tree(request, pagename).do(tree)
  1436     text = '\n'.join([s.rstrip() for s in text.splitlines()] + ['']) # remove trailing blanks
  1437     return text
  1438