MoinMoin/converter/text_html_text_moin_wiki.py
author Roger Haase <crosseyedpenquin@yahoo.com>
Fri, 23 Jul 2010 09:23:59 -0700
changeset 5712 7a83cc907f68
parent 5537 8ee65df9cbe9
permissions -rw-r--r--
simplify auto scroll initialization; fix bug in IE init discovered when using IE7 on pages with wide tables
     1 """
     2     MoinMoin - convert from html to wiki markup
     3 
     4     @copyright: 2005-2006 Bastian Blank, Florian Festi, Reimar Bauer,
     5                 2005-2007 MoinMoin:ThomasWaldmann
     6     @license: GNU GPL, see COPYING for details.
     7 """
     8 
     9 import re, os
    10 import xml.dom.minidom # HINT: the nodes in parse result tree need .has_key(), "x in ..." does not work
    11 import urlparse
    12 from xml.dom import Node
    13 
    14 from MoinMoin import config, wikiutil
    15 from MoinMoin.error import ConvertError
    16 from werkzeug import url_decode
    17 from MoinMoin.parser.text_moin_wiki import Parser as WikiParser
    18 interwiki_re = re.compile(WikiParser.interwiki_rule, re.VERBOSE|re.UNICODE)
    19 
    20 
    21 # Portions (C) International Organization for Standardization 1986
    22 # Permission to copy in any form is granted for use with
    23 # conforming SGML systems and applications as defined in
    24 # ISO 8879, provided this notice is included in all copies.
    25 dtd = ur'''
    26 <!DOCTYPE html [
    27 <!ENTITY nbsp   "&#32;">  <!-- no-break space = non-breaking space, U+00A0, convert to U+0020 -->
    28 <!ENTITY iexcl  "&#161;"> <!-- inverted exclamation mark, U+00A1 ISOnum -->
    29 <!ENTITY cent   "&#162;"> <!-- cent sign, U+00A2 ISOnum -->
    30 <!ENTITY pound  "&#163;"> <!-- pound sign, U+00A3 ISOnum -->
    31 <!ENTITY curren "&#164;"> <!-- currency sign, U+00A4 ISOnum -->
    32 <!ENTITY yen    "&#165;"> <!-- yen sign = yuan sign, U+00A5 ISOnum -->
    33 <!ENTITY brvbar "&#166;"> <!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
    34 <!ENTITY sect   "&#167;"> <!-- section sign, U+00A7 ISOnum -->
    35 <!ENTITY uml    "&#168;"> <!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
    36 <!ENTITY copy   "&#169;"> <!-- copyright sign, U+00A9 ISOnum -->
    37 <!ENTITY ordf   "&#170;"> <!-- feminine ordinal indicator, U+00AA ISOnum -->
    38 <!ENTITY laquo  "&#171;"> <!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
    39 <!ENTITY not    "&#172;"> <!-- not sign = angled dash, U+00AC ISOnum -->
    40 <!ENTITY shy    "&#173;"> <!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
    41 <!ENTITY reg    "&#174;"> <!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
    42 <!ENTITY macr   "&#175;"> <!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
    43 <!ENTITY deg    "&#176;"> <!-- degree sign, U+00B0 ISOnum -->
    44 <!ENTITY plusmn "&#177;"> <!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
    45 <!ENTITY sup2   "&#178;"> <!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
    46 <!ENTITY sup3   "&#179;"> <!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
    47 <!ENTITY acute  "&#180;"> <!-- acute accent = spacing acute, U+00B4 ISOdia -->
    48 <!ENTITY micro  "&#181;"> <!-- micro sign, U+00B5 ISOnum -->
    49 <!ENTITY para   "&#182;"> <!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
    50 <!ENTITY middot "&#183;"> <!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
    51 <!ENTITY cedil  "&#184;"> <!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
    52 <!ENTITY sup1   "&#185;"> <!-- superscript one = superscript digit one, U+00B9 ISOnum -->
    53 <!ENTITY ordm   "&#186;"> <!-- masculine ordinal indicator, U+00BA ISOnum -->
    54 <!ENTITY raquo  "&#187;"> <!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
    55 <!ENTITY frac14 "&#188;"> <!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
    56 <!ENTITY frac12 "&#189;"> <!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
    57 <!ENTITY frac34 "&#190;"> <!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
    58 <!ENTITY iquest "&#191;"> <!-- inverted question mark = turned question mark, U+00BF ISOnum -->
    59 <!ENTITY Agrave "&#192;"> <!-- latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 -->
    60 <!ENTITY Aacute "&#193;"> <!-- latin capital letter A with acute, U+00C1 ISOlat1 -->
    61 <!ENTITY Acirc  "&#194;"> <!-- latin capital letter A with circumflex, U+00C2 ISOlat1 -->
    62 <!ENTITY Atilde "&#195;"> <!-- latin capital letter A with tilde, U+00C3 ISOlat1 -->
    63 <!ENTITY Auml   "&#196;"> <!-- latin capital letter A with diaeresis, U+00C4 ISOlat1 -->
    64 <!ENTITY Aring  "&#197;"> <!-- latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 -->
    65 <!ENTITY AElig  "&#198;"> <!-- latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 -->
    66 <!ENTITY Ccedil "&#199;"> <!-- latin capital letter C with cedilla, U+00C7 ISOlat1 -->
    67 <!ENTITY Egrave "&#200;"> <!-- latin capital letter E with grave, U+00C8 ISOlat1 -->
    68 <!ENTITY Eacute "&#201;"> <!-- latin capital letter E with acute, U+00C9 ISOlat1 -->
    69 <!ENTITY Ecirc  "&#202;"> <!-- latin capital letter E with circumflex, U+00CA ISOlat1 -->
    70 <!ENTITY Euml   "&#203;"> <!-- latin capital letter E with diaeresis, U+00CB ISOlat1 -->
    71 <!ENTITY Igrave "&#204;"> <!-- latin capital letter I with grave, U+00CC ISOlat1 -->
    72 <!ENTITY Iacute "&#205;"> <!-- latin capital letter I with acute, U+00CD ISOlat1 -->
    73 <!ENTITY Icirc  "&#206;"> <!-- latin capital letter I with circumflex, U+00CE ISOlat1 -->
    74 <!ENTITY Iuml   "&#207;"> <!-- latin capital letter I with diaeresis, U+00CF ISOlat1 -->
    75 <!ENTITY ETH    "&#208;"> <!-- latin capital letter ETH, U+00D0 ISOlat1 -->
    76 <!ENTITY Ntilde "&#209;"> <!-- latin capital letter N with tilde, U+00D1 ISOlat1 -->
    77 <!ENTITY Ograve "&#210;"> <!-- latin capital letter O with grave, U+00D2 ISOlat1 -->
    78 <!ENTITY Oacute "&#211;"> <!-- latin capital letter O with acute, U+00D3 ISOlat1 -->
    79 <!ENTITY Ocirc  "&#212;"> <!-- latin capital letter O with circumflex, U+00D4 ISOlat1 -->
    80 <!ENTITY Otilde "&#213;"> <!-- latin capital letter O with tilde, U+00D5 ISOlat1 -->
    81 <!ENTITY Ouml   "&#214;"> <!-- latin capital letter O with diaeresis, U+00D6 ISOlat1 -->
    82 <!ENTITY times  "&#215;"> <!-- multiplication sign, U+00D7 ISOnum -->
    83 <!ENTITY Oslash "&#216;"> <!-- latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 -->
    84 <!ENTITY Ugrave "&#217;"> <!-- latin capital letter U with grave, U+00D9 ISOlat1 -->
    85 <!ENTITY Uacute "&#218;"> <!-- latin capital letter U with acute, U+00DA ISOlat1 -->
    86 <!ENTITY Ucirc  "&#219;"> <!-- latin capital letter U with circumflex, U+00DB ISOlat1 -->
    87 <!ENTITY Uuml   "&#220;"> <!-- latin capital letter U with diaeresis, U+00DC ISOlat1 -->
    88 <!ENTITY Yacute "&#221;"> <!-- latin capital letter Y with acute, U+00DD ISOlat1 -->
    89 <!ENTITY THORN  "&#222;"> <!-- latin capital letter THORN, U+00DE ISOlat1 -->
    90 <!ENTITY szlig  "&#223;"> <!-- latin small letter sharp s = ess-zed, U+00DF ISOlat1 -->
    91 <!ENTITY agrave "&#224;"> <!-- latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 -->
    92 <!ENTITY aacute "&#225;"> <!-- latin small letter a with acute, U+00E1 ISOlat1 -->
    93 <!ENTITY acirc  "&#226;"> <!-- latin small letter a with circumflex, U+00E2 ISOlat1 -->
    94 <!ENTITY atilde "&#227;"> <!-- latin small letter a with tilde, U+00E3 ISOlat1 -->
    95 <!ENTITY auml   "&#228;"> <!-- latin small letter a with diaeresis, U+00E4 ISOlat1 -->
    96 <!ENTITY aring  "&#229;"> <!-- latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 -->
    97 <!ENTITY aelig  "&#230;"> <!-- latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 -->
    98 <!ENTITY ccedil "&#231;"> <!-- latin small letter c with cedilla, U+00E7 ISOlat1 -->
    99 <!ENTITY egrave "&#232;"> <!-- latin small letter e with grave, U+00E8 ISOlat1 -->
   100 <!ENTITY eacute "&#233;"> <!-- latin small letter e with acute, U+00E9 ISOlat1 -->
   101 <!ENTITY ecirc  "&#234;"> <!-- latin small letter e with circumflex, U+00EA ISOlat1 -->
   102 <!ENTITY euml   "&#235;"> <!-- latin small letter e with diaeresis, U+00EB ISOlat1 -->
   103 <!ENTITY igrave "&#236;"> <!-- latin small letter i with grave, U+00EC ISOlat1 -->
   104 <!ENTITY iacute "&#237;"> <!-- latin small letter i with acute, U+00ED ISOlat1 -->
   105 <!ENTITY icirc  "&#238;"> <!-- latin small letter i with circumflex, U+00EE ISOlat1 -->
   106 <!ENTITY iuml   "&#239;"> <!-- latin small letter i with diaeresis, U+00EF ISOlat1 -->
   107 <!ENTITY eth    "&#240;"> <!-- latin small letter eth, U+00F0 ISOlat1 -->
   108 <!ENTITY ntilde "&#241;"> <!-- latin small letter n with tilde, U+00F1 ISOlat1 -->
   109 <!ENTITY ograve "&#242;"> <!-- latin small letter o with grave, U+00F2 ISOlat1 -->
   110 <!ENTITY oacute "&#243;"> <!-- latin small letter o with acute, U+00F3 ISOlat1 -->
   111 <!ENTITY ocirc  "&#244;"> <!-- latin small letter o with circumflex, U+00F4 ISOlat1 -->
   112 <!ENTITY otilde "&#245;"> <!-- latin small letter o with tilde, U+00F5 ISOlat1 -->
   113 <!ENTITY ouml   "&#246;"> <!-- latin small letter o with diaeresis, U+00F6 ISOlat1 -->
   114 <!ENTITY divide "&#247;"> <!-- division sign, U+00F7 ISOnum -->
   115 <!ENTITY oslash "&#248;"> <!-- latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 -->
   116 <!ENTITY ugrave "&#249;"> <!-- latin small letter u with grave, U+00F9 ISOlat1 -->
   117 <!ENTITY uacute "&#250;"> <!-- latin small letter u with acute, U+00FA ISOlat1 -->
   118 <!ENTITY ucirc  "&#251;"> <!-- latin small letter u with circumflex, U+00FB ISOlat1 -->
   119 <!ENTITY uuml   "&#252;"> <!-- latin small letter u with diaeresis, U+00FC ISOlat1 -->
   120 <!ENTITY yacute "&#253;"> <!-- latin small letter y with acute, U+00FD ISOlat1 -->
   121 <!ENTITY thorn  "&#254;"> <!-- latin small letter thorn, U+00FE ISOlat1 -->
   122 <!ENTITY yuml   "&#255;"> <!-- latin small letter y with diaeresis, U+00FF ISOlat1 -->
   123 
   124 <!-- Latin Extended-B -->
   125 <!ENTITY fnof     "&#402;"> <!-- latin small f with hook = function                                    = florin, U+0192 ISOtech -->
   126 
   127 <!-- Greek -->
   128 <!ENTITY Alpha    "&#913;"> <!-- greek capital letter alpha, U+0391 -->
   129 <!ENTITY Beta     "&#914;"> <!-- greek capital letter beta, U+0392 -->
   130 <!ENTITY Gamma    "&#915;"> <!-- greek capital letter gamma,
   131                                     U+0393 ISOgrk3 -->
   132 <!ENTITY Delta    "&#916;"> <!-- greek capital letter delta,
   133                                     U+0394 ISOgrk3 -->
   134 <!ENTITY Epsilon  "&#917;"> <!-- greek capital letter epsilon, U+0395 -->
   135 <!ENTITY Zeta     "&#918;"> <!-- greek capital letter zeta, U+0396 -->
   136 <!ENTITY Eta      "&#919;"> <!-- greek capital letter eta, U+0397 -->
   137 <!ENTITY Theta    "&#920;"> <!-- greek capital letter theta,
   138                                     U+0398 ISOgrk3 -->
   139 <!ENTITY Iota     "&#921;"> <!-- greek capital letter iota, U+0399 -->
   140 <!ENTITY Kappa    "&#922;"> <!-- greek capital letter kappa, U+039A -->
   141 <!ENTITY Lambda   "&#923;"> <!-- greek capital letter lambda,
   142                                     U+039B ISOgrk3 -->
   143 <!ENTITY Mu       "&#924;"> <!-- greek capital letter mu, U+039C -->
   144 <!ENTITY Nu       "&#925;"> <!-- greek capital letter nu, U+039D -->
   145 <!ENTITY Xi       "&#926;"> <!-- greek capital letter xi, U+039E ISOgrk3 -->
   146 <!ENTITY Omicron  "&#927;"> <!-- greek capital letter omicron, U+039F -->
   147 <!ENTITY Pi       "&#928;"> <!-- greek capital letter pi, U+03A0 ISOgrk3 -->
   148 <!ENTITY Rho      "&#929;"> <!-- greek capital letter rho, U+03A1 -->
   149 <!-- there is no Sigmaf, and no U+03A2 character either -->
   150 <!ENTITY Sigma    "&#931;"> <!-- greek capital letter sigma,
   151                                     U+03A3 ISOgrk3 -->
   152 <!ENTITY Tau      "&#932;"> <!-- greek capital letter tau, U+03A4 -->
   153 <!ENTITY Upsilon  "&#933;"> <!-- greek capital letter upsilon,
   154                                     U+03A5 ISOgrk3 -->
   155 <!ENTITY Phi      "&#934;"> <!-- greek capital letter phi,
   156                                     U+03A6 ISOgrk3 -->
   157 <!ENTITY Chi      "&#935;"> <!-- greek capital letter chi, U+03A7 -->
   158 <!ENTITY Psi      "&#936;"> <!-- greek capital letter psi,
   159                                     U+03A8 ISOgrk3 -->
   160 <!ENTITY Omega    "&#937;"> <!-- greek capital letter omega,
   161                                     U+03A9 ISOgrk3 -->
   162 
   163 <!ENTITY alpha    "&#945;"> <!-- greek small letter alpha,
   164                                     U+03B1 ISOgrk3 -->
   165 <!ENTITY beta     "&#946;"> <!-- greek small letter beta, U+03B2 ISOgrk3 -->
   166 <!ENTITY gamma    "&#947;"> <!-- greek small letter gamma,
   167                                     U+03B3 ISOgrk3 -->
   168 <!ENTITY delta    "&#948;"> <!-- greek small letter delta,
   169                                     U+03B4 ISOgrk3 -->
   170 <!ENTITY epsilon  "&#949;"> <!-- greek small letter epsilon,
   171                                     U+03B5 ISOgrk3 -->
   172 <!ENTITY zeta     "&#950;"> <!-- greek small letter zeta, U+03B6 ISOgrk3 -->
   173 <!ENTITY eta      "&#951;"> <!-- greek small letter eta, U+03B7 ISOgrk3 -->
   174 <!ENTITY theta    "&#952;"> <!-- greek small letter theta,
   175                                     U+03B8 ISOgrk3 -->
   176 <!ENTITY iota     "&#953;"> <!-- greek small letter iota, U+03B9 ISOgrk3 -->
   177 <!ENTITY kappa    "&#954;"> <!-- greek small letter kappa,
   178                                     U+03BA ISOgrk3 -->
   179 <!ENTITY lambda   "&#955;"> <!-- greek small letter lambda,
   180                                     U+03BB ISOgrk3 -->
   181 <!ENTITY mu       "&#956;"> <!-- greek small letter mu, U+03BC ISOgrk3 -->
   182 <!ENTITY nu       "&#957;"> <!-- greek small letter nu, U+03BD ISOgrk3 -->
   183 <!ENTITY xi       "&#958;"> <!-- greek small letter xi, U+03BE ISOgrk3 -->
   184 <!ENTITY omicron  "&#959;"> <!-- greek small letter omicron, U+03BF NEW -->
   185 <!ENTITY pi       "&#960;"> <!-- greek small letter pi, U+03C0 ISOgrk3 -->
   186 <!ENTITY rho      "&#961;"> <!-- greek small letter rho, U+03C1 ISOgrk3 -->
   187 <!ENTITY sigmaf   "&#962;"> <!-- greek small letter final sigma,
   188                                     U+03C2 ISOgrk3 -->
   189 <!ENTITY sigma    "&#963;"> <!-- greek small letter sigma,
   190                                     U+03C3 ISOgrk3 -->
   191 <!ENTITY tau      "&#964;"> <!-- greek small letter tau, U+03C4 ISOgrk3 -->
   192 <!ENTITY upsilon  "&#965;"> <!-- greek small letter upsilon,
   193                                     U+03C5 ISOgrk3 -->
   194 <!ENTITY phi      "&#966;"> <!-- greek small letter phi, U+03C6 ISOgrk3 -->
   195 <!ENTITY chi      "&#967;"> <!-- greek small letter chi, U+03C7 ISOgrk3 -->
   196 <!ENTITY psi      "&#968;"> <!-- greek small letter psi, U+03C8 ISOgrk3 -->
   197 <!ENTITY omega    "&#969;"> <!-- greek small letter omega,
   198                                     U+03C9 ISOgrk3 -->
   199 <!ENTITY thetasym "&#977;"> <!-- greek small letter theta symbol,
   200                                     U+03D1 NEW -->
   201 <!ENTITY upsih    "&#978;"> <!-- greek upsilon with hook symbol,
   202                                     U+03D2 NEW -->
   203 <!ENTITY piv      "&#982;"> <!-- greek pi symbol, U+03D6 ISOgrk3 -->
   204 
   205 <!-- General Punctuation -->
   206 <!ENTITY bull     "&#8226;"> <!-- bullet = black small circle,
   207                                      U+2022 ISOpub  -->
   208 <!-- bullet is NOT the same as bullet operator, U+2219 -->
   209 <!ENTITY hellip   "&#8230;"> <!-- horizontal ellipsis = three dot leader,
   210                                      U+2026 ISOpub  -->
   211 <!ENTITY prime    "&#8242;"> <!-- prime = minutes = feet, U+2032 ISOtech -->
   212 <!ENTITY Prime    "&#8243;"> <!-- double prime = seconds = inches,
   213                                      U+2033 ISOtech -->
   214 <!ENTITY oline    "&#8254;"> <!-- overline = spacing overscore,
   215                                      U+203E NEW -->
   216 <!ENTITY frasl    "&#8260;"> <!-- fraction slash, U+2044 NEW -->
   217 
   218 <!-- Letterlike Symbols -->
   219 <!ENTITY weierp   "&#8472;"> <!-- script capital P = power set
   220                                      = Weierstrass p, U+2118 ISOamso -->
   221 <!ENTITY image    "&#8465;"> <!-- blackletter capital I = imaginary part,
   222                                      U+2111 ISOamso -->
   223 <!ENTITY real     "&#8476;"> <!-- blackletter capital R = real part symbol,
   224                                      U+211C ISOamso -->
   225 <!ENTITY trade    "&#8482;"> <!-- trade mark sign, U+2122 ISOnum -->
   226 <!ENTITY alefsym  "&#8501;"> <!-- alef symbol = first transfinite cardinal,
   227                                      U+2135 NEW -->
   228 <!-- alef symbol is NOT the same as hebrew letter alef,
   229      U+05D0 although the same glyph could be used to depict both characters -->
   230 
   231 <!-- Arrows -->
   232 <!ENTITY larr     "&#8592;"> <!-- leftwards arrow, U+2190 ISOnum -->
   233 <!ENTITY uarr     "&#8593;"> <!-- upwards arrow, U+2191 ISOnum-->
   234 <!ENTITY rarr     "&#8594;"> <!-- rightwards arrow, U+2192 ISOnum -->
   235 <!ENTITY darr     "&#8595;"> <!-- downwards arrow, U+2193 ISOnum -->
   236 <!ENTITY harr     "&#8596;"> <!-- left right arrow, U+2194 ISOamsa -->
   237 <!ENTITY crarr    "&#8629;"> <!-- downwards arrow with corner leftwards
   238                                      = carriage return, U+21B5 NEW -->
   239 <!ENTITY lArr     "&#8656;"> <!-- leftwards double arrow, U+21D0 ISOtech -->
   240 <!-- ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
   241     but also does not have any other character for that function. So ? lArr can
   242     be used for 'is implied by' as ISOtech suggests -->
   243 <!ENTITY uArr     "&#8657;"> <!-- upwards double arrow, U+21D1 ISOamsa -->
   244 <!ENTITY rArr     "&#8658;"> <!-- rightwards double arrow,
   245                                      U+21D2 ISOtech -->
   246 <!-- ISO 10646 does not say this is the 'implies' character but does not have
   247      another character with this function so ?
   248      rArr can be used for 'implies' as ISOtech suggests -->
   249 <!ENTITY dArr     "&#8659;"> <!-- downwards double arrow, U+21D3 ISOamsa -->
   250 <!ENTITY hArr     "&#8660;"> <!-- left right double arrow,
   251                                      U+21D4 ISOamsa -->
   252 
   253 <!-- Mathematical Operators -->
   254 <!ENTITY forall   "&#8704;"> <!-- for all, U+2200 ISOtech -->
   255 <!ENTITY part     "&#8706;"> <!-- partial differential, U+2202 ISOtech  -->
   256 <!ENTITY exist    "&#8707;"> <!-- there exists, U+2203 ISOtech -->
   257 <!ENTITY empty    "&#8709;"> <!-- empty set = null set = diameter,
   258                                      U+2205 ISOamso -->
   259 <!ENTITY nabla    "&#8711;"> <!-- nabla = backward difference,
   260                                      U+2207 ISOtech -->
   261 <!ENTITY isin     "&#8712;"> <!-- element of, U+2208 ISOtech -->
   262 <!ENTITY notin    "&#8713;"> <!-- not an element of, U+2209 ISOtech -->
   263 <!ENTITY ni       "&#8715;"> <!-- contains as member, U+220B ISOtech -->
   264 <!-- should there be a more memorable name than 'ni'? -->
   265 <!ENTITY prod     "&#8719;"> <!-- n-ary product = product sign,
   266                                      U+220F ISOamsb -->
   267 <!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
   268      the same glyph might be used for both -->
   269 <!ENTITY sum      "&#8721;"> <!-- n-ary sumation, U+2211 ISOamsb -->
   270 <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
   271      though the same glyph might be used for both -->
   272 <!ENTITY minus    "&#8722;"> <!-- minus sign, U+2212 ISOtech -->
   273 <!ENTITY lowast   "&#8727;"> <!-- asterisk operator, U+2217 ISOtech -->
   274 <!ENTITY radic    "&#8730;"> <!-- square root = radical sign,
   275                                      U+221A ISOtech -->
   276 <!ENTITY prop     "&#8733;"> <!-- proportional to, U+221D ISOtech -->
   277 <!ENTITY infin    "&#8734;"> <!-- infinity, U+221E ISOtech -->
   278 <!ENTITY ang      "&#8736;"> <!-- angle, U+2220 ISOamso -->
   279 <!ENTITY and      "&#8743;"> <!-- logical and = wedge, U+2227 ISOtech -->
   280 <!ENTITY or       "&#8744;"> <!-- logical or = vee, U+2228 ISOtech -->
   281 <!ENTITY cap      "&#8745;"> <!-- intersection = cap, U+2229 ISOtech -->
   282 <!ENTITY cup      "&#8746;"> <!-- union = cup, U+222A ISOtech -->
   283 <!ENTITY int      "&#8747;"> <!-- integral, U+222B ISOtech -->
   284 <!ENTITY there4   "&#8756;"> <!-- therefore, U+2234 ISOtech -->
   285 <!ENTITY sim      "&#8764;"> <!-- tilde operator = varies with = similar to,
   286                                      U+223C ISOtech -->
   287 <!-- tilde operator is NOT the same character as the tilde, U+007E,
   288      although the same glyph might be used to represent both  -->
   289 <!ENTITY cong     "&#8773;"> <!-- approximately equal to, U+2245 ISOtech -->
   290 <!ENTITY asymp    "&#8776;"> <!-- almost equal to = asymptotic to,
   291                                      U+2248 ISOamsr -->
   292 <!ENTITY ne       "&#8800;"> <!-- not equal to, U+2260 ISOtech -->
   293 <!ENTITY equiv    "&#8801;"> <!-- identical to, U+2261 ISOtech -->
   294 <!ENTITY le       "&#8804;"> <!-- less-than or equal to, U+2264 ISOtech -->
   295 <!ENTITY ge       "&#8805;"> <!-- greater-than or equal to,
   296                                      U+2265 ISOtech -->
   297 <!ENTITY sub      "&#8834;"> <!-- subset of, U+2282 ISOtech -->
   298 <!ENTITY sup      "&#8835;"> <!-- superset of, U+2283 ISOtech -->
   299 <!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
   300      font encoding and is not included. Should it be, for symmetry?
   301      It is in ISOamsn  -->
   302 <!ENTITY nsub     "&#8836;"> <!-- not a subset of, U+2284 ISOamsn -->
   303 <!ENTITY sube     "&#8838;"> <!-- subset of or equal to, U+2286 ISOtech -->
   304 <!ENTITY supe     "&#8839;"> <!-- superset of or equal to,
   305                                      U+2287 ISOtech -->
   306 <!ENTITY oplus    "&#8853;"> <!-- circled plus = direct sum,
   307                                      U+2295 ISOamsb -->
   308 <!ENTITY otimes   "&#8855;"> <!-- circled times = vector product,
   309                                      U+2297 ISOamsb -->
   310 <!ENTITY perp     "&#8869;"> <!-- up tack = orthogonal to = perpendicular,
   311                                      U+22A5 ISOtech -->
   312 <!ENTITY sdot     "&#8901;"> <!-- dot operator, U+22C5 ISOamsb -->
   313 <!-- dot operator is NOT the same character as U+00B7 middle dot -->
   314 
   315 <!-- Miscellaneous Technical -->
   316 <!ENTITY lceil    "&#8968;"> <!-- left ceiling = apl upstile,
   317                                      U+2308 ISOamsc  -->
   318 <!ENTITY rceil    "&#8969;"> <!-- right ceiling, U+2309 ISOamsc  -->
   319 <!ENTITY lfloor   "&#8970;"> <!-- left floor = apl downstile,
   320                                      U+230A ISOamsc  -->
   321 <!ENTITY rfloor   "&#8971;"> <!-- right floor, U+230B ISOamsc  -->
   322 <!ENTITY lang     "&#9001;"> <!-- left-pointing angle bracket = bra,
   323                                      U+2329 ISOtech -->
   324 <!-- lang is NOT the same character as U+003C 'less than'
   325      or U+2039 'single left-pointing angle quotation mark' -->
   326 <!ENTITY rang     "&#9002;"> <!-- right-pointing angle bracket = ket,
   327                                      U+232A ISOtech -->
   328 <!-- rang is NOT the same character as U+003E 'greater than'
   329      or U+203A 'single right-pointing angle quotation mark' -->
   330 
   331 <!-- Geometric Shapes -->
   332 <!ENTITY loz      "&#9674;"> <!-- lozenge, U+25CA ISOpub -->
   333 
   334 <!-- Miscellaneous Symbols -->
   335 <!ENTITY spades   "&#9824;"> <!-- black spade suit, U+2660 ISOpub -->
   336 <!-- black here seems to mean filled as opposed to hollow -->
   337 <!ENTITY clubs    "&#9827;"> <!-- black club suit = shamrock,
   338                                      U+2663 ISOpub -->
   339 <!ENTITY hearts   "&#9829;"> <!-- black heart suit = valentine,
   340                                      U+2665 ISOpub -->
   341 <!ENTITY diams    "&#9830;"> <!-- black diamond suit, U+2666 ISOpub -->
   342 
   343 <!-- C0 Controls and Basic Latin -->
   344 <!ENTITY quot    "&#34;"> <!-- quotation mark = APL quote,
   345                                     U+0022 ISOnum -->
   346 <!ENTITY amp     "&#38;"> <!-- ampersand, U+0026 ISOnum -->
   347 <!ENTITY lt      "&#60;"> <!-- less-than sign, U+003C ISOnum -->
   348 <!ENTITY gt      "&#62;"> <!-- greater-than sign, U+003E ISOnum -->
   349 
   350 <!-- Latin Extended-A -->
   351 <!ENTITY OElig   "&#338;"> <!-- latin capital ligature OE,
   352                                     U+0152 ISOlat2 -->
   353 <!ENTITY oelig   "&#339;"> <!-- latin small ligature oe, U+0153 ISOlat2 -->
   354 <!-- ligature is a misnomer, this is a separate character in some languages -->
   355 <!ENTITY Scaron  "&#352;"> <!-- latin capital letter S with caron,
   356                                     U+0160 ISOlat2 -->
   357 <!ENTITY scaron  "&#353;"> <!-- latin small letter s with caron,
   358                                     U+0161 ISOlat2 -->
   359 <!ENTITY Yuml    "&#376;"> <!-- latin capital letter Y with diaeresis,
   360                                     U+0178 ISOlat2 -->
   361 
   362 <!-- Spacing Modifier Letters -->
   363 <!ENTITY circ    "&#710;"> <!-- modifier letter circumflex accent,
   364                                     U+02C6 ISOpub -->
   365 <!ENTITY tilde   "&#732;"> <!-- small tilde, U+02DC ISOdia -->
   366 
   367 <!-- General Punctuation -->
   368 <!ENTITY ensp    "&#8194;"> <!-- en space, U+2002 ISOpub -->
   369 <!ENTITY emsp    "&#8195;"> <!-- em space, U+2003 ISOpub -->
   370 <!ENTITY thinsp  "&#8201;"> <!-- thin space, U+2009 ISOpub -->
   371 <!ENTITY zwnj    "&#8204;"> <!-- zero width non-joiner,
   372                                     U+200C NEW RFC 2070 -->
   373 <!ENTITY zwj     "&#8205;"> <!-- zero width joiner, U+200D NEW RFC 2070 -->
   374 <!ENTITY lrm     "&#8206;"> <!-- left-to-right mark, U+200E NEW RFC 2070 -->
   375 <!ENTITY rlm     "&#8207;"> <!-- right-to-left mark, U+200F NEW RFC 2070 -->
   376 <!ENTITY ndash   "&#8211;"> <!-- en dash, U+2013 ISOpub -->
   377 <!ENTITY mdash   "&#8212;"> <!-- em dash, U+2014 ISOpub -->
   378 <!ENTITY lsquo   "&#8216;"> <!-- left single quotation mark,
   379                                     U+2018 ISOnum -->
   380 <!ENTITY rsquo   "&#8217;"> <!-- right single quotation mark,
   381                                     U+2019 ISOnum -->
   382 <!ENTITY sbquo   "&#8218;"> <!-- single low-9 quotation mark, U+201A NEW -->
   383 <!ENTITY ldquo   "&#8220;"> <!-- left double quotation mark,
   384                                     U+201C ISOnum -->
   385 <!ENTITY rdquo   "&#8221;"> <!-- right double quotation mark,
   386                                     U+201D ISOnum -->
   387 <!ENTITY bdquo   "&#8222;"> <!-- double low-9 quotation mark, U+201E NEW -->
   388 <!ENTITY dagger  "&#8224;"> <!-- dagger, U+2020 ISOpub -->
   389 <!ENTITY Dagger  "&#8225;"> <!-- double dagger, U+2021 ISOpub -->
   390 <!ENTITY permil  "&#8240;"> <!-- per mille sign, U+2030 ISOtech -->
   391 <!ENTITY lsaquo  "&#8249;"> <!-- single left-pointing angle quotation mark,
   392                                     U+2039 ISO proposed -->
   393 <!-- lsaquo is proposed but not yet ISO standardized -->
   394 <!ENTITY rsaquo  "&#8250;"> <!-- single right-pointing angle quotation mark,
   395                                     U+203A ISO proposed -->
   396 <!-- rsaquo is proposed but not yet ISO standardized -->
   397 <!ENTITY euro   "&#8364;"> <!-- euro sign, U+20AC NEW -->
   398 
   399 ]>
   400 '''
   401 
   402 class visitor(object):
   403     def do(self, tree):
   404         self.visit_node_list(tree.childNodes)
   405 
   406     def visit_node_list(self, nodelist):
   407         for node in nodelist:
   408             self.visit(node)
   409 
   410     def visit(self, node):
   411         nodeType = node.nodeType
   412         if node.nodeType == Node.ELEMENT_NODE:
   413             return self.visit_element(node)
   414         elif node.nodeType == Node.ATTRIBUTE_NODE:
   415             return self.visit_attribute(node)
   416         elif node.nodeType == Node.TEXT_NODE:
   417             return self.visit_text(node)
   418         elif node.nodeType == Node.CDATA_SECTION_NODE:
   419             return self.visit_cdata_section(node)
   420 
   421     def visit_element(self, node):
   422         if len(node.childNodes):
   423             self.visit_node_list(node.childNodes)
   424 
   425     def visit_attribute(self, node):
   426         pass
   427 
   428     def visit_text(self, node):
   429         pass
   430 
   431     def visit_cdata_section(self, node):
   432         pass
   433 
   434 
   435 class strip_whitespace(visitor):
   436 
   437     def visit_element(self, node):
   438         if node.localName == 'p':
   439             # XXX: our formatter adds a whitespace at the end of each paragraph
   440             if node.hasChildNodes() and node.childNodes[-1].nodeType == Node.TEXT_NODE:
   441                 data = node.childNodes[-1].data.rstrip('\n ')
   442                 # Remove it if empty
   443                 if data == '':
   444                     node.removeChild(node.childNodes[-1])
   445                 else:
   446                     node.childNodes[-1].data = data
   447             # Remove empty paragraphs
   448             if not node.hasChildNodes():
   449                 node.parentNode.removeChild(node)
   450 
   451         if node.hasChildNodes():
   452             self.visit_node_list(node.childNodes)
   453 
   454 
   455 class convert_tree(visitor):
   456     white_space = object()
   457     new_line = object()
   458     new_line_dont_remove = object()
   459 
   460     def __init__(self, request, pagename):
   461         self.request = request
   462         self.pagename = pagename
   463 
   464     def do(self, tree):
   465         self.depth = 0
   466         self.text = []
   467         self.visit(tree.documentElement)
   468         self.check_whitespace()
   469         return ''.join(self.text)
   470 
   471     def check_whitespace(self):
   472         i = 0
   473         text = self.text
   474         while i < len(text):
   475             if text[i] is self.white_space:
   476                 if i == 0 or i == len(text)-1:
   477                     del text[i]
   478                 elif text[i-1].endswith(" ") or text[i-1].endswith("\n"):
   479                     # last char of previous element is whitespace
   480                     del text[i]
   481                 elif (text[i+1] is self.white_space or
   482                       # next element is white_space
   483                       text[i+1] is self.new_line):
   484                       # or new_line
   485                     del text[i]
   486                 elif text[i+1].startswith(" ") or text[i+1].startswith("\n"):
   487                     # first char of next element is whitespace
   488                     del text[i]
   489                 else:
   490                     text[i] = " "
   491                     i += 1
   492             elif text[i] is self.new_line:
   493                 if i == 0:
   494                     del text[i]
   495                 elif i == len(text) - 1:
   496                     text[i] = "\n"
   497                     i += 1
   498                 elif text[i-1].endswith("\n") or (
   499                       isinstance(text[i+1], str) and text[i+1].startswith("\n")):
   500                     del text[i]
   501                 else:
   502                     text[i] = "\n"
   503                     i += 1
   504             elif text[i] is self.new_line_dont_remove:
   505                 text[i] = "\n"
   506                 i += 1
   507             else:
   508                 i += 1
   509 
   510     def visit_text(self, node):
   511         self.text.append(node.data)
   512 
   513     def visit_element(self, node):
   514         name = node.localName
   515         if name is None: # not sure this can happen here (DOM comment node), but just for the case
   516             return
   517         func = getattr(self, "process_%s" % name, None)
   518         if func:
   519             func(node)
   520         else:
   521             self.process_inline(node)
   522 
   523     def visit_node_list_element_only(self, nodelist):
   524         for node in nodelist:
   525             if node.nodeType == Node.ELEMENT_NODE:
   526                 self.visit_element(node)
   527 
   528     def node_list_text_only(self, nodelist):
   529         result = []
   530         for node in nodelist:
   531             if node.nodeType == Node.TEXT_NODE:
   532                 result.append(node.data)
   533             else:
   534                 result.extend(self.node_list_text_only(node.childNodes))
   535         return "".join(result)
   536 
   537     def get_desc(self, nodelist):
   538         """ links can have either text or an image as description - we extract
   539             this from the child nodelist and return wiki markup.
   540         """
   541         markup = ''
   542         text = self.node_list_text_only(nodelist).replace("\n", " ").strip()
   543         if text:
   544             # found some text
   545             markup = text
   546         else:
   547             # search for an img / object
   548             for node in nodelist:
   549                 if node.nodeType == Node.ELEMENT_NODE:
   550                     name = node.localName
   551                     if name == 'img':
   552                         markup = self._process_img(node) # XXX problem: markup containts auto-generated alt text with link target
   553                         break
   554                     elif name == 'object':
   555                         markup = self._process_object(node)
   556                         break
   557         return markup
   558 
   559     def process_page(self, node):
   560         for i in node.childNodes:
   561             if i.nodeType == Node.ELEMENT_NODE:
   562                 self.visit_element(i)
   563             elif i.nodeType == Node.TEXT_NODE: # if this is missing, all std text under a headline is dropped!
   564                 txt = i.data.strip() # IMPORTANT: don't leave this unstripped or there will be wrong blanks
   565                 if txt:
   566                     self.text.append(txt)
   567             #we use <pre class="comment"> now, so this is currently unused:
   568             #elif i.nodeType == Node.COMMENT_NODE:
   569             #    self.text.append(i.data)
   570             #    self.text.append("\n")
   571 
   572     def process_br(self, node):
   573         self.text.append(self.new_line) # without this, std multi-line text below some heading misses a whitespace
   574                                         # when it gets merged to float text, like word word wordword word word
   575 
   576     def process_heading(self, node):
   577         text = self.node_list_text_only(node.childNodes).strip()
   578         if text:
   579             depth = int(node.localName[1])
   580             hstr = "=" * depth
   581             self.text.append(self.new_line)
   582             self.text.append("%s %s %s" % (hstr, text.replace("\n", " "), hstr))
   583             self.text.append(self.new_line)
   584 
   585     process_h1 = process_heading
   586     process_h2 = process_heading
   587     process_h3 = process_heading
   588     process_h4 = process_heading
   589     process_h5 = process_heading
   590     process_h6 = process_heading
   591 
   592     def _get_list_item_markup(self, list, listitem):
   593         before = ""
   594         #indent = str(self.depth) * self.depth # nice for debugging :)
   595         indent = " " * self.depth
   596         markup = ""
   597         name = list.localName
   598         if name == 'ol':
   599             class_ = listitem.getAttribute("class")
   600             if class_ == "gap":
   601                 before = self.new_line_dont_remove
   602             if list.hasAttribute("type"):
   603                 type = list.getAttribute("type")
   604             else:
   605                 type = "1"
   606             markup = "%s. " % type
   607         elif name == 'ul':
   608             class_ = listitem.getAttribute("class")
   609             if class_ == "gap":
   610                 before = self.new_line_dont_remove
   611             style = listitem.getAttribute("style")
   612             if re.match(ur"list-style-type:\s*none", style, re.I):
   613                 markup = ". "
   614                 # set markup with white space when list element containes table
   615                 for i in listitem.childNodes:
   616                     if i.nodeType == Node.ELEMENT_NODE:
   617                         if i.localName == 'table':
   618                             markup = ""
   619             else:
   620                 markup = "* "
   621         elif name == 'dl':
   622             markup = ":: "
   623         else:
   624             raise ConvertError("Illegal list type %s" % name)
   625         return before, indent, markup
   626 
   627     def process_dl(self, node):
   628         self.depth += 1
   629         markup = ":: " # can there be a dl dd without dt?
   630         for i in node.childNodes:
   631             if i.nodeType == Node.ELEMENT_NODE:
   632                 name = i.localName
   633                 if name == 'dt':
   634                     before, indent, markup = self._get_list_item_markup(node, i)
   635                     self.text.extend([before, indent])
   636                     text = self.node_list_text_only(i.childNodes)
   637                     self.text.append(text.replace("\n", " "))
   638                 elif name == 'dd':
   639                     self.text.append(markup)
   640                     self.process_list_item(i, indent) # XXX no dt -> indent is undefined!!!
   641                 else:
   642                     raise ConvertError("Illegal list element %s" % i.localName)
   643         self.depth -= 1
   644         if self.depth == 0:
   645             self.text.append(self.new_line_dont_remove)
   646 
   647     def process_list(self, node):
   648         self.depth += 1
   649         for i in node.childNodes:
   650             if i.nodeType == Node.ELEMENT_NODE:
   651                 name = i.localName
   652                 if name == 'li':
   653                     before, indent, markup = self._get_list_item_markup(node, i)
   654                     self.text.extend([before, indent, markup])
   655                     self.process_list_item(i, indent)
   656                 elif name in ('ol', 'ul', ):
   657                     self.process_list(i)
   658                 elif name == 'dl':
   659                     self.process_dl(i)
   660                 else:
   661                     raise ConvertError("Illegal list element %s" % i.localName)
   662         self.depth -= 1
   663         if self.depth == 0:
   664             self.text.append(self.new_line_dont_remove)
   665 
   666     process_ul = process_list
   667     process_ol = process_list
   668 
   669     def empty_paragraph_queue(self, nodelist, indent, need_indent):
   670         if need_indent:
   671             self.text.append(indent)
   672         for i in nodelist:
   673             if i.nodeType == Node.ELEMENT_NODE:
   674                 if i.localName == 'br':
   675                     self.text.append('<<BR>>')
   676                 else:
   677                     self.process_inline(i)
   678             elif i.nodeType == Node.TEXT_NODE:
   679                 self.text.append(i.data.strip('\n').replace('\n', ' '))
   680         self.text.append(self.new_line)
   681         del nodelist[:]
   682 
   683     def process_list_item(self, node, indent):
   684         found = False
   685         need_indent = False
   686         pending = []
   687 
   688         # If this is a empty list item, we just terminate the line
   689         if node.childNodes.length == 0:
   690             self.text.append(self.new_line)
   691             return
   692 
   693         for i in node.childNodes:
   694             name = i.localName
   695 
   696             if name in ('p', 'pre', 'ol', 'ul', 'dl', 'table', ) and pending:
   697                 self.empty_paragraph_queue(pending, indent, need_indent)
   698                 need_indent = True
   699 
   700             if name == 'p':
   701                 if need_indent:
   702                     self.text.append(indent)
   703                 self.process_paragraph_item(i)
   704                 self.text.append(self.new_line)
   705                 found = True
   706             elif name == 'pre':
   707                 if need_indent:
   708                     self.text.append(indent)
   709                 self.process_preformatted_item(i)
   710                 found = True
   711             elif name in ('ol', 'ul', ):
   712                 self.process_list(i)
   713                 found = True
   714             elif name == 'dl':
   715                 self.process_dl(i)
   716                 found = True
   717             elif name == 'table':
   718                 if need_indent:
   719                     self.text.append(indent)
   720                 self.process_table(i)
   721                 found = True
   722             elif name == 'br':
   723                 pending.append(i)
   724             else:
   725                 pending.append(i)
   726 
   727             if found:
   728                 need_indent = True
   729 
   730         if pending:
   731             self.empty_paragraph_queue(pending, indent, need_indent)
   732 
   733     def process_blockquote(self, node):
   734         # XXX this does not really work. e.g.:
   735         # <bq>aaaaaa
   736         # <hr---------->
   737         # <bq>bbbbbb
   738         self.depth += 1
   739         for i in node.childNodes:
   740             if i.nodeType == Node.ELEMENT_NODE:
   741                 name = i.localName
   742                 if name == 'p':
   743                     self.text.append(self.new_line)
   744                     self.text.append(" " * self.depth)
   745                     self.process_p(i)
   746                 elif name == 'pre':
   747                     self.text.append(self.new_line)
   748                     self.text.append(" " * self.depth)
   749                     self.process_pre(i)
   750                 elif name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', ):
   751                     self.process_heading(i)
   752                 elif name in ('ol', 'ul', ):
   753                     self.process_list(i)
   754                 elif name == 'dl':
   755                     self.process_dl(i)
   756                 elif name == 'a':
   757                     self.process_a(i)
   758                 elif name == 'img':
   759                     self.process_img(i)
   760                 elif name == 'div':
   761                     self.visit_node_list_element_only(i.childNodes)
   762                 elif name == 'blockquote':
   763                     self.process_blockquote(i)
   764                 elif name == 'hr':
   765                     self.process_hr(i)
   766                 elif name == 'br':
   767                     self.process_br(i)
   768                 else:
   769                     raise ConvertError("process_blockquote: Don't support %s element" % name)
   770         self.depth -= 1
   771 
   772     def process_inline(self, node):
   773         if node.nodeType == Node.TEXT_NODE:
   774             self.text.append(node.data.strip('\n').replace('\n', ' '))
   775             return
   776 
   777         # do we need to check for Node.ELEMENT_NODE and return (do nothing)?
   778         name = node.localName # can be None for DOM Comment nodes
   779         if name is None:
   780             return
   781 
   782         # unsupported tags
   783         if name in (u'title', u'meta', u'style'):
   784             return
   785 
   786         if name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', ): # headers are not allowed here (e.g. inside a ul li),
   787             text = self.node_list_text_only(node.childNodes).strip() # but can be inserted via the editor
   788             self.text.append(text)                          # so we just drop the header markup and keep the text
   789             return
   790 
   791         func = getattr(self, "process_%s" % name, None)
   792         if func:
   793             func(node)
   794             return
   795 
   796         command_close = None
   797         if name in ('em', 'i', ):
   798             command = "''"
   799         elif name in ('strong', 'b', ):
   800             command = "'''"
   801         elif name == 'u':
   802             command = "__"
   803         elif name == 'big':
   804             command = "~+"
   805             command_close = "+~"
   806         elif name == 'small':
   807             command = "~-"
   808             command_close = "-~"
   809         elif name == 'strike':
   810             command = "--("
   811             command_close = ")--"
   812         elif name == 'sub':
   813             command = ",,"
   814         elif name == 'sup':
   815             command = "^"
   816         elif name in ('area', 'center', 'code', 'embed', 'fieldset', 'font', 'form', 'iframe', 'input', 'label', 'link', 'map',
   817                       'meta', 'noscript', 'option', 'script', 'select', 'textarea', 'wbr'):
   818             command = "" # just throw away unsupported elements
   819         else:
   820             raise ConvertError("process_inline: Don't support %s element" % name)
   821 
   822         self.text.append(command)
   823         for i in node.childNodes:
   824             # lonly childnodes checked if they are only 'br'
   825             if command and len(node.childNodes) == 1:
   826                 # formatted br alone is not wanted (who wants a bold br?)
   827                 if i.localName != 'br':
   828                     self.process_inline(i)
   829             else:
   830                 if i.localName == 'br':
   831                     # dont make a real \n because that breaks tables
   832                     self.text.append('<<BR>>')
   833                 else:
   834                     self.process_inline(i)
   835         if command_close:
   836             command = command_close
   837         self.text.append(command)
   838 
   839     def process_span(self, node):
   840         # process span tag for firefox3
   841         node_style = node.getAttribute("style")
   842 
   843         is_strike = node.getAttribute("class") == "strike"
   844         is_strike = is_strike or "line-through" in node_style
   845         is_strong = "bold" in node_style
   846         is_italic = "italic" in node_style
   847         is_underline = "underline" in node_style
   848         is_comment = node.getAttribute("class") == "comment"
   849 
   850         # start tag
   851         if is_comment:
   852             self.text.append("/* ")
   853         if is_strike:
   854             self.text.append("--(")
   855         if is_strong:
   856             self.text.append("'''")
   857         if is_italic:
   858             self.text.append("''")
   859         if is_underline:
   860             self.text.append("__")
   861 
   862         # body
   863         for i in node.childNodes:
   864             self.process_inline(i)
   865 
   866         # end tag
   867         if is_underline:
   868             self.text.append("__")
   869         if is_italic:
   870             self.text.append("''")
   871         if is_strong:
   872             self.text.append("'''")
   873         if is_strike:
   874             self.text.append(")--")
   875         if is_comment:
   876             self.text.append(" */")
   877 
   878     def process_div(self, node):
   879         # process indent
   880         self._process_indent(node)
   881 
   882         # ignore div tags - just descend
   883         for i in node.childNodes:
   884             self.visit(i)
   885 
   886     def process_tt(self, node):
   887         text = self.node_list_text_only(node.childNodes).replace("\n", " ")
   888         if node.getAttribute("class") == "backtick":
   889             self.text.append("`%s`" % text)
   890         else:
   891             self.text.append("{{{%s}}}" % text)
   892 
   893     def process_hr(self, node):
   894         if node.hasAttribute("class"):
   895             class_ = node.getAttribute("class")
   896         else:
   897             class_ = "hr0"
   898         if class_.startswith("hr") and class_[2] in "123456":
   899             length = int(class_[2]) + 4
   900         else:
   901             length = 4
   902         self.text.extend([self.new_line, "-" * length, self.new_line])
   903 
   904     def process_p(self, node):
   905         # process indent
   906         self._process_indent(node)
   907         self.process_paragraph_item(node)
   908         self.text.append("\n\n") # do not use self.new_line here!
   909 
   910     def _process_indent(self, node):
   911         # process indent
   912         node_style = node.getAttribute("style")
   913         match = re.match(r"margin-left:\s*(\d+)px", node_style)
   914         if match:
   915             left_margin = int(match.group(1))
   916             indent_depth = int(left_margin / 40)
   917             if indent_depth > 0:
   918                 self.text.append(' . ')
   919 
   920     def process_paragraph_item(self, node):
   921         for i in node.childNodes:
   922             if i.nodeType == Node.ELEMENT_NODE:
   923                 self.process_inline(i)
   924             elif i.nodeType == Node.TEXT_NODE:
   925                 self.text.append(i.data.strip('\n').replace('\n', ' '))
   926 
   927     def process_pre(self, node):
   928         self.process_preformatted_item(node)
   929         self.text.append(self.new_line)
   930 
   931     def process_preformatted_item(self, node):
   932         if node.hasAttribute("class"):
   933             class_ = node.getAttribute("class")
   934         else:
   935             class_ = None
   936         if class_ == "comment": # we currently use this for stuff like ## or #acl
   937             for i in node.childNodes:
   938                 if i.nodeType == Node.TEXT_NODE:
   939                     self.text.append(i.data.replace('\n', ''))
   940                 elif i.localName == 'br':
   941                     self.text.append(self.new_line)
   942                 else:
   943                     pass
   944         else:
   945             content_buffer = []
   946             longest_inner_formater = ''
   947             bang_args = ''
   948             delimiters = []
   949 
   950             """
   951             below code fixed for MoinMoinBugs/GuiEditorCantNest bug
   952             this has problem when outer delimiter has two more { than inside one
   953             e.g. {{{{{{ {{{ foo }}} }}}}}}  --> {{{{ {{{ foo }}} }}}}
   954                    {{{foo {{{ }}} foo}}} --> {{{{ {{{ }}} }}}}
   955             """
   956 
   957             for i in node.childNodes:
   958                 if i.nodeType == Node.TEXT_NODE:
   959                     # get longest pre tag({{{ or }}}) from content
   960                     delimiters.extend(re.compile("((?u){+)").findall(i.data))
   961                     delimiters.extend(re.compile("((?u)}+)").findall(i.data))
   962                     # when first line is empty, start iteration second line of i.data
   963                     data_lines = i.data.rstrip().split('\n')
   964                     if data_lines[0].strip() == '':
   965                         data_lines = data_lines[1:]
   966                     for line in data_lines:
   967                         if line.strip().startswith('#!'):
   968                             if bang_args == '':
   969                                 bang_args = line.strip()
   970                             else:
   971                                 content_buffer.extend([line, self.new_line])
   972                         else:
   973                             content_buffer.extend([line, self.new_line])
   974                 elif i.localName == 'br':
   975                     content_buffer.append(self.new_line_dont_remove)
   976                 else:
   977                     pass
   978 
   979             if delimiters:
   980                 longest_inner_formater = max(delimiters)
   981 
   982             if (len(longest_inner_formater) >= 3):
   983                 self.text.extend([("{" * (len(longest_inner_formater) + 1)) + bang_args, \
   984                                       self.new_line])
   985                 self.text.extend(content_buffer)
   986                 self.text.extend(["}" * (len(longest_inner_formater) + 1), \
   987                                       self.new_line])
   988             else:
   989                 self.text.extend(["{{{"+bang_args, self.new_line])
   990                 self.text.extend(content_buffer)
   991                 self.text.extend(["}}}", self.new_line])
   992 
   993     _alignment = {"left": "(",
   994                   "center": ":",
   995                   "right": ")",
   996                   "top": "^",
   997                   "bottom": "v"}
   998 
   999     def _check_length(self, value):
  1000         try:
  1001             int(value)
  1002             return value + 'px'
  1003         except ValueError:
  1004             return value
  1005 
  1006     def _get_color(self, node, prefix):
  1007         if node.hasAttribute("bgcolor"):
  1008             value = node.getAttribute("bgcolor")
  1009             match = re.match(r"rgb\((\d+),\s*(\d+),\s*(\d+)\)", value)
  1010             if match:
  1011                 value = '#%X%X%X' % (int(match.group(1)), int(match.group(2)), int(match.group(3)))
  1012             else:
  1013                 match = re.match(r"#[0-9A-Fa-f]{6}", value)
  1014             if not prefix and match:
  1015                 result = value
  1016             else:
  1017                 result = '%sbgcolor="%s"' % (prefix, value)
  1018         else:
  1019             result = ''
  1020         return result
  1021 
  1022     def _table_style(self, node):
  1023         # TODO: attrs = get_attrs(node)
  1024         result = []
  1025         result.append(self._get_color(node, 'table'))
  1026         if node.hasAttribute("width"):
  1027             value = node.getAttribute("width")
  1028             result.append('tablewidth="%s"' % self._check_length(value))
  1029         if node.hasAttribute("height"):
  1030             value = node.getAttribute("height")
  1031             result.append('tableheight="%s"' % self._check_length(value))
  1032         if node.hasAttribute("align"):
  1033             value = node.getAttribute("align")
  1034             result.append('tablealign="%s"' % value)
  1035         if node.hasAttribute("style"):
  1036             result.append('tablestyle="%s"' % node.getAttribute("style"))
  1037         if node.hasAttribute("class"):
  1038             result.append('tableclass="%s"' % node.getAttribute("class"))
  1039         return " ".join(result).strip()
  1040 
  1041     def _row_style(self, node):
  1042         # TODO: attrs = get_attrs(node)
  1043         result = []
  1044         result.append(self._get_color(node, 'row'))
  1045         if node.hasAttribute("style"):
  1046             result.append('rowstyle="%s"' % node.getAttribute("style"))
  1047         if node.hasAttribute("class"):
  1048             result.append('rowclass="%s"' % node.getAttribute("class"))
  1049         return " ".join(result).strip()
  1050 
  1051     def _cell_style(self, node):
  1052         # TODO: attrs = get_attrs(node)
  1053         if node.hasAttribute("rowspan"):
  1054             rowspan = ("|%s" % node.getAttribute("rowspan"))
  1055         else:
  1056             rowspan = ""
  1057 
  1058         if node.hasAttribute("colspan"):
  1059             colspan = int(node.getAttribute("colspan"))
  1060         else:
  1061             colspan = 1
  1062 
  1063         spanning = rowspan or colspan > 1
  1064 
  1065         align = ""
  1066         result = []
  1067         result.append(self._get_color(node, ''))
  1068         if node.hasAttribute("align"):
  1069             value = node.getAttribute("align")
  1070             if not spanning or value != "center":
  1071                 # ignore "center" in spanning cells
  1072                 align += self._alignment.get(value, "")
  1073         if node.hasAttribute("valign"):
  1074             value = node.getAttribute("valign")
  1075             if not spanning or value != "center":
  1076                 # ignore "center" in spanning cells
  1077                 align += self._alignment.get(value, "")
  1078         if node.hasAttribute("width"):
  1079             value = node.getAttribute("width")
  1080             if value and value[-1] == "%":
  1081                 align += value
  1082             else:
  1083                 result.append('width="%s"' % self._check_length(value))
  1084         if node.hasAttribute("height"):
  1085             value = node.getAttribute("height")
  1086             result.append('height="%s"' % self._check_length(value))
  1087         if node.hasAttribute("class"):
  1088             result.append('class="%s"' % node.getAttribute("class"))
  1089         if node.hasAttribute("id"):
  1090             result.append('id="%s"' % node.getAttribute("id"))
  1091         if node.hasAttribute("style"):
  1092             result.append('style="%s"' % node.getAttribute("style"))
  1093 
  1094         if align:
  1095             result.insert(0, "%s" % align)
  1096         result.append(rowspan)
  1097         return " ".join(result).strip()
  1098 
  1099     def process_table(self, node, style=""):
  1100         if self.depth == 0:
  1101             self.text.append(self.new_line)
  1102         self.new_table = True
  1103         style += self._table_style(node)
  1104         for i in node.childNodes:
  1105             if i.nodeType == Node.ELEMENT_NODE:
  1106                 name = i.localName
  1107                 if name == 'tr':
  1108                     self.process_table_record(i, style)
  1109                     style = ""
  1110                 elif name in ('thead', 'tbody', 'tfoot'):
  1111                     self.process_table(i, style)
  1112                 elif name == 'caption':
  1113                     self.process_caption(node, i, style)
  1114                     style = ''
  1115                 elif name in ('col', 'colgroup', 'strong', ):
  1116                     pass # we don't support these, but we just ignore them
  1117                 else:
  1118                     raise ConvertError("process_table: Don't support %s element" % name)
  1119             #else:
  1120             #    raise ConvertError("Unexpected node: %r" % i)
  1121         self.text.append(self.new_line_dont_remove)
  1122 
  1123     def process_caption(self, table, node, style=""):
  1124         # get first row
  1125         for i in table.childNodes:
  1126             if i.localName in ('thead', 'tbody', 'tfoot'): # XXX is this correct?
  1127             #if i.localName == 'tbody': (old version)
  1128                 for i in i.childNodes:
  1129                     if i.localName == 'tr':
  1130                         break
  1131                 break
  1132             elif i.localName == 'tr':
  1133                 break
  1134         # count columns
  1135         if i.localName == 'tr':
  1136             colspan = 0
  1137             for td in i.childNodes:
  1138                 if not td.nodeType == Node.ELEMENT_NODE:
  1139                     continue
  1140                 span = td.getAttribute('colspan')
  1141                 try:
  1142                     colspan += int(span)
  1143                 except ValueError:
  1144                     colspan += 1
  1145         else:
  1146             colspan = 1
  1147         text = self.node_list_text_only(node.childNodes).replace('\n', ' ').strip()
  1148         if text:
  1149             if style:
  1150                 style = '<%s>' % style
  1151             self.text.extend(["%s%s'''%s'''||" % ('||' * colspan, style, text), self.new_line_dont_remove])
  1152 
  1153     def process_table_data(self, node, style=""):
  1154         if node.hasAttribute("colspan"):
  1155             colspan = int(node.getAttribute("colspan"))
  1156         else:
  1157             colspan = 1
  1158         self.text.append("||" * colspan)
  1159 
  1160         style += self._cell_style(node)
  1161         if style:
  1162             self.text.append("<%s>" % style)
  1163 
  1164         found = False
  1165         for i in node.childNodes:
  1166             name = i.localName
  1167             if name == 'p':
  1168                 self.process_paragraph_item(i)
  1169                 self.text.append(self.white_space)
  1170                 found = True
  1171         if not found:
  1172             for i in node.childNodes:
  1173                 name = i.localName
  1174                 if i.nodeType == Node.ELEMENT_NODE:
  1175                     if name == 'br':
  1176                         # if we get a br for a cell from e.g. cut and paste from OOo
  1177                         # or if someone simulates a list by enter in a cell
  1178                         # it should be appended as macro BR.
  1179                         self.text.append('<<BR>>')
  1180                         found = True
  1181                         continue
  1182                     else:
  1183                         self.process_inline(i)
  1184                         found = True
  1185                 elif i.nodeType == Node.TEXT_NODE:
  1186                     data = i.data.strip('\n').replace('\n', ' ')
  1187                     if data:
  1188                         found = True
  1189                         self.text.append(data)
  1190         if not found:
  1191             self.text.append(" ")
  1192 
  1193     def process_table_record(self, node, style=""):
  1194         if not self.new_table:
  1195             self.text.append(" " * self.depth)
  1196         else:
  1197             self.new_table = False
  1198         style += self._row_style(node)
  1199         for i in node.childNodes:
  1200             if i.nodeType == Node.ELEMENT_NODE:
  1201                 name = i.localName
  1202                 if name in ('td', 'th', ):
  1203                     self.process_table_data(i, style=style)
  1204                     style = ""
  1205                 else:
  1206                     raise ConvertError("process_table_record: Don't support %s element" % name)
  1207         self.text.extend(["||", self.new_line_dont_remove])
  1208 
  1209     def process_a(self, node):
  1210         attrs = get_attrs(node)
  1211 
  1212         title = attrs.pop('title', '')
  1213         href = attrs.pop('href', None)
  1214         css_class = attrs.get('class')
  1215 
  1216         scriptname = self.request.script_root
  1217         if scriptname == "":
  1218             scriptname = "/"
  1219 
  1220         # can either be a link (with href) or an anchor (with e.g. id)
  1221         # we don't need to support anchors here as we currently handle them as <<Anchor(id)>> macro
  1222         if href:
  1223             href = wikiutil.url_unquote(href)
  1224 
  1225             interwikiname = None
  1226             desc = self.get_desc(node.childNodes)
  1227 
  1228             # interwiki link
  1229             if css_class == "interwiki":
  1230                 wikitag, wikiurl, wikitail, err = wikiutil.resolve_interwiki(
  1231                     self.request, title, "") # the title has the wiki name, page = ""
  1232                 if not err and href.startswith(wikiurl):
  1233                     pagename = wikiutil.url_unquote(href[len(wikiurl):].lstrip('/'))
  1234                     interwikiname = "%s:%s" % (wikitag, pagename)
  1235                 else:
  1236                     raise ConvertError("Invalid InterWiki link: '%s'" % href)
  1237             elif css_class == "badinterwiki" and title:
  1238                 if href == "/": # we used this as replacement for empty href
  1239                     href = ""
  1240                 pagename = wikiutil.url_unquote(href)
  1241                 interwikiname = "%s:%s" % (title, pagename)
  1242             if interwikiname and pagename == desc:
  1243                 if interwiki_re.match(interwikiname+' '): # the blank is needed by interwiki_re to match
  1244                     # this is valid as a free interwiki link
  1245                     self.text.append("%s" % interwikiname)
  1246                 else:
  1247                     self.text.append("[[%s]]" % interwikiname)
  1248                 return
  1249             elif title == 'Self':
  1250                 self.text.append('[[%s|%s]]' % (href, desc))
  1251                 return
  1252             elif interwikiname:
  1253                 self.text.append("[[%s|%s]]" % (interwikiname, desc))
  1254                 return
  1255 
  1256             # fix links generated by a broken copy & paste of gecko based browsers
  1257             brokenness = '../../../..'
  1258             if href.startswith(brokenness):
  1259                 href = href[len(brokenness):] # just strip it away!
  1260             # TODO: IE pastes complete http://server/Page/SubPage as href and as text, too
  1261 
  1262             # Attachments
  1263             if title.startswith("attachment:"):
  1264                 attname = wikiutil.url_unquote(title[len("attachment:"):])
  1265                 if 'do=get' in href: # quick&dirty fix for not dropping &do=get param
  1266                     parms = '|&do=get'
  1267                 else:
  1268                     parms = ''
  1269                 if attname != desc:
  1270                     desc = '|%s' % desc
  1271                 elif parms:
  1272                     desc = '|'
  1273                 else:
  1274                     desc = ''
  1275                 self.text.append('[[attachment:%s%s%s]]' % (attname, desc, parms))
  1276             # wiki link
  1277             elif href.startswith(scriptname):
  1278                 pagename = href[len(scriptname):]
  1279                 pagename = pagename.lstrip('/')    # XXX temp fix for generated pagenames starting with /
  1280                 if desc == pagename:
  1281                     self.text.append(wikiutil.pagelinkmarkup(pagename))
  1282                 # relative link /SubPage
  1283                 elif desc.startswith('/') and href.endswith(desc):
  1284                     if pagename.startswith(self.pagename): # is this a subpage of us?
  1285                         self.text.append(wikiutil.pagelinkmarkup(pagename[len(self.pagename):]))
  1286                     else:
  1287                         self.text.append(wikiutil.pagelinkmarkup(pagename))
  1288                 # relative link ../
  1289                 elif desc.startswith('../') and href.endswith(desc[3:]):
  1290                     self.text.append(wikiutil.pagelinkmarkup(desc))
  1291                 # internal link #internal
  1292                 elif '#' in href and pagename.startswith(self.pagename):
  1293                     self.text.append(wikiutil.pagelinkmarkup(href[href.index('#'):], desc))
  1294                 # labeled link
  1295                 else:
  1296                     self.text.append(wikiutil.pagelinkmarkup(pagename, desc))
  1297             # mailto link
  1298             elif href.startswith("mailto:"):
  1299                 if href == desc or href[len("mailto:"):] == desc:
  1300                     self.text.extend([self.white_space, desc, self.white_space])
  1301                 else:
  1302                     self.text.append("[[%s|%s]]" % (href, desc)) # XXX use a (renamed) pagelinkmarkup
  1303             # link
  1304             else:
  1305                 if href == desc:
  1306                     href = href.replace(" ", "%20")
  1307                     self.text.append(href)
  1308                 else:
  1309                     href = href.replace(" ", "%20")
  1310                     if desc:
  1311                         desc = '|' + desc
  1312                     self.text.append("[[%s%s]]" % (href, desc))
  1313 
  1314     def process_img(self, node):
  1315         markup = self._process_img(node)
  1316         self.text.extend([self.white_space, markup, self.white_space])
  1317 
  1318     def _process_img(self, node):
  1319         attrs = get_attrs(node)
  1320 
  1321         title = attrs.pop('title', '')
  1322         if title.startswith("smiley:"):
  1323             markup = title[len("smiley:"):]
  1324             return markup
  1325 
  1326         alt = attrs.pop('alt', None)
  1327         src = attrs.pop('src', None)
  1328         css_class = attrs.get('class')
  1329 
  1330         target = src
  1331         if title.startswith("attachment:"):
  1332             target = wikiutil.url_unquote(title)
  1333             if alt == title[len("attachment:"):]:
  1334                 # kill auto-generated alt
  1335                 alt = None
  1336         elif title.startswith("drawing:"):
  1337             target = wikiutil.url_unquote(title)
  1338             if alt == title[len("drawing:"):]:
  1339                 # kill auto-generated alt
  1340                 alt = None
  1341         else:
  1342             if css_class == 'external_image':
  1343                 # kill auto-generated alt and class
  1344                 if src == alt:
  1345                     alt = None
  1346                 del attrs['class']
  1347 
  1348         if alt:
  1349             desc = '|' + alt
  1350         else:
  1351             desc = ''
  1352 
  1353         params = ','.join(['%s="%s"' % (k, v) for k, v in attrs.items()])
  1354                            # if k in ('width', 'height', )])
  1355         if params:
  1356             params = '|' + params
  1357             if not desc:
  1358                 desc = '|'
  1359 
  1360         markup = "{{%s%s%s}}" % (target, desc, params)
  1361         return markup
  1362 
  1363     def process_object(self, node):
  1364         markup = self._process_object(node)
  1365         self.text.append(markup)
  1366 
  1367     def _process_object(self, node):
  1368         attrs = get_attrs(node)
  1369         markup = ''
  1370         data = attrs.pop('data', None)
  1371         if data:
  1372             scheme, netloc, path, params, query, fragment = urlparse.urlparse(data)
  1373             args = url_decode(query)
  1374             action = args.get("action")
  1375             attachname = args.get("target")
  1376 
  1377             if (not scheme and not netloc # same server (local attachment!)
  1378                 and path and action == 'AttachFile' and attachname):
  1379                 scriptname = self.request.script_root or "/"
  1380                 pagename = path[len(scriptname):].lstrip("/")
  1381                 pagename = wikiutil.url_unquote(pagename)
  1382 
  1383                 if pagename != self.request.page.page_name:
  1384                     attachname = "%s/%s" % (pagename, attachname)
  1385                 data = "attachment:%s" % attachname
  1386 
  1387             desc = self.get_desc(node.childNodes)
  1388             if desc:
  1389                 desc = '|' + desc
  1390 
  1391             # Exlude 'type' attribute cause it generates a 'key already present' error.
  1392             params = ','.join(['%s="%s"' % (k, v) for k, v in attrs.items() if not k in ('type', )])
  1393             if params:
  1394                 params = '|' + params
  1395                 if not desc:
  1396                     desc = '|'
  1397             markup = "{{%s%s%s}}" % (data, desc, params)
  1398         return markup
  1399         # TODO: for target PAGES, use some code from process_a to get the pagename from URL
  1400         # TODO: roundtrip attachment: correctly
  1401         # TODO: handle object's content better?
  1402 
  1403 def get_attrs(node):
  1404     """ get the attributes of <node> into an easy-to-use dict """
  1405     attrs = {}
  1406     for attr_name in node.attributes.keys():
  1407         # get attributes of style element
  1408         if attr_name == "style":
  1409             for style_element in node.attributes.get(attr_name).nodeValue.split(';'):
  1410                 if style_element.strip() != '':
  1411                     style_elements = style_element.split(':')
  1412                     if len(style_elements) == 2:
  1413                         attrs[style_elements[0].strip()] = style_elements[1].strip()
  1414         # get attributes without style element
  1415         else:
  1416             attrs[attr_name] = node.attributes.get(attr_name).nodeValue
  1417     return attrs
  1418 
  1419 
  1420 def parse(request, text):
  1421     text = u'<?xml version="1.0"?>%s%s' % (dtd, text)
  1422     text = text.encode(config.charset)
  1423     try:
  1424         return xml.dom.minidom.parseString(text)
  1425     except xml.parsers.expat.ExpatError, msg:
  1426         # this sometimes crashes when it should not, so save the stuff to analyze it:
  1427         logname = os.path.join(request.cfg.data_dir, "expaterror.log")
  1428         f = file(logname, "w")
  1429         f.write(text)
  1430         f.write("\n" + "-"*80 + "\n" + str(msg))
  1431         f.close()
  1432         raise ConvertError('ExpatError: %s (see dump in %s)' % (msg, logname))
  1433 
  1434 def convert(request, pagename, text):
  1435     # Due to expat needing explicitly set namespaces, we set these here to allow pasting
  1436     # from Word / Excel without issues.
  1437     # If you encounter 'ExpatError: unbound prefix', try adding the namespace to the list.
  1438     namespace = [u'xmlns:o="urn:schemas-microsoft-com:office:office"',
  1439                  u'xmlns:x="urn:schemas-microsoft-com:office:excel"',
  1440                  u'xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet"',
  1441                  u'xmlns:c="urn:schemas-microsoft-com:office:component:spreadsheet"',
  1442                  u'xmlns:s="uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882"',
  1443                  u'xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882"',
  1444                  u'xmlns:rs="urn:schemas-microsoft-com:rowset"',
  1445                  u'xmlns:z="#RowsetSchema"',
  1446                  u'xmlns:x2="http://schemas.microsoft.com/office/excel/2003/xml"',
  1447                  u'xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core"',
  1448                  u'xmlns:aml="http://schemas.microsoft.com/aml/2001/core"',
  1449                  u'xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml"',
  1450                  u'xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint"',
  1451                  u'xmlns:w10="urn:schemas-microsoft-com:office:word"',
  1452                  u'xmlns:v="urn:schemas-microsoft-com:office:vml"']
  1453     text = u'<page %s>%s</page>' % (' '.join(namespace), text)
  1454     tree = parse(request, text)
  1455     strip_whitespace().do(tree)
  1456     text = convert_tree(request, pagename).do(tree)
  1457     text = '\n'.join([s.rstrip() for s in text.splitlines()] + ['']) # remove trailing blanks
  1458     return text
  1459