MoinMoin/script/migration/_conv160_wiki.py
author Thomas Waldmann <tw AT waldmann-edv DOT de>
Wed, 11 Feb 2009 02:34:33 +0100
changeset 4569 3caaa8c74c41
parent 3119 2a380f99afa8
child 5067 10965bc1ee3c
permissions -rw-r--r--
wikiutil: replace moin's cgi/urllib wrappers by calls to werkzeug.utils code
     1 # -*- coding: iso-8859-1 -*-
     2 """
     3     MoinMoin - convert content in 1.5.8 wiki markup to 1.6.0 style
     4                by using a modified 1.5.8 parser as translator.
     5 
     6     Assuming we have this "renames" map:
     7     -------------------------------------------------------
     8     'PAGE', 'some_page'        -> 'some page'
     9     'FILE', 'with%20blank.txt' -> 'with blank.txt'
    10 
    11     Markup transformations needed:
    12     -------------------------------------------------------
    13     ["some_page"]           -> [[some page]] # renamed
    14     [:some_page:some text]  -> [[some page|some text]]
    15     [:page:text]            -> [[page|text]]
    16                                (with a page not being renamed)
    17 
    18     attachment:with%20blank.txt -> [[attachment:with blank.txt]]
    19     attachment:some_page/with%20blank.txt -> [[attachment:some page/with blank.txt]]
    20     The attachment processing should also urllib.unquote the filename (or at
    21     least replace %20 by space) and put it into "quotes" if it contains spaces.
    22 
    23     @copyright: 2007 MoinMoin:JohannesBerg,
    24                 2007 MoinMoin:ThomasWaldmann
    25     @license: GNU GPL, see COPYING for details.
    26 """
    27 
    28 import re
    29 
    30 from MoinMoin import i18n
    31 i18n.wikiLanguages = lambda: {}
    32 
    33 from MoinMoin import config, wikiutil, macro
    34 from MoinMoin.action import AttachFile
    35 from MoinMoin.Page import Page
    36 from MoinMoin.support.python_compatibility import rsplit
    37 
    38 from text_moin158_wiki import Parser
    39 
    40 def convert_wiki(request, pagename, intext, renames):
    41     """ Convert content written in wiki markup """
    42     noeol = False
    43     if not intext.endswith('\r\n'):
    44         intext += '\r\n'
    45         noeol = True
    46     c = Converter(request, pagename, intext, renames)
    47     result = request.redirectedOutput(c.convert, request)
    48     if noeol and result.endswith('\r\n'):
    49         result = result[:-2]
    50     return result
    51 
    52 
    53 STONEAGE_IMAGELINK = False # True for ImageLink(target,image), False for ImageLink(image,target)
    54 
    55 # copied from moin 1.6.0 macro/ImageLink.py (to be safe in case we remove ImageLink some day)
    56 # ... and slightly modified/refactored for our needs here.
    57 # hint: using parse_quoted_separated from wikiutil does NOT work here, because we do not have
    58 #       quoted urls when they contain a '=' char in the 1.5 data input.
    59 def explore_args(args):
    60     """ explore args for positional and keyword parameters """
    61     if args:
    62         args = args.split(',')
    63         args = [arg.strip() for arg in args]
    64     else:
    65         args = []
    66 
    67     kw_count = 0
    68     kw = {} # keyword args
    69     pp = [] # positional parameters
    70 
    71     kwAllowed = ('width', 'height', 'alt')
    72 
    73     for arg in args:
    74         if '=' in arg:
    75             key, value = arg.split('=', 1)
    76             key_lowerstr = str(key.lower())
    77             # avoid that urls with "=" are interpreted as keyword
    78             if key_lowerstr in kwAllowed:
    79                 kw_count += 1
    80                 kw[key_lowerstr] = value
    81             elif not kw_count and '://' in arg:
    82                 # assuming that this is the image
    83                 pp.append(arg)
    84         else:
    85             pp.append(arg)
    86 
    87     if STONEAGE_IMAGELINK and len(pp) >= 2:
    88         pp[0], pp[1] = pp[1], pp[0]
    89 
    90     return pp, kw
    91 
    92 
    93 class Converter(Parser):
    94     def __init__(self, request, pagename, raw, renames):
    95         self.pagename = pagename
    96         self.raw = raw
    97         self.renames = renames
    98         self.request = request
    99         self._ = None
   100         self.in_pre = 0
   101 
   102         self.formatting_rules = self.formatting_rules % {'macronames': u'|'.join(['ImageLink', ] + macro.getNames(self.request.cfg))}
   103 
   104     # no change
   105     def return_word(self, word):
   106         return word
   107     _emph_repl = return_word
   108     _emph_ibb_repl = return_word
   109     _emph_ibi_repl = return_word
   110     _emph_ib_or_bi_repl = return_word
   111     _u_repl = return_word
   112     _strike_repl = return_word
   113     _sup_repl = return_word
   114     _sub_repl = return_word
   115     _small_repl = return_word
   116     _big_repl = return_word
   117     _tt_repl = return_word
   118     _tt_bt_repl = return_word
   119     _remark_repl = return_word
   120     _table_repl = return_word
   121     _tableZ_repl = return_word
   122     _rule_repl = return_word
   123     _smiley_repl = return_word
   124     _smileyA_repl = return_word
   125     _ent_repl = return_word
   126     _ent_numeric_repl = return_word
   127     _ent_symbolic_repl = return_word
   128     _heading_repl = return_word
   129     _email_repl = return_word
   130     _notword_repl = return_word
   131     _indent_repl = return_word
   132     _li_none_repl = return_word
   133     _li_repl = return_word
   134     _ol_repl = return_word
   135     _dl_repl = return_word
   136     _comment_repl = return_word
   137 
   138     # translate pagenames using pagename translation map
   139 
   140     def _replace(self, key):
   141         """ replace a item_name if it is in the renames dict
   142             key is either a 2-tuple ('PAGE', pagename)
   143             or a 3-tuple ('FILE', pagename, filename)
   144         """
   145         current_page = self.pagename
   146         item_type, page_name, file_name = (key + (None, ))[:3]
   147         abs_page_name = wikiutil.AbsPageName(current_page, page_name)
   148         if item_type == 'PAGE':
   149             key = (item_type, abs_page_name)
   150             new_name = self.renames.get(key)
   151             if new_name is None:
   152                 # we don't have an entry in rename map - apply the same magic
   153                 # to the page name as 1.5 did (" " -> "_") and try again:
   154                 abs_magic_name = abs_page_name.replace(u' ', u'_')
   155                 key = (item_type, abs_magic_name)
   156                 new_name = self.renames.get(key)
   157                 if new_name is None:
   158                     # we didn't find it under the magic name either -
   159                     # that means we do not rename it!
   160                     new_name = page_name
   161             if new_name != page_name and abs_page_name != page_name:
   162                 # we have to fix the (absolute) new_name to be a relative name (as it was before)
   163                 new_name = wikiutil.RelPageName(current_page, new_name)
   164         elif item_type == 'FILE':
   165             key = (item_type, abs_page_name, file_name)
   166             new_name = self.renames.get(key)
   167             if new_name is None:
   168                 # we don't have an entry in rename map - apply the same magic
   169                 # to the page name as 1.5 did (" " -> "_") and try again:
   170                 abs_magic_name = abs_page_name.replace(u' ', u'_')
   171                 key = (item_type, abs_magic_name, file_name)
   172                 new_name = self.renames.get(key)
   173                 if new_name is None:
   174                     # we didn't find it under the magic name either -
   175                     # that means we do not rename it!
   176                     new_name = file_name
   177         return new_name
   178 
   179     def _replace_target(self, target):
   180         target_and_anchor = rsplit(target, '#', 1)
   181         if len(target_and_anchor) > 1:
   182             target, anchor = target_and_anchor
   183             target = self._replace(('PAGE', target))
   184             return '%s#%s' % (target, anchor)
   185         else:
   186             target = self._replace(('PAGE', target))
   187             return target
   188 
   189     # markup conversion
   190 
   191     def _macro_repl(self, word):
   192         # we use [[...]] for links now, macros will be <<...>>
   193         macro_rule = ur"""
   194             \[\[
   195             (?P<macro_name>\w+)
   196             (\((?P<macro_args>.*?)\))?
   197             \]\]
   198         """
   199         word = unicode(word) # XXX why is word not unicode before???
   200         m = re.match(macro_rule, word, re.X|re.U)
   201         macro_name = m.group('macro_name')
   202         macro_args = m.group('macro_args')
   203         if macro_name == 'ImageLink':
   204             fixed, kw = explore_args(macro_args)
   205             #print "macro_args=%r" % macro_args
   206             #print "fixed=%r, kw=%r" % (fixed, kw)
   207             image, target = (fixed + ['', ''])[:2]
   208             if image is None:
   209                 image = ''
   210             if target is None:
   211                 target = ''
   212             if '://' not in image:
   213                 # if it is not a URL, it is meant as attachment
   214                 image = u'attachment:%s' % image
   215             if not target:
   216                 target = image
   217             elif target.startswith('inline:'):
   218                 target = 'attachment:' + target[7:] # we don't support inline:
   219             elif target.startswith('wiki:'):
   220                 target = target[5:] # drop wiki:
   221             image_attrs = []
   222             alt = kw.get('alt') or ''
   223             width = kw.get('width')
   224             if width is not None:
   225                 image_attrs.append(u"width=%s" % width)
   226             height = kw.get('height')
   227             if height is not None:
   228                 image_attrs.append(u"height=%s" % height)
   229             image_attrs = u", ".join(image_attrs)
   230             if image_attrs:
   231                 image_attrs = u'|' + image_attrs
   232             if alt or image_attrs:
   233                 alt = u'|' + alt
   234             result = u'[[%s|{{%s%s%s}}]]' % (target, image, alt, image_attrs)
   235         else:
   236             if macro_args:
   237                 macro_args = u"(%s)" % macro_args
   238             else:
   239                 macro_args = u''
   240             result = u"<<%s%s>>" % (macro_name, macro_args)
   241         # XXX later check whether some to be renamed pagename is used as macro param
   242         return result
   243 
   244     def _word_repl(self, word, text=None):
   245         """Handle WikiNames."""
   246         if not text:
   247             return word
   248         else: # internal use:
   249             return '[[%s|%s]]' % (word, text)
   250 
   251     def _wikiname_bracket_repl(self, word):
   252         """Handle special-char wikinames."""
   253         pagename = word[2:-2]
   254         if pagename:
   255             pagename = self._replace(('PAGE', pagename))
   256             return '[[%s]]' % pagename
   257         else:
   258             return word
   259 
   260     def _interwiki_repl(self, word):
   261         """Handle InterWiki links."""
   262         wikitag, wikiurl, wikitail, wikitag_bad = wikiutil.resolve_wiki(self.request, word)
   263         if wikitag_bad:
   264             return word
   265         else:
   266             wikiname, pagename = word.split(':', 1)
   267             pagename = wikiutil.url_unquote(pagename) # maybe someone has used %20 for blanks in pagename
   268             camelcase = wikiutil.isStrictWikiname(pagename)
   269             if wikiname in ('Self', self.request.cfg.interwikiname):
   270                 pagename = self._replace(('PAGE', pagename))
   271                 if camelcase:
   272                     return '%s' % pagename # optimize special case
   273                 else:
   274                     return '[[%s]]' % pagename # optimize special case
   275             else:
   276                 if ' ' in pagename: # we could get a ' '  by urlunquoting
   277                     return '[[%s:%s]]' % (wikiname, pagename)
   278                 else:
   279                     return '%s:%s' % (wikiname, pagename)
   280 
   281     def interwiki(self, url_and_text):
   282         if len(url_and_text) == 1:
   283             url = url_and_text[0]
   284             text = ''
   285         else:
   286             url, text = url_and_text
   287             text = '|' + text
   288 
   289         # keep track of whether this is a self-reference, so links
   290         # are always shown even the page doesn't exist.
   291         scheme, url = url.split(':', 1)
   292         wikiname, pagename = wikiutil.split_wiki(url)
   293         if (url.startswith(wikiutil.CHILD_PREFIX) or # fancy link to subpage [wiki:/SubPage text]
   294             Page(self.request, url).exists()): # fancy link to local page [wiki:LocalPage text]
   295             pagename = wikiutil.url_unquote(url)
   296             pagename = self._replace_target(pagename)
   297             return '[[%s%s]]' % (pagename, text)
   298         if wikiname in ('Self', self.request.cfg.interwikiname, ''): # [wiki:Self:LocalPage text] or [:LocalPage:text]
   299             pagename = wikiutil.url_unquote(pagename)
   300             pagename = self._replace_target(pagename)
   301             return '[[%s%s]]' % (pagename, text)
   302 
   303         wikitag, wikiurl, wikitail, wikitag_bad = wikiutil.resolve_wiki(self.request, url)
   304         if wikitag_bad: # likely we got some /InterWiki as wikitail, we don't want that!
   305             pagename = wikiutil.url_unquote(pagename)
   306             pagename = self._replace_target(pagename)
   307             wikitail = pagename
   308         else: # good
   309             wikitail = wikiutil.url_unquote(wikitail)
   310 
   311         # link to self?
   312         if wikiutil.isPicture(wikitail):
   313             return '{{%s:%s%s}}' % (wikitag, wikitail, text)
   314         else:
   315             if ' ' not in wikitail and not text:
   316                 return '%s:%s' % (wikitag, wikitail)
   317             else:
   318                 return '[[%s:%s%s]]' % (wikitag, wikitail, text)
   319 
   320     def attachment(self, url_and_text):
   321         """ This gets called on attachment URLs. """
   322         if len(url_and_text) == 1:
   323             url = url_and_text[0]
   324             text = ''
   325         else:
   326             url, text = url_and_text
   327             text = '|' + text
   328 
   329         scheme, fname = url.split(":", 1)
   330         #scheme, fname, text = wikiutil.split_wiki(target_and_text)
   331 
   332         pagename, fname = AttachFile.absoluteName(fname, self.pagename)
   333         from_this_page = pagename == self.pagename
   334         fname = self._replace(('FILE', pagename, fname))
   335         fname = wikiutil.url_unquote(fname)
   336         fname = self._replace(('FILE', pagename, fname))
   337         pagename = self._replace(('PAGE', pagename))
   338         if from_this_page:
   339             name = fname
   340         else:
   341             name = "%s/%s" % (pagename, fname)
   342 
   343         if scheme == 'drawing':
   344             return "{{drawing:%s%s}}" % (name, text)
   345 
   346         # check for image URL, and possibly return IMG tag
   347         # (images are always inlined, just like for other URLs)
   348         if wikiutil.isPicture(name):
   349             return "{{attachment:%s%s}}" % (name, text)
   350 
   351         # inline the attachment
   352         if scheme == 'inline':
   353             return '{{attachment:%s%s}}' % (name, text)
   354         else: # 'attachment'
   355             return '[[attachment:%s%s]]' % (name, text)
   356 
   357     def _url_repl(self, word):
   358         """Handle literal URLs including inline images."""
   359         scheme = word.split(":", 1)[0]
   360 
   361         if scheme == 'wiki':
   362             return self.interwiki([word])
   363         if scheme in self.attachment_schemas:
   364             return '%s' % self.attachment([word])
   365 
   366         if wikiutil.isPicture(word): # magic will go away in 1.6!
   367             return '{{%s}}' % word # new markup for inline images
   368         else:
   369             return word
   370 
   371     def _url_bracket_repl(self, word):
   372         """Handle bracketed URLs."""
   373         word = word[1:-1] # strip brackets
   374 
   375         # Local extended link?
   376         if word[0] == ':':
   377             words = word[1:].split(':', 1)
   378             link, text = (words + ['', ''])[:2]
   379             if link.strip() == text.strip():
   380                 text = ''
   381             link = self._replace_target(link)
   382             if text:
   383                 text = '|' + text
   384             return '[[%s%s]]' % (link, text)
   385 
   386         # Traditional split on space
   387         words = word.split(None, 1)
   388         if words[0][0] == '#':
   389             # anchor link
   390             link, text = (words + ['', ''])[:2]
   391             if link.strip() == text.strip():
   392                 text = ''
   393             #link = self._replace_target(link)
   394             if text:
   395                 text = '|' + text
   396             return '[[%s%s]]' % (link, text)
   397 
   398         scheme = words[0].split(":", 1)[0]
   399         if scheme == "wiki":
   400             return self.interwiki(words)
   401             #scheme, wikiname, pagename, text = self.interwiki(word)
   402             #print "%r %r %r %r" % (scheme, wikiname, pagename, text)
   403             #if wikiname in ('Self', self.request.cfg.interwikiname, ''):
   404             #    if text:
   405             #        text = '|' + text
   406             #    return '[[%s%s]]' % (pagename, text)
   407             #else:
   408             #    if text:
   409             #        text = '|' + text
   410             #    return "[[%s:%s%s]]" % (wikiname, pagename, text)
   411         if scheme in self.attachment_schemas:
   412             m = self.attachment(words)
   413             if m.startswith('{{') and m.endswith('}}'):
   414                 # with url_bracket markup, 1.5.8 parser does not embed, but link!
   415                 m = '[[%s]]' % m[2:-2]
   416             return m
   417 
   418         target, desc = (words + ['', ''])[:2]
   419         if wikiutil.isPicture(desc) and re.match(self.url_rule, desc):
   420             #return '[[%s|{{%s|%s}}]]' % (words[0], words[1], words[0])
   421             return '[[%s|{{%s}}]]' % (target, desc)
   422         else:
   423             if desc:
   424                 desc = '|' + desc
   425             return '[[%s%s]]' % (target, desc)
   426 
   427     def _pre_repl(self, word):
   428         w = word.strip()
   429         if w == '{{{' and not self.in_pre:
   430             self.in_pre = True
   431         elif w == '}}}' and self.in_pre:
   432             self.in_pre = False
   433         return word
   434 
   435     def _processor_repl(self, word):
   436         self.in_pre = True
   437         return word
   438 
   439     def scan(self, scan_re, line):
   440         """ Scans one line - append text before match, invoke replace() with match, and add text after match.  """
   441         result = []
   442         lastpos = 0
   443 
   444         for match in scan_re.finditer(line):
   445             # Add text before the match
   446             if lastpos < match.start():
   447                 result.append(line[lastpos:match.start()])
   448             # Replace match with markup
   449             result.append(self.replace(match))
   450             lastpos = match.end()
   451 
   452         # Add remainder of the line
   453         result.append(line[lastpos:])
   454         return u''.join(result)
   455 
   456 
   457     def replace(self, match):
   458         """ Replace match using type name """
   459         result = []
   460         for _type, hit in match.groupdict().items():
   461             if hit is not None and not _type in ["hmarker", ]:
   462                 # Get replace method and replace hit
   463                 replace = getattr(self, '_' + _type + '_repl')
   464                 # print _type, hit
   465                 result.append(replace(hit))
   466                 return ''.join(result)
   467         else:
   468             # We should never get here
   469             import pprint
   470             raise Exception("Can't handle match %r\n%s\n%s" % (
   471                 match,
   472                 pprint.pformat(match.groupdict()),
   473                 pprint.pformat(match.groups()),
   474             ))
   475 
   476         return ""
   477 
   478     def convert(self, request):
   479         """ For each line, scan through looking for magic
   480             strings, outputting verbatim any intervening text.
   481         """
   482         self.request = request
   483         # prepare regex patterns
   484         rules = self.formatting_rules.replace('\n', '|')
   485         if self.request.cfg.bang_meta:
   486             rules = ur'(?P<notword>!%(word_rule)s)|%(rules)s' % {
   487                 'word_rule': self.word_rule,
   488                 'rules': rules,
   489             }
   490         pre_rules = r'''(?P<pre>\}\}\})'''
   491         pre_scan_re = re.compile(pre_rules, re.UNICODE)
   492         scan_re = re.compile(rules, re.UNICODE)
   493         eol_re = re.compile(r'\r?\n', re.UNICODE)
   494 
   495         rawtext = self.raw
   496 
   497         # remove last item because it's guaranteed to be empty
   498         self.lines = eol_re.split(rawtext)[:-1]
   499         self.in_processing_instructions = True
   500 
   501         # Main loop
   502         for line in self.lines:
   503             # ignore processing instructions
   504             if self.in_processing_instructions:
   505                 found = False
   506                 for pi in ("##", "#format", "#refresh", "#redirect", "#deprecated",
   507                            "#pragma", "#form", "#acl", "#language"):
   508                     if line.lower().startswith(pi):
   509                         self.request.write(line + '\r\n')
   510                         found = True
   511                         break
   512                 if not found:
   513                     self.in_processing_instructions = False
   514                 else:
   515                     continue # do not parse this line
   516             if not line.strip():
   517                 self.request.write(line + '\r\n')
   518             else:
   519                 # Scan line, format and write
   520                 scanning_re = self.in_pre and pre_scan_re or scan_re
   521                 formatted_line = self.scan(scanning_re, line)
   522                 self.request.write(formatted_line + '\r\n')
   523