MoinMoin/script/migration/_conv160a.py
author Thomas Waldmann <tw AT waldmann-edv DOT de>
Wed, 11 Feb 2009 02:34:33 +0100
changeset 4569 3caaa8c74c41
parent 3925 44d856f8a110
child 4576 95dc19f1ea75
permissions -rw-r--r--
wikiutil: replace moin's cgi/urllib wrappers by calls to werkzeug.utils code
     1 # -*- coding: iso-8859-1 -*-
     2 """
     3     MoinMoin - migration from 1.6.0alpha (rev 1844: 58ebb64243cc - used a similar markup as 1.5.8, but with quotes for linking stuff with blanks) to 1.6.0 (creole link style)
     4 
     5     What it does:
     6 
     7     a) reverse underscore == blank stuff in pagenames (introducing this was a fault)
     8 
     9                    pagename            quoted pagename
    10        -----------------------------------------------------
    11        old         MainPage/Sub_Page   MainPage(2f)Sub_Page
    12        new         MainPage/Sub Page   MainPage(2f)Sub(20)Page    or
    13        new         MainPage/Sub_Page   MainPage(2f)Sub_Page       (user has to decide by editing rename1.txt)
    14 
    15 
    16                    markup
    17        ----------------------------------------------------
    18        old         MoinMoin:MainPage/Sub_Page      ../Sub_Page2
    19        new         [[MoinMoin:MainPage/Sub Page]]  [[../Sub Page2]]
    20 
    21 
    22     b) decode url encoded chars in attachment names (and quote the whole fname):
    23 
    24                    markup
    25        ----------------------------------------------------
    26        old         attachment:file%20with%20blanks.txt
    27        new         [[attachment:file with blanks.txt]]
    28 
    29     c) users: move bookmarks from separate files into user profile
    30     d) users: generate new name[] for lists and name{} for dicts
    31 
    32     e) kill all */MoinEditorBackup pages (replaced by drafts functionality)
    33 
    34     @copyright: 2007 by Thomas Waldmann
    35     @license: GNU GPL, see COPYING for details.
    36 """
    37 
    38 import os.path
    39 import re
    40 import time
    41 import codecs, urllib, glob
    42 
    43 from MoinMoin import config, wikiutil
    44 from MoinMoin.script.migration.migutil import opj, listdir, copy_file, move_file, copy_dir
    45 
    46 import mimetypes # this MUST be after wikiutil import!
    47 
    48 from _conv160b_wiki import convert_wiki
    49 
    50 create_rev = True # create a <new> rev with the converted content of <new-1> rev?
    51 
    52 def markup_converter(request, pagename, text, renames):
    53     """ Convert the <text> content of page <pagename>, using <renames> dict
    54         to rename links correctly. Additionally, convert some changed markup.
    55     """
    56     if text.startswith('<?xml'):
    57         # would be done with xslt processor
    58         return text
    59 
    60     pis, body = wikiutil.get_processing_instructions(text)
    61     for pi, val in pis:
    62         if pi == 'format' and val != 'wiki':
    63             # not wiki page
    64             return text
    65 
    66     text = convert_wiki(request, pagename, text, renames)
    67     return text
    68 
    69 
    70 class EventLog:
    71     def __init__(self, request, fname):
    72         self.request = request
    73         self.fname = fname
    74         self.data = None
    75         self.renames = {}
    76 
    77     def read(self):
    78         """ read complete event-log from disk """
    79         data = []
    80         try:
    81             lineno = 0
    82             f = file(self.fname, 'r')
    83             for line in f:
    84                 lineno += 1
    85                 line = line.replace('\r', '').replace('\n', '')
    86                 if not line.strip(): # skip empty lines
    87                     continue
    88                 fields = line.split('\t')
    89                 try:
    90                     timestamp, action, kvpairs = fields[:3]
    91                     timestamp = int(timestamp)
    92                     kvdict = wikiutil.parseQueryString(kvpairs)
    93                     data.append((timestamp, action, kvdict))
    94                 except ValueError, err:
    95                     # corrupt event log line, log error and skip it
    96                     print "Error: invalid event log (%s) line %d, err: %s, SKIPPING THIS LINE!" % (self.fname, lineno, str(err))
    97             f.close()
    98         except IOError, err:
    99             # no event-log
   100             pass
   101         self.data = data
   102 
   103     def write(self, fname):
   104         """ write complete event-log to disk """
   105         if self.data:
   106             f = file(fname, 'w')
   107             for timestamp, action, kvdict in self.data:
   108                 pagename = kvdict.get('pagename')
   109                 if pagename and ('PAGE', pagename) in self.renames:
   110                     kvdict['pagename'] = self.renames[('PAGE', pagename)]
   111                 kvpairs = wikiutil.makeQueryString(kvdict)
   112                 fields = str(timestamp), action, kvpairs
   113                 line = '\t'.join(fields) + '\n'
   114                 f.write(line)
   115             f.close()
   116 
   117     def copy(self, destfname, renames):
   118         self.renames = renames
   119         self.read()
   120         self.write(destfname)
   121 
   122 
   123 class EditLog:
   124     def __init__(self, request, fname):
   125         self.request = request
   126         self.fname = fname
   127         self.data = None
   128         self.renames = {}
   129 
   130     def read(self):
   131         """ read complete edit-log from disk """
   132         data = {}
   133         try:
   134             f = file(self.fname, 'r')
   135             for line in f:
   136                 line = line.replace('\r', '').replace('\n', '')
   137                 if not line.strip(): # skip empty lines
   138                     continue
   139                 fields = line.split('\t') + [''] * 9
   140                 timestamp, rev, action, pagename, ip, hostname, userid, extra, comment = fields[:9]
   141                 timestamp = int(timestamp)
   142                 rev = int(rev)
   143                 pagename = wikiutil.unquoteWikiname(pagename)
   144                 data[(timestamp, rev, pagename)] = (timestamp, rev, action, pagename, ip, hostname, userid, extra, comment)
   145             f.close()
   146         except IOError, err:
   147             # no edit-log
   148             pass
   149         self.data = data
   150 
   151     def write(self, fname, deleted=False):
   152         """ write complete edit-log to disk """
   153         if self.data:
   154             editlog = self.data.items()
   155             editlog.sort()
   156             f = file(fname, "w")
   157             max_rev = 0
   158             for key, fields in editlog:
   159                 timestamp, rev, action, pagename, ip, hostname, userid, extra, comment = fields
   160                 if action.startswith('ATT'):
   161                     try:
   162                         fname = urllib.unquote(extra).decode('utf-8')
   163                     except UnicodeDecodeError:
   164                         fname = urllib.unquote(extra).decode('iso-8859-1')
   165                     if ('FILE', pagename, fname) in self.renames:
   166                         fname = self.renames[('FILE', pagename, fname)]
   167                     extra = urllib.quote(fname.encode('utf-8'))
   168                 if ('PAGE', pagename) in self.renames:
   169                     pagename = self.renames[('PAGE', pagename)]
   170                 timestamp = str(timestamp)
   171                 if rev != 99999999:
   172                     max_rev = max(rev, max_rev)
   173                 revstr = '%08d' % rev
   174                 pagename = wikiutil.quoteWikinameFS(pagename)
   175                 fields = timestamp, revstr, action, pagename, ip, hostname, userid, extra, comment
   176                 log_str = '\t'.join(fields) + '\n'
   177                 f.write(log_str)
   178             if create_rev and not deleted:
   179                 timestamp = str(wikiutil.timestamp2version(time.time()))
   180                 revstr = '%08d' % (max_rev + 1)
   181                 action = 'SAVE'
   182                 ip = '127.0.0.1'
   183                 hostname = 'localhost'
   184                 userid = ''
   185                 extra = ''
   186                 comment = "converted to 1.6 markup"
   187                 fields = timestamp, revstr, action, pagename, ip, hostname, userid, extra, comment
   188                 log_str = '\t'.join(fields) + '\n'
   189                 f.write(log_str)
   190             f.close()
   191 
   192     def copy(self, destfname, renames, deleted=False):
   193         self.renames = renames
   194         self.read()
   195         self.write(destfname, deleted)
   196 
   197 
   198 class PageRev:
   199     """ a single revision of a page """
   200     def __init__(self, request, pagename, rev_dir, rev):
   201         self.request = request
   202         self.pagename = pagename
   203         self.rev_dir = rev_dir
   204         self.rev = rev
   205 
   206     def read(self):
   207         fname = opj(self.rev_dir, '%08d' % self.rev)
   208         f = file(fname, "rb")
   209         data = f.read()
   210         f.close()
   211         data = data.decode(config.charset)
   212         return data
   213 
   214     def write(self, data, rev_dir, convert, rev=None):
   215         if rev is None:
   216             rev = self.rev
   217         if convert:
   218             data = markup_converter(self.request, self.pagename, data, self.renames)
   219         fname = opj(rev_dir, '%08d' % rev)
   220         data = data.encode(config.charset)
   221         f = file(fname, "wb")
   222         f.write(data)
   223         f.close()
   224 
   225     def copy(self, rev_dir, renames, convert=False, new_rev=None):
   226         self.renames = renames
   227         data = self.read()
   228         self.write(data, rev_dir, convert, new_rev)
   229 
   230 
   231 class Attachment:
   232     """ a single attachment """
   233     def __init__(self, request, attach_dir, attfile):
   234         self.request = request
   235         self.path = opj(attach_dir, attfile)
   236         self.name = attfile.decode('utf-8', 'replace')
   237 
   238     def copy(self, attach_dir):
   239         """ copy attachment file from orig path to new destination """
   240         attfile = self.name.encode('utf-8')
   241         dest = opj(attach_dir, attfile)
   242         copy_file(self.path, dest)
   243 
   244 
   245 class Page:
   246     """ represents a page with all related data """
   247     def __init__(self, request, pages_dir, qpagename):
   248         self.request = request
   249         self.name = wikiutil.unquoteWikiname(qpagename)
   250         self.name_old = self.name # renaming: still original name when self.name has the new name
   251         self.page_dir = opj(pages_dir, qpagename)
   252         self.current = None # int current
   253         self.editlog = None # dict (see read_editlog)
   254         self.revlist = None # list of ints (page text revisions)
   255         self.revisions = None # dict int: pagerev obj
   256         self.attachments = None # dict of unicode fname: full path
   257         self.renames = {} # info for renaming pages/attachments
   258 
   259     def read(self):
   260         """ read a page, including revisions, log, attachments from disk """
   261         page_dir = self.page_dir
   262         # read current file
   263         current_fname = opj(page_dir, 'current')
   264         if os.path.exists(current_fname):
   265             current_file = file(current_fname, "r")
   266             current_rev = current_file.read()
   267             current_file.close()
   268             try:
   269                 self.current = int(current_rev)
   270             except ValueError:
   271                 print "Error: invalid current file %s, SKIPPING THIS PAGE!" % current_fname
   272                 return
   273         # read edit-log
   274         editlog_fname = opj(page_dir, 'edit-log')
   275         if os.path.exists(editlog_fname):
   276             self.editlog = EditLog(self.request, editlog_fname)
   277         # read page revisions
   278         rev_dir = opj(page_dir, 'revisions')
   279         if os.path.exists(rev_dir):
   280             revlist = listdir(rev_dir)
   281             revlist = [int(rev) for rev in revlist]
   282             revlist.sort()
   283             self.revlist = revlist
   284             self.revisions = {}
   285             for rev in revlist:
   286                 self.revisions[rev] = PageRev(self.request, self.name_old, rev_dir, rev)
   287         # set deleted status
   288         self.is_deleted = not self.revisions or self.current not in self.revisions
   289         # read attachment filenames
   290         attach_dir = opj(page_dir, 'attachments')
   291         if os.path.exists(attach_dir):
   292             self.attachments = {}
   293             attlist = listdir(attach_dir)
   294             for attfile in attlist:
   295                 a = Attachment(self.request, attach_dir, attfile)
   296                 self.attachments[a.name] = a
   297 
   298     def write(self, pages_dir):
   299         """ write a page, including revisions, log, attachments to disk """
   300         if ('PAGE', self.name) in self.renames:
   301             name_new = self.renames[('PAGE', self.name)]
   302             if name_new != self.name:
   303                 print "Renaming page %r -> %r" % (self.name, name_new)
   304                 self.name_old = self.name
   305                 self.name = name_new
   306         qpagename = wikiutil.quoteWikinameFS(self.name)
   307         page_dir = opj(pages_dir, qpagename)
   308         os.makedirs(page_dir)
   309         # write current file
   310         current = self.current
   311         if current is not None:
   312             if create_rev and not self.is_deleted:
   313                 current += 1
   314             current_fname = opj(page_dir, 'current')
   315             current_file = file(current_fname, "w")
   316             current_str = '%08d\n' % current
   317             current_file.write(current_str)
   318             current_file.close()
   319         # copy edit-log
   320         if self.editlog is not None:
   321             editlog_fname = opj(page_dir, 'edit-log')
   322             self.editlog.copy(editlog_fname, self.renames, deleted=self.is_deleted)
   323         # copy page revisions
   324         if self.revisions is not None:
   325             rev_dir = opj(page_dir, 'revisions')
   326             os.makedirs(rev_dir)
   327             for rev in self.revlist:
   328                 if create_rev:
   329                     self.revisions[rev].copy(rev_dir, self.renames)
   330                 else:
   331                     if int(rev) == self.current:
   332                         self.revisions[rev].copy(rev_dir, self.renames, convert=True)
   333                     else:
   334                         self.revisions[rev].copy(rev_dir, self.renames)
   335             if create_rev and not self.is_deleted:
   336                 self.revisions[rev].copy(rev_dir, self.renames, convert=True, new_rev=rev+1)
   337 
   338         # copy attachments
   339         if self.attachments is not None:
   340             attach_dir = opj(page_dir, 'attachments')
   341             os.makedirs(attach_dir)
   342             for fn, att in self.attachments.items():
   343                 # we have to check for renames here because we need the (old) pagename, too:
   344                 if ('FILE', self.name_old, fn) in self.renames:
   345                     fn_new = self.renames[('FILE', self.name_old, fn)]
   346                     if fn_new != fn:
   347                         print "Renaming file %r %r -> %r" % (self.name_old, fn, fn_new)
   348                         att.name = fn_new
   349                 att.copy(attach_dir)
   350 
   351     def copy(self, pages_dir, renames):
   352         self.renames = renames
   353         self.read()
   354         self.write(pages_dir)
   355 
   356 
   357 class User:
   358     """ represents a user with all related data """
   359     def __init__(self, request, users_dir, uid):
   360         self.request = request
   361         self.uid = uid
   362         self.users_dir = users_dir
   363         self.profile = None
   364         self.bookmarks = None
   365 
   366     def read(self):
   367         """ read profile and bookmarks data from disk """
   368         self.profile = {}
   369         fname = opj(self.users_dir, self.uid)
   370         # read user profile
   371         f = codecs.open(fname, 'r', config.charset)
   372         for line in f:
   373             line = line.replace(u'\r', '').replace(u'\n', '')
   374             if not line.strip() or line.startswith(u'#'): # skip empty or comment lines
   375                 continue
   376             try:
   377                 key, value = line.split(u'=', 1)
   378             except Exception, err:
   379                 print "Error: User reader can not parse line %r from profile %r (%s)" % (line, fname, str(err))
   380                 continue
   381             self.profile[key] = value
   382         f.close()
   383         # read bookmarks
   384         self.bookmarks = {}
   385         fname_pattern = opj(self.users_dir, "%s.*.bookmark" % self.uid)
   386         for fname in glob.glob(fname_pattern):
   387             f = file(fname, "r")
   388             bookmark = f.read()
   389             f.close()
   390             wiki = fname.replace('.bookmark', '').replace(opj(self.users_dir, self.uid+'.'), '')
   391             self.bookmarks[wiki] = int(bookmark)
   392         # don't care about trail
   393 
   394     def write(self, users_dir):
   395         """ write profile and bookmarks data to disk """
   396         fname = opj(users_dir, self.uid)
   397         f = codecs.open(fname, 'w', config.charset)
   398         for key, value in self.profile.items():
   399             if key in (u'subscribed_pages', u'quicklinks'):
   400                 pages = value.split(u'\t')
   401                 for i in range(len(pages)):
   402                     pagename = pages[i]
   403                     try:
   404                         interwiki, pagename = pagename.split(u':', 1)
   405                     except:
   406                         interwiki, pagename = u'Self', pagename
   407                     if interwiki == u'Self' or interwiki == self.request.cfg.interwikiname:
   408                         if ('PAGE', pagename) in self.renames:
   409                             pagename = self.renames[('PAGE', pagename)]
   410                             pages[i] = u'%s:%s' % (interwiki, pagename)
   411                 key += '[]' # we have lists here
   412                 value = u'\t'.join(pages)
   413                 f.write(u"%s=%s\n" % (key, value))
   414             else:
   415                 f.write(u"%s=%s\n" % (key, value))
   416         bookmark_entries = [u'%s:%s' % item for item in self.bookmarks.items()]
   417         key = u"bookmarks{}"
   418         value = u'\t'.join(bookmark_entries)
   419         f.write(u"%s=%s\n" % (key, value))
   420         f.close()
   421         # don't care about trail
   422 
   423     def copy(self, users_dir, renames):
   424         self.renames = renames
   425         self.read()
   426         self.write(users_dir)
   427 
   428 
   429 class DataConverter(object):
   430     def __init__(self, request, src_data_dir, dest_data_dir):
   431         self.request = request
   432         self.sdata = src_data_dir
   433         self.ddata = dest_data_dir
   434         self.pages = {}
   435         self.users = {}
   436         self.complete = {}
   437         self.renames = {}
   438         self.complete_fname = opj(self.sdata, 'complete.txt')
   439         self.rename_fname1 = opj(self.sdata, 'rename1.txt')
   440         self.rename_fname2 = opj(self.sdata, 'rename2.txt')
   441 
   442     def pass1(self):
   443         """ First create the rename list - the user has to review/edit it as
   444             we can't decide about page/attachment names automatically.
   445         """
   446         self.read_src()
   447         # pages
   448         for pn, p in self.pages.items():
   449             p.read()
   450             if not p.revisions:
   451                 continue # we don't care for pages with no revisions (trash)
   452             if pn.endswith('/MoinEditorBackup'):
   453                 continue # we don't care for old editor backups
   454             self.complete[('PAGE', pn)] = None
   455             if "_" in pn:
   456                 # log all pagenames with underscores
   457                 self.renames[('PAGE', pn)] = None
   458             if p.attachments is not None:
   459                 for fn in p.attachments:
   460                     try:
   461                         fn_str = fn.encode('ascii')
   462                         log = False # pure ascii filenames are no problem
   463                     except UnicodeEncodeError:
   464                         log = True # this file maybe has a strange representation in wiki markup
   465                     else:
   466                         if ' ' in fn_str or '%' in fn_str: # files with blanks need quoting
   467                             log = True
   468                     self.complete[('FILE', pn, fn)] = None
   469                     if log:
   470                         # log all strange attachment filenames
   471                         fn_str = fn.encode('utf-8')
   472                         self.renames[('FILE', pn, fn)] = None
   473         self.save_list(self.complete_fname, self.complete)
   474         self.save_list(self.rename_fname1, self.renames)
   475 
   476     LIST_FIELDSEP = u'|' # in case | makes trouble, one can use \t tab char
   477 
   478     def save_list(self, fname, what):
   479         what_sorted = what.keys()
   480         # make sure we have 3-tuples:
   481         what_sorted = [(k + (None, ))[:3] for k in what_sorted]
   482         # we only have python 2.3, thus no cmp keyword for the sort() call,
   483         # thus we need to do it the more complicated way:
   484         what_sorted = [(pn, fn, rtype) for rtype, pn, fn in what_sorted] # shuffle
   485         what_sorted.sort() # sort
   486         what_sorted = [(rtype, pn, fn) for pn, fn, rtype in what_sorted] # shuffle
   487         f = codecs.open(fname, 'w', 'utf-8')
   488         for rtype, pn, fn in what_sorted:
   489             if rtype == 'PAGE':
   490                 line = (rtype, pn, pn)
   491             elif rtype == 'FILE':
   492                 line = (rtype, pn, fn, fn)
   493             line = self.LIST_FIELDSEP.join(line)
   494             f.write(line + u'\n')
   495         f.close()
   496 
   497     def load_list(self, fname, what):
   498         f = codecs.open(fname, 'r', 'utf-8')
   499         for line in f:
   500             line = line.rstrip()
   501             if not line:
   502                 continue
   503             t = line.split(self.LIST_FIELDSEP)
   504             rtype, p1, p2, p3 = (t + [None]*3)[:4]
   505             if rtype == u'PAGE':
   506                 what[(str(rtype), p1)] = p2
   507             elif rtype == u'FILE':
   508                 what[(str(rtype), p1, p2)] = p3
   509         f.close()
   510 
   511     def pass2(self):
   512         """ Second, read the (user edited) rename list and do the renamings everywhere. """
   513         self.read_src()
   514         #self.load_list(self.complete_fname, self.complete)
   515         self.load_list(self.rename_fname2, self.renames)
   516         self.write_dest()
   517 
   518     def read_src(self):
   519         # create Page objects in memory
   520         pages_dir = opj(self.sdata, 'pages')
   521         pagelist = listdir(pages_dir)
   522         for qpagename in pagelist:
   523             p = Page(self.request, pages_dir, qpagename)
   524             self.pages[p.name] = p
   525 
   526         # create User objects in memory
   527         users_dir = opj(self.sdata, 'user')
   528         user_re = re.compile(r'^\d+\.\d+(\.\d+)?$')
   529         userlist = listdir(users_dir)
   530         userlist = [f for f in userlist if user_re.match(f)]
   531         for userid in userlist:
   532             u = User(self.request, users_dir, userid)
   533             self.users[u.uid] = u
   534 
   535         # create log objects in memory
   536         self.editlog = EditLog(self.request, opj(self.sdata, 'edit-log'))
   537         self.eventlog = EventLog(self.request, opj(self.sdata, 'event-log'))
   538 
   539     def write_dest(self):
   540         self.init_dest()
   541         # copy pages
   542         pages_dir = opj(self.ddata, 'pages')
   543         for pn, page in self.pages.items():
   544             if pn.endswith('/MoinEditorBackup'):
   545                 continue # we don't care for old editor backups
   546             page.copy(pages_dir, self.renames)
   547 
   548         # copy users
   549         users_dir = opj(self.ddata, 'user')
   550         for user in self.users.values():
   551             user.copy(users_dir, self.renames)
   552 
   553         # copy logs
   554         self.editlog.copy(opj(self.ddata, 'edit-log'), self.renames)
   555         self.eventlog.copy(opj(self.ddata, 'event-log'), self.renames)
   556 
   557     def init_dest(self):
   558         try:
   559             os.makedirs(self.ddata)
   560         except:
   561             pass
   562         os.makedirs(opj(self.ddata, 'pages'))
   563         os.makedirs(opj(self.ddata, 'user'))
   564         copy_dir(opj(self.sdata, 'plugin'), opj(self.ddata, 'plugin'))
   565         copy_file(opj(self.sdata, 'intermap.txt'), opj(self.ddata, 'intermap.txt'))
   566 
   567