Mercurial > moin > 1.9
changeset 960:afb156d4caa5
Merge with main.
author | Alexander Schremmer <alex AT alexanderweb DOT de> |
---|---|
date | Fri, 30 Jun 2006 21:35:59 +0200 |
parents | 5d308092d40e (current diff) d825de2173d5 (diff) |
children | 21eb4cb11e2c |
files | |
diffstat | 38 files changed, 2570 insertions(+), 151 deletions(-) [+] |
line wrap: on
line diff
--- a/MoinMoin/Page.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/Page.py Fri Jun 30 21:35:59 2006 +0200 @@ -1263,7 +1263,7 @@ " Check your underlay directory setting.") url = '%s?action=edit' % wikiutil.quoteWikinameURL(self.page_name) request.write(wikiutil.link_tag(self.request, url, text=_("Create New Page"), - formatter=self.formatter)) + formatter=self.formatter, rel='nofollow')) elif not request.user.may.read(self.page_name): request.write("<strong>%s</strong><br>" % _("You are not allowed to view this page.")) else:
--- a/MoinMoin/PageEditor.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/PageEditor.py Fri Jun 30 21:35:59 2006 +0200 @@ -363,7 +363,7 @@ self.request.write("<p>") self.request.write(_("Comment:"), - ' <input id="editor-comment" type="text" name="comment" value="%s" maxlength="80"' + ' <input id="editor-comment" type="text" name="comment" value="%s" maxlength="200"' ' onChange="flgChange = true;" onKeyPress="flgChange = true;">' % ( wikiutil.escape(kw.get('comment', ''), 1), )) self.request.write("</p>")
--- a/MoinMoin/PageGraphicalEditor.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/PageGraphicalEditor.py Fri Jun 30 21:35:59 2006 +0200 @@ -295,7 +295,7 @@ """) self.request.write("<p>") self.request.write(_("Comment:"), - ' <input id="editor-comment" type="text" name="comment" value="%s" maxlength="80">' % ( + ' <input id="editor-comment" type="text" name="comment" value="%s" maxlength="200">' % ( wikiutil.escape(kw.get('comment', ''), 1), )) self.request.write("</p>")
--- a/MoinMoin/Xapian.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/Xapian.py Fri Jun 30 21:35:59 2006 +0200 @@ -12,6 +12,7 @@ from pprint import pprint import xapian +from xapian import Query from MoinMoin.support.xapwrap import document as xapdoc from MoinMoin.support.xapwrap import index as xapidx from MoinMoin.parser.text_moin_wiki import Parser as WikiParser @@ -20,16 +21,24 @@ from MoinMoin import config, wikiutil from MoinMoin.util import filesys, lock +try: + # PyStemmer, snowball python bindings from http://snowball.tartarus.org/ + from Stemmer import Stemmer + use_stemming = True +except ImportError: + use_stemming = False class UnicodeQuery(xapian.Query): def __init__(self, *args, **kwargs): self.encoding = kwargs.get('encoding', config.charset) nargs = [] - for i in args: - if isinstance(i, unicode): - i = i.encode(self.encoding) - nargs.append(i) + for term in args: + if isinstance(term, unicode): + term = term.encode(self.encoding) + elif isinstance(term, list) or isinstance(term, tuple): + term = [t.encode(self.encoding) for t in term] + nargs.append(term) xapian.Query.__init__(self, *nargs, **kwargs) @@ -38,6 +47,9 @@ ### Tokenizer ############################################################################## +def getWikiAnalyzerFactory(language='en'): + return (lambda: WikiAnalyzer(language)) + class WikiAnalyzer: singleword = r"[%(u)s][%(l)s]+" % { 'u': config.chars_upper, @@ -62,10 +74,13 @@ # XXX limit stuff above to xapdoc.MAX_KEY_LEN # WORD_RE = re.compile('\\w{1,%i}' % MAX_KEY_LEN, re.U) - def tokenize(self, value): - """Yield a stream of lower cased words from a string. - value must be an UNICODE object or a list of unicode objects - """ + def __init__(self, language=None): + if use_stemming and language: + self.stemmer = Stemmer(language) + else: + self.stemmer = None + + def raw_tokenize(self, value): def enc(uc): """ 'encode' unicode results into whatever xapian / xapwrap wants """ lower = uc.lower() @@ -93,12 +108,24 @@ yield enc(word) elif m.group("word"): word = m.group("word") - yield enc(word) + yield enc(word) # if it is a CamelCaseWord, we additionally yield Camel, Case and Word if self.wikiword_re.match(word): for sm in re.finditer(self.singleword_re, word): yield enc(sm.group()) + def tokenize(self, value, flat_stemming=True): + """Yield a stream of lower cased raw and stemmed (optional) words from a string. + value must be an UNICODE object or a list of unicode objects + """ + for i in self.raw_tokenize(value): + if flat_stemming: + yield i # XXX: should we really use a prefix for that? Index.prefixMap['raw'] + i + if self.stemmer: + yield self.stemmer.stemWord(i) + else: + yield (i, self.stemmer.stemWord(i)) + ############################################################################# ### Indexing @@ -240,7 +267,7 @@ #N ISO couNtry code (or domaiN name) #P Pathname #Q uniQue id - #R Raw (i.e. unstemmed) term + 'raw': 'R', # Raw (i.e. unstemmed) term 'title': 'S', # Subject (or title) 'mimetype': 'T', 'url': 'U', # full URL of indexed document - if the resulting term would be > 240 @@ -250,33 +277,39 @@ # the D term, and changing the last digit to a '2' if it's a '3') #X longer prefix for user-defined use 'linkto': 'XLINKTO', # this document links to that document + 'stem_lang': 'XSTEMLANG', # ISO Language code this document was stemmed in #Y year (four digits) } - - class LockedException(Exception): pass def __init__(self, request): self.request = request cache_dir = request.cfg.cache_dir - self.main_dir = os.path.join(cache_dir, 'xapian') - self.dir = os.path.join(self.main_dir, 'index') + main_dir = self._main_dir() + self.dir = os.path.join(main_dir, 'index') filesys.makeDirs(self.dir) - self.sig_file = os.path.join(self.main_dir, 'complete') - lock_dir = os.path.join(self.main_dir, 'index-lock') + self.sig_file = os.path.join(main_dir, 'complete') + lock_dir = os.path.join(main_dir, 'index-lock') self.lock = lock.WriteLock(lock_dir, timeout=3600.0, readlocktimeout=60.0) self.read_lock = lock.ReadLock(lock_dir, timeout=3600.0) - self.queue = UpdateQueue(os.path.join(self.main_dir, "update-queue"), - os.path.join(self.main_dir, 'update-queue-lock')) - + self.queue = UpdateQueue(os.path.join(main_dir, 'update-queue'), + os.path.join(main_dir, 'update-queue-lock')) + # Disabled until we have a sane way to build the index with a # queue in small steps. ## if not self.exists(): ## self.indexPagesInNewThread(request) + def _main_dir(self): + if self.request.cfg.xapian_index_dir: + return os.path.join(self.request.cfg.xapian_index_dir, + self.request.cfg.siteid) + else: + return os.path.join(self.request.cfg.cache_dir, 'xapian') + def exists(self): """ Check if index exists """ return os.path.exists(self.sig_file) @@ -358,7 +391,7 @@ indexThread.join() return func - self.request.finish = joinDecorator(self.request.finish) + self.request.finish = joinDecorator(self.request.finish) indexThread.start() except: self.lock.release() @@ -391,7 +424,7 @@ indexThread.join() return func - self.request.finish = joinDecorator(self.request.finish) + self.request.finish = joinDecorator(self.request.finish) indexThread.start() except: self.lock.release() @@ -422,8 +455,8 @@ break except wikiutil.PluginMissingError: pass - #else: - # raise "Cannot load filter for mimetype." + modulename # XXX + else: + request.log("Cannot load filter for mimetype." + modulename) try: data = execute(self, filename) if debug: @@ -480,7 +513,7 @@ keywords=(xtitle, xitemid, ), sortFields=(xpname, xattachment, xmtime, xwname, ), ) - doc.analyzerFactory = WikiAnalyzer + doc.analyzerFactory = getWikiAnalyzerFactory() if mode == 'update': if debug: request.log("%s (replace %r)" % (filename, uid)) doc.uid = uid @@ -491,6 +524,34 @@ except (OSError, IOError), err: pass + def _get_languages(self, page): + body = page.get_raw_body() + default_lang = page.request.cfg.language_default + + lang = '' + + if use_stemming: + for line in body.split('\n'): + if line.startswith('#language'): + lang = line.split(' ')[1] + try: + Stemmer(lang) + except KeyError: + # lang is not stemmable + break + else: + # lang is stemmable + return (lang, lang) + elif not line.startswith('#'): + break + + if not lang: + # no lang found at all.. fallback to default language + lang = default_lang + + # return actual lang and lang to stem in + return (lang, default_lang) + def _index_page(self, writer, page, mode='update'): """ Index a page - assumes that the write lock is acquired @arg writer: the index writer object @@ -504,6 +565,8 @@ pagename = page.page_name mtime = page.mtime_usecs() itemid = "%s:%s" % (wikiname, pagename) + # XXX: Hack until we get proper metadata + language, stem_language = self._get_languages(page) updated = False if mode == 'update': @@ -530,7 +593,9 @@ xattachment = xapdoc.SortKey('attachment', '') # this is a real page, not an attachment xmtime = xapdoc.SortKey('mtime', mtime) xtitle = xapdoc.TextField('title', pagename, True) # prefixed - xkeywords = [xapdoc.Keyword('itemid', itemid)] + xkeywords = [xapdoc.Keyword('itemid', itemid), + xapdoc.Keyword('lang', language), + xapdoc.Keyword('stem_lang', stem_language)] for pagelink in page.getPageLinks(request): xkeywords.append(xapdoc.Keyword('linkto', pagelink)) xcontent = xapdoc.TextField('content', page.get_raw_body()) @@ -538,17 +603,8 @@ keywords=xkeywords, sortFields=(xpname, xattachment, xmtime, xwname, ), ) - doc.analyzerFactory = WikiAnalyzer - #search_db_language = "english" - #stemmer = xapian.Stem(search_db_language) - #pagetext = page.get_raw_body().lower() - #words = re.finditer(r"\w+", pagetext) - #count = 0 - #for wordmatch in words: - # count += 1 - # word = wordmatch.group().encode(config.charset) - # document.add_posting('R' + stemmer.stem_word(word), count) # count should be term position in document (starting at 1) - + doc.analyzerFactory = getWikiAnalyzerFactory() + if mode == 'update': if debug: request.log("%s (replace %r)" % (pagename, uid)) doc.uid = uid @@ -586,14 +642,15 @@ xattachment = xapdoc.SortKey('attachment', att) # this is an attachment, store its filename xmtime = xapdoc.SortKey('mtime', mtime) xtitle = xapdoc.Keyword('title', '%s/%s' % (pagename, att)) + xlanguage = xapdoc.Keyword('lang', language) mimetype, att_content = self.contentfilter(filename) xmimetype = xapdoc.TextField('mimetype', mimetype, True) xcontent = xapdoc.TextField('content', att_content) doc = xapdoc.Document(textFields=(xcontent, xmimetype, ), - keywords=(xatt_itemid, xtitle, ), + keywords=(xatt_itemid, xtitle, xlanguage, ), sortFields=(xpname, xattachment, xmtime, xwname, ), ) - doc.analyzerFactory = WikiAnalyzer + doc.analyzerFactory = getWikiAnalyzerFactory() if mode == 'update': if debug: request.log("%s (replace %r)" % (pagename, uid)) doc.uid = uid @@ -631,7 +688,7 @@ fname = fname.strip() self._index_file(request, writer, fname, mode) writer.close() - request.log("indexing completed successfully in %0.2f seconds." % + request.log("indexing completed successfully in %0.2f seconds." % (time.time() - start)) self._sign() finally:
--- a/MoinMoin/__init__.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/__init__.py Fri Jun 30 21:35:59 2006 +0200 @@ -1,6 +1,6 @@ # -*- coding: iso-8859-1 -*- """ -MoinMoin Version 1.6.0alpha 43b158d3cf22+ tip +MoinMoin Version 1.6.0alpha bf18e19e618d+ tip @copyright: 2000-2006 by Jürgen Hermann <jh@web.de> @license: GNU GPL, see COPYING for details.
--- a/MoinMoin/action/AttachFile.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/action/AttachFile.py Fri Jun 30 21:35:59 2006 +0200 @@ -124,7 +124,8 @@ attach_icon = request.theme.make_icon('attach', vars={ 'attach_count': attach_count }) attach_link = wikiutil.link_tag(request, "%s?action=AttachFile" % wikiutil.quoteWikinameURL(pagename), - attach_icon) + attach_icon, + request.formatter, rel='nofollow') return attach_link
--- a/MoinMoin/action/DeletePage.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/action/DeletePage.py Fri Jun 30 21:35:59 2006 +0200 @@ -61,7 +61,7 @@ <tr> <td class="label"><label>%(comment_label)s</label></td> <td class="content"> - <input type="text" name="comment" maxlength="80"> + <input type="text" name="comment" maxlength="200"> </td> </tr> <tr>
--- a/MoinMoin/action/LikePages.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/action/LikePages.py Fri Jun 30 21:35:59 2006 +0200 @@ -12,6 +12,8 @@ """ import re + +from MoinMoin.support import difflib from MoinMoin import config, wikiutil from MoinMoin.Page import Page @@ -81,7 +83,7 @@ start, end, matches = wikiMatches(pagename, pages, start_re=s_re, end_re=e_re) - # Get the best 10 close matches using difflib + # Get the best 10 close matches close_matches = {} found = 0 for name in closeMatches(pagename, pages): @@ -167,7 +169,7 @@ def closeMatches(pagename, pages): - """ Get close matches using difflib + """ Get close matches. Return all matching pages with rank above cutoff value. @@ -176,8 +178,6 @@ @rtype: list @return: list of matching pages, sorted by rank """ - import difflib - # Match using case insensitive matching # Make mapping from lowerpages to pages - pages might have same name # with different case (although its stupid).
--- a/MoinMoin/action/PackagePages.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/action/PackagePages.py Fri Jun 30 21:35:59 2006 +0200 @@ -152,7 +152,7 @@ <tr> <td class="label"><label>%(list_label)s</label></td> <td class="content"> - <input type="text" name="pagelist" maxlength="80"> + <input type="text" name="pagelist" maxlength="200"> </td> </tr> <tr>
--- a/MoinMoin/action/RenamePage.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/action/RenamePage.py Fri Jun 30 21:35:59 2006 +0200 @@ -79,7 +79,7 @@ <tr> <td class="label"><label>%(comment_label)s</label></td> <td class="content"> - <input type="text" name="comment" maxlength="80"> + <input type="text" name="comment" maxlength="200"> </td> </tr> <tr>
--- a/MoinMoin/action/__init__.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/action/__init__.py Fri Jun 30 21:35:59 2006 +0200 @@ -594,7 +594,7 @@ if rev2: qstr = '%s&rev2=%s' % (qstr, rev2) request.write(Page(request, pagename).link_to(request, text=_('Ignore changes in the amount of whitespace'), - querystr=qstr) + '<p>') + querystr=qstr, rel='nofollow') + '<p>') request.write('<pre>') for line in lines: @@ -695,24 +695,24 @@ querystr='')) actions = '%s %s' % (actions, page.link_to(request, text=_('raw'), - querystr='action=raw')) + querystr='action=raw', rel='nofollow')) actions = '%s %s' % (actions, page.link_to(request, text=_('print'), - querystr='action=print')) + querystr='action=print', rel='nofollow')) else: actions = '%s %s' % (actions, page.link_to(request, text=_('view'), - querystr='action=recall&rev=%d' % rev)) + querystr='action=recall&rev=%d' % rev, rel='nofollow')) actions = '%s %s' % (actions, page.link_to(request, text=_('raw'), - querystr='action=raw&rev=%d' % rev)) + querystr='action=raw&rev=%d' % rev, rel='nofollow')) actions = '%s %s' % (actions, page.link_to(request, text=_('print'), - querystr='action=print&rev=%d' % rev)) + querystr='action=print&rev=%d' % rev, rel='nofollow')) if may_revert and size: # you can only revert to nonempty revisions actions = '%s %s' % (actions, page.link_to(request, text=_('revert'), - querystr='action=revert&rev=%d' % (rev,))) + querystr='action=revert&rev=%d' % rev, rel='nofollow')) if count == 0: rchecked=' checked="checked"' lchecked = '' @@ -744,18 +744,18 @@ if line.action == 'ATTNEW': actions = '%s %s' % (actions, page.link_to(request, text=_('view'), - querystr='action=AttachFile&do=view&target=%s' % filename)) + querystr='action=AttachFile&do=view&target=%s' % filename, rel='nofollow')) elif line.action == 'ATTDRW': actions = '%s %s' % (actions, page.link_to(request, text=_('edit'), - querystr='action=AttachFile&drawing=%s' % filename.replace(".draw",""))) + querystr='action=AttachFile&drawing=%s' % filename.replace(".draw",""), rel='nofollow')) actions = '%s %s' % (actions, page.link_to(request, text=_('get'), - querystr='action=AttachFile&do=get&target=%s' % filename)) + querystr='action=AttachFile&do=get&target=%s' % filename, rel='nofollow')) actions = '%s %s' % (actions, page.link_to(request, text=_('del'), - querystr='action=AttachFile&do=del&target=%s' % filename)) + querystr='action=AttachFile&do=del&target=%s' % filename, rel='nofollow')) # XXX use?: wikiutil.escape(filename) history.addRow(( @@ -810,11 +810,11 @@ request.theme.send_title(_('Info for "%s"') % (title,), pagename=pagename) historylink = wikiutil.link_tag(request, '%s?action=info' % qpagename, - _('Show "%(title)s"') % {'title': _('Revision History')}) + _('Show "%(title)s"') % {'title': _('Revision History')}, request.formatter, rel='nofollow') generallink = wikiutil.link_tag(request, '%s?action=info&general=1' % qpagename, - _('Show "%(title)s"') % {'title': _('General Page Infos')}) + _('Show "%(title)s"') % {'title': _('General Page Infos')}, request.formatter, rel='nofollow') hitcountlink = wikiutil.link_tag(request, '%s?action=info&hitcounts=1' % qpagename, - _('Show chart "%(title)s"') % {'title': _('Page hits and edits')}) + _('Show chart "%(title)s"') % {'title': _('Page hits and edits')}, request.formatter, rel='nofollow') request.write('<div id="content">\n') # start content div request.write("<p>[%s] [%s] [%s]</p>" % (historylink, generallink, hitcountlink))
--- a/MoinMoin/action/login.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/action/login.py Fri Jun 30 21:35:59 2006 +0200 @@ -51,11 +51,10 @@ else: password = form.get('password',[None])[0] if not password: - error = _("Missing password. Please enter user name and" - " password.") + error = _("Missing password. Please enter user name and password.") else: if not request.user.valid: - error = _("Sorry, wrong password.") + error = _("Sorry, login failed.") return self.page.send_page(request, msg=error)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/action/sitemap.py Fri Jun 30 21:35:59 2006 +0200 @@ -0,0 +1,98 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - "sitemap" action + + Generate a URL list of all your pages (using google's sitemap XML format). + + @copyright: 2006 by Thomas Waldmann, MoinMoin:ThomasWaldmann + @license: GNU GPL, see COPYING for details. +""" +import time +from MoinMoin import wikiutil + +datetime_fmt = "%Y-%m-%dT%H:%M:%S+00:00" + +def now(): + return time.strftime(datetime_fmt, time.gmtime()) + +def make_url_xml(vars): + """ assemble a single <url> xml fragment """ + return """\ +<url> + <loc>%(base)s%(url)s</loc> + <lastmod>%(lastmod)s</lastmod> + <changefreq>%(changefreq)s</changefreq> + <priority>%(priority)s</priority> +</url> +""" % vars + +def sitemap_url(request, base, page): + """ return a sitemap <url>..</url> fragment for page object <page> """ + url = page.url(request) + pagename = page.page_name + lastmod = page.mtime_printable(request) + if lastmod == "0": # can happen in case of errors + lastmod = now() + + # page's changefreq, priority and lastmod depends on page type / name + if pagename in [u"RecentChanges", u"TitleIndex", ]: + # important dynamic pages with macros + changefreq = "hourly" + priority = "0.9" + lastmod = now() # the page text mtime never changes, but the macro output DOES + + elif pagename in [request.cfg.page_front_page, ]: + # important user edited pages + changefreq = "hourly" + priority = "1.0" + + elif wikiutil.isSystemPage(request, pagename): + # other system pages are rather boring + changefreq = "yearly" + priority = "0.1" + + else: + # these are the content pages: + changefreq = "daily" + priority = "0.5" + + return make_url_xml(locals()) + +def execute(pagename, request): + _ = request.getText + form = request.form + request.user.datetime_fmt = datetime_fmt + base = request.getBaseURL() + + request.http_headers(["Content-Type: text/xml; charset=UTF-8"]) + + # we emit a piece of data so other side doesn't get bored: + request.write("""<?xml version="1.0" encoding="UTF-8"?>\r\n""") + + result = [] + result.append("""<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">\n""") + + # we include the / url as an important and often changed URL + result.append(make_url_xml({ + 'base': base, + 'url': '/', + 'lastmod': now(), # fake + 'changefreq': 'hourly', + 'priority': '1.0', + })) + + # Get page dict readable by current user + pages = request.rootpage.getPageDict() + pagelist = pages.keys() + pagelist.sort() + for name in pagelist: + result.append(sitemap_url(request, base, pages[name])) + + result.append("""</urlset>\n""") + + result = "".join(result) + result = result.replace("\n", "\r\n") # text/* requires CR/LF + + # emit all real data + request.write(result) +
--- a/MoinMoin/auth/ldap_login.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/auth/ldap_login.py Fri Jun 30 21:35:59 2006 +0200 @@ -47,7 +47,13 @@ l.simple_bind_s(ldap_binddn.encode(coding), ldap_bindpw.encode(coding)) if verbose: request.log("LDAP: Bound with binddn %s" % ldap_binddn) - filterstr = "(%s=%s)" % (cfg.ldap_name_attribute, username) + # normal usage: ldap_filter = "(%(ldap_name_attribute)s=%(username)s)" + # you can also do more complex filtering like: + # "(&(%(ldap_name_attribute)s=%(username)s)(memberOf=CN=WikiUsers,OU=Groups,DC=example,DC=org))" + filterstr = cfg.ldap_filter % { + 'ldap_name_attribute': cfg.ldap_name_attribute, + 'username': username, + } if verbose: request.log("LDAP: Searching %s" % filterstr) lusers = l.search_st(cfg.ldap_base, cfg.ldap_scope, filterstr.encode(coding), timeout=cfg.ldap_timeout)
--- a/MoinMoin/converter/text_html_text_moin_wiki.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/converter/text_html_text_moin_wiki.py Fri Jun 30 21:35:59 2006 +0200 @@ -465,7 +465,7 @@ class convert_tree(visitor): white_space = object() new_line = object() - + def __init__(self, request, pagename): self.request = request self.pagename = pagename @@ -616,7 +616,7 @@ self.text.append(text.replace("\n", " ")) elif name == 'dd': self.text.append(markup) - self.process_list_item(i, indent) + self.process_list_item(i, indent) # XXX no dt -> indent is undefined!!! else: raise ConvertError("Illegal list element %s" % i.localName) self.depth -= 1 @@ -802,7 +802,7 @@ def process_div(self, node): # ignore div tags - just descend for i in node.childNodes: - self.visit_element(i) + self.visit(i) def process_tt(self, node): text = self.node_list_text_only(node.childNodes).replace("\n", " ") @@ -972,7 +972,7 @@ result.append('style="%s"' % node.getAttribute("style")) if align: - result[0:0] = "%s" % align + result.insert(0, "%s" % align) result.append(rowspan) return " ".join(result).strip()
--- a/MoinMoin/filter/application_msword.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/filter/application_msword.py Fri Jun 30 21:35:59 2006 +0200 @@ -11,5 +11,5 @@ from MoinMoin import filter def execute(indexobj, filename): - return filter.execfilter("HOME=/tmp antiword %s", filename) # no HOME makes antiword complain + return filter.execfilter("HOME=/tmp antiword '%s'", filename) # no HOME makes antiword complain
--- a/MoinMoin/filter/application_octet_stream.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/filter/application_octet_stream.py Fri Jun 30 21:35:59 2006 +0200 @@ -23,7 +23,7 @@ blacklist = ('.iso', # CD/DVD images, TODO: add nero/... stuff '.zip', '.rar', '.lzh', '.lha', - '.tar', '.gz', '.tgz', '.bz2', '.tb2', + '.tar', '.gz', '.tgz', '.bz2', '.tb2', '.z', '.exe', '.com', '.dll', '.cab', '.msi', '.bin', # windows '.rpm', '.deb', # linux '.hqx', '.dmg', '.sit', # mac
--- a/MoinMoin/filter/application_pdf.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/filter/application_pdf.py Fri Jun 30 21:35:59 2006 +0200 @@ -11,5 +11,5 @@ from MoinMoin import filter def execute(indexobj, filename): - return filter.execfilter("pdftotext -enc UTF-8 %s -", filename) + return filter.execfilter("pdftotext -enc UTF-8 '%s' -", filename)
--- a/MoinMoin/filter/application_vnd_ms_excel.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/filter/application_vnd_ms_excel.py Fri Jun 30 21:35:59 2006 +0200 @@ -11,7 +11,7 @@ from MoinMoin import filter def execute(indexobj, filename): - data = filter.execfilter("xls2csv %s", filename) + data = filter.execfilter("xls2csv '%s'", filename) # xls2csv uses comma as field separator and "field content", # we strip both to not confuse the indexer data = data.replace(u',', u' ').replace(u'"', u' ')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/i18n/mo/__init__.py Fri Jun 30 21:35:59 2006 +0200 @@ -0,0 +1,7 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - *.mo binary i18n files + + @copyright: 2006 MoinMoin:ThomasWaldmann + @license: GNU GPL, see COPYING for details. +"""
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/i18n/tools/__init__.py Fri Jun 30 21:35:59 2006 +0200 @@ -0,0 +1,7 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - misc. i18n related scripts + + @copyright: 2006 MoinMoin:ThomasWaldmann + @license: GNU GPL, see COPYING for details. +"""
--- a/MoinMoin/macro/ImageLink.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/macro/ImageLink.py Fri Jun 30 21:35:59 2006 +0200 @@ -79,6 +79,9 @@ Thomas Waldmann 2006-03-10 code refactored + + Reimar Bauer + 2006-05-01 bug fix of image linked to attachment @copyright: 2001 by Jeff Kunce, 2004 by Marcin Zalewski, @@ -124,6 +127,9 @@ image = args[0] if argc >= 2 and args[1]: target = args[1] + elif argc == 1: + pagename, attname = AttachFile.absoluteName(image, formatter.page.page_name) + target = AttachFile.getAttachUrl(pagename, image, request) else: target = None @@ -153,6 +159,11 @@ if target is None: target = kw['src'] + + if argc == 1: + return "%s%s%s" % (formatter.url(1, kw['src']), + formatter.image(**kw), + formatter.url(0)) if _is_URL(target): return "%s%s%s" % (formatter.url(1, target),
--- a/MoinMoin/macro/RecentChanges.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/macro/RecentChanges.py Fri Jun 30 21:35:59 2006 +0200 @@ -65,25 +65,25 @@ #img = request.theme.make_icon('help') html_link = wikiutil.link_tag(request, wikiutil.quoteWikinameURL(pagename) + "?action=edit", - img, formatter=macro.formatter) + img, formatter=macro.formatter, rel="nofollow") elif is_new: # show "NEW" icon if page was created after the user's bookmark if hilite: img = request.theme.make_icon('new') html_link = wikiutil.link_tag(request, wikiutil.quoteWikinameURL(pagename), - img, formatter=macro.formatter) + img, formatter=macro.formatter, rel="nofollow") elif hilite: # show "UPDATED" icon if page was edited after the user's bookmark img = request.theme.make_icon('updated') html_link = wikiutil.link_tag(request, wikiutil.quoteWikinameURL(pagename) + "?action=diff&date=%d" % bookmark_usecs, - img, formatter=macro.formatter) + img, formatter=macro.formatter, rel="nofollow") else: # show "DIFF" icon else img = request.theme.make_icon('diffrc') html_link = wikiutil.link_tag(request, wikiutil.quoteWikinameURL(line.pagename) + "?action=diff", - img, formatter=macro.formatter) + img, formatter=macro.formatter, rel="nofollow") # print name of page, with a link to it force_split = len(page.page_name) > _MAX_PAGENAME_LENGTH @@ -133,7 +133,7 @@ img = request.theme.make_icon('info') info_html = wikiutil.link_tag(request, wikiutil.quoteWikinameURL(line.pagename) + "?action=info", - img, formatter=macro.formatter) + img, formatter=macro.formatter, rel="nofollow") d['info_html'] = info_html return request.theme.recentchanges_entry(d) @@ -259,14 +259,14 @@ url = wikiutil.quoteWikinameURL(pagename) + "?action=bookmark&time=del" deleteBookmark = wikiutil.link_tag(request, url, _("Delete Bookmark"), - formatter=macro.formatter) + formatter=macro.formatter, rel="nofollow") d['rc_curr_bookmark'] = currentBookmark + ' ' + deleteBookmark version = wikiutil.timestamp2version(tnow) url = wikiutil.quoteWikinameURL(pagename) + \ "?action=bookmark&time=%d" % version d['rc_update_bookmark'] = wikiutil.link_tag(request, url, _("Set bookmark"), - formatter=macro.formatter) + formatter=macro.formatter, rel="nofollow") # set max size in days max_days = min(int(request.form.get('max_days', [0])[0]), _DAYS_SELECTION[-1]) @@ -314,7 +314,7 @@ wikiutil.quoteWikinameURL( macro.formatter.page.page_name) + "?action=bookmark&time=%d" % (pages[0][0].ed_time_usecs,), _("set bookmark"), - formatter=macro.formatter) + formatter=macro.formatter, rel="nofollow") else: d['bookmark_link_html'] = None d['date'] = request.user.getFormattedDate(wikiutil.version2timestamp(pages[0][0].ed_time_usecs)) @@ -360,7 +360,7 @@ wikiutil.quoteWikinameURL( macro.formatter.page.page_name) + "?action=bookmark&time=%d" % (pages[0][0].ed_time_usecs,), _("Set bookmark"), - formatter=macro.formatter) + formatter=macro.formatter, rel="nofollow") else: d['bookmark_link_html'] = None d['date'] = request.user.getFormattedDate(wikiutil.version2timestamp(pages[0][0].ed_time_usecs))
--- a/MoinMoin/macro/WantedPages.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/macro/WantedPages.py Fri Jun 30 21:35:59 2006 +0200 @@ -75,7 +75,7 @@ if macro.formatter.page.page_name in where: where.remove(macro.formatter.page.page_name) querystr='highlight=%s' % wikiutil.url_quote_plus(name) - wherelinks = [pages[pagename].link_to(request, querystr=querystr) + wherelinks = [pages[pagename].link_to(request, querystr=querystr, rel='nofollow') for pagename in where] result.append(": " + ', '.join(wherelinks)) result.append(macro.formatter.listitem(0))
--- a/MoinMoin/multiconfig.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/multiconfig.py Fri Jun 30 21:35:59 2006 +0200 @@ -276,6 +276,7 @@ # instead of just IPs xapian_search = False # disabled until xapian is finished + xapian_index_dir = None mail_login = None # or "user pwd" if you need to use SMTP AUTH mail_sendmail = None # "/usr/sbin/sendmail -t -i" to not use SMTP, but sendmail @@ -387,7 +388,7 @@ user_checkbox_fields = [ ('mailto_author', lambda _: _('Publish my email (not my wiki homepage) in author info')), ('edit_on_doubleclick', lambda _: _('Open editor on double click')), - ('remember_last_visit', lambda _: _('Jump to last visited page instead of frontpage')), + ('remember_last_visit', lambda _: _('After login, jump to last visited page')), ('show_nonexist_qm', lambda _: _('Show question mark for non-existing pagelinks')), ('show_page_trail', lambda _: _('Show page trail')), ('show_toolbar', lambda _: _('Show icon toolbar')), @@ -481,7 +482,7 @@ name = dirname + '_dir' if not getattr(self, name, None): setattr(self, name, os.path.join(data_dir, dirname)) - + # Try to decode certain names which allow unicode self._decode()
--- a/MoinMoin/script/migration/1050301.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/script/migration/1050301.py Fri Jun 30 21:35:59 2006 +0200 @@ -1,13 +1,13 @@ # -*- coding: iso-8859-1 -*- """ - MoinMoin - dummy migration terminator script + MoinMoin - migration from base rev 1050301 - This must be the last migration script. + Nothing to do, we just return the new data dir revision. @copyright: 2006 by Thomas Waldmann @license: GNU GPL, see COPYING for details. """ def execute(script, data_dir, rev): - return None + return 1050400
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/script/migration/1050400.py Fri Jun 30 21:35:59 2006 +0200 @@ -0,0 +1,13 @@ +# -*- coding: iso-8859-1 -*- +""" + MoinMoin - dummy migration terminator script + + This must be the last migration script. + + @copyright: 2006 by Thomas Waldmann + @license: GNU GPL, see COPYING for details. +""" + +def execute(script, data_dir, rev): + return None +
--- a/MoinMoin/search.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/search.py Fri Jun 30 21:35:59 2006 +0200 @@ -10,13 +10,17 @@ @license: GNU GPL, see COPYING for details """ -import re, time, sys, StringIO, string +import re, time, sys, StringIO, string, operator +from sets import Set from MoinMoin import wikiutil, config from MoinMoin.Page import Page -import Xapian -from xapian import Query -from Xapian import UnicodeQuery +try: + import Xapian + from Xapian import Query, UnicodeQuery + use_stemming = Xapian.use_stemming +except ImportError: + use_stemming = False ############################################################################# ### query objects @@ -75,7 +79,7 @@ """ return '' - def _build_re(self, pattern, use_re=False, case=False): + def _build_re(self, pattern, use_re=False, case=False, stemmed=False): """ Make a regular expression out of a text pattern """ flags = case and re.U or (re.I | re.U) if use_re: @@ -175,15 +179,15 @@ wanted = wanted and term.xapian_wanted() return wanted - def xapian_term(self): + def xapian_term(self, request): # sort negated terms terms = [] not_terms = [] for term in self._subterms: if not term.negated: - terms.append(term.xapian_term()) + terms.append(term.xapian_term(request)) else: - not_terms.append(term.xapian_term()) + not_terms.append(term.xapian_term(request)) # prepare query for not negated terms if len(terms) == 1: @@ -224,9 +228,9 @@ matches.extend(result) return matches - def xapian_term(self): + def xapian_term(self, request): # XXX: negated terms managed by _moinSearch? - return Query(Query.OP_OR, [term.xapian_term() for term in self._subterms]) + return Query(Query.OP_OR, [term.xapian_term(request) for term in self._subterms]) class TextSearch(BaseExpression): @@ -271,7 +275,23 @@ # Search in page body body = page.get_raw_body() for match in self.search_re.finditer(body): - matches.append(TextMatch(re_match=match)) + if use_stemming: + # somewhere in regular word + if body[match.start()] not in config.chars_upper and \ + body[match.start()-1] in config.chars_lower: + continue + + post = 0 + for c in body[match.end():]: + if c in config.chars_lower: + post += 1 + else: + break + + matches.append(TextMatch(start=match.start(), + end=match.end()+post)) + else: + matches.append(TextMatch(re_match=match)) # Decide what to do with the results. if ((self.negated and matches) or @@ -286,25 +306,36 @@ def xapian_wanted(self): return not self.use_re - def xapian_term(self): + def xapian_term(self, request): if self.use_re: return None # xapian can't do regex search else: - analyzer = Xapian.WikiAnalyzer() + analyzer = Xapian.WikiAnalyzer(language=request.cfg.language_default) terms = self._pattern.split() - + # all parsed wikiwords, AND'ed queries = [] + stemmed = [] for t in terms: - t = [i.encode(config.charset) for i in list(analyzer.tokenize(t))] - if len(t) < 2: - queries.append(UnicodeQuery(t[0])) + if use_stemming: + # stemmed OR not stemmed + tmp = [] + for i in analyzer.tokenize(t, flat_stemming=False): + tmp.append(UnicodeQuery(Query.OP_OR, i)) + stemmed.append(i[1]) + t = tmp else: - queries.append(UnicodeQuery(Query.OP_AND, t)) + # just not stemmed + t = [UnicodeQuery(i) for i in analyzer.tokenize(t)] + queries.append(Query(Query.OP_AND, t)) + + if stemmed: + self._build_re(' '.join(stemmed), use_re=False, + case=self.case, stemmed=True) # titlesearch OR parsed wikiwords return Query(Query.OP_OR, - (self.titlesearch.xapian_term(), + (self.titlesearch.xapian_term(request), Query(Query.OP_AND, queries))) @@ -322,7 +353,7 @@ self.negated = 0 self.use_re = use_re self.case = case - self._build_re(unicode(pattern), use_re=use_re, case=case) + self._build_re(self._pattern, use_re=use_re, case=case) def costs(self): return 100 @@ -348,7 +379,23 @@ # Get matches in page name matches = [] for match in self.search_re.finditer(page.page_name): - matches.append(TitleMatch(re_match=match)) + if use_stemming: + # somewhere in regular word + if page.page_name[match.start()] not in config.chars_upper and \ + page.page_name[match.start()-1] in config.chars_lower: + continue + + post = 0 + for c in page.page_name[match.end():]: + if c in config.chars_lower: + post += 1 + else: + break + + matches.append(TitleMatch(start=match.start(), + end=match.end()+post)) + else: + matches.append(TitleMatch(re_match=match)) if ((self.negated and matches) or (not self.negated and not matches)): @@ -362,23 +409,36 @@ def xapian_wanted(self): return not self.use_re - def xapian_term(self): + def xapian_term(self, request): if self.use_re: return None # xapian doesn't support regex search else: - analyzer = Xapian.WikiAnalyzer() + analyzer = Xapian.WikiAnalyzer(language=request.cfg.language_default) terms = self._pattern.split() - terms = [list(analyzer.tokenize(t)) for t in terms] + terms = [list(analyzer.raw_tokenize(t)) for t in terms] # all parsed wikiwords, AND'ed queries = [] + stemmed = [] for t in terms: - t = ['%s%s' % (Xapian.Index.prefixMap['title'], i) - for i in list(analyzer.tokenize(t))] - if len(t) < 2: - queries.append(UnicodeQuery(t[0])) + if use_stemming: + # stemmed OR not stemmed + tmp = [] + for i in analyzer.tokenize(t, flat_stemming=False): + tmp.append(UnicodeQuery(Query.OP_OR, ['%s%s' % + (Xapian.Index.prefixMap['title'], j) for j in i])) + stemmed.append(i[1]) + t = tmp else: - queries.append(UnicodeQuery(Query.OP_AND, t)) + # just not stemmed + t = [UnicodeQuery('%s%s' % (Xapian.Index.prefixMap['title'], i)) + for i in analyzer.tokenize(t)] + + queries.append(Query(Query.OP_AND, t)) + + if stemmed: + self._build_re(' '.join(stemmed), use_re=False, + case=self.case, stemmed=True) return Query(Query.OP_AND, queries) @@ -387,7 +447,7 @@ """ Search the term in the pagelinks """ def __init__(self, pattern, use_re=False, case=True): - """ Init a title search + """ Init a link search @param pattern: pattern to search for, ascii string or unicode @param use_re: treat pattern as re of plain text, bool @@ -459,7 +519,7 @@ def xapian_wanted(self): return not self.use_re - def xapian_term(self): + def xapian_term(self, request): pattern = self.pattern if self.use_re: return None # xapian doesnt support regex search @@ -467,6 +527,56 @@ return UnicodeQuery('%s:%s' % (Xapian.Index.prefixMap['linkto'], pattern)) + +class LanguageSearch(BaseExpression): + """ Search the pages written in a language """ + + def __init__(self, pattern, use_re=False, case=True): + """ Init a language search + + @param pattern: pattern to search for, ascii string or unicode + @param use_re: treat pattern as re of plain text, bool + @param case: do case sensitive search, bool + """ + # iso language code, always lowercase + self._pattern = pattern.lower() + self.negated = 0 + self.use_re = use_re + self.case = case + self.xapian_called = False + self._build_re(self._pattern, use_re=use_re, case=case) + + def costs(self): + return 5000 # cheaper than a TextSearch + + def __unicode__(self): + neg = self.negated and '-' or '' + return u'%s!"%s"' % (neg, unicode(self._pattern)) + + def highlight_re(self): + return "" + + def search(self, page): + # We just use (and trust ;)) xapian for this.. deactivated for _moinSearch + if not self.xapian_called: + return None + else: + # XXX why not return None or empty list? + return [Match()] + + def xapian_wanted(self): + return not self.use_re + + def xapian_term(self, request): + pattern = self.pattern + if self.use_re: + return None # xapian doesnt support regex search + else: + self.xapian_called = True + return UnicodeQuery('%s%s' % + (Xapian.Index.prefixMap['lang'], pattern)) + + ############################################################################ ### Results ############################################################################ @@ -765,7 +875,8 @@ title_search = self.titlesearch regex = self.regex case = self.case - linkto = 0 + linkto = False + lang = False for m in modifiers: if "title".startswith(m): @@ -776,8 +887,12 @@ case = True elif "linkto".startswith(m): linkto = True + elif "language".startswith(m): + lang = True - if linkto: + if lang: + obj = LanguageSearch(text, use_re=regex, case=False) + elif linkto: obj = LinkSearch(text, use_re=regex, case=case) elif title_search: obj = TitleSearch(text, use_re=regex, case=case) @@ -937,6 +1052,9 @@ 'do': 'get', 'target': page.attachment, } + elif page.page_name.startswith('FS/'): # XXX FS hardcoded + fmt_context = "" + querydict = None else: fmt_context = self.formatContext(page, context, maxlines) querydict = None @@ -1253,12 +1371,15 @@ return moin search in those pages. """ pages = None - index = Xapian.Index(self.request) - if index.exists() and self.query.xapian_wanted(): + try: + index = Xapian.Index(self.request) + except NameError: + index = None + if index and index.exists() and self.query.xapian_wanted(): self.request.clock.start('_xapianSearch') try: from MoinMoin.support import xapwrap - query = self.query.xapian_term() + query = self.query.xapian_term(self.request) self.request.log("xapianSearch: query = %r" % query.get_description()) query = xapwrap.index.QObjQuery(query) @@ -1298,7 +1419,7 @@ page = Page(self.request, pagename) if attachment: if pagename == fs_rootpage: # not really an attachment - page = Page(self.request, "%s%s" % (fs_rootpage, attachment)) + page = Page(self.request, "%s/%s" % (fs_rootpage, attachment)) hits.append((wikiname, page, None, None)) else: hits.append((wikiname, page, attachment, None))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MoinMoin/support/difflib.py Fri Jun 30 21:35:59 2006 +0200 @@ -0,0 +1,2026 @@ +#! /usr/bin/env python +# Python 2.4.3 (maybe other versions, too) has a broken difflib, sometimes +# raising a "maximum recursion depth exceeded in cmp" exception. +# This is taken from python.org SVN repo revision 46940 with patches +# 36160 and 34415 reversed for python2.3 compatibility. + +""" +Module difflib -- helpers for computing deltas between objects. + +Function get_close_matches(word, possibilities, n=3, cutoff=0.6): + Use SequenceMatcher to return list of the best "good enough" matches. + +Function context_diff(a, b): + For two lists of strings, return a delta in context diff format. + +Function ndiff(a, b): + Return a delta: the difference between `a` and `b` (lists of strings). + +Function restore(delta, which): + Return one of the two sequences that generated an ndiff delta. + +Function unified_diff(a, b): + For two lists of strings, return a delta in unified diff format. + +Class SequenceMatcher: + A flexible class for comparing pairs of sequences of any type. + +Class Differ: + For producing human-readable deltas from sequences of lines of text. + +Class HtmlDiff: + For producing HTML side by side comparison with change highlights. +""" + +__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', + 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', + 'unified_diff', 'HtmlDiff'] + +def _calculate_ratio(matches, length): + if length: + return 2.0 * matches / length + return 1.0 + +class SequenceMatcher: + + """ + SequenceMatcher is a flexible class for comparing pairs of sequences of + any type, so long as the sequence elements are hashable. The basic + algorithm predates, and is a little fancier than, an algorithm + published in the late 1980's by Ratcliff and Obershelp under the + hyperbolic name "gestalt pattern matching". The basic idea is to find + the longest contiguous matching subsequence that contains no "junk" + elements (R-O doesn't address junk). The same idea is then applied + recursively to the pieces of the sequences to the left and to the right + of the matching subsequence. This does not yield minimal edit + sequences, but does tend to yield matches that "look right" to people. + + SequenceMatcher tries to compute a "human-friendly diff" between two + sequences. Unlike e.g. UNIX(tm) diff, the fundamental notion is the + longest *contiguous* & junk-free matching subsequence. That's what + catches peoples' eyes. The Windows(tm) windiff has another interesting + notion, pairing up elements that appear uniquely in each sequence. + That, and the method here, appear to yield more intuitive difference + reports than does diff. This method appears to be the least vulnerable + to synching up on blocks of "junk lines", though (like blank lines in + ordinary text files, or maybe "<P>" lines in HTML files). That may be + because this is the only method of the 3 that has a *concept* of + "junk" <wink>. + + Example, comparing two strings, and considering blanks to be "junk": + + >>> s = SequenceMatcher(lambda x: x == " ", + ... "private Thread currentThread;", + ... "private volatile Thread currentThread;") + >>> + + .ratio() returns a float in [0, 1], measuring the "similarity" of the + sequences. As a rule of thumb, a .ratio() value over 0.6 means the + sequences are close matches: + + >>> print round(s.ratio(), 3) + 0.866 + >>> + + If you're only interested in where the sequences match, + .get_matching_blocks() is handy: + + >>> for block in s.get_matching_blocks(): + ... print "a[%d] and b[%d] match for %d elements" % block + a[0] and b[0] match for 8 elements + a[8] and b[17] match for 21 elements + a[29] and b[38] match for 0 elements + + Note that the last tuple returned by .get_matching_blocks() is always a + dummy, (len(a), len(b), 0), and this is the only case in which the last + tuple element (number of elements matched) is 0. + + If you want to know how to change the first sequence into the second, + use .get_opcodes(): + + >>> for opcode in s.get_opcodes(): + ... print "%6s a[%d:%d] b[%d:%d]" % opcode + equal a[0:8] b[0:8] + insert a[8:8] b[8:17] + equal a[8:29] b[17:38] + + See the Differ class for a fancy human-friendly file differencer, which + uses SequenceMatcher both to compare sequences of lines, and to compare + sequences of characters within similar (near-matching) lines. + + See also function get_close_matches() in this module, which shows how + simple code building on SequenceMatcher can be used to do useful work. + + Timing: Basic R-O is cubic time worst case and quadratic time expected + case. SequenceMatcher is quadratic time for the worst case and has + expected-case behavior dependent in a complicated way on how many + elements the sequences have in common; best case time is linear. + + Methods: + + __init__(isjunk=None, a='', b='') + Construct a SequenceMatcher. + + set_seqs(a, b) + Set the two sequences to be compared. + + set_seq1(a) + Set the first sequence to be compared. + + set_seq2(b) + Set the second sequence to be compared. + + find_longest_match(alo, ahi, blo, bhi) + Find longest matching block in a[alo:ahi] and b[blo:bhi]. + + get_matching_blocks() + Return list of triples describing matching subsequences. + + get_opcodes() + Return list of 5-tuples describing how to turn a into b. + + ratio() + Return a measure of the sequences' similarity (float in [0,1]). + + quick_ratio() + Return an upper bound on .ratio() relatively quickly. + + real_quick_ratio() + Return an upper bound on ratio() very quickly. + """ + + def __init__(self, isjunk=None, a='', b=''): + """Construct a SequenceMatcher. + + Optional arg isjunk is None (the default), or a one-argument + function that takes a sequence element and returns true iff the + element is junk. None is equivalent to passing "lambda x: 0", i.e. + no elements are considered to be junk. For example, pass + lambda x: x in " \\t" + if you're comparing lines as sequences of characters, and don't + want to synch up on blanks or hard tabs. + + Optional arg a is the first of two sequences to be compared. By + default, an empty string. The elements of a must be hashable. See + also .set_seqs() and .set_seq1(). + + Optional arg b is the second of two sequences to be compared. By + default, an empty string. The elements of b must be hashable. See + also .set_seqs() and .set_seq2(). + """ + + # Members: + # a + # first sequence + # b + # second sequence; differences are computed as "what do + # we need to do to 'a' to change it into 'b'?" + # b2j + # for x in b, b2j[x] is a list of the indices (into b) + # at which x appears; junk elements do not appear + # fullbcount + # for x in b, fullbcount[x] == the number of times x + # appears in b; only materialized if really needed (used + # only for computing quick_ratio()) + # matching_blocks + # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k]; + # ascending & non-overlapping in i and in j; terminated by + # a dummy (len(a), len(b), 0) sentinel + # opcodes + # a list of (tag, i1, i2, j1, j2) tuples, where tag is + # one of + # 'replace' a[i1:i2] should be replaced by b[j1:j2] + # 'delete' a[i1:i2] should be deleted + # 'insert' b[j1:j2] should be inserted + # 'equal' a[i1:i2] == b[j1:j2] + # isjunk + # a user-supplied function taking a sequence element and + # returning true iff the element is "junk" -- this has + # subtle but helpful effects on the algorithm, which I'll + # get around to writing up someday <0.9 wink>. + # DON'T USE! Only __chain_b uses this. Use isbjunk. + # isbjunk + # for x in b, isbjunk(x) == isjunk(x) but much faster; + # it's really the has_key method of a hidden dict. + # DOES NOT WORK for x in a! + # isbpopular + # for x in b, isbpopular(x) is true iff b is reasonably long + # (at least 200 elements) and x accounts for more than 1% of + # its elements. DOES NOT WORK for x in a! + + self.isjunk = isjunk + self.a = self.b = None + self.set_seqs(a, b) + + def set_seqs(self, a, b): + """Set the two sequences to be compared. + + >>> s = SequenceMatcher() + >>> s.set_seqs("abcd", "bcde") + >>> s.ratio() + 0.75 + """ + + self.set_seq1(a) + self.set_seq2(b) + + def set_seq1(self, a): + """Set the first sequence to be compared. + + The second sequence to be compared is not changed. + + >>> s = SequenceMatcher(None, "abcd", "bcde") + >>> s.ratio() + 0.75 + >>> s.set_seq1("bcde") + >>> s.ratio() + 1.0 + >>> + + SequenceMatcher computes and caches detailed information about the + second sequence, so if you want to compare one sequence S against + many sequences, use .set_seq2(S) once and call .set_seq1(x) + repeatedly for each of the other sequences. + + See also set_seqs() and set_seq2(). + """ + + if a is self.a: + return + self.a = a + self.matching_blocks = self.opcodes = None + + def set_seq2(self, b): + """Set the second sequence to be compared. + + The first sequence to be compared is not changed. + + >>> s = SequenceMatcher(None, "abcd", "bcde") + >>> s.ratio() + 0.75 + >>> s.set_seq2("abcd") + >>> s.ratio() + 1.0 + >>> + + SequenceMatcher computes and caches detailed information about the + second sequence, so if you want to compare one sequence S against + many sequences, use .set_seq2(S) once and call .set_seq1(x) + repeatedly for each of the other sequences. + + See also set_seqs() and set_seq1(). + """ + + if b is self.b: + return + self.b = b + self.matching_blocks = self.opcodes = None + self.fullbcount = None + self.__chain_b() + + # For each element x in b, set b2j[x] to a list of the indices in + # b where x appears; the indices are in increasing order; note that + # the number of times x appears in b is len(b2j[x]) ... + # when self.isjunk is defined, junk elements don't show up in this + # map at all, which stops the central find_longest_match method + # from starting any matching block at a junk element ... + # also creates the fast isbjunk function ... + # b2j also does not contain entries for "popular" elements, meaning + # elements that account for more than 1% of the total elements, and + # when the sequence is reasonably large (>= 200 elements); this can + # be viewed as an adaptive notion of semi-junk, and yields an enormous + # speedup when, e.g., comparing program files with hundreds of + # instances of "return NULL;" ... + # note that this is only called when b changes; so for cross-product + # kinds of matches, it's best to call set_seq2 once, then set_seq1 + # repeatedly + + def __chain_b(self): + # Because isjunk is a user-defined (not C) function, and we test + # for junk a LOT, it's important to minimize the number of calls. + # Before the tricks described here, __chain_b was by far the most + # time-consuming routine in the whole module! If anyone sees + # Jim Roskind, thank him again for profile.py -- I never would + # have guessed that. + # The first trick is to build b2j ignoring the possibility + # of junk. I.e., we don't call isjunk at all yet. Throwing + # out the junk later is much cheaper than building b2j "right" + # from the start. + b = self.b + n = len(b) + self.b2j = b2j = {} + populardict = {} + for i, elt in enumerate(b): + if elt in b2j: + indices = b2j[elt] + if n >= 200 and len(indices) * 100 > n: + populardict[elt] = 1 + del indices[:] + else: + indices.append(i) + else: + b2j[elt] = [i] + + # Purge leftover indices for popular elements. + for elt in populardict: + del b2j[elt] + + # Now b2j.keys() contains elements uniquely, and especially when + # the sequence is a string, that's usually a good deal smaller + # than len(string). The difference is the number of isjunk calls + # saved. + isjunk = self.isjunk + junkdict = {} + if isjunk: + for d in populardict, b2j: + for elt in d.keys(): + if isjunk(elt): + junkdict[elt] = 1 + del d[elt] + + # Now for x in b, isjunk(x) == x in junkdict, but the + # latter is much faster. Note too that while there may be a + # lot of junk in the sequence, the number of *unique* junk + # elements is probably small. So the memory burden of keeping + # this dict alive is likely trivial compared to the size of b2j. + self.isbjunk = junkdict.has_key + self.isbpopular = populardict.has_key + + def find_longest_match(self, alo, ahi, blo, bhi): + """Find longest matching block in a[alo:ahi] and b[blo:bhi]. + + If isjunk is not defined: + + Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where + alo <= i <= i+k <= ahi + blo <= j <= j+k <= bhi + and for all (i',j',k') meeting those conditions, + k >= k' + i <= i' + and if i == i', j <= j' + + In other words, of all maximal matching blocks, return one that + starts earliest in a, and of all those maximal matching blocks that + start earliest in a, return the one that starts earliest in b. + + >>> s = SequenceMatcher(None, " abcd", "abcd abcd") + >>> s.find_longest_match(0, 5, 0, 9) + (0, 4, 5) + + If isjunk is defined, first the longest matching block is + determined as above, but with the additional restriction that no + junk element appears in the block. Then that block is extended as + far as possible by matching (only) junk elements on both sides. So + the resulting block never matches on junk except as identical junk + happens to be adjacent to an "interesting" match. + + Here's the same example as before, but considering blanks to be + junk. That prevents " abcd" from matching the " abcd" at the tail + end of the second sequence directly. Instead only the "abcd" can + match, and matches the leftmost "abcd" in the second sequence: + + >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd") + >>> s.find_longest_match(0, 5, 0, 9) + (1, 0, 4) + + If no blocks match, return (alo, blo, 0). + + >>> s = SequenceMatcher(None, "ab", "c") + >>> s.find_longest_match(0, 2, 0, 1) + (0, 0, 0) + """ + + # CAUTION: stripping common prefix or suffix would be incorrect. + # E.g., + # ab + # acab + # Longest matching block is "ab", but if common prefix is + # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so + # strip, so ends up claiming that ab is changed to acab by + # inserting "ca" in the middle. That's minimal but unintuitive: + # "it's obvious" that someone inserted "ac" at the front. + # Windiff ends up at the same place as diff, but by pairing up + # the unique 'b's and then matching the first two 'a's. + + a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk + besti, bestj, bestsize = alo, blo, 0 + # find longest junk-free match + # during an iteration of the loop, j2len[j] = length of longest + # junk-free match ending with a[i-1] and b[j] + j2len = {} + nothing = [] + for i in xrange(alo, ahi): + # look at all instances of a[i] in b; note that because + # b2j has no junk keys, the loop is skipped if a[i] is junk + j2lenget = j2len.get + newj2len = {} + for j in b2j.get(a[i], nothing): + # a[i] matches b[j] + if j < blo: + continue + if j >= bhi: + break + k = newj2len[j] = j2lenget(j-1, 0) + 1 + if k > bestsize: + besti, bestj, bestsize = i-k+1, j-k+1, k + j2len = newj2len + + # Extend the best by non-junk elements on each end. In particular, + # "popular" non-junk elements aren't in b2j, which greatly speeds + # the inner loop above, but also means "the best" match so far + # doesn't contain any junk *or* popular non-junk elements. + while besti > alo and bestj > blo and \ + not isbjunk(b[bestj-1]) and \ + a[besti-1] == b[bestj-1]: + besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 + while besti+bestsize < ahi and bestj+bestsize < bhi and \ + not isbjunk(b[bestj+bestsize]) and \ + a[besti+bestsize] == b[bestj+bestsize]: + bestsize += 1 + + # Now that we have a wholly interesting match (albeit possibly + # empty!), we may as well suck up the matching junk on each + # side of it too. Can't think of a good reason not to, and it + # saves post-processing the (possibly considerable) expense of + # figuring out what to do with it. In the case of an empty + # interesting match, this is clearly the right thing to do, + # because no other kind of match is possible in the regions. + while besti > alo and bestj > blo and \ + isbjunk(b[bestj-1]) and \ + a[besti-1] == b[bestj-1]: + besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 + while besti+bestsize < ahi and bestj+bestsize < bhi and \ + isbjunk(b[bestj+bestsize]) and \ + a[besti+bestsize] == b[bestj+bestsize]: + bestsize = bestsize + 1 + + return besti, bestj, bestsize + + def get_matching_blocks(self): + """Return list of triples describing matching subsequences. + + Each triple is of the form (i, j, n), and means that + a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in + i and in j. New in Python 2.5, it's also guaranteed that if + (i, j, n) and (i', j', n') are adjacent triples in the list, and + the second is not the last triple in the list, then i+n != i' or + j+n != j'. IOW, adjacent triples never describe adjacent equal + blocks. + + The last triple is a dummy, (len(a), len(b), 0), and is the only + triple with n==0. + + >>> s = SequenceMatcher(None, "abxcd", "abcd") + >>> s.get_matching_blocks() + [(0, 0, 2), (3, 2, 2), (5, 4, 0)] + """ + + if self.matching_blocks is not None: + return self.matching_blocks + la, lb = len(self.a), len(self.b) + + # This is most naturally expressed as a recursive algorithm, but + # at least one user bumped into extreme use cases that exceeded + # the recursion limit on their box. So, now we maintain a list + # ('queue`) of blocks we still need to look at, and append partial + # results to `matching_blocks` in a loop; the matches are sorted + # at the end. + queue = [(0, la, 0, lb)] + matching_blocks = [] + while queue: + alo, ahi, blo, bhi = queue.pop() + i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) + # a[alo:i] vs b[blo:j] unknown + # a[i:i+k] same as b[j:j+k] + # a[i+k:ahi] vs b[j+k:bhi] unknown + if k: # if k is 0, there was no matching block + matching_blocks.append(x) + if alo < i and blo < j: + queue.append((alo, i, blo, j)) + if i+k < ahi and j+k < bhi: + queue.append((i+k, ahi, j+k, bhi)) + matching_blocks.sort() + + # It's possible that we have adjacent equal blocks in the + # matching_blocks list now. Starting with 2.5, this code was added + # to collapse them. + i1 = j1 = k1 = 0 + non_adjacent = [] + for i2, j2, k2 in matching_blocks: + # Is this block adjacent to i1, j1, k1? + if i1 + k1 == i2 and j1 + k1 == j2: + # Yes, so collapse them -- this just increases the length of + # the first block by the length of the second, and the first + # block so lengthened remains the block to compare against. + k1 += k2 + else: + # Not adjacent. Remember the first block (k1==0 means it's + # the dummy we started with), and make the second block the + # new block to compare against. + if k1: + non_adjacent.append((i1, j1, k1)) + i1, j1, k1 = i2, j2, k2 + if k1: + non_adjacent.append((i1, j1, k1)) + + non_adjacent.append( (la, lb, 0) ) + self.matching_blocks = non_adjacent + return self.matching_blocks + + def get_opcodes(self): + """Return list of 5-tuples describing how to turn a into b. + + Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple + has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the + tuple preceding it, and likewise for j1 == the previous j2. + + The tags are strings, with these meanings: + + 'replace': a[i1:i2] should be replaced by b[j1:j2] + 'delete': a[i1:i2] should be deleted. + Note that j1==j2 in this case. + 'insert': b[j1:j2] should be inserted at a[i1:i1]. + Note that i1==i2 in this case. + 'equal': a[i1:i2] == b[j1:j2] + + >>> a = "qabxcd" + >>> b = "abycdf" + >>> s = SequenceMatcher(None, a, b) + >>> for tag, i1, i2, j1, j2 in s.get_opcodes(): + ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % + ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])) + delete a[0:1] (q) b[0:0] () + equal a[1:3] (ab) b[0:2] (ab) + replace a[3:4] (x) b[2:3] (y) + equal a[4:6] (cd) b[3:5] (cd) + insert a[6:6] () b[5:6] (f) + """ + + if self.opcodes is not None: + return self.opcodes + i = j = 0 + self.opcodes = answer = [] + for ai, bj, size in self.get_matching_blocks(): + # invariant: we've pumped out correct diffs to change + # a[:i] into b[:j], and the next matching block is + # a[ai:ai+size] == b[bj:bj+size]. So we need to pump + # out a diff to change a[i:ai] into b[j:bj], pump out + # the matching block, and move (i,j) beyond the match + tag = '' + if i < ai and j < bj: + tag = 'replace' + elif i < ai: + tag = 'delete' + elif j < bj: + tag = 'insert' + if tag: + answer.append( (tag, i, ai, j, bj) ) + i, j = ai+size, bj+size + # the list of matching blocks is terminated by a + # sentinel with size 0 + if size: + answer.append( ('equal', ai, i, bj, j) ) + return answer + + def get_grouped_opcodes(self, n=3): + """ Isolate change clusters by eliminating ranges with no changes. + + Return a generator of groups with upto n lines of context. + Each group is in the same format as returned by get_opcodes(). + + >>> from pprint import pprint + >>> a = map(str, range(1,40)) + >>> b = a[:] + >>> b[8:8] = ['i'] # Make an insertion + >>> b[20] += 'x' # Make a replacement + >>> b[23:28] = [] # Make a deletion + >>> b[30] += 'y' # Make another replacement + >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes())) + [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)], + [('equal', 16, 19, 17, 20), + ('replace', 19, 20, 20, 21), + ('equal', 20, 22, 21, 23), + ('delete', 22, 27, 23, 23), + ('equal', 27, 30, 23, 26)], + [('equal', 31, 34, 27, 30), + ('replace', 34, 35, 30, 31), + ('equal', 35, 38, 31, 34)]] + """ + + codes = self.get_opcodes() + if not codes: + codes = [("equal", 0, 1, 0, 1)] + # Fixup leading and trailing groups if they show no changes. + if codes[0][0] == 'equal': + tag, i1, i2, j1, j2 = codes[0] + codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2 + if codes[-1][0] == 'equal': + tag, i1, i2, j1, j2 = codes[-1] + codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n) + + nn = n + n + group = [] + for tag, i1, i2, j1, j2 in codes: + # End the current group and start a new one whenever + # there is a large range with no changes. + if tag == 'equal' and i2-i1 > nn: + group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n))) + yield group + group = [] + i1, j1 = max(i1, i2-n), max(j1, j2-n) + group.append((tag, i1, i2, j1 ,j2)) + if group and not (len(group)==1 and group[0][0] == 'equal'): + yield group + + def ratio(self): + """Return a measure of the sequences' similarity (float in [0,1]). + + Where T is the total number of elements in both sequences, and + M is the number of matches, this is 2.0*M / T. + Note that this is 1 if the sequences are identical, and 0 if + they have nothing in common. + + .ratio() is expensive to compute if you haven't already computed + .get_matching_blocks() or .get_opcodes(), in which case you may + want to try .quick_ratio() or .real_quick_ratio() first to get an + upper bound. + + >>> s = SequenceMatcher(None, "abcd", "bcde") + >>> s.ratio() + 0.75 + >>> s.quick_ratio() + 0.75 + >>> s.real_quick_ratio() + 1.0 + """ + + matches = reduce(lambda sum, triple: sum + triple[-1], + self.get_matching_blocks(), 0) + return _calculate_ratio(matches, len(self.a) + len(self.b)) + + def quick_ratio(self): + """Return an upper bound on ratio() relatively quickly. + + This isn't defined beyond that it is an upper bound on .ratio(), and + is faster to compute. + """ + + # viewing a and b as multisets, set matches to the cardinality + # of their intersection; this counts the number of matches + # without regard to order, so is clearly an upper bound + if self.fullbcount is None: + self.fullbcount = fullbcount = {} + for elt in self.b: + fullbcount[elt] = fullbcount.get(elt, 0) + 1 + fullbcount = self.fullbcount + # avail[x] is the number of times x appears in 'b' less the + # number of times we've seen it in 'a' so far ... kinda + avail = {} + availhas, matches = avail.has_key, 0 + for elt in self.a: + if availhas(elt): + numb = avail[elt] + else: + numb = fullbcount.get(elt, 0) + avail[elt] = numb - 1 + if numb > 0: + matches = matches + 1 + return _calculate_ratio(matches, len(self.a) + len(self.b)) + + def real_quick_ratio(self): + """Return an upper bound on ratio() very quickly. + + This isn't defined beyond that it is an upper bound on .ratio(), and + is faster to compute than either .ratio() or .quick_ratio(). + """ + + la, lb = len(self.a), len(self.b) + # can't have more matches than the number of elements in the + # shorter sequence + return _calculate_ratio(min(la, lb), la + lb) + +def get_close_matches(word, possibilities, n=3, cutoff=0.6): + """Use SequenceMatcher to return list of the best "good enough" matches. + + word is a sequence for which close matches are desired (typically a + string). + + possibilities is a list of sequences against which to match word + (typically a list of strings). + + Optional arg n (default 3) is the maximum number of close matches to + return. n must be > 0. + + Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities + that don't score at least that similar to word are ignored. + + The best (no more than n) matches among the possibilities are returned + in a list, sorted by similarity score, most similar first. + + >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"]) + ['apple', 'ape'] + >>> import keyword as _keyword + >>> get_close_matches("wheel", _keyword.kwlist) + ['while'] + >>> get_close_matches("apple", _keyword.kwlist) + [] + >>> get_close_matches("accept", _keyword.kwlist) + ['except'] + """ + + if not n > 0: + raise ValueError("n must be > 0: %r" % (n,)) + if not 0.0 <= cutoff <= 1.0: + raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) + result = [] + s = SequenceMatcher() + s.set_seq2(word) + for x in possibilities: + s.set_seq1(x) + if s.real_quick_ratio() >= cutoff and \ + s.quick_ratio() >= cutoff and \ + s.ratio() >= cutoff: + result.append((s.ratio(), x)) + + # Sort by score. + result.sort() + # Retain only the best n. + result = result[-n:] + # Move best-scorer to head of list. + result.reverse() + # Strip scores. + return [x for score, x in result] + +def _count_leading(line, ch): + """ + Return number of `ch` characters at the start of `line`. + + Example: + + >>> _count_leading(' abc', ' ') + 3 + """ + + i, n = 0, len(line) + while i < n and line[i] == ch: + i += 1 + return i + +class Differ: + r""" + Differ is a class for comparing sequences of lines of text, and + producing human-readable differences or deltas. Differ uses + SequenceMatcher both to compare sequences of lines, and to compare + sequences of characters within similar (near-matching) lines. + + Each line of a Differ delta begins with a two-letter code: + + '- ' line unique to sequence 1 + '+ ' line unique to sequence 2 + ' ' line common to both sequences + '? ' line not present in either input sequence + + Lines beginning with '? ' attempt to guide the eye to intraline + differences, and were not present in either input sequence. These lines + can be confusing if the sequences contain tab characters. + + Note that Differ makes no claim to produce a *minimal* diff. To the + contrary, minimal diffs are often counter-intuitive, because they synch + up anywhere possible, sometimes accidental matches 100 pages apart. + Restricting synch points to contiguous matches preserves some notion of + locality, at the occasional cost of producing a longer diff. + + Example: Comparing two texts. + + First we set up the texts, sequences of individual single-line strings + ending with newlines (such sequences can also be obtained from the + `readlines()` method of file-like objects): + + >>> text1 = ''' 1. Beautiful is better than ugly. + ... 2. Explicit is better than implicit. + ... 3. Simple is better than complex. + ... 4. Complex is better than complicated. + ... '''.splitlines(1) + >>> len(text1) + 4 + >>> text1[0][-1] + '\n' + >>> text2 = ''' 1. Beautiful is better than ugly. + ... 3. Simple is better than complex. + ... 4. Complicated is better than complex. + ... 5. Flat is better than nested. + ... '''.splitlines(1) + + Next we instantiate a Differ object: + + >>> d = Differ() + + Note that when instantiating a Differ object we may pass functions to + filter out line and character 'junk'. See Differ.__init__ for details. + + Finally, we compare the two: + + >>> result = list(d.compare(text1, text2)) + + 'result' is a list of strings, so let's pretty-print it: + + >>> from pprint import pprint as _pprint + >>> _pprint(result) + [' 1. Beautiful is better than ugly.\n', + '- 2. Explicit is better than implicit.\n', + '- 3. Simple is better than complex.\n', + '+ 3. Simple is better than complex.\n', + '? ++\n', + '- 4. Complex is better than complicated.\n', + '? ^ ---- ^\n', + '+ 4. Complicated is better than complex.\n', + '? ++++ ^ ^\n', + '+ 5. Flat is better than nested.\n'] + + As a single multi-line string it looks like this: + + >>> print ''.join(result), + 1. Beautiful is better than ugly. + - 2. Explicit is better than implicit. + - 3. Simple is better than complex. + + 3. Simple is better than complex. + ? ++ + - 4. Complex is better than complicated. + ? ^ ---- ^ + + 4. Complicated is better than complex. + ? ++++ ^ ^ + + 5. Flat is better than nested. + + Methods: + + __init__(linejunk=None, charjunk=None) + Construct a text differencer, with optional filters. + + compare(a, b) + Compare two sequences of lines; generate the resulting delta. + """ + + def __init__(self, linejunk=None, charjunk=None): + """ + Construct a text differencer, with optional filters. + + The two optional keyword parameters are for filter functions: + + - `linejunk`: A function that should accept a single string argument, + and return true iff the string is junk. The module-level function + `IS_LINE_JUNK` may be used to filter out lines without visible + characters, except for at most one splat ('#'). It is recommended + to leave linejunk None; as of Python 2.3, the underlying + SequenceMatcher class has grown an adaptive notion of "noise" lines + that's better than any static definition the author has ever been + able to craft. + + - `charjunk`: A function that should accept a string of length 1. The + module-level function `IS_CHARACTER_JUNK` may be used to filter out + whitespace characters (a blank or tab; **note**: bad idea to include + newline in this!). Use of IS_CHARACTER_JUNK is recommended. + """ + + self.linejunk = linejunk + self.charjunk = charjunk + + def compare(self, a, b): + r""" + Compare two sequences of lines; generate the resulting delta. + + Each sequence must contain individual single-line strings ending with + newlines. Such sequences can be obtained from the `readlines()` method + of file-like objects. The delta generated also consists of newline- + terminated strings, ready to be printed as-is via the writeline() + method of a file-like object. + + Example: + + >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1), + ... 'ore\ntree\nemu\n'.splitlines(1))), + - one + ? ^ + + ore + ? ^ + - two + - three + ? - + + tree + + emu + """ + + cruncher = SequenceMatcher(self.linejunk, a, b) + for tag, alo, ahi, blo, bhi in cruncher.get_opcodes(): + if tag == 'replace': + g = self._fancy_replace(a, alo, ahi, b, blo, bhi) + elif tag == 'delete': + g = self._dump('-', a, alo, ahi) + elif tag == 'insert': + g = self._dump('+', b, blo, bhi) + elif tag == 'equal': + g = self._dump(' ', a, alo, ahi) + else: + raise ValueError, 'unknown tag %r' % (tag,) + + for line in g: + yield line + + def _dump(self, tag, x, lo, hi): + """Generate comparison results for a same-tagged range.""" + for i in xrange(lo, hi): + yield '%s %s' % (tag, x[i]) + + def _plain_replace(self, a, alo, ahi, b, blo, bhi): + assert alo < ahi and blo < bhi + # dump the shorter block first -- reduces the burden on short-term + # memory if the blocks are of very different sizes + if bhi - blo < ahi - alo: + first = self._dump('+', b, blo, bhi) + second = self._dump('-', a, alo, ahi) + else: + first = self._dump('-', a, alo, ahi) + second = self._dump('+', b, blo, bhi) + + for g in first, second: + for line in g: + yield line + + def _fancy_replace(self, a, alo, ahi, b, blo, bhi): + r""" + When replacing one block of lines with another, search the blocks + for *similar* lines; the best-matching pair (if any) is used as a + synch point, and intraline difference marking is done on the + similar pair. Lots of work, but often worth it. + + Example: + + >>> d = Differ() + >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1, + ... ['abcdefGhijkl\n'], 0, 1) + >>> print ''.join(results), + - abcDefghiJkl + ? ^ ^ ^ + + abcdefGhijkl + ? ^ ^ ^ + """ + + # don't synch up unless the lines have a similarity score of at + # least cutoff; best_ratio tracks the best score seen so far + best_ratio, cutoff = 0.74, 0.75 + cruncher = SequenceMatcher(self.charjunk) + eqi, eqj = None, None # 1st indices of equal lines (if any) + + # search for the pair that matches best without being identical + # (identical lines must be junk lines, & we don't want to synch up + # on junk -- unless we have to) + for j in xrange(blo, bhi): + bj = b[j] + cruncher.set_seq2(bj) + for i in xrange(alo, ahi): + ai = a[i] + if ai == bj: + if eqi is None: + eqi, eqj = i, j + continue + cruncher.set_seq1(ai) + # computing similarity is expensive, so use the quick + # upper bounds first -- have seen this speed up messy + # compares by a factor of 3. + # note that ratio() is only expensive to compute the first + # time it's called on a sequence pair; the expensive part + # of the computation is cached by cruncher + if cruncher.real_quick_ratio() > best_ratio and \ + cruncher.quick_ratio() > best_ratio and \ + cruncher.ratio() > best_ratio: + best_ratio, best_i, best_j = cruncher.ratio(), i, j + if best_ratio < cutoff: + # no non-identical "pretty close" pair + if eqi is None: + # no identical pair either -- treat it as a straight replace + for line in self._plain_replace(a, alo, ahi, b, blo, bhi): + yield line + return + # no close pair, but an identical pair -- synch up on that + best_i, best_j, best_ratio = eqi, eqj, 1.0 + else: + # there's a close pair, so forget the identical pair (if any) + eqi = None + + # a[best_i] very similar to b[best_j]; eqi is None iff they're not + # identical + + # pump out diffs from before the synch point + for line in self._fancy_helper(a, alo, best_i, b, blo, best_j): + yield line + + # do intraline marking on the synch pair + aelt, belt = a[best_i], b[best_j] + if eqi is None: + # pump out a '-', '?', '+', '?' quad for the synched lines + atags = btags = "" + cruncher.set_seqs(aelt, belt) + for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes(): + la, lb = ai2 - ai1, bj2 - bj1 + if tag == 'replace': + atags += '^' * la + btags += '^' * lb + elif tag == 'delete': + atags += '-' * la + elif tag == 'insert': + btags += '+' * lb + elif tag == 'equal': + atags += ' ' * la + btags += ' ' * lb + else: + raise ValueError, 'unknown tag %r' % (tag,) + for line in self._qformat(aelt, belt, atags, btags): + yield line + else: + # the synch pair is identical + yield ' ' + aelt + + # pump out diffs from after the synch point + for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi): + yield line + + def _fancy_helper(self, a, alo, ahi, b, blo, bhi): + g = [] + if alo < ahi: + if blo < bhi: + g = self._fancy_replace(a, alo, ahi, b, blo, bhi) + else: + g = self._dump('-', a, alo, ahi) + elif blo < bhi: + g = self._dump('+', b, blo, bhi) + + for line in g: + yield line + + def _qformat(self, aline, bline, atags, btags): + r""" + Format "?" output and deal with leading tabs. + + Example: + + >>> d = Differ() + >>> results = d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n', + ... ' ^ ^ ^ ', '+ ^ ^ ^ ') + >>> for line in results: print repr(line) + ... + '- \tabcDefghiJkl\n' + '? \t ^ ^ ^\n' + '+ \t\tabcdefGhijkl\n' + '? \t ^ ^ ^\n' + """ + + # Can hurt, but will probably help most of the time. + common = min(_count_leading(aline, "\t"), + _count_leading(bline, "\t")) + common = min(common, _count_leading(atags[:common], " ")) + atags = atags[common:].rstrip() + btags = btags[common:].rstrip() + + yield "- " + aline + if atags: + yield "? %s%s\n" % ("\t" * common, atags) + + yield "+ " + bline + if btags: + yield "? %s%s\n" % ("\t" * common, btags) + +# With respect to junk, an earlier version of ndiff simply refused to +# *start* a match with a junk element. The result was cases like this: +# before: private Thread currentThread; +# after: private volatile Thread currentThread; +# If you consider whitespace to be junk, the longest contiguous match +# not starting with junk is "e Thread currentThread". So ndiff reported +# that "e volatil" was inserted between the 't' and the 'e' in "private". +# While an accurate view, to people that's absurd. The current version +# looks for matching blocks that are entirely junk-free, then extends the +# longest one of those as far as possible but only with matching junk. +# So now "currentThread" is matched, then extended to suck up the +# preceding blank; then "private" is matched, and extended to suck up the +# following blank; then "Thread" is matched; and finally ndiff reports +# that "volatile " was inserted before "Thread". The only quibble +# remaining is that perhaps it was really the case that " volatile" +# was inserted after "private". I can live with that <wink>. + +import re + +def IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match): + r""" + Return 1 for ignorable line: iff `line` is blank or contains a single '#'. + + Examples: + + >>> IS_LINE_JUNK('\n') + True + >>> IS_LINE_JUNK(' # \n') + True + >>> IS_LINE_JUNK('hello\n') + False + """ + + return pat(line) is not None + +def IS_CHARACTER_JUNK(ch, ws=" \t"): + r""" + Return 1 for ignorable character: iff `ch` is a space or tab. + + Examples: + + >>> IS_CHARACTER_JUNK(' ') + True + >>> IS_CHARACTER_JUNK('\t') + True + >>> IS_CHARACTER_JUNK('\n') + False + >>> IS_CHARACTER_JUNK('x') + False + """ + + return ch in ws + + +def unified_diff(a, b, fromfile='', tofile='', fromfiledate='', + tofiledate='', n=3, lineterm='\n'): + r""" + Compare two sequences of lines; generate the delta as a unified diff. + + Unified diffs are a compact way of showing line changes and a few + lines of context. The number of context lines is set by 'n' which + defaults to three. + + By default, the diff control lines (those with ---, +++, or @@) are + created with a trailing newline. This is helpful so that inputs + created from file.readlines() result in diffs that are suitable for + file.writelines() since both the inputs and outputs have trailing + newlines. + + For inputs that do not have trailing newlines, set the lineterm + argument to "" so that the output will be uniformly newline free. + + The unidiff format normally has a header for filenames and modification + times. Any or all of these may be specified using strings for + 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. The modification + times are normally expressed in the format returned by time.ctime(). + + Example: + + >>> for line in unified_diff('one two three four'.split(), + ... 'zero one tree four'.split(), 'Original', 'Current', + ... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:20:52 2003', + ... lineterm=''): + ... print line + --- Original Sat Jan 26 23:30:50 1991 + +++ Current Fri Jun 06 10:20:52 2003 + @@ -1,4 +1,4 @@ + +zero + one + -two + -three + +tree + four + """ + + started = False + for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): + if not started: + yield '--- %s %s%s' % (fromfile, fromfiledate, lineterm) + yield '+++ %s %s%s' % (tofile, tofiledate, lineterm) + started = True + i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4] + yield "@@ -%d,%d +%d,%d @@%s" % (i1+1, i2-i1, j1+1, j2-j1, lineterm) + for tag, i1, i2, j1, j2 in group: + if tag == 'equal': + for line in a[i1:i2]: + yield ' ' + line + continue + if tag == 'replace' or tag == 'delete': + for line in a[i1:i2]: + yield '-' + line + if tag == 'replace' or tag == 'insert': + for line in b[j1:j2]: + yield '+' + line + +# See http://www.unix.org/single_unix_specification/ +def context_diff(a, b, fromfile='', tofile='', + fromfiledate='', tofiledate='', n=3, lineterm='\n'): + r""" + Compare two sequences of lines; generate the delta as a context diff. + + Context diffs are a compact way of showing line changes and a few + lines of context. The number of context lines is set by 'n' which + defaults to three. + + By default, the diff control lines (those with *** or ---) are + created with a trailing newline. This is helpful so that inputs + created from file.readlines() result in diffs that are suitable for + file.writelines() since both the inputs and outputs have trailing + newlines. + + For inputs that do not have trailing newlines, set the lineterm + argument to "" so that the output will be uniformly newline free. + + The context diff format normally has a header for filenames and + modification times. Any or all of these may be specified using + strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'. + The modification times are normally expressed in the format returned + by time.ctime(). If not specified, the strings default to blanks. + + Example: + + >>> print ''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(1), + ... 'zero\none\ntree\nfour\n'.splitlines(1), 'Original', 'Current', + ... 'Sat Jan 26 23:30:50 1991', 'Fri Jun 06 10:22:46 2003')), + *** Original Sat Jan 26 23:30:50 1991 + --- Current Fri Jun 06 10:22:46 2003 + *************** + *** 1,4 **** + one + ! two + ! three + four + --- 1,4 ---- + + zero + one + ! tree + four + """ + + started = False + prefixmap = {'insert':'+ ', 'delete':'- ', 'replace':'! ', 'equal':' '} + for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): + if not started: + yield '*** %s %s%s' % (fromfile, fromfiledate, lineterm) + yield '--- %s %s%s' % (tofile, tofiledate, lineterm) + started = True + + yield '***************%s' % (lineterm,) + if group[-1][2] - group[0][1] >= 2: + yield '*** %d,%d ****%s' % (group[0][1]+1, group[-1][2], lineterm) + else: + yield '*** %d ****%s' % (group[-1][2], lineterm) + visiblechanges = [e for e in group if e[0] in ('replace', 'delete')] + if visiblechanges: + for tag, i1, i2, _, _ in group: + if tag != 'insert': + for line in a[i1:i2]: + yield prefixmap[tag] + line + + if group[-1][4] - group[0][3] >= 2: + yield '--- %d,%d ----%s' % (group[0][3]+1, group[-1][4], lineterm) + else: + yield '--- %d ----%s' % (group[-1][4], lineterm) + visiblechanges = [e for e in group if e[0] in ('replace', 'insert')] + if visiblechanges: + for tag, _, _, j1, j2 in group: + if tag != 'delete': + for line in b[j1:j2]: + yield prefixmap[tag] + line + +def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): + r""" + Compare `a` and `b` (lists of strings); return a `Differ`-style delta. + + Optional keyword parameters `linejunk` and `charjunk` are for filter + functions (or None): + + - linejunk: A function that should accept a single string argument, and + return true iff the string is junk. The default is None, and is + recommended; as of Python 2.3, an adaptive notion of "noise" lines is + used that does a good job on its own. + + - charjunk: A function that should accept a string of length 1. The + default is module-level function IS_CHARACTER_JUNK, which filters out + whitespace characters (a blank or tab; note: bad idea to include newline + in this!). + + Tools/scripts/ndiff.py is a command-line front-end to this function. + + Example: + + >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), + ... 'ore\ntree\nemu\n'.splitlines(1)) + >>> print ''.join(diff), + - one + ? ^ + + ore + ? ^ + - two + - three + ? - + + tree + + emu + """ + return Differ(linejunk, charjunk).compare(a, b) + +def _mdiff(fromlines, tolines, context=None, linejunk=None, + charjunk=IS_CHARACTER_JUNK): + """Returns generator yielding marked up from/to side by side differences. + + Arguments: + fromlines -- list of text lines to compared to tolines + tolines -- list of text lines to be compared to fromlines + context -- number of context lines to display on each side of difference, + if None, all from/to text lines will be generated. + linejunk -- passed on to ndiff (see ndiff documentation) + charjunk -- passed on to ndiff (see ndiff documentation) + + This function returns an interator which returns a tuple: + (from line tuple, to line tuple, boolean flag) + + from/to line tuple -- (line num, line text) + line num -- integer or None (to indicate a context seperation) + line text -- original line text with following markers inserted: + '\0+' -- marks start of added text + '\0-' -- marks start of deleted text + '\0^' -- marks start of changed text + '\1' -- marks end of added/deleted/changed text + + boolean flag -- None indicates context separation, True indicates + either "from" or "to" line contains a change, otherwise False. + + This function/iterator was originally developed to generate side by side + file difference for making HTML pages (see HtmlDiff class for example + usage). + + Note, this function utilizes the ndiff function to generate the side by + side difference markup. Optional ndiff arguments may be passed to this + function and they in turn will be passed to ndiff. + """ + import re + + # regular expression for finding intraline change indices + change_re = re.compile('(\++|\-+|\^+)') + + # create the difference iterator to generate the differences + diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk) + + def _make_line(lines, format_key, side, num_lines=[0,0]): + """Returns line of text with user's change markup and line formatting. + + lines -- list of lines from the ndiff generator to produce a line of + text from. When producing the line of text to return, the + lines used are removed from this list. + format_key -- '+' return first line in list with "add" markup around + the entire line. + '-' return first line in list with "delete" markup around + the entire line. + '?' return first line in list with add/delete/change + intraline markup (indices obtained from second line) + None return first line in list with no markup + side -- indice into the num_lines list (0=from,1=to) + num_lines -- from/to current line number. This is NOT intended to be a + passed parameter. It is present as a keyword argument to + maintain memory of the current line numbers between calls + of this function. + + Note, this function is purposefully not defined at the module scope so + that data it needs from its parent function (within whose context it + is defined) does not need to be of module scope. + """ + num_lines[side] += 1 + # Handle case where no user markup is to be added, just return line of + # text with user's line format to allow for usage of the line number. + if format_key is None: + return (num_lines[side],lines.pop(0)[2:]) + # Handle case of intraline changes + if format_key == '?': + text, markers = lines.pop(0), lines.pop(0) + # find intraline changes (store change type and indices in tuples) + sub_info = [] + def record_sub_info(match_object,sub_info=sub_info): + sub_info.append([match_object.group(1)[0],match_object.span()]) + return match_object.group(1) + change_re.sub(record_sub_info,markers) + # process each tuple inserting our special marks that won't be + # noticed by an xml/html escaper. + for key,(begin,end) in sub_info[::-1]: + text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:] + text = text[2:] + # Handle case of add/delete entire line + else: + text = lines.pop(0)[2:] + # if line of text is just a newline, insert a space so there is + # something for the user to highlight and see. + if not text: + text = ' ' + # insert marks that won't be noticed by an xml/html escaper. + text = '\0' + format_key + text + '\1' + # Return line of text, first allow user's line formatter to do its + # thing (such as adding the line number) then replace the special + # marks with what the user's change markup. + return (num_lines[side],text) + + def _line_iterator(): + """Yields from/to lines of text with a change indication. + + This function is an iterator. It itself pulls lines from a + differencing iterator, processes them and yields them. When it can + it yields both a "from" and a "to" line, otherwise it will yield one + or the other. In addition to yielding the lines of from/to text, a + boolean flag is yielded to indicate if the text line(s) have + differences in them. + + Note, this function is purposefully not defined at the module scope so + that data it needs from its parent function (within whose context it + is defined) does not need to be of module scope. + """ + lines = [] + num_blanks_pending, num_blanks_to_yield = 0, 0 + while True: + # Load up next 4 lines so we can look ahead, create strings which + # are a concatenation of the first character of each of the 4 lines + # so we can do some very readable comparisons. + while len(lines) < 4: + try: + lines.append(diff_lines_iterator.next()) + except StopIteration: + lines.append('X') + s = ''.join([line[0] for line in lines]) + if s.startswith('X'): + # When no more lines, pump out any remaining blank lines so the + # corresponding add/delete lines get a matching blank line so + # all line pairs get yielded at the next level. + num_blanks_to_yield = num_blanks_pending + elif s.startswith('-?+?'): + # simple intraline change + yield _make_line(lines,'?',0), _make_line(lines,'?',1), True + continue + elif s.startswith('--++'): + # in delete block, add block coming: we do NOT want to get + # caught up on blank lines yet, just process the delete line + num_blanks_pending -= 1 + yield _make_line(lines,'-',0), None, True + continue + elif s.startswith(('--?+', '--+', '- ')): + # in delete block and see a intraline change or unchanged line + # coming: yield the delete line and then blanks + from_line,to_line = _make_line(lines,'-',0), None + num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0 + elif s.startswith('-+?'): + # intraline change + yield _make_line(lines,None,0), _make_line(lines,'?',1), True + continue + elif s.startswith('-?+'): + # intraline change + yield _make_line(lines,'?',0), _make_line(lines,None,1), True + continue + elif s.startswith('-'): + # delete FROM line + num_blanks_pending -= 1 + yield _make_line(lines,'-',0), None, True + continue + elif s.startswith('+--'): + # in add block, delete block coming: we do NOT want to get + # caught up on blank lines yet, just process the add line + num_blanks_pending += 1 + yield None, _make_line(lines,'+',1), True + continue + elif s.startswith(('+ ', '+-')): + # will be leaving an add block: yield blanks then add line + from_line, to_line = None, _make_line(lines,'+',1) + num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0 + elif s.startswith('+'): + # inside an add block, yield the add line + num_blanks_pending += 1 + yield None, _make_line(lines,'+',1), True + continue + elif s.startswith(' '): + # unchanged text, yield it to both sides + yield _make_line(lines[:],None,0),_make_line(lines,None,1),False + continue + # Catch up on the blank lines so when we yield the next from/to + # pair, they are lined up. + while(num_blanks_to_yield < 0): + num_blanks_to_yield += 1 + yield None,('','\n'),True + while(num_blanks_to_yield > 0): + num_blanks_to_yield -= 1 + yield ('','\n'),None,True + if s.startswith('X'): + raise StopIteration + else: + yield from_line,to_line,True + + def _line_pair_iterator(): + """Yields from/to lines of text with a change indication. + + This function is an iterator. It itself pulls lines from the line + iterator. Its difference from that iterator is that this function + always yields a pair of from/to text lines (with the change + indication). If necessary it will collect single from/to lines + until it has a matching pair from/to pair to yield. + + Note, this function is purposefully not defined at the module scope so + that data it needs from its parent function (within whose context it + is defined) does not need to be of module scope. + """ + line_iterator = _line_iterator() + fromlines,tolines=[],[] + while True: + # Collecting lines of text until we have a from/to pair + while (len(fromlines)==0 or len(tolines)==0): + from_line, to_line, found_diff =line_iterator.next() + if from_line is not None: + fromlines.append((from_line,found_diff)) + if to_line is not None: + tolines.append((to_line,found_diff)) + # Once we have a pair, remove them from the collection and yield it + from_line, fromDiff = fromlines.pop(0) + to_line, to_diff = tolines.pop(0) + yield (from_line,to_line,fromDiff or to_diff) + + # Handle case where user does not want context differencing, just yield + # them up without doing anything else with them. + line_pair_iterator = _line_pair_iterator() + if context is None: + while True: + yield line_pair_iterator.next() + # Handle case where user wants context differencing. We must do some + # storage of lines until we know for sure that they are to be yielded. + else: + context += 1 + lines_to_write = 0 + while True: + # Store lines up until we find a difference, note use of a + # circular queue because we only need to keep around what + # we need for context. + index, contextLines = 0, [None]*(context) + found_diff = False + while(found_diff is False): + from_line, to_line, found_diff = line_pair_iterator.next() + i = index % context + contextLines[i] = (from_line, to_line, found_diff) + index += 1 + # Yield lines that we have collected so far, but first yield + # the user's separator. + if index > context: + yield None, None, None + lines_to_write = context + else: + lines_to_write = index + index = 0 + while(lines_to_write): + i = index % context + index += 1 + yield contextLines[i] + lines_to_write -= 1 + # Now yield the context lines after the change + lines_to_write = context-1 + while(lines_to_write): + from_line, to_line, found_diff = line_pair_iterator.next() + # If another change within the context, extend the context + if found_diff: + lines_to_write = context-1 + else: + lines_to_write -= 1 + yield from_line, to_line, found_diff + + +_file_template = """ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> + +<html> + +<head> + <meta http-equiv="Content-Type" + content="text/html; charset=ISO-8859-1" /> + <title></title> + <style type="text/css">%(styles)s + </style> +</head> + +<body> + %(table)s%(legend)s +</body> + +</html>""" + +_styles = """ + table.diff {font-family:Courier; border:medium;} + .diff_header {background-color:#e0e0e0} + td.diff_header {text-align:right} + .diff_next {background-color:#c0c0c0} + .diff_add {background-color:#aaffaa} + .diff_chg {background-color:#ffff77} + .diff_sub {background-color:#ffaaaa}""" + +_table_template = """ + <table class="diff" id="difflib_chg_%(prefix)s_top" + cellspacing="0" cellpadding="0" rules="groups" > + <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup> + <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup> + %(header_row)s + <tbody> +%(data_rows)s </tbody> + </table>""" + +_legend = """ + <table class="diff" summary="Legends"> + <tr> <th colspan="2"> Legends </th> </tr> + <tr> <td> <table border="" summary="Colors"> + <tr><th> Colors </th> </tr> + <tr><td class="diff_add"> Added </td></tr> + <tr><td class="diff_chg">Changed</td> </tr> + <tr><td class="diff_sub">Deleted</td> </tr> + </table></td> + <td> <table border="" summary="Links"> + <tr><th colspan="2"> Links </th> </tr> + <tr><td>(f)irst change</td> </tr> + <tr><td>(n)ext change</td> </tr> + <tr><td>(t)op</td> </tr> + </table></td> </tr> + </table>""" + +class HtmlDiff(object): + """For producing HTML side by side comparison with change highlights. + + This class can be used to create an HTML table (or a complete HTML file + containing the table) showing a side by side, line by line comparison + of text with inter-line and intra-line change highlights. The table can + be generated in either full or contextual difference mode. + + The following methods are provided for HTML generation: + + make_table -- generates HTML for a single side by side table + make_file -- generates complete HTML file with a single side by side table + + See tools/scripts/diff.py for an example usage of this class. + """ + + _file_template = _file_template + _styles = _styles + _table_template = _table_template + _legend = _legend + _default_prefix = 0 + + def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None, + charjunk=IS_CHARACTER_JUNK): + """HtmlDiff instance initializer + + Arguments: + tabsize -- tab stop spacing, defaults to 8. + wrapcolumn -- column number where lines are broken and wrapped, + defaults to None where lines are not wrapped. + linejunk,charjunk -- keyword arguments passed into ndiff() (used to by + HtmlDiff() to generate the side by side HTML differences). See + ndiff() documentation for argument default values and descriptions. + """ + self._tabsize = tabsize + self._wrapcolumn = wrapcolumn + self._linejunk = linejunk + self._charjunk = charjunk + + def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False, + numlines=5): + """Returns HTML file of side by side comparison with change highlights + + Arguments: + fromlines -- list of "from" lines + tolines -- list of "to" lines + fromdesc -- "from" file column header string + todesc -- "to" file column header string + context -- set to True for contextual differences (defaults to False + which shows full differences). + numlines -- number of context lines. When context is set True, + controls number of lines displayed before and after the change. + When context is False, controls the number of lines to place + the "next" link anchors before the next change (so click of + "next" link jumps to just before the change). + """ + + return self._file_template % dict( + styles = self._styles, + legend = self._legend, + table = self.make_table(fromlines,tolines,fromdesc,todesc, + context=context,numlines=numlines)) + + def _tab_newline_replace(self,fromlines,tolines): + """Returns from/to line lists with tabs expanded and newlines removed. + + Instead of tab characters being replaced by the number of spaces + needed to fill in to the next tab stop, this function will fill + the space with tab characters. This is done so that the difference + algorithms can identify changes in a file when tabs are replaced by + spaces and vice versa. At the end of the HTML generation, the tab + characters will be replaced with a nonbreakable space. + """ + def expand_tabs(line): + # hide real spaces + line = line.replace(' ','\0') + # expand tabs into spaces + line = line.expandtabs(self._tabsize) + # relace spaces from expanded tabs back into tab characters + # (we'll replace them with markup after we do differencing) + line = line.replace(' ','\t') + return line.replace('\0',' ').rstrip('\n') + fromlines = [expand_tabs(line) for line in fromlines] + tolines = [expand_tabs(line) for line in tolines] + return fromlines,tolines + + def _split_line(self,data_list,line_num,text): + """Builds list of text lines by splitting text lines at wrap point + + This function will determine if the input text line needs to be + wrapped (split) into separate lines. If so, the first wrap point + will be determined and the first line appended to the output + text line list. This function is used recursively to handle + the second part of the split line to further split it. + """ + # if blank line or context separator, just add it to the output list + if not line_num: + data_list.append((line_num,text)) + return + + # if line text doesn't need wrapping, just add it to the output list + size = len(text) + max = self._wrapcolumn + if (size <= max) or ((size -(text.count('\0')*3)) <= max): + data_list.append((line_num,text)) + return + + # scan text looking for the wrap point, keeping track if the wrap + # point is inside markers + i = 0 + n = 0 + mark = '' + while n < max and i < size: + if text[i] == '\0': + i += 1 + mark = text[i] + i += 1 + elif text[i] == '\1': + i += 1 + mark = '' + else: + i += 1 + n += 1 + + # wrap point is inside text, break it up into separate lines + line1 = text[:i] + line2 = text[i:] + + # if wrap point is inside markers, place end marker at end of first + # line and start marker at beginning of second line because each + # line will have its own table tag markup around it. + if mark: + line1 = line1 + '\1' + line2 = '\0' + mark + line2 + + # tack on first line onto the output list + data_list.append((line_num,line1)) + + # use this routine again to wrap the remaining text + self._split_line(data_list,'>',line2) + + def _line_wrapper(self,diffs): + """Returns iterator that splits (wraps) mdiff text lines""" + + # pull from/to data and flags from mdiff iterator + for fromdata,todata,flag in diffs: + # check for context separators and pass them through + if flag is None: + yield fromdata,todata,flag + continue + (fromline,fromtext),(toline,totext) = fromdata,todata + # for each from/to line split it at the wrap column to form + # list of text lines. + fromlist,tolist = [],[] + self._split_line(fromlist,fromline,fromtext) + self._split_line(tolist,toline,totext) + # yield from/to line in pairs inserting blank lines as + # necessary when one side has more wrapped lines + while fromlist or tolist: + if fromlist: + fromdata = fromlist.pop(0) + else: + fromdata = ('',' ') + if tolist: + todata = tolist.pop(0) + else: + todata = ('',' ') + yield fromdata,todata,flag + + def _collect_lines(self,diffs): + """Collects mdiff output into separate lists + + Before storing the mdiff from/to data into a list, it is converted + into a single line of text with HTML markup. + """ + + fromlist,tolist,flaglist = [],[],[] + # pull from/to data and flags from mdiff style iterator + for fromdata,todata,flag in diffs: + try: + # store HTML markup of the lines into the lists + fromlist.append(self._format_line(0,flag,*fromdata)) + tolist.append(self._format_line(1,flag,*todata)) + except TypeError: + # exceptions occur for lines where context separators go + fromlist.append(None) + tolist.append(None) + flaglist.append(flag) + return fromlist,tolist,flaglist + + def _format_line(self,side,flag,linenum,text): + """Returns HTML markup of "from" / "to" text lines + + side -- 0 or 1 indicating "from" or "to" text + flag -- indicates if difference on line + linenum -- line number (used for line number column) + text -- line text to be marked up + """ + try: + linenum = '%d' % linenum + id = ' id="%s%s"' % (self._prefix[side],linenum) + except TypeError: + # handle blank lines where linenum is '>' or '' + id = '' + # replace those things that would get confused with HTML symbols + text=text.replace("&","&").replace(">",">").replace("<","<") + + # make space non-breakable so they don't get compressed or line wrapped + text = text.replace(' ',' ').rstrip() + + return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \ + % (id,linenum,text) + + def _make_prefix(self): + """Create unique anchor prefixes""" + + # Generate a unique anchor prefix so multiple tables + # can exist on the same HTML page without conflicts. + fromprefix = "from%d_" % HtmlDiff._default_prefix + toprefix = "to%d_" % HtmlDiff._default_prefix + HtmlDiff._default_prefix += 1 + # store prefixes so line format method has access + self._prefix = [fromprefix,toprefix] + + def _convert_flags(self,fromlist,tolist,flaglist,context,numlines): + """Makes list of "next" links""" + + # all anchor names will be generated using the unique "to" prefix + toprefix = self._prefix[1] + + # process change flags, generating middle column of next anchors/links + next_id = ['']*len(flaglist) + next_href = ['']*len(flaglist) + num_chg, in_change = 0, False + last = 0 + for i,flag in enumerate(flaglist): + if flag: + if not in_change: + in_change = True + last = i + # at the beginning of a change, drop an anchor a few lines + # (the context lines) before the change for the previous + # link + i = max([0,i-numlines]) + next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg) + # at the beginning of a change, drop a link to the next + # change + num_chg += 1 + next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % ( + toprefix,num_chg) + else: + in_change = False + # check for cases where there is no content to avoid exceptions + if not flaglist: + flaglist = [False] + next_id = [''] + next_href = [''] + last = 0 + if context: + fromlist = ['<td></td><td> No Differences Found </td>'] + tolist = fromlist + else: + fromlist = tolist = ['<td></td><td> Empty File </td>'] + # if not a change on first line, drop a link + if not flaglist[0]: + next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix + # redo the last link to link to the top + next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix) + + return fromlist,tolist,flaglist,next_href,next_id + + def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False, + numlines=5): + """Returns HTML table of side by side comparison with change highlights + + Arguments: + fromlines -- list of "from" lines + tolines -- list of "to" lines + fromdesc -- "from" file column header string + todesc -- "to" file column header string + context -- set to True for contextual differences (defaults to False + which shows full differences). + numlines -- number of context lines. When context is set True, + controls number of lines displayed before and after the change. + When context is False, controls the number of lines to place + the "next" link anchors before the next change (so click of + "next" link jumps to just before the change). + """ + + # make unique anchor prefixes so that multiple tables may exist + # on the same page without conflict. + self._make_prefix() + + # change tabs to spaces before it gets more difficult after we insert + # markkup + fromlines,tolines = self._tab_newline_replace(fromlines,tolines) + + # create diffs iterator which generates side by side from/to data + if context: + context_lines = numlines + else: + context_lines = None + diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk, + charjunk=self._charjunk) + + # set up iterator to wrap lines that exceed desired width + if self._wrapcolumn: + diffs = self._line_wrapper(diffs) + + # collect up from/to lines and flags into lists (also format the lines) + fromlist,tolist,flaglist = self._collect_lines(diffs) + + # process change flags, generating middle column of next anchors/links + fromlist,tolist,flaglist,next_href,next_id = self._convert_flags( + fromlist,tolist,flaglist,context,numlines) + + import cStringIO + s = cStringIO.StringIO() + fmt = ' <tr><td class="diff_next"%s>%s</td>%s' + \ + '<td class="diff_next">%s</td>%s</tr>\n' + for i in range(len(flaglist)): + if flaglist[i] is None: + # mdiff yields None on separator lines skip the bogus ones + # generated for the first line + if i > 0: + s.write(' </tbody> \n <tbody>\n') + else: + s.write( fmt % (next_id[i],next_href[i],fromlist[i], + next_href[i],tolist[i])) + if fromdesc or todesc: + header_row = '<thead><tr>%s%s%s%s</tr></thead>' % ( + '<th class="diff_next"><br /></th>', + '<th colspan="2" class="diff_header">%s</th>' % fromdesc, + '<th class="diff_next"><br /></th>', + '<th colspan="2" class="diff_header">%s</th>' % todesc) + else: + header_row = '' + + table = self._table_template % dict( + data_rows=s.getvalue(), + header_row=header_row, + prefix=self._prefix[1]) + + return table.replace('\0+','<span class="diff_add">'). \ + replace('\0-','<span class="diff_sub">'). \ + replace('\0^','<span class="diff_chg">'). \ + replace('\1','</span>'). \ + replace('\t',' ') + +del re + +def restore(delta, which): + r""" + Generate one of the two sequences that generated a delta. + + Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract + lines originating from file 1 or 2 (parameter `which`), stripping off line + prefixes. + + Examples: + + >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), + ... 'ore\ntree\nemu\n'.splitlines(1)) + >>> diff = list(diff) + >>> print ''.join(restore(diff, 1)), + one + two + three + >>> print ''.join(restore(diff, 2)), + ore + tree + emu + """ + try: + tag = {1: "- ", 2: "+ "}[int(which)] + except KeyError: + raise ValueError, ('unknown delta choice (must be 1 or 2): %r' + % which) + prefixes = (" ", tag) + for line in delta: + if line[:2] in prefixes: + yield line[2:] + +def _test(): + import doctest, difflib + return doctest.testmod(difflib) + +if __name__ == "__main__": + _test()
--- a/MoinMoin/theme/__init__.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/theme/__init__.py Fri Jun 30 21:35:59 2006 +0200 @@ -222,7 +222,7 @@ curpage += s content.append("<li>%s</li>" % Page(self.request, curpage).link_to(self.request, s)) curpage += '/' - content.append(('<li><a class="backlink" title="%(title)s" href="%(href)s">%(text)s</a></li>') % { + content.append(('<li><a class="backlink" title="%(title)s" rel="nofollow" href="%(href)s">%(text)s</a></li>') % { 'title': _('Click to do a full-text search for this title'), 'href': d['title_link'], 'text': wikiutil.escape(segments[-1]), @@ -264,15 +264,15 @@ userlinks.append(homelink) # link to userprefs action userlinks.append(d['page'].link_to(request, text=_('Preferences'), - querystr={'action': 'userprefs'}, id="userprefs")) + querystr={'action': 'userprefs'}, id='userprefs', rel='nofollow')) if request.cfg.show_login: if request.user.valid: userlinks.append(d['page'].link_to(request, text=_('Logout', formatted=False), - querystr={'action': 'logout', 'logout': 'logout'}, id="logout")) + querystr={'action': 'logout', 'logout': 'logout'}, id='logout', rel='nofollow')) else: userlinks.append(d['page'].link_to(request, text=_("Login", formatted=False), - querystr={'action': 'login'}, id="login")) + querystr={'action': 'login'}, id='login', rel='nofollow')) userlinks = [u'<li>%s</li>' % link for link in userlinks] html = u'<ul id="username">%s</ul>' % ''.join(userlinks) @@ -1124,16 +1124,16 @@ if self.showBothEditLinks() and guiworks: text = _('Edit (Text)', formatted=False) params = params + 'text' - attrs = {'name': "texteditlink"} + attrs = {'name': 'texteditlink', 'rel': 'nofollow', } else: text = _('Edit', formatted=False) if guiworks: # 'textonly' will be upgraded dynamically to 'guipossible' by JS params = params + 'textonly' - attrs = {'name': "editlink"} + attrs = {'name': 'editlink', 'rel': 'nofollow', } else: params = params + 'text' - attrs = {'name': "texteditlink"} + attrs = {'name': 'texteditlink', 'rel': 'nofollow', } return wikiutil.link_tag(self.request, params, text, **attrs) @@ -1177,7 +1177,7 @@ _ = self.request.getText return page.link_to(self.request, text=_('Info', formatted=False), - querystr='action=info') + querystr='action=info', rel='nofollow') def subscribeLink(self, page): """ Return subscribe/unsubscribe link to valid users @@ -1194,7 +1194,7 @@ else: text = _("Subscribe", formatted=False) params = wikiutil.quoteWikinameURL(page.page_name) + '?action=subscribe' - return wikiutil.link_tag(self.request, params, text) + return wikiutil.link_tag(self.request, params, text, self.request.formatter, rel='nofollow') def quicklinkLink(self, page): """ Return add/remove quicklink link @@ -1211,14 +1211,14 @@ else: text = _("Add Link", formatted=False) params = wikiutil.quoteWikinameURL(page.page_name) + '?action=quicklink' - return wikiutil.link_tag(self.request, params, text) + return wikiutil.link_tag(self.request, params, text, self.request.formatter, rel='nofollow') def attachmentsLink(self, page): """ Return link to page attachments """ _ = self.request.getText return page.link_to(self.request, text=_('Attachments', formatted=False), - querystr='action=AttachFile') + querystr='action=AttachFile', rel='nofollow') def startPage(self): """ Start page div with page language and direction @@ -1362,7 +1362,8 @@ days.append( wikiutil.link_tag(self.request, '%s?max_days=%d' % (d['q_page_name'], day), - str(day))) + str(day), + self.request.formatter, rel='nofollow')) days = ' | '.join(days) html += (_("Show %s days.") % (days,))
--- a/MoinMoin/theme/classic.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/theme/classic.py Fri Jun 30 21:35:59 2006 +0200 @@ -195,12 +195,12 @@ # Use translated version if available title = _(title, formatted=False) params = '%s?action=%s' % (d['q_page_name'], action) - link = wikiutil.link_tag(request, params, title) + link = wikiutil.link_tag(request, params, title, request.formatter, rel='nofollow') html.append(link) title = _("DeleteCache", formatted=False) params = '%s?action=%s' % (d['page_name'], 'refresh') - link = wikiutil.link_tag(request, params, title) + link = wikiutil.link_tag(request, params, title, request.formatter, rel='nofollow') cache = caching.CacheEntry(request, page, page.getFormatterName(), scope='item') date = request.user.getFormattedDateTime(cache.mtime())
--- a/MoinMoin/userform.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/userform.py Fri Jun 30 21:35:59 2006 +0200 @@ -637,7 +637,7 @@ sn = request.getScriptname() pi = request.getPathinfo() action = u"%s%s" % (sn, pi) - userprefslink = wikiutil.getSysPage(request, "UserPreferences").link_to(request) + userprefslink = wikiutil.getSysPage(request, "UserPreferences").link_to(request, rel='nofollow') hint = _("To create an account or recover a lost password, see the %(userprefslink)s page.") % { 'userprefslink': userprefslink} self._form = html.FORM(action=action) @@ -714,7 +714,8 @@ querystr= {"action":"userform", "email": account.email, "account_sendmail": "1", - "sysadm": "users",}) + "sysadm": "users",}, + rel='nofollow') )) if data:
--- a/MoinMoin/util/diff.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/util/diff.py Fri Jun 30 21:35:59 2006 +0200 @@ -7,7 +7,7 @@ @license: GNU GPL, see COPYING for details. """ -import difflib +from MoinMoin.support import difflib from MoinMoin.wikiutil import escape def indent(line):
--- a/MoinMoin/wikiutil.py Fri Jun 30 21:34:45 2006 +0200 +++ b/MoinMoin/wikiutil.py Fri Jun 30 21:35:59 2006 +0200 @@ -6,9 +6,10 @@ @license: GNU GPL, see COPYING for details. """ -import os, re, difflib, urllib, cgi +import os, re, urllib, cgi import codecs, types +from MoinMoin.support import difflib from MoinMoin import util, version, config from MoinMoin.util import pysupport, filesys
--- a/docs/CHANGES Fri Jun 30 21:34:45 2006 +0200 +++ b/docs/CHANGES Fri Jun 30 21:35:59 2006 +0200 @@ -28,6 +28,9 @@ and improving it and after having made a backup with some other, proven method. USE BOTH ON YOUR OWN RISK! +Version 1.5.4-current: + * increased maxlength of some input fields from 80 to 200 + Branch moin-1.6-xapian: New Features: * Added Xapian (see http://xapian.org/) based indexed search code. @@ -143,6 +146,18 @@ * cfg.log_reverse_dns_lookups [default: True] - you can set this to False if rev. dns lookups are broken in your network (leading to long delays on page saves). With False, edit-log will only contain IP, not hostname. + * ?action=sitemap emits a google sitemap (XML), listing all your wiki pages + and the wiki root URL. + Page Priority / Frequency / Last modification + -------------------------------------------------------------------- + / 1.0 / hourly / <now> + cfg.page_front_page 1.0 / hourly / page last edit + TitleIndex,RecentChanges 0.9 / hourly / <now> + content pages 0.5 / daily / page last edit + system/help pages 0.1 / yearly / page last edit + * We use rel="nofollow" for some action links in the hope that some search + engines don't fetch the targets (if they do, they will just get 403 and + cause unnecessary traffic). Bugfixes: * on action "info" page, "revert" link will not be displayed for empty page @@ -157,6 +172,10 @@ * Fixed the output of macro and "attachment:" usages of the rst parser. * Removed Twisted request object reverse DNS lookup * cfg.editor_quickhelp was not parsed with the wiki parser when customized + * fixed MoinMoin:MoinMoinBugs/GuiEditorDeletesNewBulletText + * fixed MoinMoin:MoinMoinBugs/HtmlTextConvertTables + * updated ImageLink macro + * Added a (less broken) MoinMoin.support.difflib, details see there. Other changes: * we use (again) the same browser compatibility check as FCKeditor uses
--- a/docs/CHANGES.fpletz Fri Jun 30 21:34:45 2006 +0200 +++ b/docs/CHANGES.fpletz Fri Jun 30 21:35:59 2006 +0200 @@ -2,15 +2,23 @@ ============================= Known main issues: - * ... + * Regex searching with Xapian? ToDo: - * Manually parse prefixes (e.g. title:) in MoinMoin.Xapian.Index - right before searching + * HACK: MoinMoin.Xapian.Index._get_languages (wait for proper + metadata) * Mockup the new search UI + * Write/update documentation for all the new search stuff + * Indexing and searching of categories (new term prefix) + * MoinMoin.Xapian.use_stemming -> request.cfg.xapian_use_stemming New Features: - * TBD + * Faster search thanks to Xapian + * Searching for languages with new prefix lang/language, i.e. lang:de + Note: Only available when Xapian is activated + * New config options: + xapian_search 0 enables xapian-powered search + xapian_index_dir None directory for xapian indices Bugfixes (only stuff that is buggy in moin/1.6 main branch): * ... @@ -32,4 +40,43 @@ be no issue with OrExpression as _moinSearch handles this correctly. 2006-06-11 + * Now handling prefixes correctly (title -> S, XLINKTO always with ':') +2006-06-15 + * Integrated basic stemming, english only for now (see issues). + * Introduced LanguageSearch (new prefix lang/language) + * Searching now works with stemmed terms but matching is limited due + to usage of _moinSearch + +2006-06-16 + * Indexing & searching now works without a stemmer installed (small + bugfixes) + +2006-06-17 + * Tackled some of the issues with matching stemmed words. Need some + advice on how to detect and match them reliably using the current + framework + +2006-06-19 + * Introducing xapian_index_dir as a global directory for multiple + xapian indices i.e. for wikifarms. + + Layout: + xapian_index_dir/ + siteid1/ + complete + index/ + index-lock/ + update-queue-lock/ + siteid2/ + complete + index/ + index-lock/ + update-queue-lock/ + ... + + Possible extension: Xapian can handle multiple databases, maybe + allow searching across defined wikis on a wikifarm + * All stemming/matching issues resolved (hopefully) + * Works now without xapian installed (enhance error reporting) +
--- a/setup.py Fri Jun 30 21:34:45 2006 +0200 +++ b/setup.py Fri Jun 30 21:35:59 2006 +0200 @@ -201,6 +201,7 @@ 'packages': [ 'MoinMoin', 'MoinMoin.action', + 'MoinMoin.auth', 'MoinMoin.converter', 'MoinMoin.filter', 'MoinMoin.formatter', @@ -209,6 +210,7 @@ 'MoinMoin.i18n.tools', 'MoinMoin.logfile', 'MoinMoin.macro', + 'MoinMoin.mail', 'MoinMoin.parser', 'MoinMoin.request', 'MoinMoin.script',
--- a/wiki/config/more_samples/ldap_smb_farmconfig.py Fri Jun 30 21:34:45 2006 +0200 +++ b/wiki/config/more_samples/ldap_smb_farmconfig.py Fri Jun 30 21:35:59 2006 +0200 @@ -93,6 +93,7 @@ ldap_base = 'ou=SOMEUNIT,dc=example,dc=org' # base DN we use for searching ldap_scope = ldap.SCOPE_SUBTREE # scope of the search we do + ldap_filter = "(%(ldap_name_attribute)s=%(username)s)" # available: ldap_name_attribute (see below) and username ldap_name_attribute = 'sAMAccountName' # ldap attribute we get the user name from ldap_email_attribute = 'mail' # ldap attribute we get the email address from ldap_coding = 'utf-8' # coding used for ldap queries and result values