changeset 3824:aef3adf3f701

MoinMoinBugs/DoNotConvertUnicodeToUTF8ForXsltParser Initial patch applied and slightly improved.
author Thomas Pfaff <yael [at] gmx dot net>
date Fri, 11 Jul 2008 10:28:38 +0200
parents 262738c1f46b
children d125f175fc2e
files MoinMoin/_tests/test_parser_unicode.py MoinMoin/parser/text_xslt.py
diffstat 2 files changed, 74 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MoinMoin/_tests/test_parser_unicode.py	Fri Jul 11 10:28:38 2008 +0200
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+"""
+    MoinMoin - Test if MoinMoin.parser.* do write UNICODE objects
+
+    Comment:
+    The test produces an exception in another place, if some routine encodes unicode to UTF8 too early.
+    The corresponding bug was found initially in MoinMoin.parser.text_xslt and the test was designed for
+    this problem, but the test actually examines all available parsers.
+
+    @copyright: 2007,2008 by Raphael Bossek <raphael.bossek@gmail.com>
+    @license: GNU GPL, see COPYING for details.
+"""
+
+import py
+import sys, traceback
+import MoinMoin.parser
+from MoinMoin.Page import Page
+
+class TestParserOutput(object):
+    """Parser has to generate unicode output.
+    """
+    def test_ParserOutput(self):
+        """The method aims generally at MoinMoin.parser.text_xslt
+        This parser should encode Unicode input to UTF8 as late as possible
+        """
+        request = self.request
+        assert not request.cfg.allow_xslt, u'allow_xslt should be disabled'
+        errmsg = []
+
+        # Some examples to verify with additional stuff
+        parser_raw_input = {
+            u'text_html': u'<html><body><h1>%s</h1></body></html>',
+            u'text_irssi': u"[12:01] <RaphaelBosek> %s",
+            u'text_moin_wiki': u'||<#fefefe> %s ||',
+            u'text_python': u'if True: print "%s"',
+            u'text_xslt': u'<?xml version="1.0" encoding="ISO-8859-1"?><!-- %s -->',
+        }
+
+        # Blacklist for parsers that don't work
+        # This list should be empty !
+        parser_blacklist = []
+
+        # Create a page if it doesn't exist already.
+        if not u'page' in request.formatter.__dict__ or not request.formatter.page:
+            request.formatter.page = Page(request, u'test_parser_unicode_page')
+            # this temporarily fixes an error with page-names, should be fixed at a central place some time
+            request.page = Page(request, u'test_parser_unicode_page')
+
+        # Check all parsers for UNICODE output.
+        for parsername in MoinMoin.parser.modules:
+            if parsername in parser_blacklist:
+                continue
+
+            module = __import__(u'MoinMoin.parser', globals(), {}, [parsername])
+            parsermodule = getattr(module, parsername)
+            if u'Parser' in parsermodule.__dict__:
+                # Get the parser_input or use a simple fallback if the parser is not found in parser_raw_input
+                i = parser_raw_input.get(parsername, u'%s') % u'\xC3\x84\xC3\x96\xC3\x9C\xC3\xE2\x82\xAC\x27'
+                p = parsermodule.Parser(i, request)
+
+                # This is the actual request that would produce an exception, which would usually look like the following:
+                # >  UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)
+                # usually occurring in python/lib/StringIO.py:270
+                r = request.redirectedOutput(p.format, request.formatter)
+
+                # This assertion will only be triggered, if the parser does not write unicode at all
+                assert isinstance(r, unicode), u'MoinMoin.parser.%s does not write UNICODE data but %s' % (parsername, type(r), )
+
+coverage_modules = ['MoinMoin.parser']
--- a/MoinMoin/parser/text_xslt.py	Thu Jul 10 16:13:24 2008 +0200
+++ b/MoinMoin/parser/text_xslt.py	Fri Jul 11 10:28:38 2008 +0200
@@ -28,7 +28,7 @@
     Dependencies = Dependencies
 
     def __init__(self, raw, request, **kw):
-        self.raw = raw.encode(config.charset)
+        self.raw = raw
         self.request = request
         self.form = request.form
         self._ = request.getText
@@ -87,14 +87,15 @@
                 input_factory = InputSource.InputSourceFactory(resolver=wiki_resolver)
 
                 page_uri = self.base_uri + wikiutil.url_quote(formatter.page.page_name)
-                raw = self.raw.strip()
-
+                # 4Suite needs an utf-8 encoded byte string instead of an unicode object
+                raw = self.raw.strip().encode('utf-8')
                 self.processor = Processor()
                 self.append_stylesheet() # hook, for extending this parser
                 self.processor.run(
                     input_factory.fromString(raw, uri=page_uri),
                     outputStream=out_file)
-                result = out_file.getvalue()
+                # Convert utf-8 encoded byte string into unicode
+                result = out_file.getvalue().decode('utf-8')
                 result = self.parse_result(result) # hook, for extending this parser
 
             except FtException, msg: