comparison MoinMoin/parser/_creole.py @ 5440:6c0e03fcc067

update the wikicreole parser to version 1.1
author Radomir Dopieralski <moindev@sheep.art.pl>
date Sat, 16 Jan 2010 23:43:38 +0100
parents 634ed3db8df4
children
comparison
equal deleted inserted replaced
5439:282276752499 5440:6c0e03fcc067
21 @license: GNU GPL, see COPYING for details. 21 @license: GNU GPL, see COPYING for details.
22 @license: BSD, see COPYING for details. 22 @license: BSD, see COPYING for details.
23 """ 23 """
24 24
25 import re 25 import re
26 26 import sys
27 # Whether the parser should convert \n into <br>. 27
28 bloglike_lines = False 28 __version__ = '1.1'
29
29 30
30 class Rules: 31 class Rules:
31 """Hold all the rules for generating regular expressions.""" 32 """Hold all the rules for generating regular expressions."""
32 33
33 # For the inline elements: 34 # For the inline elements:
34 proto = r'http|https|ftp|nntp|news|mailto|telnet|file|irc' 35 proto = r'http|https|ftp|nntp|news|mailto|telnet|file|irc'
35 url = r'''(?P<url>
36 (^ | (?<=\s | [.,:;!?()/=]))
37 (?P<escaped_url>~)?
38 (?P<url_target> (?P<url_proto> %s ):\S+? )
39 ($ | (?=\s | [,.:;!?()] (\s | $)))
40 )''' % proto
41 link = r'''(?P<link> 36 link = r'''(?P<link>
42 \[\[ 37 \[\[
43 (?P<link_target>.+?) \s* 38 (?P<link_target>.+?) \s*
44 ([|] \s* (?P<link_text>.+?) \s*)? 39 ([|] \s* (?P<link_text>.+?) \s*)?
45 ]] 40 ]]
74 (?P<head_head>=+) \s* 69 (?P<head_head>=+) \s*
75 (?P<head_text> .*? ) \s* 70 (?P<head_text> .*? ) \s*
76 (?P<head_tail>=*) \s* 71 (?P<head_tail>=*) \s*
77 $ 72 $
78 )''' 73 )'''
79 if bloglike_lines: 74 text = r'(?P<text> .+ )'
80 text = r'(?P<text> .+ ) (?P<break> (?<!\\)$\n(?!\s*$) )?'
81 else:
82 text = r'(?P<text> .+ )'
83 list = r'''(?P<list> 75 list = r'''(?P<list>
84 ^ [ \t]* ([*][^*\#]|[\#][^\#*]).* $ 76 ^ [ \t]* ([*][^*\#]|[\#][^\#*]).* $
85 ( \n[ \t]* [*\#]+.* $ )* 77 ( \n[ \t]* [*\#]+.* $ )*
86 )''' # Matches the whole list, separate items are parsed later. The 78 )''' # Matches the whole list, separate items are parsed later. The
87 # list *must* start with a single bullet. 79 # list *must* start with a single bullet.
116 (?P<head> [=][^|]+ ) | 108 (?P<head> [=][^|]+ ) |
117 (?P<cell> ( %s | [^|])+ ) 109 (?P<cell> ( %s | [^|])+ )
118 ) \s* 110 ) \s*
119 ''' % '|'.join([link, macro, image, code]) 111 ''' % '|'.join([link, macro, image, code])
120 112
113 def __init__(self, bloglike_lines=False, url_protocols=None,
114 wiki_words=False):
115 c = re.compile
116 # For pre escaping, in creole 1.0 done with ~:
117 self.pre_escape_re = c(self.pre_escape, re.M | re.X)
118 # for link descriptions
119 self.link_re = c('|'.join([self.image, self.linebreak,
120 self.char]), re.X | re.U)
121 # for list items
122 self.item_re = c(self.item, re.X | re.U | re.M)
123 # for table cells
124 self.cell_re = c(self.cell, re.X | re.U)
125
126 # For block elements:
127 if bloglike_lines:
128 self.text = r'(?P<text> .+ ) (?P<break> (?<!\\)$\n(?!\s*$) )?'
129 self.block_re = c('|'.join([self.line, self.head, self.separator,
130 self.pre, self.list, self.table,
131 self.text]), re.X | re.U | re.M)
132
133 # For inline elements:
134 if url_protocols is not None:
135 self.proto = '|'.join(re.escape(p) for p in url_protocols)
136 self.url = r'''(?P<url>
137 (^ | (?<=\s | [.,:;!?()/=]))
138 (?P<escaped_url>~)?
139 (?P<url_target> (?P<url_proto> %s ):\S+? )
140 ($ | (?=\s | [,.:;!?()] (\s | $))))''' % self.proto
141 inline_elements = [self.link, self.url, self.macro,
142 self.code, self.image, self.strong,
143 self.emph, self.linebreak,
144 self.escape, self.char]
145 if wiki_words:
146 import unicodedata
147 up_case = u''.join(unichr(i) for i in xrange(sys.maxunicode)
148 if unicodedata.category(unichr(i))=='Lu')
149 self.wiki = ur'''(?P<wiki>[%s]\w+[%s]\w+)''' % (up_case, up_case)
150 inline_elements.insert(3, self.wiki)
151 self.inline_re = c('|'.join(inline_elements), re.X | re.U)
152
121 class Parser: 153 class Parser:
122 """ 154 """
123 Parse the raw text and create a document object 155 Parse the raw text and create a document object
124 that can be converted into output using Emitter. 156 that can be converted into output using Emitter.
157
158 A separate instance should be created for parsing a new document.
159 The first parameter is the raw text to be parsed. An optional second
160 argument is the Rules object to use. You can customize the parsing
161 rules to enable optional features or extend the parser.
125 """ 162 """
126 163
127 # For pre escaping, in creole 1.0 done with ~: 164 def __init__(self, raw, rules=None):
128 pre_escape_re = re.compile(Rules.pre_escape, re.M | re.X) 165 self.rules = rules or Rules()
129 link_re = re.compile('|'.join([Rules.image, Rules.linebreak, Rules.char]), re.X | re.U) # for link descriptions
130 item_re = re.compile(Rules.item, re.X | re.U | re.M) # for list items
131 cell_re = re.compile(Rules.cell, re.X | re.U) # for table cells
132 # For block elements:
133 block_re = re.compile('|'.join([Rules.line, Rules.head, Rules.separator,
134 Rules.pre, Rules.list, Rules.table, Rules.text]), re.X | re.U | re.M)
135 # For inline elements:
136 inline_re = re.compile('|'.join([Rules.link, Rules.url, Rules.macro,
137 Rules.code, Rules.image, Rules.strong, Rules.emph, Rules.linebreak,
138 Rules.escape, Rules.char]), re.X | re.U)
139
140 def __init__(self, raw):
141 self.raw = raw 166 self.raw = raw
142 self.root = DocNode('document', None) 167 self.root = DocNode('document', None)
143 self.cur = self.root # The most recent document node 168 self.cur = self.root # The most recent document node
144 self.text = None # The node to add inline characters to 169 self.text = None # The node to add inline characters to
145 170
182 text = (groups.get('link_text', '') or '').strip() 207 text = (groups.get('link_text', '') or '').strip()
183 parent = self.cur 208 parent = self.cur
184 self.cur = DocNode('link', self.cur) 209 self.cur = DocNode('link', self.cur)
185 self.cur.content = target 210 self.cur.content = target
186 self.text = None 211 self.text = None
187 re.sub(self.link_re, self._replace, text) 212 re.sub(self.rules.link_re, self._replace, text)
188 self.cur = parent 213 self.cur = parent
189 self.text = None 214 self.text = None
190 _link_target_repl = _link_repl 215 _link_target_repl = _link_repl
191 _link_text_repl = _link_repl 216 _link_text_repl = _link_repl
217
218 def _wiki_repl(self, groups):
219 """Handle WikiWord links, if enabled."""
220
221 text = groups.get('wiki', '')
222 node = DocNode('link', self.cur)
223 node.content = text
224 DocNode('text', node, node.content)
225 self.text = None
192 226
193 def _macro_repl(self, groups): 227 def _macro_repl(self, groups):
194 """Handles macros using the placeholder syntax.""" 228 """Handles macros using the placeholder syntax."""
195 229
196 name = groups.get('macro_name', '') 230 name = groups.get('macro_name', '')
247 _item_text_repl = _item_repl 281 _item_text_repl = _item_repl
248 _item_head_repl = _item_repl 282 _item_head_repl = _item_repl
249 283
250 def _list_repl(self, groups): 284 def _list_repl(self, groups):
251 text = groups.get('list', u'') 285 text = groups.get('list', u'')
252 self.item_re.sub(self._replace, text) 286 self.rules.item_re.sub(self._replace, text)
253 287
254 def _head_repl(self, groups): 288 def _head_repl(self, groups):
255 self.cur = self._upto(self.cur, ('document', 'section', 'blockquote')) 289 self.cur = self._upto(self.cur, ('document', 'section', 'blockquote'))
256 node = DocNode('header', self.cur, groups.get('head_text', '').strip()) 290 node = DocNode('header', self.cur, groups.get('head_text', '').strip())
257 node.level = len(groups.get('head_head', ' ')) 291 node.level = len(groups.get('head_head', ' '))
283 self.cur = DocNode('table', self.cur) 317 self.cur = DocNode('table', self.cur)
284 tb = self.cur 318 tb = self.cur
285 tr = DocNode('table_row', tb) 319 tr = DocNode('table_row', tb)
286 320
287 text = '' 321 text = ''
288 for m in self.cell_re.finditer(row): 322 for m in self.rules.cell_re.finditer(row):
289 cell = m.group('cell') 323 cell = m.group('cell')
290 if cell: 324 if cell:
291 self.cur = DocNode('table_cell', tr) 325 self.cur = DocNode('table_cell', tr)
292 self.text = None 326 self.text = None
293 self.parse_inline(cell) 327 self.parse_inline(cell)
303 self.cur = self._upto(self.cur, ('document', 'section', 'blockquote')) 337 self.cur = self._upto(self.cur, ('document', 'section', 'blockquote'))
304 kind = groups.get('pre_kind', None) 338 kind = groups.get('pre_kind', None)
305 text = groups.get('pre_text', u'') 339 text = groups.get('pre_text', u'')
306 def remove_tilde(m): 340 def remove_tilde(m):
307 return m.group('indent') + m.group('rest') 341 return m.group('indent') + m.group('rest')
308 text = self.pre_escape_re.sub(remove_tilde, text) 342 text = self.rules.pre_escape_re.sub(remove_tilde, text)
309 node = DocNode('preformatted', self.cur, text) 343 node = DocNode('preformatted', self.cur, text)
310 node.sect = kind or '' 344 node.sect = kind or ''
311 self.text = None 345 self.text = None
312 _pre_text_repl = _pre_repl 346 _pre_text_repl = _pre_repl
313 _pre_head_repl = _pre_repl 347 _pre_head_repl = _pre_repl
361 return 395 return
362 396
363 def parse_inline(self, raw): 397 def parse_inline(self, raw):
364 """Recognize inline elements inside blocks.""" 398 """Recognize inline elements inside blocks."""
365 399
366 re.sub(self.inline_re, self._replace, raw) 400 re.sub(self.rules.inline_re, self._replace, raw)
367 401
368 def parse_block(self, raw): 402 def parse_block(self, raw):
369 """Recognize block elements.""" 403 """Recognize block elements."""
370 404
371 re.sub(self.block_re, self._replace, raw) 405 re.sub(self.rules.block_re, self._replace, raw)
372 406
373 def parse(self): 407 def parse(self):
374 """Parse the text given as self.raw and return DOM tree.""" 408 """Parse the text given as self.raw and return DOM tree."""
375 409
376 self.parse_block(self.raw) 410 self.parse_block(self.raw)
377 return self.root 411 return self.root
378 412
379 #################### Helper classes 413 #################### Helper classes
380 414
381 ### The document model and emitter follow 415 ### The document model
382 416
383 class DocNode: 417 class DocNode:
384 """ 418 """
385 A node in the document. 419 A node in the document.
386 """ 420 """