comparison emeraldtree/HTMLTreeBuilder.py @ 51:847897e5fab8

HTMLTreeBuilder - Make namespace aware, add testcases
author Bastian Blank <bblank@thinkmo.de>
date Sun, 21 Sep 2008 11:01:19 +0200
parents e647f30cc08e
children
comparison
equal deleted inserted replaced
50:d3767204a325 51:847897e5fab8
50 ## 50 ##
51 51
52 import htmlentitydefs 52 import htmlentitydefs
53 import re 53 import re
54 import mimetools, StringIO 54 import mimetools, StringIO
55 from HTMLParser import HTMLParser 55 from HTMLParser import HTMLParser as HTMLParserBase
56 56
57 import ElementTree 57 import ElementTree
58
59 AUTOCLOSE = "p", "li", "tr", "th", "td", "head", "body"
60 IGNOREEND = "img", "hr", "meta", "link", "br"
61
62 is_not_ascii = re.compile(eval(r'u"[\u0080-\uffff]"')).search
63 58
64 59
65 ## 60 ##
66 # ElementTree builder for HTML source code. This builder converts an 61 # ElementTree builder for HTML source code. This builder converts an
67 # HTML document or fragment to an ElementTree. 62 # HTML document or fragment to an ElementTree.
81 # document uses a non-ASCII compatible encoding, you must decode 76 # document uses a non-ASCII compatible encoding, you must decode
82 # the document before parsing. 77 # the document before parsing.
83 # 78 #
84 # @see elementtree.ElementTree 79 # @see elementtree.ElementTree
85 80
86 class HTMLTreeBuilder(HTMLParser): 81 class HTMLParser(HTMLParserBase):
87 82 AUTOCLOSE = "p", "li", "tr", "th", "td", "head", "body"
88 # FIXME: shouldn't this class be named Parser, not Builder? 83 IGNOREEND = "img", "hr", "meta", "link", "br", "input", "col"
84
85 namespace = "http://www.w3.org/1999/xhtml"
89 86
90 def __init__(self, builder=None, encoding=None): 87 def __init__(self, builder=None, encoding=None):
91 self.__stack = [] 88 self.__stack = []
92 if builder is None: 89 if builder is None:
93 builder = ElementTree.TreeBuilder() 90 builder = ElementTree.TreeBuilder()
94 self.__builder = builder 91 self.__builder = builder
95 self.encoding = encoding or "iso-8859-1" 92 self.encoding = encoding or "iso-8859-1"
96 HTMLParser.__init__(self) 93 HTMLParserBase.__init__(self)
97 94
98 ## 95 ##
99 # Flushes parser buffers, and return the root element. 96 # Flushes parser buffers, and return the root element.
100 # 97 #
101 # @return An Element instance. 98 # @return An Element instance.
102 99
103 def close(self): 100 def close(self):
104 HTMLParser.close(self) 101 HTMLParserBase.close(self)
105 return self.__builder.close() 102 return self.__builder.close()
106 103
107 ## 104 ##
108 # (Internal) Handles start tags. 105 # (Internal) Handles start tags.
109 106
110 def handle_starttag(self, tag, attrs): 107 def handle_starttag(self, tag, attrs):
111 if tag == "meta": 108 tag = ElementTree.QName(tag.lower(), self.namespace)
109 if tag.name == "meta":
112 # look for encoding directives 110 # look for encoding directives
113 http_equiv = content = None 111 http_equiv = content = None
114 for k, v in attrs: 112 for k, v in attrs:
115 if k == "http-equiv": 113 if k == "http-equiv":
116 http_equiv = v.lower() 114 http_equiv = v.lower()
122 StringIO.StringIO("%s: %s\n\n" % (http_equiv, content)) 120 StringIO.StringIO("%s: %s\n\n" % (http_equiv, content))
123 ) 121 )
124 encoding = header.getparam("charset") 122 encoding = header.getparam("charset")
125 if encoding: 123 if encoding:
126 self.encoding = encoding 124 self.encoding = encoding
127 if tag in AUTOCLOSE: 125 if tag.name in self.AUTOCLOSE:
128 if self.__stack and self.__stack[-1] == tag: 126 if self.__stack and self.__stack[-1] == tag:
129 self.handle_endtag(tag) 127 self.handle_endtag(tag)
130 self.__stack.append(tag) 128 self.__stack.append(tag)
131 attrib = {} 129 attrib = {}
132 if attrs: 130 if attrs:
133 for k, v in attrs: 131 for key, value in attrs:
134 attrib[k.lower()] = v 132 # Handle short attributes
133 if value is None:
134 value = key
135 key = ElementTree.QName(key.lower(), self.namespace)
136 attrib[key] = value
135 self.__builder.start(tag, attrib) 137 self.__builder.start(tag, attrib)
136 if tag in IGNOREEND: 138 if tag.name in self.IGNOREEND:
137 self.__stack.pop() 139 self.__stack.pop()
138 self.__builder.end(tag) 140 self.__builder.end(tag)
139 141
140 ## 142 ##
141 # (Internal) Handles end tags. 143 # (Internal) Handles end tags.
142 144
143 def handle_endtag(self, tag): 145 def handle_endtag(self, tag):
144 if tag in IGNOREEND: 146 if not isinstance(tag, ElementTree.QName):
147 tag = ElementTree.QName(tag.lower(), self.namespace)
148 if tag.name in self.IGNOREEND:
145 return 149 return
146 lasttag = self.__stack.pop() 150 lasttag = self.__stack.pop()
147 if tag != lasttag and lasttag in AUTOCLOSE: 151 if tag != lasttag and lasttag.name in self.AUTOCLOSE:
148 self.handle_endtag(lasttag) 152 self.handle_endtag(lasttag)
149 self.__builder.end(tag) 153 self.__builder.end(tag)
150 154
151 ## 155 ##
152 # (Internal) Handles character references. 156 # (Internal) Handles character references.
180 184
181 ## 185 ##
182 # (Internal) Handles character data. 186 # (Internal) Handles character data.
183 187
184 def handle_data(self, data): 188 def handle_data(self, data):
185 if isinstance(data, type('')) and is_not_ascii(data): 189 if isinstance(data, str):
186 # convert to unicode, but only if necessary 190 # convert to unicode, but only if necessary
187 data = unicode(data, self.encoding, "ignore") 191 data = unicode(data, self.encoding, "ignore")
188 self.__builder.data(data) 192 self.__builder.data(data)
189 193
190 ## 194 ##
193 197
194 def unknown_entityref(self, name): 198 def unknown_entityref(self, name):
195 pass # ignore by default; override if necessary 199 pass # ignore by default; override if necessary
196 200
197 ## 201 ##
198 # An alias for the <b>HTMLTreeBuilder</b> class. 202 # An alias for the <b>HTMLParser</b> class.
199 203
200 TreeBuilder = HTMLTreeBuilder 204 TreeBuilder = HTMLTreeBuilder = HTMLParser
201 205
202 ## 206 ##
203 # Parse an HTML document or document fragment. 207 # Parse an HTML document or document fragment.
204 # 208 #
205 # @param source A filename or file object containing HTML data. 209 # @param source A filename or file object containing HTML data.
209 # @return An ElementTree instance 213 # @return An ElementTree instance
210 214
211 def parse(source, encoding=None): 215 def parse(source, encoding=None):
212 return ElementTree.parse(source, HTMLTreeBuilder(encoding=encoding)) 216 return ElementTree.parse(source, HTMLTreeBuilder(encoding=encoding))
213 217
218 def HTML(text):
219 parser = HTMLParser()
220 parser.feed(text)
221 return parser.close()
222
214 if __name__ == "__main__": 223 if __name__ == "__main__":
215 import sys 224 import sys
216 ElementTree.dump(parse(open(sys.argv[1]))) 225 ElementTree.dump(parse(open(sys.argv[1])))