comparison elementtree/ElementTree.py @ 0:5169fce2d144 upstream

Import ElementTree (1.3a3-20070912-preview).
author Bastian Blank <bblank@thinkmo.de>
date Fri, 30 May 2008 18:02:30 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5169fce2d144
1 #
2 # ElementTree
3 # $Id: ElementTree.py 3276 2007-09-12 06:52:30Z fredrik $
4 #
5 # light-weight XML support for Python 2.2 and later.
6 #
7 # history:
8 # 2001-10-20 fl created (from various sources)
9 # 2001-11-01 fl return root from parse method
10 # 2002-02-16 fl sort attributes in lexical order
11 # 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
12 # 2002-05-01 fl finished TreeBuilder refactoring
13 # 2002-07-14 fl added basic namespace support to ElementTree.write
14 # 2002-07-25 fl added QName attribute support
15 # 2002-10-20 fl fixed encoding in write
16 # 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
17 # 2002-11-27 fl accept file objects or file names for parse/write
18 # 2002-12-04 fl moved XMLTreeBuilder back to this module
19 # 2003-01-11 fl fixed entity encoding glitch for us-ascii
20 # 2003-02-13 fl added XML literal factory
21 # 2003-02-21 fl added ProcessingInstruction/PI factory
22 # 2003-05-11 fl added tostring/fromstring helpers
23 # 2003-05-26 fl added ElementPath support
24 # 2003-07-05 fl added makeelement factory method
25 # 2003-07-28 fl added more well-known namespace prefixes
26 # 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
27 # 2003-09-04 fl fall back on emulator if ElementPath is not installed
28 # 2003-10-31 fl markup updates
29 # 2003-11-15 fl fixed nested namespace bug
30 # 2004-03-28 fl added XMLID helper
31 # 2004-06-02 fl added default support to findtext
32 # 2004-06-08 fl fixed encoding of non-ascii element/attribute names
33 # 2004-08-23 fl take advantage of post-2.1 expat features
34 # 2004-09-03 fl made Element class visible; removed factory
35 # 2005-02-01 fl added iterparse implementation
36 # 2005-03-02 fl fixed iterparse support for pre-2.2 versions
37 # 2005-11-12 fl added tostringlist/fromstringlist helpers
38 # 2006-07-05 fl merged in selected changes from the 1.3 sandbox
39 # 2006-07-05 fl removed support for 2.1 and earlier
40 # 2007-06-21 fl added deprecation/future warnings
41 # 2007-08-25 fl added doctype hook, added parser version attribute etc
42 # 2007-08-26 fl added new serializer code (better namespace handling, etc)
43 # 2007-08-27 fl warn for broken /tag searches on tree level
44 # 2007-09-02 fl added html/text methods to serializer (experimental)
45 # 2007-09-05 fl added method argument to tostring/tostringlist
46 # 2007-09-06 fl improved error handling
47 #
48 # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
49 #
50 # fredrik@pythonware.com
51 # http://www.pythonware.com
52 #
53 # --------------------------------------------------------------------
54 # The ElementTree toolkit is
55 #
56 # Copyright (c) 1999-2007 by Fredrik Lundh
57 #
58 # By obtaining, using, and/or copying this software and/or its
59 # associated documentation, you agree that you have read, understood,
60 # and will comply with the following terms and conditions:
61 #
62 # Permission to use, copy, modify, and distribute this software and
63 # its associated documentation for any purpose and without fee is
64 # hereby granted, provided that the above copyright notice appears in
65 # all copies, and that both that copyright notice and this permission
66 # notice appear in supporting documentation, and that the name of
67 # Secret Labs AB or the author not be used in advertising or publicity
68 # pertaining to distribution of the software without specific, written
69 # prior permission.
70 #
71 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
72 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
73 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
74 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
75 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
76 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
77 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
78 # OF THIS SOFTWARE.
79 # --------------------------------------------------------------------
80
81 from __future__ import generators
82
83 __all__ = [
84 # public symbols
85 "Comment",
86 "dump",
87 "Element", "ElementTree",
88 "fromstring", "fromstringlist",
89 "iselement", "iterparse",
90 "parse", "ParseError",
91 "PI", "ProcessingInstruction",
92 "QName",
93 "SubElement",
94 "tostring", "tostringlist",
95 "TreeBuilder",
96 "VERSION",
97 "XML",
98 "XMLParser", "XMLTreeBuilder",
99 ]
100
101 ##
102 # The <b>Element</b> type is a flexible container object, designed to
103 # store hierarchical data structures in memory. The type can be
104 # described as a cross between a list and a dictionary.
105 # <p>
106 # Each element has a number of properties associated with it:
107 # <ul>
108 # <li>a <i>tag</i>. This is a string identifying what kind of data
109 # this element represents (the element type, in other words).</li>
110 # <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
111 # <li>a <i>text</i> string.</li>
112 # <li>an optional <i>tail</i> string.</li>
113 # <li>a number of <i>child elements</i>, stored in a Python sequence</li>
114 # </ul>
115 #
116 # To create an element instance, use the {@link #Element} constructor
117 # or the {@link #SubElement} factory function.
118 # <p>
119 # The {@link #ElementTree} class can be used to wrap an element
120 # structure, and convert it from and to XML.
121 ##
122
123 import sys, re
124
125 class _SimpleElementPath(object):
126 # emulate pre-1.2 find/findtext/findall behaviour
127 def find(self, element, tag):
128 for elem in element:
129 if elem.tag == tag:
130 return elem
131 return None
132 def findtext(self, element, tag, default=None):
133 for elem in element:
134 if elem.tag == tag:
135 return elem.text or ""
136 return default
137 def findall(self, element, tag):
138 if tag[:3] == ".//":
139 return element.getiterator(tag[3:])
140 result = []
141 for elem in element:
142 if elem.tag == tag:
143 result.append(elem)
144 return result
145
146 try:
147 import ElementPath
148 except ImportError:
149 # FIXME: issue warning in this case?
150 ElementPath = _SimpleElementPath()
151
152 VERSION = "1.3a2"
153
154 class ParseError(SyntaxError):
155 pass
156
157 # --------------------------------------------------------------------
158
159 ##
160 # Checks if an object appears to be a valid element object.
161 #
162 # @param An element instance.
163 # @return A true value if this is an element object.
164 # @defreturn flag
165
166 def iselement(element):
167 # FIXME: not sure about this; might be a better idea to look
168 # for tag/attrib/text attributes
169 return isinstance(element, Element) or hasattr(element, "tag")
170
171 ##
172 # Element class. This class defines the Element interface, and
173 # provides a reference implementation of this interface.
174 # <p>
175 # The element name, attribute names, and attribute values can be
176 # either 8-bit ASCII strings or Unicode strings.
177 #
178 # @param tag The element name.
179 # @param attrib An optional dictionary, containing element attributes.
180 # @param **extra Additional attributes, given as keyword arguments.
181 # @see Element
182 # @see SubElement
183 # @see Comment
184 # @see ProcessingInstruction
185
186 class Element(object):
187 # <tag attrib>text<child/>...</tag>tail
188
189 ##
190 # (Attribute) Element tag.
191
192 tag = None
193
194 ##
195 # (Attribute) Element attribute dictionary. Where possible, use
196 # {@link #Element.get},
197 # {@link #Element.set},
198 # {@link #Element.keys}, and
199 # {@link #Element.items} to access
200 # element attributes.
201
202 attrib = None
203
204 ##
205 # (Attribute) Text before first subelement. This is either a
206 # string or the value None, if there was no text.
207
208 text = None
209
210 ##
211 # (Attribute) Text after this element's end tag, but before the
212 # next sibling element's start tag. This is either a string or
213 # the value None, if there was no text.
214
215 tail = None # text after end tag, if any
216
217 def __init__(self, tag, attrib={}, **extra):
218 attrib = attrib.copy()
219 attrib.update(extra)
220 self.tag = tag
221 self.attrib = attrib
222 self._children = []
223
224 def __repr__(self):
225 return "<Element %s at %x>" % (repr(self.tag), id(self))
226
227 ##
228 # Creates a new element object of the same type as this element.
229 #
230 # @param tag Element tag.
231 # @param attrib Element attributes, given as a dictionary.
232 # @return A new element instance.
233
234 def makeelement(self, tag, attrib):
235 return Element(tag, attrib)
236
237 ##
238 # Returns the number of subelements.
239 #
240 # @return The number of subelements.
241
242 def __len__(self):
243 return len(self._children)
244
245 def __nonzero__(self):
246 import warnings
247 warnings.warn(
248 "The behavior of this method will change in future versions. "
249 "Use specific 'len(elem)' or 'elem is not None' test instead.",
250 FutureWarning
251 )
252 return len(self._children) != 0 # emulate old behaviour
253
254 ##
255 # Returns the given subelement.
256 #
257 # @param index What subelement to return.
258 # @return The given subelement.
259 # @exception IndexError If the given element does not exist.
260
261 def __getitem__(self, index):
262 return self._children[index]
263
264 ##
265 # Replaces the given subelement.
266 #
267 # @param index What subelement to replace.
268 # @param element The new element value.
269 # @exception IndexError If the given element does not exist.
270 # @exception AssertionError If element is not a valid object.
271
272 def __setitem__(self, index, element):
273 assert iselement(element)
274 self._children[index] = element
275
276 ##
277 # Deletes the given subelement.
278 #
279 # @param index What subelement to delete.
280 # @exception IndexError If the given element does not exist.
281
282 def __delitem__(self, index):
283 del self._children[index]
284
285 ##
286 # Returns a list containing subelements in the given range.
287 #
288 # @param start The first subelement to return.
289 # @param stop The first subelement that shouldn't be returned.
290 # @return A sequence object containing subelements.
291
292 def __getslice__(self, start, stop):
293 return self._children[start:stop]
294
295 ##
296 # Replaces a number of subelements with elements from a sequence.
297 #
298 # @param start The first subelement to replace.
299 # @param stop The first subelement that shouldn't be replaced.
300 # @param elements A sequence object with zero or more elements.
301 # @exception AssertionError If a sequence member is not a valid object.
302
303 def __setslice__(self, start, stop, elements):
304 for element in elements:
305 assert iselement(element)
306 self._children[start:stop] = list(elements)
307
308 ##
309 # Deletes a number of subelements.
310 #
311 # @param start The first subelement to delete.
312 # @param stop The first subelement to leave in there.
313
314 def __delslice__(self, start, stop):
315 del self._children[start:stop]
316
317 ##
318 # Adds a subelement to the end of this element.
319 #
320 # @param element The element to add.
321 # @exception AssertionError If a sequence member is not a valid object.
322
323 def append(self, element):
324 assert iselement(element)
325 self._children.append(element)
326
327 ##
328 # Appends subelements from a sequence.
329 #
330 # @param elements A sequence object with zero or more elements.
331 # @exception AssertionError If a subelement is not a valid object.
332 # @since 1.3
333
334 def extend(self, elements):
335 for element in elements:
336 assert iselement(element)
337 self._children.extend(elements)
338
339 ##
340 # Inserts a subelement at the given position in this element.
341 #
342 # @param index Where to insert the new subelement.
343 # @exception AssertionError If the element is not a valid object.
344
345 def insert(self, index, element):
346 assert iselement(element)
347 self._children.insert(index, element)
348
349 ##
350 # Removes a matching subelement. Unlike the <b>find</b> methods,
351 # this method compares elements based on identity, not on tag
352 # value or contents.
353 #
354 # @param element What element to remove.
355 # @exception ValueError If a matching element could not be found.
356 # @exception AssertionError If the element is not a valid object.
357
358 def remove(self, element):
359 assert iselement(element)
360 self._children.remove(element)
361
362 ##
363 # (Deprecated) Returns all subelements. The elements are returned
364 # in document order.
365 #
366 # @return A list of subelements.
367 # @defreturn list of Element instances
368
369 def getchildren(self):
370 import warnings
371 warnings.warn(
372 "This method will be removed in future versions. "
373 "Use 'list(elem)' or iteration over elem instead.",
374 DeprecationWarning
375 )
376 return self._children
377
378 ##
379 # Finds the first matching subelement, by tag name or path.
380 #
381 # @param path What element to look for.
382 # @return The first matching element, or None if no element was found.
383 # @defreturn Element or None
384
385 def find(self, path):
386 return ElementPath.find(self, path)
387
388 ##
389 # Finds text for the first matching subelement, by tag name or path.
390 #
391 # @param path What element to look for.
392 # @param default What to return if the element was not found.
393 # @return The text content of the first matching element, or the
394 # default value no element was found. Note that if the element
395 # has is found, but has no text content, this method returns an
396 # empty string.
397 # @defreturn string
398
399 def findtext(self, path, default=None):
400 return ElementPath.findtext(self, path, default)
401
402 ##
403 # Finds all matching subelements, by tag name or path.
404 #
405 # @param path What element to look for.
406 # @return A list or iterator containing all matching elements,
407 # in document order.
408 # @defreturn list of Element instances
409
410 def findall(self, path):
411 return ElementPath.findall(self, path)
412
413 ##
414 # Resets an element. This function removes all subelements, clears
415 # all attributes, and sets the text and tail attributes to None.
416
417 def clear(self):
418 self.attrib.clear()
419 self._children = []
420 self.text = self.tail = None
421
422 ##
423 # Gets an element attribute.
424 #
425 # @param key What attribute to look for.
426 # @param default What to return if the attribute was not found.
427 # @return The attribute value, or the default value, if the
428 # attribute was not found.
429 # @defreturn string or None
430
431 def get(self, key, default=None):
432 return self.attrib.get(key, default)
433
434 ##
435 # Sets an element attribute.
436 #
437 # @param key What attribute to set.
438 # @param value The attribute value.
439
440 def set(self, key, value):
441 self.attrib[key] = value
442
443 ##
444 # Gets a list of attribute names. The names are returned in an
445 # arbitrary order (just like for an ordinary Python dictionary).
446 #
447 # @return A list of element attribute names.
448 # @defreturn list of strings
449
450 def keys(self):
451 return self.attrib.keys()
452
453 ##
454 # Gets element attributes, as a sequence. The attributes are
455 # returned in an arbitrary order.
456 #
457 # @return A list of (name, value) tuples for all attributes.
458 # @defreturn list of (string, string) tuples
459
460 def items(self):
461 return self.attrib.items()
462
463 ##
464 # Creates a tree iterator. The iterator loops over this element
465 # and all subelements, in document order, and returns all elements
466 # with a matching tag.
467 # <p>
468 # If the tree structure is modified during iteration, new or removed
469 # elements may or may not be included. To get a stable set, use the
470 # list() function on the iterator, and loop over the resulting list.
471 #
472 # @param tag What tags to look for (default is to return all elements).
473 # @return An iterator containing all the matching elements.
474 # @defreturn iterator
475
476 def iter(self, tag=None):
477 if tag == "*":
478 tag = None
479 if tag is None or self.tag == tag:
480 yield self
481 for e in self._children:
482 for e in e.iter(tag):
483 yield e
484
485 # compatibility (FIXME: preserve list behaviour too? see below)
486 getiterator = iter
487
488 # def getiterator(self, tag=None):
489 # return list(tag)
490
491 ##
492 # Creates a text iterator. The iterator loops over this element
493 # and all subelements, in document order, and returns all inner
494 # text.
495 #
496 # @return An iterator containing all inner text.
497 # @defreturn iterator
498
499 def itertext(self):
500 if self.text:
501 yield self.text
502 for e in self:
503 for s in e.itertext():
504 yield s
505 if e.tail:
506 yield e.tail
507
508 # compatibility
509 _Element = _ElementInterface = Element
510
511 ##
512 # Subelement factory. This function creates an element instance, and
513 # appends it to an existing element.
514 # <p>
515 # The element name, attribute names, and attribute values can be
516 # either 8-bit ASCII strings or Unicode strings.
517 #
518 # @param parent The parent element.
519 # @param tag The subelement name.
520 # @param attrib An optional dictionary, containing element attributes.
521 # @param **extra Additional attributes, given as keyword arguments.
522 # @return An element instance.
523 # @defreturn Element
524
525 def SubElement(parent, tag, attrib={}, **extra):
526 attrib = attrib.copy()
527 attrib.update(extra)
528 element = parent.makeelement(tag, attrib)
529 parent.append(element)
530 return element
531
532 ##
533 # Comment element factory. This factory function creates a special
534 # element that will be serialized as an XML comment by the standard
535 # serializer.
536 # <p>
537 # The comment string can be either an 8-bit ASCII string or a Unicode
538 # string.
539 #
540 # @param text A string containing the comment string.
541 # @return An element instance, representing a comment.
542 # @defreturn Element
543
544 def Comment(text=None):
545 element = Element(Comment)
546 element.text = text
547 return element
548
549 ##
550 # PI element factory. This factory function creates a special element
551 # that will be serialized as an XML processing instruction by the standard
552 # serializer.
553 #
554 # @param target A string containing the PI target.
555 # @param text A string containing the PI contents, if any.
556 # @return An element instance, representing a PI.
557 # @defreturn Element
558
559 def ProcessingInstruction(target, text=None):
560 element = Element(ProcessingInstruction)
561 element.text = target
562 if text:
563 element.text = element.text + " " + text
564 return element
565
566 PI = ProcessingInstruction
567
568 ##
569 # QName wrapper. This can be used to wrap a QName attribute value, in
570 # order to get proper namespace handling on output.
571 #
572 # @param text A string containing the QName value, in the form {uri}local,
573 # or, if the tag argument is given, the URI part of a QName.
574 # @param tag Optional tag. If given, the first argument is interpreted as
575 # an URI, and this argument is interpreted as a local name.
576 # @return An opaque object, representing the QName.
577
578 class QName(object):
579 def __init__(self, text_or_uri, tag=None):
580 if tag:
581 text_or_uri = "{%s}%s" % (text_or_uri, tag)
582 self.text = text_or_uri
583 def __str__(self):
584 return self.text
585 def __hash__(self):
586 return hash(self.text)
587 def __cmp__(self, other):
588 if isinstance(other, QName):
589 return cmp(self.text, other.text)
590 return cmp(self.text, other)
591
592 # --------------------------------------------------------------------
593
594 ##
595 # ElementTree wrapper class. This class represents an entire element
596 # hierarchy, and adds some extra support for serialization to and from
597 # standard XML.
598 #
599 # @param element Optional root element.
600 # @keyparam file Optional file handle or file name. If given, the
601 # tree is initialized with the contents of this XML file.
602
603 class ElementTree(object):
604
605 def __init__(self, element=None, file=None):
606 assert element is None or iselement(element)
607 self._root = element # first node
608 if file:
609 self.parse(file)
610
611 ##
612 # Gets the root element for this tree.
613 #
614 # @return An element instance.
615 # @defreturn Element
616
617 def getroot(self):
618 return self._root
619
620 ##
621 # Replaces the root element for this tree. This discards the
622 # current contents of the tree, and replaces it with the given
623 # element. Use with care.
624 #
625 # @param element An element instance.
626
627 def _setroot(self, element):
628 assert iselement(element)
629 self._root = element
630
631 ##
632 # Loads an external XML document into this element tree.
633 #
634 # @param source A file name or file object.
635 # @keyparam parser An optional parser instance. If not given, the
636 # standard {@link XMLParser} parser is used.
637 # @return The document root element.
638 # @defreturn Element
639
640 def parse(self, source, parser=None):
641 if not hasattr(source, "read"):
642 source = open(source, "rb")
643 if not parser:
644 parser = XMLParser(target=TreeBuilder())
645 while 1:
646 data = source.read(32768)
647 if not data:
648 break
649 parser.feed(data)
650 self._root = parser.close()
651 return self._root
652
653 ##
654 # Creates a tree iterator for the root element. The iterator loops
655 # over all elements in this tree, in document order.
656 #
657 # @param tag What tags to look for (default is to return all elements)
658 # @return An iterator.
659 # @defreturn iterator
660
661 def iter(self, tag=None):
662 assert self._root is not None
663 return self._root.iter(tag)
664
665 getiterator = iter
666
667 ##
668 # Finds the first toplevel element with given tag.
669 # Same as getroot().find(path).
670 #
671 # @param path What element to look for.
672 # @return The first matching element, or None if no element was found.
673 # @defreturn Element or None
674
675 def find(self, path):
676 assert self._root is not None
677 if path[:1] == "/":
678 path = "." + path
679 import warnings
680 warnings.warn(
681 "This search is broken in 1.3 and earlier; if you rely "
682 "on the current behaviour, change it to %r" % path,
683 FutureWarning
684 )
685 return self._root.find(path)
686
687 ##
688 # Finds the element text for the first toplevel element with given
689 # tag. Same as getroot().findtext(path).
690 #
691 # @param path What toplevel element to look for.
692 # @param default What to return if the element was not found.
693 # @return The text content of the first matching element, or the
694 # default value no element was found. Note that if the element
695 # has is found, but has no text content, this method returns an
696 # empty string.
697 # @defreturn string
698
699 def findtext(self, path, default=None):
700 assert self._root is not None
701 if path[:1] == "/":
702 path = "." + path
703 import warnings
704 warnings.warn(
705 "This search is broken in 1.3 and earlier; if you rely "
706 "on the current behaviour, change it to %r" % path,
707 FutureWarning
708 )
709 return self._root.findtext(path, default)
710
711 ##
712 # Finds all toplevel elements with the given tag.
713 # Same as getroot().findall(path).
714 #
715 # @param path What element to look for.
716 # @return A list or iterator containing all matching elements,
717 # in document order.
718 # @defreturn list of Element instances
719
720 def findall(self, path):
721 assert self._root is not None
722 if path[:1] == "/":
723 path = "." + path
724 import warnings
725 warnings.warn(
726 "This search is broken in 1.3 and earlier; if you rely "
727 "on the current behaviour, change it to %r" % path,
728 FutureWarning
729 )
730 return self._root.findall(path)
731
732 ##
733 # Writes the element tree to a file, as XML.
734 #
735 # @param file A file name, or a file object opened for writing.
736 # @keyparam encoding Optional output encoding (default is US-ASCII).
737 # @keyparam method Optional output method ("xml" or "html"; default
738 # is "xml".
739 # @keyparam xml_declaration Controls if an XML declaration should
740 # be added to the file. Use False for never, True for always,
741 # None for only if not US-ASCII or UTF-8. None is default.
742
743 def write(self, file,
744 # keyword arguments
745 encoding="us-ascii",
746 xml_declaration=None,
747 default_namespace=None,
748 method=None):
749 assert self._root is not None
750 if not hasattr(file, "write"):
751 file = open(file, "wb")
752 write = file.write
753 if not method:
754 method = "xml"
755 if not encoding:
756 encoding = "us-ascii"
757 elif xml_declaration or (xml_declaration is None and
758 encoding not in ("utf-8", "us-ascii")):
759 write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
760 if method == "text":
761 _serialize_text(write, self._root, encoding)
762 else:
763 qnames, namespaces = _namespaces(
764 self._root, encoding, default_namespace
765 )
766 if method == "xml":
767 _serialize_xml(
768 write, self._root, encoding, qnames, namespaces
769 )
770 elif method == "html":
771 _serialize_html(
772 write, self._root, encoding, qnames, namespaces
773 )
774 else:
775 raise ValueError("unknown method %r" % method)
776
777 # --------------------------------------------------------------------
778 # serialization support
779
780 def _namespaces(elem, encoding, default_namespace=None):
781 # identify namespaces used in this tree
782
783 # maps qnames to *encoded* prefix:local names
784 qnames = {None: None}
785
786 # maps uri:s to prefixes
787 namespaces = {}
788 if default_namespace:
789 namespaces[default_namespace] = ""
790
791 def encode(text):
792 return text.encode(encoding)
793
794 def add_qname(qname):
795 # calculate serialized qname representation
796 try:
797 if qname[:1] == "{":
798 uri, tag = qname[1:].split("}", 1)
799 prefix = namespaces.get(uri)
800 if prefix is None:
801 prefix = _namespace_map.get(uri)
802 if prefix is None:
803 prefix = "ns%d" % len(namespaces)
804 if prefix != "xml":
805 namespaces[uri] = prefix
806 if prefix:
807 qnames[qname] = encode("%s:%s" % (prefix, tag))
808 else:
809 qnames[qname] = encode(tag) # default element
810 else:
811 if default_namespace:
812 # FIXME: can this be handled in XML 1.0?
813 raise ValueError(
814 "cannot use non-qualified names with "
815 "default_namespace option"
816 )
817 qnames[qname] = encode(qname)
818 except TypeError:
819 _raise_serialization_error(qname)
820
821 # populate qname and namespaces table
822 try:
823 iterate = elem.iter
824 except AttributeError:
825 iterate = elem.getiterator # cET compatibility
826 for elem in iterate():
827 tag = elem.tag
828 if isinstance(tag, QName) and tag.text not in qnames:
829 add_qname(tag.text)
830 elif isinstance(tag, basestring):
831 if tag not in qnames:
832 add_qname(tag)
833 elif tag is not None and tag is not Comment and tag is not PI:
834 _raise_serialization_error(tag)
835 for key, value in elem.items():
836 if isinstance(key, QName):
837 key = key.text
838 if key not in qnames:
839 add_qname(key)
840 if isinstance(value, QName) and value.text not in qnames:
841 add_qname(value.text)
842 text = elem.text
843 if isinstance(text, QName) and text.text not in qnames:
844 add_qname(text.text)
845 return qnames, namespaces
846
847 def _serialize_xml(write, elem, encoding, qnames, namespaces):
848 tag = elem.tag
849 text = elem.text
850 if tag is Comment:
851 write("<!--%s-->" % _escape_cdata(text, encoding))
852 elif tag is ProcessingInstruction:
853 write("<?%s?>" % _escape_cdata(text, encoding))
854 else:
855 tag = qnames[tag]
856 if tag is None:
857 if text:
858 write(_escape_cdata(text, encoding))
859 for e in elem:
860 _serialize_xml(write, e, encoding, qnames, None)
861 else:
862 write("<" + tag)
863 items = elem.items()
864 if items or namespaces:
865 items.sort() # lexical order
866 for k, v in items:
867 if isinstance(k, QName):
868 k = k.text
869 if isinstance(v, QName):
870 v = qnames[v.text]
871 else:
872 v = _escape_attrib(v, encoding)
873 write(" %s=\"%s\"" % (qnames[k], v))
874 if namespaces:
875 items = namespaces.items()
876 items.sort(key=lambda x: x[1]) # sort on prefix
877 for v, k in items:
878 if k:
879 k = ":" + k
880 write(" xmlns%s=\"%s\"" % (
881 k.encode(encoding),
882 _escape_attrib(v, encoding)
883 ))
884 if text or len(elem):
885 write(">")
886 if text:
887 write(_escape_cdata(text, encoding))
888 for e in elem:
889 _serialize_xml(write, e, encoding, qnames, None)
890 write("</" + tag + ">")
891 else:
892 write(" />")
893 if elem.tail:
894 write(_escape_cdata(elem.tail, encoding))
895
896 HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
897 "img", "input", "isindex", "link", "meta" "param")
898
899 try:
900 HTML_EMPTY = set(HTML_EMPTY)
901 except NameError:
902 pass
903
904 def _serialize_html(write, elem, encoding, qnames, namespaces):
905 tag = elem.tag
906 text = elem.text
907 if tag is Comment:
908 write("<!--%s-->" % _escape_cdata(text, encoding))
909 elif tag is ProcessingInstruction:
910 write("<?%s?>" % _escape_cdata(text, encoding))
911 else:
912 tag = qnames[tag]
913 if tag is None:
914 if text:
915 write(_escape_cdata(text, encoding))
916 for e in elem:
917 _serialize_html(write, e, encoding, qnames, None)
918 else:
919 write("<" + tag)
920 items = elem.items()
921 if items or namespaces:
922 items.sort() # lexical order
923 for k, v in items:
924 if isinstance(k, QName):
925 k = k.text
926 if isinstance(v, QName):
927 v = qnames[v.text]
928 else:
929 v = _escape_attrib_html(v, encoding)
930 # FIXME: handle boolean attributes
931 write(" %s=\"%s\"" % (qnames[k], v))
932 if namespaces:
933 items = namespaces.items()
934 items.sort(key=lambda x: x[1]) # sort on prefix
935 for v, k in items:
936 if k:
937 k = ":" + k
938 write(" xmlns%s=\"%s\"" % (
939 k.encode(encoding),
940 _escape_attrib(v, encoding)
941 ))
942 write(">")
943 tag = tag.lower()
944 if text:
945 if tag == "script" or tag == "style":
946 write(_encode(text, encoding))
947 else:
948 write(_escape_cdata(text, encoding))
949 for e in elem:
950 _serialize_html(write, e, encoding, qnames, None)
951 if tag not in HTML_EMPTY:
952 write("</" + tag + ">")
953 if elem.tail:
954 write(_escape_cdata(elem.tail, encoding))
955
956 def _serialize_text(write, elem, encoding):
957 for part in elem.itertext():
958 write(part.encode(encoding))
959 if elem.tail:
960 write(elem.tail.encode(encoding))
961
962 ##
963 # Registers a namespace prefix. The registry is global, and any
964 # existing mapping for either the given prefix or the namespace URI
965 # will be removed.
966 #
967 # @param prefix Namespace prefix.
968 # @param uri Namespace uri. Tags and attributes in this namespace
969 # will be serialized with the given prefix, if at all possible.
970 # @raise ValueError If the prefix is reserved, or is otherwise
971 # invalid.
972
973 def register_namespace(prefix, uri):
974 if re.match("ns\d+$", prefix):
975 raise ValueError("Prefix format reserved for internal use")
976 for k, v in _namespace_map.items():
977 if k == uri or v == prefix:
978 del _namespace_map[k]
979 _namespace_map[uri] = prefix
980
981 _namespace_map = {
982 # "well-known" namespace prefixes
983 "http://www.w3.org/XML/1998/namespace": "xml",
984 "http://www.w3.org/1999/xhtml": "html",
985 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
986 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
987 # xml schema
988 "http://www.w3.org/2001/XMLSchema": "xs",
989 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
990 # dublic core
991 "http://purl.org/dc/elements/1.1/": "dc",
992 }
993
994 def _raise_serialization_error(text):
995 raise TypeError(
996 "cannot serialize %r (type %s)" % (text, type(text).__name__)
997 )
998
999 def _encode(text, encoding):
1000 try:
1001 return text.encode(encoding, "xmlcharrefreplace")
1002 except (TypeError, AttributeError):
1003 _raise_serialization_error(text)
1004
1005 def _escape_cdata(text, encoding):
1006 # escape character data
1007 try:
1008 # it's worth avoiding do-nothing calls for strings that are
1009 # shorter than 500 character, or so. assume that's, by far,
1010 # the most common case in most applications.
1011 if "&" in text:
1012 text = text.replace("&", "&amp;")
1013 if "<" in text:
1014 text = text.replace("<", "&lt;")
1015 if ">" in text:
1016 text = text.replace(">", "&gt;")
1017 return text.encode(encoding, "xmlcharrefreplace")
1018 except (TypeError, AttributeError):
1019 _raise_serialization_error(text)
1020
1021 def _escape_attrib(text, encoding):
1022 # escape attribute value
1023 try:
1024 if "&" in text:
1025 text = text.replace("&", "&amp;")
1026 if "<" in text:
1027 text = text.replace("<", "&lt;")
1028 if ">" in text:
1029 text = text.replace(">", "&gt;")
1030 if "\"" in text:
1031 text = text.replace("\"", "&quot;")
1032 if "\n" in text:
1033 text = text.replace("\n", "&#10;")
1034 return text.encode(encoding, "xmlcharrefreplace")
1035 except (TypeError, AttributeError):
1036 _raise_serialization_error(text)
1037
1038 def _escape_attrib_html(text, encoding):
1039 # escape attribute value
1040 try:
1041 if "&" in text:
1042 text = text.replace("&", "&amp;")
1043 if ">" in text:
1044 text = text.replace(">", "&gt;")
1045 if "\"" in text:
1046 text = text.replace("\"", "&quot;")
1047 return text.encode(encoding, "xmlcharrefreplace")
1048 except (TypeError, AttributeError):
1049 _raise_serialization_error(text)
1050
1051 # --------------------------------------------------------------------
1052
1053 ##
1054 # Generates a string representation of an XML element, including all
1055 # subelements.
1056 #
1057 # @param element An Element instance.
1058 # @return An encoded string containing the XML data.
1059 # @defreturn string
1060
1061 def tostring(element, encoding=None, method=None):
1062 class dummy:
1063 pass
1064 data = []
1065 file = dummy()
1066 file.write = data.append
1067 ElementTree(element).write(file, encoding, method=method)
1068 return "".join(data)
1069
1070 ##
1071 # Generates a string representation of an XML element, including all
1072 # subelements. The string is returned as a sequence of string fragments.
1073 #
1074 # @param element An Element instance.
1075 # @return A sequence object containing the XML data.
1076 # @defreturn sequence
1077 # @since 1.3
1078
1079 def tostringlist(element, encoding=None):
1080 class dummy:
1081 pass
1082 data = []
1083 file = dummy()
1084 file.write = data.append
1085 ElementTree(element).write(file, encoding)
1086 # FIXME: merge small fragments into larger parts
1087 return data
1088
1089 ##
1090 # Writes an element tree or element structure to sys.stdout. This
1091 # function should be used for debugging only.
1092 # <p>
1093 # The exact output format is implementation dependent. In this
1094 # version, it's written as an ordinary XML file.
1095 #
1096 # @param elem An element tree or an individual element.
1097
1098 def dump(elem):
1099 # debugging
1100 if not isinstance(elem, ElementTree):
1101 elem = ElementTree(elem)
1102 elem.write(sys.stdout)
1103 tail = elem.getroot().tail
1104 if not tail or tail[-1] != "\n":
1105 sys.stdout.write("\n")
1106
1107 # --------------------------------------------------------------------
1108 # parsing
1109
1110 ##
1111 # Parses an XML document into an element tree.
1112 #
1113 # @param source A filename or file object containing XML data.
1114 # @param parser An optional parser instance. If not given, the
1115 # standard {@link XMLParser} parser is used.
1116 # @return An ElementTree instance
1117
1118 def parse(source, parser=None):
1119 tree = ElementTree()
1120 tree.parse(source, parser)
1121 return tree
1122
1123 ##
1124 # Parses an XML document into an element tree incrementally, and reports
1125 # what's going on to the user.
1126 #
1127 # @param source A filename or file object containing XML data.
1128 # @param events A list of events to report back. If omitted, only "end"
1129 # events are reported.
1130 # @param parser An optional parser instance. If not given, the
1131 # standard {@link XMLParser} parser is used.
1132 # @return A (event, elem) iterator.
1133
1134 def iterparse(source, events=None, parser=None):
1135 if not hasattr(source, "read"):
1136 source = open(source, "rb")
1137 if not parser:
1138 parser = XMLParser(target=TreeBuilder())
1139 return _IterParseIterator(source, events, parser)
1140
1141 class _IterParseIterator(object):
1142
1143 def __init__(self, source, events, parser):
1144 self._file = source
1145 self._events = []
1146 self._index = 0
1147 self.root = self._root = None
1148 self._parser = parser
1149 # wire up the parser for event reporting
1150 parser = self._parser._parser
1151 append = self._events.append
1152 if events is None:
1153 events = ["end"]
1154 for event in events:
1155 if event == "start":
1156 try:
1157 parser.ordered_attributes = 1
1158 parser.specified_attributes = 1
1159 def handler(tag, attrib_in, event=event, append=append,
1160 start=self._parser._start_list):
1161 append((event, start(tag, attrib_in)))
1162 parser.StartElementHandler = handler
1163 except AttributeError:
1164 def handler(tag, attrib_in, event=event, append=append,
1165 start=self._parser._start):
1166 append((event, start(tag, attrib_in)))
1167 parser.StartElementHandler = handler
1168 elif event == "end":
1169 def handler(tag, event=event, append=append,
1170 end=self._parser._end):
1171 append((event, end(tag)))
1172 parser.EndElementHandler = handler
1173 elif event == "start-ns":
1174 def handler(prefix, uri, event=event, append=append):
1175 try:
1176 uri = uri.encode("ascii")
1177 except UnicodeError:
1178 pass
1179 append((event, (prefix or "", uri)))
1180 parser.StartNamespaceDeclHandler = handler
1181 elif event == "end-ns":
1182 def handler(prefix, event=event, append=append):
1183 append((event, None))
1184 parser.EndNamespaceDeclHandler = handler
1185
1186 def next(self):
1187 while 1:
1188 try:
1189 item = self._events[self._index]
1190 except IndexError:
1191 if self._parser is None:
1192 self.root = self._root
1193 raise StopIteration
1194 # load event buffer
1195 del self._events[:]
1196 self._index = 0
1197 data = self._file.read(16384)
1198 if data:
1199 self._parser.feed(data)
1200 else:
1201 self._root = self._parser.close()
1202 self._parser = None
1203 else:
1204 self._index = self._index + 1
1205 return item
1206
1207 def __iter__(self):
1208 return self
1209
1210 ##
1211 # Parses an XML document from a string constant. This function can
1212 # be used to embed "XML literals" in Python code.
1213 #
1214 # @param source A string containing XML data.
1215 # @param parser An optional parser instance. If not given, the
1216 # standard {@link XMLParser} parser is used.
1217 # @return An Element instance.
1218 # @defreturn Element
1219
1220 def XML(text, parser=None):
1221 if not parser:
1222 parser = XMLParser(target=TreeBuilder())
1223 parser.feed(text)
1224 return parser.close()
1225
1226 ##
1227 # Parses an XML document from a string constant, and also returns
1228 # a dictionary which maps from element id:s to elements.
1229 #
1230 # @param source A string containing XML data.
1231 # @param parser An optional parser instance. If not given, the
1232 # standard {@link XMLParser} parser is used.
1233 # @return A tuple containing an Element instance and a dictionary.
1234 # @defreturn (Element, dictionary)
1235
1236 def XMLID(text, parser=None):
1237 if not parser:
1238 parser = XMLParser(target=TreeBuilder())
1239 parser.feed(text)
1240 tree = parser.close()
1241 ids = {}
1242 for elem in tree.getiterator():
1243 id = elem.get("id")
1244 if id:
1245 ids[id] = elem
1246 return tree, ids
1247
1248 ##
1249 # Parses an XML document from a string constant. Same as {@link #XML}.
1250 #
1251 # @def fromstring(text)
1252 # @param source A string containing XML data.
1253 # @return An Element instance.
1254 # @defreturn Element
1255
1256 fromstring = XML
1257
1258 ##
1259 # Parses an XML document from a sequence of string fragments.
1260 #
1261 # @param sequence A list or other sequence containing XML data fragments.
1262 # @param parser An optional parser instance. If not given, the
1263 # standard {@link XMLParser} parser is used.
1264 # @return An Element instance.
1265 # @defreturn Element
1266 # @since 1.3
1267
1268 def fromstringlist(sequence, parser=None):
1269 if not parser:
1270 parser = XMLParser(target=TreeBuilder())
1271 for text in sequence:
1272 parser.feed(text)
1273 return parser.close()
1274
1275 # --------------------------------------------------------------------
1276
1277 ##
1278 # Generic element structure builder. This builder converts a sequence
1279 # of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1280 # #TreeBuilder.end} method calls to a well-formed element structure.
1281 # <p>
1282 # You can use this class to build an element structure using a custom XML
1283 # parser, or a parser for some other XML-like format.
1284 #
1285 # @param element_factory Optional element factory. This factory
1286 # is called to create new Element instances, as necessary.
1287
1288 class TreeBuilder(object):
1289
1290 def __init__(self, element_factory=None):
1291 self._data = [] # data collector
1292 self._elem = [] # element stack
1293 self._last = None # last element
1294 self._tail = None # true if we're after an end tag
1295 if element_factory is None:
1296 element_factory = Element
1297 self._factory = element_factory
1298
1299 ##
1300 # Flushes the builder buffers, and returns the toplevel document
1301 # element.
1302 #
1303 # @return An Element instance.
1304 # @defreturn Element
1305
1306 def close(self):
1307 assert len(self._elem) == 0, "missing end tags"
1308 assert self._last != None, "missing toplevel element"
1309 return self._last
1310
1311 def _flush(self):
1312 if self._data:
1313 if self._last is not None:
1314 text = "".join(self._data)
1315 if self._tail:
1316 assert self._last.tail is None, "internal error (tail)"
1317 self._last.tail = text
1318 else:
1319 assert self._last.text is None, "internal error (text)"
1320 self._last.text = text
1321 self._data = []
1322
1323 ##
1324 # Adds text to the current element.
1325 #
1326 # @param data A string. This should be either an 8-bit string
1327 # containing ASCII text, or a Unicode string.
1328
1329 def data(self, data):
1330 self._data.append(data)
1331
1332 ##
1333 # Opens a new element.
1334 #
1335 # @param tag The element name.
1336 # @param attrib A dictionary containing element attributes.
1337 # @return The opened element.
1338 # @defreturn Element
1339
1340 def start(self, tag, attrs):
1341 self._flush()
1342 self._last = elem = self._factory(tag, attrs)
1343 if self._elem:
1344 self._elem[-1].append(elem)
1345 self._elem.append(elem)
1346 self._tail = 0
1347 return elem
1348
1349 ##
1350 # Closes the current element.
1351 #
1352 # @param tag The element name.
1353 # @return The closed element.
1354 # @defreturn Element
1355
1356 def end(self, tag):
1357 self._flush()
1358 self._last = self._elem.pop()
1359 assert self._last.tag == tag,\
1360 "end tag mismatch (expected %s, got %s)" % (
1361 self._last.tag, tag)
1362 self._tail = 1
1363 return self._last
1364
1365 ##
1366 # Element structure builder for XML source data, based on the
1367 # <b>expat</b> parser.
1368 #
1369 # @keyparam target Target object. If omitted, the builder uses an
1370 # instance of the standard {@link #TreeBuilder} class.
1371 # @keyparam html Predefine HTML entities. This flag is not supported
1372 # by the current implementation.
1373 # @keyparam encoding Optional encoding. If given, the value overrides
1374 # the encoding specified in the XML file.
1375 # @see #ElementTree
1376 # @see #TreeBuilder
1377
1378 class XMLParser(object):
1379
1380 def __init__(self, html=0, target=None, encoding=None):
1381 try:
1382 from xml.parsers import expat
1383 except ImportError:
1384 try:
1385 import pyexpat; expat = pyexpat
1386 except ImportError:
1387 raise ImportError(
1388 "No module named expat; use SimpleXMLTreeBuilder instead"
1389 )
1390 parser = expat.ParserCreate(encoding, "}")
1391 if target is None:
1392 target = TreeBuilder()
1393 # underscored names are provided for compatibility only
1394 self.parser = self._parser = parser
1395 self.target = self._target = target
1396 self._error = expat.error
1397 self._names = {} # name memo cache
1398 # callbacks
1399 parser.DefaultHandlerExpand = self._default
1400 parser.StartElementHandler = self._start
1401 parser.EndElementHandler = self._end
1402 parser.CharacterDataHandler = self._data
1403 # let expat do the buffering, if supported
1404 try:
1405 self._parser.buffer_text = 1
1406 except AttributeError:
1407 pass
1408 # use new-style attribute handling, if supported
1409 try:
1410 self._parser.ordered_attributes = 1
1411 self._parser.specified_attributes = 1
1412 parser.StartElementHandler = self._start_list
1413 except AttributeError:
1414 pass
1415 self._doctype = None
1416 self.entity = {}
1417 try:
1418 self.version = "Expat %d.%d.%d" % expat.version_info
1419 except AttributeError:
1420 pass # unknown
1421
1422 def _raiseerror(self, value):
1423 err = ParseError(value)
1424 err.code = value.code
1425 err.position = value.lineno, value.offset
1426 raise err
1427
1428 def _fixtext(self, text):
1429 # convert text string to ascii, if possible
1430 try:
1431 return text.encode("ascii")
1432 except UnicodeError:
1433 return text
1434
1435 def _fixname(self, key):
1436 # expand qname, and convert name string to ascii, if possible
1437 try:
1438 name = self._names[key]
1439 except KeyError:
1440 name = key
1441 if "}" in name:
1442 name = "{" + name
1443 self._names[key] = name = self._fixtext(name)
1444 return name
1445
1446 def _start(self, tag, attrib_in):
1447 fixname = self._fixname
1448 fixtext = self._fixtext
1449 tag = fixname(tag)
1450 attrib = {}
1451 for key, value in attrib_in.items():
1452 attrib[fixname(key)] = fixtext(value)
1453 return self.target.start(tag, attrib)
1454
1455 def _start_list(self, tag, attrib_in):
1456 fixname = self._fixname
1457 fixtext = self._fixtext
1458 tag = fixname(tag)
1459 attrib = {}
1460 if attrib_in:
1461 for i in range(0, len(attrib_in), 2):
1462 attrib[fixname(attrib_in[i])] = fixtext(attrib_in[i+1])
1463 return self.target.start(tag, attrib)
1464
1465 def _data(self, text):
1466 return self.target.data(self._fixtext(text))
1467
1468 def _end(self, tag):
1469 return self.target.end(self._fixname(tag))
1470
1471 def _default(self, text):
1472 prefix = text[:1]
1473 if prefix == "&":
1474 # deal with undefined entities
1475 try:
1476 self.target.data(self.entity[text[1:-1]])
1477 except KeyError:
1478 from xml.parsers import expat
1479 err = expat.error(
1480 "undefined entity %s: line %d, column %d" %
1481 (text, self._parser.ErrorLineNumber,
1482 self._parser.ErrorColumnNumber)
1483 )
1484 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1485 err.lineno = self._parser.ErrorLineNumber
1486 err.offset = self._parser.ErrorColumnNumber
1487 raise err
1488 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1489 self._doctype = [] # inside a doctype declaration
1490 elif self._doctype is not None:
1491 # parse doctype contents
1492 if prefix == ">":
1493 self._doctype = None
1494 return
1495 text = text.strip()
1496 if not text:
1497 return
1498 self._doctype.append(text)
1499 n = len(self._doctype)
1500 if n > 2:
1501 type = self._doctype[1]
1502 if type == "PUBLIC" and n == 4:
1503 name, type, pubid, system = self._doctype
1504 elif type == "SYSTEM" and n == 3:
1505 name, type, system = self._doctype
1506 pubid = None
1507 else:
1508 return
1509 if pubid:
1510 pubid = pubid[1:-1]
1511 if hasattr(self.target, "doctype"):
1512 self.target.doctype(name, pubid, system[1:-1])
1513 self._doctype = None
1514
1515 ##
1516 # Feeds data to the parser.
1517 #
1518 # @param data Encoded data.
1519
1520 def feed(self, data):
1521 try:
1522 self._parser.Parse(data, 0)
1523 except self._error, v:
1524 self._raiseerror(v)
1525
1526 ##
1527 # Finishes feeding data to the parser.
1528 #
1529 # @return An element structure.
1530 # @defreturn Element
1531
1532 def close(self):
1533 try:
1534 self._parser.Parse("", 1) # end of data
1535 except self._error, v:
1536 self._raiseerror(v)
1537 tree = self.target.close()
1538 del self.target, self._parser # get rid of circular references
1539 return tree
1540
1541 # compatibility
1542 XMLTreeBuilder = XMLParser