diff options
Diffstat (limited to 'jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/treebuilders/etree_lxml.py')
-rw-r--r-- | jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/treebuilders/etree_lxml.py | 369 |
1 files changed, 369 insertions, 0 deletions
diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/treebuilders/etree_lxml.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/treebuilders/etree_lxml.py new file mode 100644 index 0000000..35d08ef --- /dev/null +++ b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/treebuilders/etree_lxml.py @@ -0,0 +1,369 @@ +"""Module for supporting the lxml.etree library. The idea here is to use as much +of the native library as possible, without using fragile hacks like custom element +names that break between releases. The downside of this is that we cannot represent +all possible trees; specifically the following are known to cause problems: + +Text or comments as siblings of the root element +Docypes with no name + +When any of these things occur, we emit a DataLossWarning +""" + +from __future__ import absolute_import, division, unicode_literals + +import warnings +import re +import sys + +from . import _base +from ..constants import DataLossWarning +from .. import constants +from . import etree as etree_builders +from .. import ihatexml + +import lxml.etree as etree + + +fullTree = True +tag_regexp = re.compile("{([^}]*)}(.*)") + +comment_type = etree.Comment("asd").tag + + +class DocumentType(object): + def __init__(self, name, publicId, systemId): + self.name = name + self.publicId = publicId + self.systemId = systemId + + +class Document(object): + def __init__(self): + self._elementTree = None + self._childNodes = [] + + def appendChild(self, element): + self._elementTree.getroot().addnext(element._element) + + def _getChildNodes(self): + return self._childNodes + + childNodes = property(_getChildNodes) + + +def testSerializer(element): + rv = [] + finalText = None + infosetFilter = ihatexml.InfosetFilter() + + def serializeElement(element, indent=0): + if not hasattr(element, "tag"): + if hasattr(element, "getroot"): + # Full tree case + rv.append("#document") + if element.docinfo.internalDTD: + if not (element.docinfo.public_id or + element.docinfo.system_url): + dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name + else: + dtd_str = """<!DOCTYPE %s "%s" "%s">""" % ( + element.docinfo.root_name, + element.docinfo.public_id, + element.docinfo.system_url) + rv.append("|%s%s" % (' ' * (indent + 2), dtd_str)) + next_element = element.getroot() + while next_element.getprevious() is not None: + next_element = next_element.getprevious() + while next_element is not None: + serializeElement(next_element, indent + 2) + next_element = next_element.getnext() + elif isinstance(element, str) or isinstance(element, bytes): + # Text in a fragment + assert isinstance(element, str) or sys.version_info.major == 2 + rv.append("|%s\"%s\"" % (' ' * indent, element)) + else: + # Fragment case + rv.append("#document-fragment") + for next_element in element: + serializeElement(next_element, indent + 2) + elif element.tag == comment_type: + rv.append("|%s<!-- %s -->" % (' ' * indent, element.text)) + if hasattr(element, "tail") and element.tail: + rv.append("|%s\"%s\"" % (' ' * indent, element.tail)) + else: + assert isinstance(element, etree._Element) + nsmatch = etree_builders.tag_regexp.match(element.tag) + if nsmatch is not None: + ns = nsmatch.group(1) + tag = nsmatch.group(2) + prefix = constants.prefixes[ns] + rv.append("|%s<%s %s>" % (' ' * indent, prefix, + infosetFilter.fromXmlName(tag))) + else: + rv.append("|%s<%s>" % (' ' * indent, + infosetFilter.fromXmlName(element.tag))) + + if hasattr(element, "attrib"): + attributes = [] + for name, value in element.attrib.items(): + nsmatch = tag_regexp.match(name) + if nsmatch is not None: + ns, name = nsmatch.groups() + name = infosetFilter.fromXmlName(name) + prefix = constants.prefixes[ns] + attr_string = "%s %s" % (prefix, name) + else: + attr_string = infosetFilter.fromXmlName(name) + attributes.append((attr_string, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) + + if element.text: + rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text)) + indent += 2 + for child in element: + serializeElement(child, indent) + if hasattr(element, "tail") and element.tail: + rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail)) + serializeElement(element, 0) + + if finalText is not None: + rv.append("|%s\"%s\"" % (' ' * 2, finalText)) + + return "\n".join(rv) + + +def tostring(element): + """Serialize an element and its child nodes to a string""" + rv = [] + finalText = None + + def serializeElement(element): + if not hasattr(element, "tag"): + if element.docinfo.internalDTD: + if element.docinfo.doctype: + dtd_str = element.docinfo.doctype + else: + dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name + rv.append(dtd_str) + serializeElement(element.getroot()) + + elif element.tag == comment_type: + rv.append("<!--%s-->" % (element.text,)) + + else: + # This is assumed to be an ordinary element + if not element.attrib: + rv.append("<%s>" % (element.tag,)) + else: + attr = " ".join(["%s=\"%s\"" % (name, value) + for name, value in element.attrib.items()]) + rv.append("<%s %s>" % (element.tag, attr)) + if element.text: + rv.append(element.text) + + for child in element: + serializeElement(child) + + rv.append("</%s>" % (element.tag,)) + + if hasattr(element, "tail") and element.tail: + rv.append(element.tail) + + serializeElement(element) + + if finalText is not None: + rv.append("%s\"" % (' ' * 2, finalText)) + + return "".join(rv) + + +class TreeBuilder(_base.TreeBuilder): + documentClass = Document + doctypeClass = DocumentType + elementClass = None + commentClass = None + fragmentClass = Document + implementation = etree + + def __init__(self, namespaceHTMLElements, fullTree=False): + builder = etree_builders.getETreeModule(etree, fullTree=fullTree) + infosetFilter = self.infosetFilter = ihatexml.InfosetFilter() + self.namespaceHTMLElements = namespaceHTMLElements + + class Attributes(dict): + def __init__(self, element, value={}): + self._element = element + dict.__init__(self, value) + for key, value in self.items(): + if isinstance(key, tuple): + name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1])) + else: + name = infosetFilter.coerceAttribute(key) + self._element._element.attrib[name] = value + + def __setitem__(self, key, value): + dict.__setitem__(self, key, value) + if isinstance(key, tuple): + name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1])) + else: + name = infosetFilter.coerceAttribute(key) + self._element._element.attrib[name] = value + + class Element(builder.Element): + def __init__(self, name, namespace): + name = infosetFilter.coerceElement(name) + builder.Element.__init__(self, name, namespace=namespace) + self._attributes = Attributes(self) + + def _setName(self, name): + self._name = infosetFilter.coerceElement(name) + self._element.tag = self._getETreeTag( + self._name, self._namespace) + + def _getName(self): + return infosetFilter.fromXmlName(self._name) + + name = property(_getName, _setName) + + def _getAttributes(self): + return self._attributes + + def _setAttributes(self, attributes): + self._attributes = Attributes(self, attributes) + + attributes = property(_getAttributes, _setAttributes) + + def insertText(self, data, insertBefore=None): + data = infosetFilter.coerceCharacters(data) + builder.Element.insertText(self, data, insertBefore) + + def appendChild(self, child): + builder.Element.appendChild(self, child) + + class Comment(builder.Comment): + def __init__(self, data): + data = infosetFilter.coerceComment(data) + builder.Comment.__init__(self, data) + + def _setData(self, data): + data = infosetFilter.coerceComment(data) + self._element.text = data + + def _getData(self): + return self._element.text + + data = property(_getData, _setData) + + self.elementClass = Element + self.commentClass = builder.Comment + # self.fragmentClass = builder.DocumentFragment + _base.TreeBuilder.__init__(self, namespaceHTMLElements) + + def reset(self): + _base.TreeBuilder.reset(self) + self.insertComment = self.insertCommentInitial + self.initial_comments = [] + self.doctype = None + + def testSerializer(self, element): + return testSerializer(element) + + def getDocument(self): + if fullTree: + return self.document._elementTree + else: + return self.document._elementTree.getroot() + + def getFragment(self): + fragment = [] + element = self.openElements[0]._element + if element.text: + fragment.append(element.text) + fragment.extend(list(element)) + if element.tail: + fragment.append(element.tail) + return fragment + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + if not name: + warnings.warn("lxml cannot represent empty doctype", DataLossWarning) + self.doctype = None + else: + coercedName = self.infosetFilter.coerceElement(name) + if coercedName != name: + warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning) + + doctype = self.doctypeClass(coercedName, publicId, systemId) + self.doctype = doctype + + def insertCommentInitial(self, data, parent=None): + self.initial_comments.append(data) + + def insertCommentMain(self, data, parent=None): + if (parent == self.document and + self.document._elementTree.getroot()[-1].tag == comment_type): + warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning) + super(TreeBuilder, self).insertComment(data, parent) + + def insertRoot(self, token): + """Create the document root""" + # Because of the way libxml2 works, it doesn't seem to be possible to + # alter information like the doctype after the tree has been parsed. + # Therefore we need to use the built-in parser to create our iniial + # tree, after which we can add elements like normal + docStr = "" + if self.doctype: + assert self.doctype.name + docStr += "<!DOCTYPE %s" % self.doctype.name + if (self.doctype.publicId is not None or + self.doctype.systemId is not None): + docStr += (' PUBLIC "%s" ' % + (self.infosetFilter.coercePubid(self.doctype.publicId or ""))) + if self.doctype.systemId: + sysid = self.doctype.systemId + if sysid.find("'") >= 0 and sysid.find('"') >= 0: + warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning) + sysid = sysid.replace("'", 'U00027') + if sysid.find("'") >= 0: + docStr += '"%s"' % sysid + else: + docStr += "'%s'" % sysid + else: + docStr += "''" + docStr += ">" + if self.doctype.name != token["name"]: + warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning) + docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>" + root = etree.fromstring(docStr) + + # Append the initial comments: + for comment_token in self.initial_comments: + root.addprevious(etree.Comment(comment_token["data"])) + + # Create the root document and add the ElementTree to it + self.document = self.documentClass() + self.document._elementTree = root.getroottree() + + # Give the root element the right name + name = token["name"] + namespace = token.get("namespace", self.defaultNamespace) + if namespace is None: + etree_tag = name + else: + etree_tag = "{%s}%s" % (namespace, name) + root.tag = etree_tag + + # Add the root element to the internal child/open data structures + root_element = self.elementClass(name, namespace) + root_element._element = root + self.document._childNodes.append(root_element) + self.openElements.append(root_element) + + # Reset to the default insert comment function + self.insertComment = self.insertCommentMain |