diff options
Diffstat (limited to 'jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/serializer/htmlserializer.py')
-rw-r--r-- | jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/serializer/htmlserializer.py | 309 |
1 files changed, 309 insertions, 0 deletions
diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/serializer/htmlserializer.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/serializer/htmlserializer.py new file mode 100644 index 0000000..08b60df --- /dev/null +++ b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/serializer/htmlserializer.py @@ -0,0 +1,309 @@ +from __future__ import absolute_import, division, unicode_literals +from pip._vendor.six import text_type + +import gettext +_ = gettext.gettext + +try: + from functools import reduce +except ImportError: + pass + +from ..constants import voidElements, booleanAttributes, spaceCharacters +from ..constants import rcdataElements, entities, xmlEntities +from .. import utils +from xml.sax.saxutils import escape + +spaceCharacters = "".join(spaceCharacters) + +try: + from codecs import register_error, xmlcharrefreplace_errors +except ImportError: + unicode_encode_errors = "strict" +else: + unicode_encode_errors = "htmlentityreplace" + + encode_entity_map = {} + is_ucs4 = len("\U0010FFFF") == 1 + for k, v in list(entities.items()): + # skip multi-character entities + if ((is_ucs4 and len(v) > 1) or + (not is_ucs4 and len(v) > 2)): + continue + if v != "&": + if len(v) == 2: + v = utils.surrogatePairToCodepoint(v) + else: + v = ord(v) + if not v in encode_entity_map or k.islower(): + # prefer < over < and similarly for &, >, etc. + encode_entity_map[v] = k + + def htmlentityreplace_errors(exc): + if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): + res = [] + codepoints = [] + skip = False + for i, c in enumerate(exc.object[exc.start:exc.end]): + if skip: + skip = False + continue + index = i + exc.start + if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): + codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) + skip = True + else: + codepoint = ord(c) + codepoints.append(codepoint) + for cp in codepoints: + e = encode_entity_map.get(cp) + if e: + res.append("&") + res.append(e) + if not e.endswith(";"): + res.append(";") + else: + res.append("&#x%s;" % (hex(cp)[2:])) + return ("".join(res), exc.end) + else: + return xmlcharrefreplace_errors(exc) + + register_error(unicode_encode_errors, htmlentityreplace_errors) + + del register_error + + +class HTMLSerializer(object): + + # attribute quoting options + quote_attr_values = False + quote_char = '"' + use_best_quote_char = True + + # tag syntax options + omit_optional_tags = True + minimize_boolean_attributes = True + use_trailing_solidus = False + space_before_trailing_solidus = True + + # escaping options + escape_lt_in_attrs = False + escape_rcdata = False + resolve_entities = True + + # miscellaneous options + inject_meta_charset = True + strip_whitespace = False + sanitize = False + + options = ("quote_attr_values", "quote_char", "use_best_quote_char", + "minimize_boolean_attributes", "use_trailing_solidus", + "space_before_trailing_solidus", "omit_optional_tags", + "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", + "escape_rcdata", "resolve_entities", "sanitize") + + def __init__(self, **kwargs): + """Initialize HTMLSerializer. + + Keyword options (default given first unless specified) include: + + inject_meta_charset=True|False + Whether it insert a meta element to define the character set of the + document. + quote_attr_values=True|False + Whether to quote attribute values that don't require quoting + per HTML5 parsing rules. + quote_char=u'"'|u"'" + Use given quote character for attribute quoting. Default is to + use double quote unless attribute value contains a double quote, + in which case single quotes are used instead. + escape_lt_in_attrs=False|True + Whether to escape < in attribute values. + escape_rcdata=False|True + Whether to escape characters that need to be escaped within normal + elements within rcdata elements such as style. + resolve_entities=True|False + Whether to resolve named character entities that appear in the + source tree. The XML predefined entities < > & " ' + are unaffected by this setting. + strip_whitespace=False|True + Whether to remove semantically meaningless whitespace. (This + compresses all whitespace to a single space except within pre.) + minimize_boolean_attributes=True|False + Shortens boolean attributes to give just the attribute value, + for example <input disabled="disabled"> becomes <input disabled>. + use_trailing_solidus=False|True + Includes a close-tag slash at the end of the start tag of void + elements (empty elements whose end tag is forbidden). E.g. <hr/>. + space_before_trailing_solidus=True|False + Places a space immediately before the closing slash in a tag + using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus. + sanitize=False|True + Strip all unsafe or unknown constructs from output. + See `html5lib user documentation`_ + omit_optional_tags=True|False + Omit start/end tags that are optional. + + .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation + """ + if 'quote_char' in kwargs: + self.use_best_quote_char = False + for attr in self.options: + setattr(self, attr, kwargs.get(attr, getattr(self, attr))) + self.errors = [] + self.strict = False + + def encode(self, string): + assert(isinstance(string, text_type)) + if self.encoding: + return string.encode(self.encoding, unicode_encode_errors) + else: + return string + + def encodeStrict(self, string): + assert(isinstance(string, text_type)) + if self.encoding: + return string.encode(self.encoding, "strict") + else: + return string + + def serialize(self, treewalker, encoding=None): + self.encoding = encoding + in_cdata = False + self.errors = [] + if encoding and self.inject_meta_charset: + from ..filters.inject_meta_charset import Filter + treewalker = Filter(treewalker, encoding) + # XXX: WhitespaceFilter should be used before OptionalTagFilter + # for maximum efficiently of this latter filter + if self.strip_whitespace: + from ..filters.whitespace import Filter + treewalker = Filter(treewalker) + if self.sanitize: + from ..filters.sanitizer import Filter + treewalker = Filter(treewalker) + if self.omit_optional_tags: + from ..filters.optionaltags import Filter + treewalker = Filter(treewalker) + for token in treewalker: + type = token["type"] + if type == "Doctype": + doctype = "<!DOCTYPE %s" % token["name"] + + if token["publicId"]: + doctype += ' PUBLIC "%s"' % token["publicId"] + elif token["systemId"]: + doctype += " SYSTEM" + if token["systemId"]: + if token["systemId"].find('"') >= 0: + if token["systemId"].find("'") >= 0: + self.serializeError(_("System identifer contains both single and double quote characters")) + quote_char = "'" + else: + quote_char = '"' + doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char) + + doctype += ">" + yield self.encodeStrict(doctype) + + elif type in ("Characters", "SpaceCharacters"): + if type == "SpaceCharacters" or in_cdata: + if in_cdata and token["data"].find("</") >= 0: + self.serializeError(_("Unexpected </ in CDATA")) + yield self.encode(token["data"]) + else: + yield self.encode(escape(token["data"])) + + elif type in ("StartTag", "EmptyTag"): + name = token["name"] + yield self.encodeStrict("<%s" % name) + if name in rcdataElements and not self.escape_rcdata: + in_cdata = True + elif in_cdata: + self.serializeError(_("Unexpected child element of a CDATA element")) + for (attr_namespace, attr_name), attr_value in token["data"].items(): + # TODO: Add namespace support here + k = attr_name + v = attr_value + yield self.encodeStrict(' ') + + yield self.encodeStrict(k) + if not self.minimize_boolean_attributes or \ + (k not in booleanAttributes.get(name, tuple()) + and k not in booleanAttributes.get("", tuple())): + yield self.encodeStrict("=") + if self.quote_attr_values or not v: + quote_attr = True + else: + quote_attr = reduce(lambda x, y: x or (y in v), + spaceCharacters + ">\"'=", False) + v = v.replace("&", "&") + if self.escape_lt_in_attrs: + v = v.replace("<", "<") + if quote_attr: + quote_char = self.quote_char + if self.use_best_quote_char: + if "'" in v and '"' not in v: + quote_char = '"' + elif '"' in v and "'" not in v: + quote_char = "'" + if quote_char == "'": + v = v.replace("'", "'") + else: + v = v.replace('"', """) + yield self.encodeStrict(quote_char) + yield self.encode(v) + yield self.encodeStrict(quote_char) + else: + yield self.encode(v) + if name in voidElements and self.use_trailing_solidus: + if self.space_before_trailing_solidus: + yield self.encodeStrict(" /") + else: + yield self.encodeStrict("/") + yield self.encode(">") + + elif type == "EndTag": + name = token["name"] + if name in rcdataElements: + in_cdata = False + elif in_cdata: + self.serializeError(_("Unexpected child element of a CDATA element")) + yield self.encodeStrict("</%s>" % name) + + elif type == "Comment": + data = token["data"] + if data.find("--") >= 0: + self.serializeError(_("Comment contains --")) + yield self.encodeStrict("<!--%s-->" % token["data"]) + + elif type == "Entity": + name = token["name"] + key = name + ";" + if not key in entities: + self.serializeError(_("Entity %s not recognized" % name)) + if self.resolve_entities and key not in xmlEntities: + data = entities[key] + else: + data = "&%s;" % name + yield self.encodeStrict(data) + + else: + self.serializeError(token["data"]) + + def render(self, treewalker, encoding=None): + if encoding: + return b"".join(list(self.serialize(treewalker, encoding))) + else: + return "".join(list(self.serialize(treewalker))) + + def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): + # XXX The idea is to make data mandatory. + self.errors.append(data) + if self.strict: + raise SerializeError + + +def SerializeError(Exception): + """Error in serialized tree""" + pass |