aboutsummaryrefslogtreecommitdiffstats
path: root/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters
diff options
context:
space:
mode:
Diffstat (limited to 'jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters')
-rw-r--r--jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/__init__.py0
-rw-r--r--jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/_base.py12
-rw-r--r--jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/alphabeticalattributes.py20
-rw-r--r--jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/inject_meta_charset.py65
-rw-r--r--jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/lint.py93
-rw-r--r--jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/optionaltags.py205
-rw-r--r--jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/sanitizer.py12
-rw-r--r--jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/whitespace.py38
8 files changed, 445 insertions, 0 deletions
diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/__init__.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/__init__.py
diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/_base.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/_base.py
new file mode 100644
index 0000000..c7dbaed
--- /dev/null
+++ b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/_base.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import, division, unicode_literals
+
+
+class Filter(object):
+ def __init__(self, source):
+ self.source = source
+
+ def __iter__(self):
+ return iter(self.source)
+
+ def __getattr__(self, name):
+ return getattr(self.source, name)
diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/alphabeticalattributes.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/alphabeticalattributes.py
new file mode 100644
index 0000000..fed6996
--- /dev/null
+++ b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/alphabeticalattributes.py
@@ -0,0 +1,20 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import _base
+
+try:
+ from collections import OrderedDict
+except ImportError:
+ from ordereddict import OrderedDict
+
+
+class Filter(_base.Filter):
+ def __iter__(self):
+ for token in _base.Filter.__iter__(self):
+ if token["type"] in ("StartTag", "EmptyTag"):
+ attrs = OrderedDict()
+ for name, value in sorted(token["data"].items(),
+ key=lambda x: x[0]):
+ attrs[name] = value
+ token["data"] = attrs
+ yield token
diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/inject_meta_charset.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/inject_meta_charset.py
new file mode 100644
index 0000000..ca33b70
--- /dev/null
+++ b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/inject_meta_charset.py
@@ -0,0 +1,65 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import _base
+
+
+class Filter(_base.Filter):
+ def __init__(self, source, encoding):
+ _base.Filter.__init__(self, source)
+ self.encoding = encoding
+
+ def __iter__(self):
+ state = "pre_head"
+ meta_found = (self.encoding is None)
+ pending = []
+
+ for token in _base.Filter.__iter__(self):
+ type = token["type"]
+ if type == "StartTag":
+ if token["name"].lower() == "head":
+ state = "in_head"
+
+ elif type == "EmptyTag":
+ if token["name"].lower() == "meta":
+ # replace charset with actual encoding
+ has_http_equiv_content_type = False
+ for (namespace, name), value in token["data"].items():
+ if namespace is not None:
+ continue
+ elif name.lower() == 'charset':
+ token["data"][(namespace, name)] = self.encoding
+ meta_found = True
+ break
+ elif name == 'http-equiv' and value.lower() == 'content-type':
+ has_http_equiv_content_type = True
+ else:
+ if has_http_equiv_content_type and (None, "content") in token["data"]:
+ token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
+ meta_found = True
+
+ elif token["name"].lower() == "head" and not meta_found:
+ # insert meta into empty head
+ yield {"type": "StartTag", "name": "head",
+ "data": token["data"]}
+ yield {"type": "EmptyTag", "name": "meta",
+ "data": {(None, "charset"): self.encoding}}
+ yield {"type": "EndTag", "name": "head"}
+ meta_found = True
+ continue
+
+ elif type == "EndTag":
+ if token["name"].lower() == "head" and pending:
+ # insert meta into head (if necessary) and flush pending queue
+ yield pending.pop(0)
+ if not meta_found:
+ yield {"type": "EmptyTag", "name": "meta",
+ "data": {(None, "charset"): self.encoding}}
+ while pending:
+ yield pending.pop(0)
+ meta_found = True
+ state = "post_head"
+
+ if state == "in_head":
+ pending.append(token)
+ else:
+ yield token
diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/lint.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/lint.py
new file mode 100644
index 0000000..83ad639
--- /dev/null
+++ b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/lint.py
@@ -0,0 +1,93 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from gettext import gettext
+_ = gettext
+
+from . import _base
+from ..constants import cdataElements, rcdataElements, voidElements
+
+from ..constants import spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+
+class LintError(Exception):
+ pass
+
+
+class Filter(_base.Filter):
+ def __iter__(self):
+ open_elements = []
+ contentModelFlag = "PCDATA"
+ for token in _base.Filter.__iter__(self):
+ type = token["type"]
+ if type in ("StartTag", "EmptyTag"):
+ name = token["name"]
+ if contentModelFlag != "PCDATA":
+ raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
+ if not isinstance(name, str):
+ raise LintError(_("Tag name is not a string: %r") % name)
+ if not name:
+ raise LintError(_("Empty tag name"))
+ if type == "StartTag" and name in voidElements:
+ raise LintError(_("Void element reported as StartTag token: %s") % name)
+ elif type == "EmptyTag" and name not in voidElements:
+ raise LintError(_("Non-void element reported as EmptyTag token: %s") % token["name"])
+ if type == "StartTag":
+ open_elements.append(name)
+ for name, value in token["data"]:
+ if not isinstance(name, str):
+ raise LintError(_("Attribute name is not a string: %r") % name)
+ if not name:
+ raise LintError(_("Empty attribute name"))
+ if not isinstance(value, str):
+ raise LintError(_("Attribute value is not a string: %r") % value)
+ if name in cdataElements:
+ contentModelFlag = "CDATA"
+ elif name in rcdataElements:
+ contentModelFlag = "RCDATA"
+ elif name == "plaintext":
+ contentModelFlag = "PLAINTEXT"
+
+ elif type == "EndTag":
+ name = token["name"]
+ if not isinstance(name, str):
+ raise LintError(_("Tag name is not a string: %r") % name)
+ if not name:
+ raise LintError(_("Empty tag name"))
+ if name in voidElements:
+ raise LintError(_("Void element reported as EndTag token: %s") % name)
+ start_name = open_elements.pop()
+ if start_name != name:
+ raise LintError(_("EndTag (%s) does not match StartTag (%s)") % (name, start_name))
+ contentModelFlag = "PCDATA"
+
+ elif type == "Comment":
+ if contentModelFlag != "PCDATA":
+ raise LintError(_("Comment not in PCDATA content model flag"))
+
+ elif type in ("Characters", "SpaceCharacters"):
+ data = token["data"]
+ if not isinstance(data, str):
+ raise LintError(_("Attribute name is not a string: %r") % data)
+ if not data:
+ raise LintError(_("%s token with empty data") % type)
+ if type == "SpaceCharacters":
+ data = data.strip(spaceCharacters)
+ if data:
+ raise LintError(_("Non-space character(s) found in SpaceCharacters token: ") % data)
+
+ elif type == "Doctype":
+ name = token["name"]
+ if contentModelFlag != "PCDATA":
+ raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
+ if not isinstance(name, str):
+ raise LintError(_("Tag name is not a string: %r") % name)
+ # XXX: what to do with token["data"] ?
+
+ elif type in ("ParseError", "SerializeError"):
+ pass
+
+ else:
+ raise LintError(_("Unknown token type: %s") % type)
+
+ yield token
diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/optionaltags.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/optionaltags.py
new file mode 100644
index 0000000..fefe0b3
--- /dev/null
+++ b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/optionaltags.py
@@ -0,0 +1,205 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import _base
+
+
+class Filter(_base.Filter):
+ def slider(self):
+ previous1 = previous2 = None
+ for token in self.source:
+ if previous1 is not None:
+ yield previous2, previous1, token
+ previous2 = previous1
+ previous1 = token
+ yield previous2, previous1, None
+
+ def __iter__(self):
+ for previous, token, next in self.slider():
+ type = token["type"]
+ if type == "StartTag":
+ if (token["data"] or
+ not self.is_optional_start(token["name"], previous, next)):
+ yield token
+ elif type == "EndTag":
+ if not self.is_optional_end(token["name"], next):
+ yield token
+ else:
+ yield token
+
+ def is_optional_start(self, tagname, previous, next):
+ type = next and next["type"] or None
+ if tagname in 'html':
+ # An html element's start tag may be omitted if the first thing
+ # inside the html element is not a space character or a comment.
+ return type not in ("Comment", "SpaceCharacters")
+ elif tagname == 'head':
+ # A head element's start tag may be omitted if the first thing
+ # inside the head element is an element.
+ # XXX: we also omit the start tag if the head element is empty
+ if type in ("StartTag", "EmptyTag"):
+ return True
+ elif type == "EndTag":
+ return next["name"] == "head"
+ elif tagname == 'body':
+ # A body element's start tag may be omitted if the first thing
+ # inside the body element is not a space character or a comment,
+ # except if the first thing inside the body element is a script
+ # or style element and the node immediately preceding the body
+ # element is a head element whose end tag has been omitted.
+ if type in ("Comment", "SpaceCharacters"):
+ return False
+ elif type == "StartTag":
+ # XXX: we do not look at the preceding event, so we never omit
+ # the body element's start tag if it's followed by a script or
+ # a style element.
+ return next["name"] not in ('script', 'style')
+ else:
+ return True
+ elif tagname == 'colgroup':
+ # A colgroup element's start tag may be omitted if the first thing
+ # inside the colgroup element is a col element, and if the element
+ # is not immediately preceeded by another colgroup element whose
+ # end tag has been omitted.
+ if type in ("StartTag", "EmptyTag"):
+ # XXX: we do not look at the preceding event, so instead we never
+ # omit the colgroup element's end tag when it is immediately
+ # followed by another colgroup element. See is_optional_end.
+ return next["name"] == "col"
+ else:
+ return False
+ elif tagname == 'tbody':
+ # A tbody element's start tag may be omitted if the first thing
+ # inside the tbody element is a tr element, and if the element is
+ # not immediately preceeded by a tbody, thead, or tfoot element
+ # whose end tag has been omitted.
+ if type == "StartTag":
+ # omit the thead and tfoot elements' end tag when they are
+ # immediately followed by a tbody element. See is_optional_end.
+ if previous and previous['type'] == 'EndTag' and \
+ previous['name'] in ('tbody', 'thead', 'tfoot'):
+ return False
+ return next["name"] == 'tr'
+ else:
+ return False
+ return False
+
+ def is_optional_end(self, tagname, next):
+ type = next and next["type"] or None
+ if tagname in ('html', 'head', 'body'):
+ # An html element's end tag may be omitted if the html element
+ # is not immediately followed by a space character or a comment.
+ return type not in ("Comment", "SpaceCharacters")
+ elif tagname in ('li', 'optgroup', 'tr'):
+ # A li element's end tag may be omitted if the li element is
+ # immediately followed by another li element or if there is
+ # no more content in the parent element.
+ # An optgroup element's end tag may be omitted if the optgroup
+ # element is immediately followed by another optgroup element,
+ # or if there is no more content in the parent element.
+ # A tr element's end tag may be omitted if the tr element is
+ # immediately followed by another tr element, or if there is
+ # no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] == tagname
+ else:
+ return type == "EndTag" or type is None
+ elif tagname in ('dt', 'dd'):
+ # A dt element's end tag may be omitted if the dt element is
+ # immediately followed by another dt element or a dd element.
+ # A dd element's end tag may be omitted if the dd element is
+ # immediately followed by another dd element or a dt element,
+ # or if there is no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] in ('dt', 'dd')
+ elif tagname == 'dd':
+ return type == "EndTag" or type is None
+ else:
+ return False
+ elif tagname == 'p':
+ # A p element's end tag may be omitted if the p element is
+ # immediately followed by an address, article, aside,
+ # blockquote, datagrid, dialog, dir, div, dl, fieldset,
+ # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
+ # nav, ol, p, pre, section, table, or ul, element, or if
+ # there is no more content in the parent element.
+ if type in ("StartTag", "EmptyTag"):
+ return next["name"] in ('address', 'article', 'aside',
+ 'blockquote', 'datagrid', 'dialog',
+ 'dir', 'div', 'dl', 'fieldset', 'footer',
+ 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+ 'header', 'hr', 'menu', 'nav', 'ol',
+ 'p', 'pre', 'section', 'table', 'ul')
+ else:
+ return type == "EndTag" or type is None
+ elif tagname == 'option':
+ # An option element's end tag may be omitted if the option
+ # element is immediately followed by another option element,
+ # or if it is immediately followed by an <code>optgroup</code>
+ # element, or if there is no more content in the parent
+ # element.
+ if type == "StartTag":
+ return next["name"] in ('option', 'optgroup')
+ else:
+ return type == "EndTag" or type is None
+ elif tagname in ('rt', 'rp'):
+ # An rt element's end tag may be omitted if the rt element is
+ # immediately followed by an rt or rp element, or if there is
+ # no more content in the parent element.
+ # An rp element's end tag may be omitted if the rp element is
+ # immediately followed by an rt or rp element, or if there is
+ # no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] in ('rt', 'rp')
+ else:
+ return type == "EndTag" or type is None
+ elif tagname == 'colgroup':
+ # A colgroup element's end tag may be omitted if the colgroup
+ # element is not immediately followed by a space character or
+ # a comment.
+ if type in ("Comment", "SpaceCharacters"):
+ return False
+ elif type == "StartTag":
+ # XXX: we also look for an immediately following colgroup
+ # element. See is_optional_start.
+ return next["name"] != 'colgroup'
+ else:
+ return True
+ elif tagname in ('thead', 'tbody'):
+ # A thead element's end tag may be omitted if the thead element
+ # is immediately followed by a tbody or tfoot element.
+ # A tbody element's end tag may be omitted if the tbody element
+ # is immediately followed by a tbody or tfoot element, or if
+ # there is no more content in the parent element.
+ # A tfoot element's end tag may be omitted if the tfoot element
+ # is immediately followed by a tbody element, or if there is no
+ # more content in the parent element.
+ # XXX: we never omit the end tag when the following element is
+ # a tbody. See is_optional_start.
+ if type == "StartTag":
+ return next["name"] in ['tbody', 'tfoot']
+ elif tagname == 'tbody':
+ return type == "EndTag" or type is None
+ else:
+ return False
+ elif tagname == 'tfoot':
+ # A tfoot element's end tag may be omitted if the tfoot element
+ # is immediately followed by a tbody element, or if there is no
+ # more content in the parent element.
+ # XXX: we never omit the end tag when the following element is
+ # a tbody. See is_optional_start.
+ if type == "StartTag":
+ return next["name"] == 'tbody'
+ else:
+ return type == "EndTag" or type is None
+ elif tagname in ('td', 'th'):
+ # A td element's end tag may be omitted if the td element is
+ # immediately followed by a td or th element, or if there is
+ # no more content in the parent element.
+ # A th element's end tag may be omitted if the th element is
+ # immediately followed by a td or th element, or if there is
+ # no more content in the parent element.
+ if type == "StartTag":
+ return next["name"] in ('td', 'th')
+ else:
+ return type == "EndTag" or type is None
+ return False
diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/sanitizer.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/sanitizer.py
new file mode 100644
index 0000000..b206b54
--- /dev/null
+++ b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/sanitizer.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import _base
+from ..sanitizer import HTMLSanitizerMixin
+
+
+class Filter(_base.Filter, HTMLSanitizerMixin):
+ def __iter__(self):
+ for token in _base.Filter.__iter__(self):
+ token = self.sanitize_token(token)
+ if token:
+ yield token
diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/whitespace.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/whitespace.py
new file mode 100644
index 0000000..dfc60ee
--- /dev/null
+++ b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/filters/whitespace.py
@@ -0,0 +1,38 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+
+from . import _base
+from ..constants import rcdataElements, spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
+
+
+class Filter(_base.Filter):
+
+ spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
+
+ def __iter__(self):
+ preserve = 0
+ for token in _base.Filter.__iter__(self):
+ type = token["type"]
+ if type == "StartTag" \
+ and (preserve or token["name"] in self.spacePreserveElements):
+ preserve += 1
+
+ elif type == "EndTag" and preserve:
+ preserve -= 1
+
+ elif not preserve and type == "SpaceCharacters" and token["data"]:
+ # Test on token["data"] above to not introduce spaces where there were not
+ token["data"] = " "
+
+ elif not preserve and type == "Characters":
+ token["data"] = collapse_spaces(token["data"])
+
+ yield token
+
+
+def collapse_spaces(text):
+ return SPACES_REGEX.sub(' ', text)