aboutsummaryrefslogtreecommitdiffstats
path: root/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/html5parser.py
diff options
context:
space:
mode:
authorPavel Aharoni <pa0916@att.com>2017-03-29 13:35:45 +0300
committerPavel Aharoni <pa0916@att.com>2017-03-29 13:35:45 +0300
commite2cc2530fc6d54ebc975c01a4ff887ce12f0a736 (patch)
tree38385867295c8a09fb0d7f8eaf5fa78179e5b13a /jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/html5parser.py
parentbccebaa9888906f8ff78172f62ec592956066d82 (diff)
[SDC-6] sdc-distribution-client 1707 rebasing
Change-Id: I322a05fd79beb6ba4fee4d32afffecf531b86e98 Signed-off-by: Pavel Aharoni <pa0916@att.com>
Diffstat (limited to 'jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/html5parser.py')
-rw-r--r--jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/html5parser.py2725
1 files changed, 2725 insertions, 0 deletions
diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/html5parser.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/html5parser.py
new file mode 100644
index 0000000..8a5acfe
--- /dev/null
+++ b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/html5parser.py
@@ -0,0 +1,2725 @@
+from __future__ import absolute_import, division, unicode_literals
+from pip._vendor.six import with_metaclass
+
+import types
+
+from . import inputstream
+from . import tokenizer
+
+from . import treebuilders
+from .treebuilders._base import Marker
+
+from . import utils
+from . import constants
+from .constants import spaceCharacters, asciiUpper2Lower
+from .constants import specialElements
+from .constants import headingElements
+from .constants import cdataElements, rcdataElements
+from .constants import tokenTypes, ReparseException, namespaces
+from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
+
+
+def parse(doc, treebuilder="etree", encoding=None,
+ namespaceHTMLElements=True):
+ """Parse a string or file-like object into a tree"""
+ tb = treebuilders.getTreeBuilder(treebuilder)
+ p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
+ return p.parse(doc, encoding=encoding)
+
+
+def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
+ namespaceHTMLElements=True):
+ tb = treebuilders.getTreeBuilder(treebuilder)
+ p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
+ return p.parseFragment(doc, container=container, encoding=encoding)
+
+
+def method_decorator_metaclass(function):
+ class Decorated(type):
+ def __new__(meta, classname, bases, classDict):
+ for attributeName, attribute in classDict.items():
+ if isinstance(attribute, types.FunctionType):
+ attribute = function(attribute)
+
+ classDict[attributeName] = attribute
+ return type.__new__(meta, classname, bases, classDict)
+ return Decorated
+
+
+class HTMLParser(object):
+ """HTML parser. Generates a tree structure from a stream of (possibly
+ malformed) HTML"""
+
+ def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
+ strict=False, namespaceHTMLElements=True, debug=False):
+ """
+ strict - raise an exception when a parse error is encountered
+
+ tree - a treebuilder class controlling the type of tree that will be
+ returned. Built in treebuilders can be accessed through
+ html5lib.treebuilders.getTreeBuilder(treeType)
+
+ tokenizer - a class that provides a stream of tokens to the treebuilder.
+ This may be replaced for e.g. a sanitizer which converts some tags to
+ text
+ """
+
+ # Raise an exception on the first error encountered
+ self.strict = strict
+
+ if tree is None:
+ tree = treebuilders.getTreeBuilder("etree")
+ self.tree = tree(namespaceHTMLElements)
+ self.tokenizer_class = tokenizer
+ self.errors = []
+
+ self.phases = dict([(name, cls(self, self.tree)) for name, cls in
+ getPhases(debug).items()])
+
+ def _parse(self, stream, innerHTML=False, container="div",
+ encoding=None, parseMeta=True, useChardet=True, **kwargs):
+
+ self.innerHTMLMode = innerHTML
+ self.container = container
+ self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
+ parseMeta=parseMeta,
+ useChardet=useChardet,
+ parser=self, **kwargs)
+ self.reset()
+
+ while True:
+ try:
+ self.mainLoop()
+ break
+ except ReparseException:
+ self.reset()
+
+ def reset(self):
+ self.tree.reset()
+ self.firstStartTag = False
+ self.errors = []
+ self.log = [] # only used with debug mode
+ # "quirks" / "limited quirks" / "no quirks"
+ self.compatMode = "no quirks"
+
+ if self.innerHTMLMode:
+ self.innerHTML = self.container.lower()
+
+ if self.innerHTML in cdataElements:
+ self.tokenizer.state = self.tokenizer.rcdataState
+ elif self.innerHTML in rcdataElements:
+ self.tokenizer.state = self.tokenizer.rawtextState
+ elif self.innerHTML == 'plaintext':
+ self.tokenizer.state = self.tokenizer.plaintextState
+ else:
+ # state already is data state
+ # self.tokenizer.state = self.tokenizer.dataState
+ pass
+ self.phase = self.phases["beforeHtml"]
+ self.phase.insertHtmlElement()
+ self.resetInsertionMode()
+ else:
+ self.innerHTML = False
+ self.phase = self.phases["initial"]
+
+ self.lastPhase = None
+
+ self.beforeRCDataPhase = None
+
+ self.framesetOK = True
+
+ def isHTMLIntegrationPoint(self, element):
+ if (element.name == "annotation-xml" and
+ element.namespace == namespaces["mathml"]):
+ return ("encoding" in element.attributes and
+ element.attributes["encoding"].translate(
+ asciiUpper2Lower) in
+ ("text/html", "application/xhtml+xml"))
+ else:
+ return (element.namespace, element.name) in htmlIntegrationPointElements
+
+ def isMathMLTextIntegrationPoint(self, element):
+ return (element.namespace, element.name) in mathmlTextIntegrationPointElements
+
+ def mainLoop(self):
+ CharactersToken = tokenTypes["Characters"]
+ SpaceCharactersToken = tokenTypes["SpaceCharacters"]
+ StartTagToken = tokenTypes["StartTag"]
+ EndTagToken = tokenTypes["EndTag"]
+ CommentToken = tokenTypes["Comment"]
+ DoctypeToken = tokenTypes["Doctype"]
+ ParseErrorToken = tokenTypes["ParseError"]
+
+ for token in self.normalizedTokens():
+ new_token = token
+ while new_token is not None:
+ currentNode = self.tree.openElements[-1] if self.tree.openElements else None
+ currentNodeNamespace = currentNode.namespace if currentNode else None
+ currentNodeName = currentNode.name if currentNode else None
+
+ type = new_token["type"]
+
+ if type == ParseErrorToken:
+ self.parseError(new_token["data"], new_token.get("datavars", {}))
+ new_token = None
+ else:
+ if (len(self.tree.openElements) == 0 or
+ currentNodeNamespace == self.tree.defaultNamespace or
+ (self.isMathMLTextIntegrationPoint(currentNode) and
+ ((type == StartTagToken and
+ token["name"] not in frozenset(["mglyph", "malignmark"])) or
+ type in (CharactersToken, SpaceCharactersToken))) or
+ (currentNodeNamespace == namespaces["mathml"] and
+ currentNodeName == "annotation-xml" and
+ token["name"] == "svg") or
+ (self.isHTMLIntegrationPoint(currentNode) and
+ type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
+ phase = self.phase
+ else:
+ phase = self.phases["inForeignContent"]
+
+ if type == CharactersToken:
+ new_token = phase.processCharacters(new_token)
+ elif type == SpaceCharactersToken:
+ new_token = phase.processSpaceCharacters(new_token)
+ elif type == StartTagToken:
+ new_token = phase.processStartTag(new_token)
+ elif type == EndTagToken:
+ new_token = phase.processEndTag(new_token)
+ elif type == CommentToken:
+ new_token = phase.processComment(new_token)
+ elif type == DoctypeToken:
+ new_token = phase.processDoctype(new_token)
+
+ if (type == StartTagToken and token["selfClosing"]
+ and not token["selfClosingAcknowledged"]):
+ self.parseError("non-void-element-with-trailing-solidus",
+ {"name": token["name"]})
+
+ # When the loop finishes it's EOF
+ reprocess = True
+ phases = []
+ while reprocess:
+ phases.append(self.phase)
+ reprocess = self.phase.processEOF()
+ if reprocess:
+ assert self.phase not in phases
+
+ def normalizedTokens(self):
+ for token in self.tokenizer:
+ yield self.normalizeToken(token)
+
+ def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
+ """Parse a HTML document into a well-formed tree
+
+ stream - a filelike object or string containing the HTML to be parsed
+
+ The optional encoding parameter must be a string that indicates
+ the encoding. If specified, that encoding will be used,
+ regardless of any BOM or later declaration (such as in a meta
+ element)
+ """
+ self._parse(stream, innerHTML=False, encoding=encoding,
+ parseMeta=parseMeta, useChardet=useChardet)
+ return self.tree.getDocument()
+
+ def parseFragment(self, stream, container="div", encoding=None,
+ parseMeta=False, useChardet=True):
+ """Parse a HTML fragment into a well-formed tree fragment
+
+ container - name of the element we're setting the innerHTML property
+ if set to None, default to 'div'
+
+ stream - a filelike object or string containing the HTML to be parsed
+
+ The optional encoding parameter must be a string that indicates
+ the encoding. If specified, that encoding will be used,
+ regardless of any BOM or later declaration (such as in a meta
+ element)
+ """
+ self._parse(stream, True, container=container, encoding=encoding)
+ return self.tree.getFragment()
+
+ def parseError(self, errorcode="XXX-undefined-error", datavars={}):
+ # XXX The idea is to make errorcode mandatory.
+ self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
+ if self.strict:
+ raise ParseError
+
+ def normalizeToken(self, token):
+ """ HTML5 specific normalizations to the token stream """
+
+ if token["type"] == tokenTypes["StartTag"]:
+ token["data"] = dict(token["data"][::-1])
+
+ return token
+
+ def adjustMathMLAttributes(self, token):
+ replacements = {"definitionurl": "definitionURL"}
+ for k, v in replacements.items():
+ if k in token["data"]:
+ token["data"][v] = token["data"][k]
+ del token["data"][k]
+
+ def adjustSVGAttributes(self, token):
+ replacements = {
+ "attributename": "attributeName",
+ "attributetype": "attributeType",
+ "basefrequency": "baseFrequency",
+ "baseprofile": "baseProfile",
+ "calcmode": "calcMode",
+ "clippathunits": "clipPathUnits",
+ "contentscripttype": "contentScriptType",
+ "contentstyletype": "contentStyleType",
+ "diffuseconstant": "diffuseConstant",
+ "edgemode": "edgeMode",
+ "externalresourcesrequired": "externalResourcesRequired",
+ "filterres": "filterRes",
+ "filterunits": "filterUnits",
+ "glyphref": "glyphRef",
+ "gradienttransform": "gradientTransform",
+ "gradientunits": "gradientUnits",
+ "kernelmatrix": "kernelMatrix",
+ "kernelunitlength": "kernelUnitLength",
+ "keypoints": "keyPoints",
+ "keysplines": "keySplines",
+ "keytimes": "keyTimes",
+ "lengthadjust": "lengthAdjust",
+ "limitingconeangle": "limitingConeAngle",
+ "markerheight": "markerHeight",
+ "markerunits": "markerUnits",
+ "markerwidth": "markerWidth",
+ "maskcontentunits": "maskContentUnits",
+ "maskunits": "maskUnits",
+ "numoctaves": "numOctaves",
+ "pathlength": "pathLength",
+ "patterncontentunits": "patternContentUnits",
+ "patterntransform": "patternTransform",
+ "patternunits": "patternUnits",
+ "pointsatx": "pointsAtX",
+ "pointsaty": "pointsAtY",
+ "pointsatz": "pointsAtZ",
+ "preservealpha": "preserveAlpha",
+ "preserveaspectratio": "preserveAspectRatio",
+ "primitiveunits": "primitiveUnits",
+ "refx": "refX",
+ "refy": "refY",
+ "repeatcount": "repeatCount",
+ "repeatdur": "repeatDur",
+ "requiredextensions": "requiredExtensions",
+ "requiredfeatures": "requiredFeatures",
+ "specularconstant": "specularConstant",
+ "specularexponent": "specularExponent",
+ "spreadmethod": "spreadMethod",
+ "startoffset": "startOffset",
+ "stddeviation": "stdDeviation",
+ "stitchtiles": "stitchTiles",
+ "surfacescale": "surfaceScale",
+ "systemlanguage": "systemLanguage",
+ "tablevalues": "tableValues",
+ "targetx": "targetX",
+ "targety": "targetY",
+ "textlength": "textLength",
+ "viewbox": "viewBox",
+ "viewtarget": "viewTarget",
+ "xchannelselector": "xChannelSelector",
+ "ychannelselector": "yChannelSelector",
+ "zoomandpan": "zoomAndPan"
+ }
+ for originalName in list(token["data"].keys()):
+ if originalName in replacements:
+ svgName = replacements[originalName]
+ token["data"][svgName] = token["data"][originalName]
+ del token["data"][originalName]
+
+ def adjustForeignAttributes(self, token):
+ replacements = {
+ "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
+ "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
+ "xlink:href": ("xlink", "href", namespaces["xlink"]),
+ "xlink:role": ("xlink", "role", namespaces["xlink"]),
+ "xlink:show": ("xlink", "show", namespaces["xlink"]),
+ "xlink:title": ("xlink", "title", namespaces["xlink"]),
+ "xlink:type": ("xlink", "type", namespaces["xlink"]),
+ "xml:base": ("xml", "base", namespaces["xml"]),
+ "xml:lang": ("xml", "lang", namespaces["xml"]),
+ "xml:space": ("xml", "space", namespaces["xml"]),
+ "xmlns": (None, "xmlns", namespaces["xmlns"]),
+ "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
+ }
+
+ for originalName in token["data"].keys():
+ if originalName in replacements:
+ foreignName = replacements[originalName]
+ token["data"][foreignName] = token["data"][originalName]
+ del token["data"][originalName]
+
+ def reparseTokenNormal(self, token):
+ self.parser.phase()
+
+ def resetInsertionMode(self):
+ # The name of this method is mostly historical. (It's also used in the
+ # specification.)
+ last = False
+ newModes = {
+ "select": "inSelect",
+ "td": "inCell",
+ "th": "inCell",
+ "tr": "inRow",
+ "tbody": "inTableBody",
+ "thead": "inTableBody",
+ "tfoot": "inTableBody",
+ "caption": "inCaption",
+ "colgroup": "inColumnGroup",
+ "table": "inTable",
+ "head": "inBody",
+ "body": "inBody",
+ "frameset": "inFrameset",
+ "html": "beforeHead"
+ }
+ for node in self.tree.openElements[::-1]:
+ nodeName = node.name
+ new_phase = None
+ if node == self.tree.openElements[0]:
+ assert self.innerHTML
+ last = True
+ nodeName = self.innerHTML
+ # Check for conditions that should only happen in the innerHTML
+ # case
+ if nodeName in ("select", "colgroup", "head", "html"):
+ assert self.innerHTML
+
+ if not last and node.namespace != self.tree.defaultNamespace:
+ continue
+
+ if nodeName in newModes:
+ new_phase = self.phases[newModes[nodeName]]
+ break
+ elif last:
+ new_phase = self.phases["inBody"]
+ break
+
+ self.phase = new_phase
+
+ def parseRCDataRawtext(self, token, contentType):
+ """Generic RCDATA/RAWTEXT Parsing algorithm
+ contentType - RCDATA or RAWTEXT
+ """
+ assert contentType in ("RAWTEXT", "RCDATA")
+
+ self.tree.insertElement(token)
+
+ if contentType == "RAWTEXT":
+ self.tokenizer.state = self.tokenizer.rawtextState
+ else:
+ self.tokenizer.state = self.tokenizer.rcdataState
+
+ self.originalPhase = self.phase
+
+ self.phase = self.phases["text"]
+
+
+def getPhases(debug):
+ def log(function):
+ """Logger that records which phase processes each token"""
+ type_names = dict((value, key) for key, value in
+ constants.tokenTypes.items())
+
+ def wrapped(self, *args, **kwargs):
+ if function.__name__.startswith("process") and len(args) > 0:
+ token = args[0]
+ try:
+ info = {"type": type_names[token['type']]}
+ except:
+ raise
+ if token['type'] in constants.tagTokenTypes:
+ info["name"] = token['name']
+
+ self.parser.log.append((self.parser.tokenizer.state.__name__,
+ self.parser.phase.__class__.__name__,
+ self.__class__.__name__,
+ function.__name__,
+ info))
+ return function(self, *args, **kwargs)
+ else:
+ return function(self, *args, **kwargs)
+ return wrapped
+
+ def getMetaclass(use_metaclass, metaclass_func):
+ if use_metaclass:
+ return method_decorator_metaclass(metaclass_func)
+ else:
+ return type
+
+ class Phase(with_metaclass(getMetaclass(debug, log))):
+ """Base class for helper object that implements each phase of processing
+ """
+
+ def __init__(self, parser, tree):
+ self.parser = parser
+ self.tree = tree
+
+ def processEOF(self):
+ raise NotImplementedError
+
+ def processComment(self, token):
+ # For most phases the following is correct. Where it's not it will be
+ # overridden.
+ self.tree.insertComment(token, self.tree.openElements[-1])
+
+ def processDoctype(self, token):
+ self.parser.parseError("unexpected-doctype")
+
+ def processCharacters(self, token):
+ self.tree.insertText(token["data"])
+
+ def processSpaceCharacters(self, token):
+ self.tree.insertText(token["data"])
+
+ def processStartTag(self, token):
+ return self.startTagHandler[token["name"]](token)
+
+ def startTagHtml(self, token):
+ if not self.parser.firstStartTag and token["name"] == "html":
+ self.parser.parseError("non-html-root")
+ # XXX Need a check here to see if the first start tag token emitted is
+ # this token... If it's not, invoke self.parser.parseError().
+ for attr, value in token["data"].items():
+ if attr not in self.tree.openElements[0].attributes:
+ self.tree.openElements[0].attributes[attr] = value
+ self.parser.firstStartTag = False
+
+ def processEndTag(self, token):
+ return self.endTagHandler[token["name"]](token)
+
+ class InitialPhase(Phase):
+ def processSpaceCharacters(self, token):
+ pass
+
+ def processComment(self, token):
+ self.tree.insertComment(token, self.tree.document)
+
+ def processDoctype(self, token):
+ name = token["name"]
+ publicId = token["publicId"]
+ systemId = token["systemId"]
+ correct = token["correct"]
+
+ if (name != "html" or publicId is not None or
+ systemId is not None and systemId != "about:legacy-compat"):
+ self.parser.parseError("unknown-doctype")
+
+ if publicId is None:
+ publicId = ""
+
+ self.tree.insertDoctype(token)
+
+ if publicId != "":
+ publicId = publicId.translate(asciiUpper2Lower)
+
+ if (not correct or token["name"] != "html"
+ or publicId.startswith(
+ ("+//silmaril//dtd html pro v0r11 19970101//",
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
+ "-//as//dtd html 3.0 aswedit + extensions//",
+ "-//ietf//dtd html 2.0 level 1//",
+ "-//ietf//dtd html 2.0 level 2//",
+ "-//ietf//dtd html 2.0 strict level 1//",
+ "-//ietf//dtd html 2.0 strict level 2//",
+ "-//ietf//dtd html 2.0 strict//",
+ "-//ietf//dtd html 2.0//",
+ "-//ietf//dtd html 2.1e//",
+ "-//ietf//dtd html 3.0//",
+ "-//ietf//dtd html 3.2 final//",
+ "-//ietf//dtd html 3.2//",
+ "-//ietf//dtd html 3//",
+ "-//ietf//dtd html level 0//",
+ "-//ietf//dtd html level 1//",
+ "-//ietf//dtd html level 2//",
+ "-//ietf//dtd html level 3//",
+ "-//ietf//dtd html strict level 0//",
+ "-//ietf//dtd html strict level 1//",
+ "-//ietf//dtd html strict level 2//",
+ "-//ietf//dtd html strict level 3//",
+ "-//ietf//dtd html strict//",
+ "-//ietf//dtd html//",
+ "-//metrius//dtd metrius presentational//",
+ "-//microsoft//dtd internet explorer 2.0 html strict//",
+ "-//microsoft//dtd internet explorer 2.0 html//",
+ "-//microsoft//dtd internet explorer 2.0 tables//",
+ "-//microsoft//dtd internet explorer 3.0 html strict//",
+ "-//microsoft//dtd internet explorer 3.0 html//",
+ "-//microsoft//dtd internet explorer 3.0 tables//",
+ "-//netscape comm. corp.//dtd html//",
+ "-//netscape comm. corp.//dtd strict html//",
+ "-//o'reilly and associates//dtd html 2.0//",
+ "-//o'reilly and associates//dtd html extended 1.0//",
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//",
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
+ "-//spyglass//dtd html 2.0 extended//",
+ "-//sq//dtd html 2.0 hotmetal + extensions//",
+ "-//sun microsystems corp.//dtd hotjava html//",
+ "-//sun microsystems corp.//dtd hotjava strict html//",
+ "-//w3c//dtd html 3 1995-03-24//",
+ "-//w3c//dtd html 3.2 draft//",
+ "-//w3c//dtd html 3.2 final//",
+ "-//w3c//dtd html 3.2//",
+ "-//w3c//dtd html 3.2s draft//",
+ "-//w3c//dtd html 4.0 frameset//",
+ "-//w3c//dtd html 4.0 transitional//",
+ "-//w3c//dtd html experimental 19960712//",
+ "-//w3c//dtd html experimental 970421//",
+ "-//w3c//dtd w3 html//",
+ "-//w3o//dtd w3 html 3.0//",
+ "-//webtechs//dtd mozilla html 2.0//",
+ "-//webtechs//dtd mozilla html//"))
+ or publicId in
+ ("-//w3o//dtd w3 html strict 3.0//en//",
+ "-/w3c/dtd html 4.0 transitional/en",
+ "html")
+ or publicId.startswith(
+ ("-//w3c//dtd html 4.01 frameset//",
+ "-//w3c//dtd html 4.01 transitional//")) and
+ systemId is None
+ or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
+ self.parser.compatMode = "quirks"
+ elif (publicId.startswith(
+ ("-//w3c//dtd xhtml 1.0 frameset//",
+ "-//w3c//dtd xhtml 1.0 transitional//"))
+ or publicId.startswith(
+ ("-//w3c//dtd html 4.01 frameset//",
+ "-//w3c//dtd html 4.01 transitional//")) and
+ systemId is not None):
+ self.parser.compatMode = "limited quirks"
+
+ self.parser.phase = self.parser.phases["beforeHtml"]
+
+ def anythingElse(self):
+ self.parser.compatMode = "quirks"
+ self.parser.phase = self.parser.phases["beforeHtml"]
+
+ def processCharacters(self, token):
+ self.parser.parseError("expected-doctype-but-got-chars")
+ self.anythingElse()
+ return token
+
+ def processStartTag(self, token):
+ self.parser.parseError("expected-doctype-but-got-start-tag",
+ {"name": token["name"]})
+ self.anythingElse()
+ return token
+
+ def processEndTag(self, token):
+ self.parser.parseError("expected-doctype-but-got-end-tag",
+ {"name": token["name"]})
+ self.anythingElse()
+ return token
+
+ def processEOF(self):
+ self.parser.parseError("expected-doctype-but-got-eof")
+ self.anythingElse()
+ return True
+
+ class BeforeHtmlPhase(Phase):
+ # helper methods
+ def insertHtmlElement(self):
+ self.tree.insertRoot(impliedTagToken("html", "StartTag"))
+ self.parser.phase = self.parser.phases["beforeHead"]
+
+ # other
+ def processEOF(self):
+ self.insertHtmlElement()
+ return True
+
+ def processComment(self, token):
+ self.tree.insertComment(token, self.tree.document)
+
+ def processSpaceCharacters(self, token):
+ pass
+
+ def processCharacters(self, token):
+ self.insertHtmlElement()
+ return token
+
+ def processStartTag(self, token):
+ if token["name"] == "html":
+ self.parser.firstStartTag = True
+ self.insertHtmlElement()
+ return token
+
+ def processEndTag(self, token):
+ if token["name"] not in ("head", "body", "html", "br"):
+ self.parser.parseError("unexpected-end-tag-before-html",
+ {"name": token["name"]})
+ else:
+ self.insertHtmlElement()
+ return token
+
+ class BeforeHeadPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("head", self.startTagHead)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([
+ (("head", "body", "html", "br"), self.endTagImplyHead)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ def processEOF(self):
+ self.startTagHead(impliedTagToken("head", "StartTag"))
+ return True
+
+ def processSpaceCharacters(self, token):
+ pass
+
+ def processCharacters(self, token):
+ self.startTagHead(impliedTagToken("head", "StartTag"))
+ return token
+
+ def startTagHtml(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def startTagHead(self, token):
+ self.tree.insertElement(token)
+ self.tree.headPointer = self.tree.openElements[-1]
+ self.parser.phase = self.parser.phases["inHead"]
+
+ def startTagOther(self, token):
+ self.startTagHead(impliedTagToken("head", "StartTag"))
+ return token
+
+ def endTagImplyHead(self, token):
+ self.startTagHead(impliedTagToken("head", "StartTag"))
+ return token
+
+ def endTagOther(self, token):
+ self.parser.parseError("end-tag-after-implied-root",
+ {"name": token["name"]})
+
+ class InHeadPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("title", self.startTagTitle),
+ (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
+ ("script", self.startTagScript),
+ (("base", "basefont", "bgsound", "command", "link"),
+ self.startTagBaseLinkCommand),
+ ("meta", self.startTagMeta),
+ ("head", self.startTagHead)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self. endTagHandler = utils.MethodDispatcher([
+ ("head", self.endTagHead),
+ (("br", "html", "body"), self.endTagHtmlBodyBr)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ # the real thing
+ def processEOF(self):
+ self.anythingElse()
+ return True
+
+ def processCharacters(self, token):
+ self.anythingElse()
+ return token
+
+ def startTagHtml(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def startTagHead(self, token):
+ self.parser.parseError("two-heads-are-not-better-than-one")
+
+ def startTagBaseLinkCommand(self, token):
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
+
+ def startTagMeta(self, token):
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
+
+ attributes = token["data"]
+ if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
+ if "charset" in attributes:
+ self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
+ elif ("content" in attributes and
+ "http-equiv" in attributes and
+ attributes["http-equiv"].lower() == "content-type"):
+ # Encoding it as UTF-8 here is a hack, as really we should pass
+ # the abstract Unicode string, and just use the
+ # ContentAttrParser on that, but using UTF-8 allows all chars
+ # to be encoded and as a ASCII-superset works.
+ data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
+ parser = inputstream.ContentAttrParser(data)
+ codec = parser.parse()
+ self.parser.tokenizer.stream.changeEncoding(codec)
+
+ def startTagTitle(self, token):
+ self.parser.parseRCDataRawtext(token, "RCDATA")
+
+ def startTagNoScriptNoFramesStyle(self, token):
+ # Need to decide whether to implement the scripting-disabled case
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
+
+ def startTagScript(self, token):
+ self.tree.insertElement(token)
+ self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
+ self.parser.originalPhase = self.parser.phase
+ self.parser.phase = self.parser.phases["text"]
+
+ def startTagOther(self, token):
+ self.anythingElse()
+ return token
+
+ def endTagHead(self, token):
+ node = self.parser.tree.openElements.pop()
+ assert node.name == "head", "Expected head got %s" % node.name
+ self.parser.phase = self.parser.phases["afterHead"]
+
+ def endTagHtmlBodyBr(self, token):
+ self.anythingElse()
+ return token
+
+ def endTagOther(self, token):
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+ def anythingElse(self):
+ self.endTagHead(impliedTagToken("head"))
+
+ # XXX If we implement a parser for which scripting is disabled we need to
+ # implement this phase.
+ #
+ # class InHeadNoScriptPhase(Phase):
+ class AfterHeadPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("body", self.startTagBody),
+ ("frameset", self.startTagFrameset),
+ (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
+ "style", "title"),
+ self.startTagFromHead),
+ ("head", self.startTagHead)
+ ])
+ self.startTagHandler.default = self.startTagOther
+ self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"),
+ self.endTagHtmlBodyBr)])
+ self.endTagHandler.default = self.endTagOther
+
+ def processEOF(self):
+ self.anythingElse()
+ return True
+
+ def processCharacters(self, token):
+ self.anythingElse()
+ return token
+
+ def startTagHtml(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def startTagBody(self, token):
+ self.parser.framesetOK = False
+ self.tree.insertElement(token)
+ self.parser.phase = self.parser.phases["inBody"]
+
+ def startTagFrameset(self, token):
+ self.tree.insertElement(token)
+ self.parser.phase = self.parser.phases["inFrameset"]
+
+ def startTagFromHead(self, token):
+ self.parser.parseError("unexpected-start-tag-out-of-my-head",
+ {"name": token["name"]})
+ self.tree.openElements.append(self.tree.headPointer)
+ self.parser.phases["inHead"].processStartTag(token)
+ for node in self.tree.openElements[::-1]:
+ if node.name == "head":
+ self.tree.openElements.remove(node)
+ break
+
+ def startTagHead(self, token):
+ self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+
+ def startTagOther(self, token):
+ self.anythingElse()
+ return token
+
+ def endTagHtmlBodyBr(self, token):
+ self.anythingElse()
+ return token
+
+ def endTagOther(self, token):
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+ def anythingElse(self):
+ self.tree.insertElement(impliedTagToken("body", "StartTag"))
+ self.parser.phase = self.parser.phases["inBody"]
+ self.parser.framesetOK = True
+
+ class InBodyPhase(Phase):
+ # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
+ # the really-really-really-very crazy mode
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ # Keep a ref to this for special handling of whitespace in <pre>
+ self.processSpaceCharactersNonPre = self.processSpaceCharacters
+
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ (("base", "basefont", "bgsound", "command", "link", "meta",
+ "noframes", "script", "style", "title"),
+ self.startTagProcessInHead),
+ ("body", self.startTagBody),
+ ("frameset", self.startTagFrameset),
+ (("address", "article", "aside", "blockquote", "center", "details",
+ "details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
+ "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
+ "section", "summary", "ul"),
+ self.startTagCloseP),
+ (headingElements, self.startTagHeading),
+ (("pre", "listing"), self.startTagPreListing),
+ ("form", self.startTagForm),
+ (("li", "dd", "dt"), self.startTagListItem),
+ ("plaintext", self.startTagPlaintext),
+ ("a", self.startTagA),
+ (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
+ "strong", "tt", "u"), self.startTagFormatting),
+ ("nobr", self.startTagNobr),
+ ("button", self.startTagButton),
+ (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
+ ("xmp", self.startTagXmp),
+ ("table", self.startTagTable),
+ (("area", "br", "embed", "img", "keygen", "wbr"),
+ self.startTagVoidFormatting),
+ (("param", "source", "track"), self.startTagParamSource),
+ ("input", self.startTagInput),
+ ("hr", self.startTagHr),
+ ("image", self.startTagImage),
+ ("isindex", self.startTagIsIndex),
+ ("textarea", self.startTagTextarea),
+ ("iframe", self.startTagIFrame),
+ (("noembed", "noframes", "noscript"), self.startTagRawtext),
+ ("select", self.startTagSelect),
+ (("rp", "rt"), self.startTagRpRt),
+ (("option", "optgroup"), self.startTagOpt),
+ (("math"), self.startTagMath),
+ (("svg"), self.startTagSvg),
+ (("caption", "col", "colgroup", "frame", "head",
+ "tbody", "td", "tfoot", "th", "thead",
+ "tr"), self.startTagMisplaced)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([
+ ("body", self.endTagBody),
+ ("html", self.endTagHtml),
+ (("address", "article", "aside", "blockquote", "button", "center",
+ "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
+ "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
+ "section", "summary", "ul"), self.endTagBlock),
+ ("form", self.endTagForm),
+ ("p", self.endTagP),
+ (("dd", "dt", "li"), self.endTagListItem),
+ (headingElements, self.endTagHeading),
+ (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
+ "strike", "strong", "tt", "u"), self.endTagFormatting),
+ (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
+ ("br", self.endTagBr),
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ def isMatchingFormattingElement(self, node1, node2):
+ if node1.name != node2.name or node1.namespace != node2.namespace:
+ return False
+ elif len(node1.attributes) != len(node2.attributes):
+ return False
+ else:
+ attributes1 = sorted(node1.attributes.items())
+ attributes2 = sorted(node2.attributes.items())
+ for attr1, attr2 in zip(attributes1, attributes2):
+ if attr1 != attr2:
+ return False
+ return True
+
+ # helper
+ def addFormattingElement(self, token):
+ self.tree.insertElement(token)
+ element = self.tree.openElements[-1]
+
+ matchingElements = []
+ for node in self.tree.activeFormattingElements[::-1]:
+ if node is Marker:
+ break
+ elif self.isMatchingFormattingElement(node, element):
+ matchingElements.append(node)
+
+ assert len(matchingElements) <= 3
+ if len(matchingElements) == 3:
+ self.tree.activeFormattingElements.remove(matchingElements[-1])
+ self.tree.activeFormattingElements.append(element)
+
+ # the real deal
+ def processEOF(self):
+ allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
+ "tfoot", "th", "thead", "tr", "body",
+ "html"))
+ for node in self.tree.openElements[::-1]:
+ if node.name not in allowed_elements:
+ self.parser.parseError("expected-closing-tag-but-got-eof")
+ break
+ # Stop parsing
+
+ def processSpaceCharactersDropNewline(self, token):
+ # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
+ # want to drop leading newlines
+ data = token["data"]
+ self.processSpaceCharacters = self.processSpaceCharactersNonPre
+ if (data.startswith("\n") and
+ self.tree.openElements[-1].name in ("pre", "listing", "textarea")
+ and not self.tree.openElements[-1].hasContent()):
+ data = data[1:]
+ if data:
+ self.tree.reconstructActiveFormattingElements()
+ self.tree.insertText(data)
+
+ def processCharacters(self, token):
+ if token["data"] == "\u0000":
+ # The tokenizer should always emit null on its own
+ return
+ self.tree.reconstructActiveFormattingElements()
+ self.tree.insertText(token["data"])
+ # This must be bad for performance
+ if (self.parser.framesetOK and
+ any([char not in spaceCharacters
+ for char in token["data"]])):
+ self.parser.framesetOK = False
+
+ def processSpaceCharacters(self, token):
+ self.tree.reconstructActiveFormattingElements()
+ self.tree.insertText(token["data"])
+
+ def startTagProcessInHead(self, token):
+ return self.parser.phases["inHead"].processStartTag(token)
+
+ def startTagBody(self, token):
+ self.parser.parseError("unexpected-start-tag", {"name": "body"})
+ if (len(self.tree.openElements) == 1
+ or self.tree.openElements[1].name != "body"):
+ assert self.parser.innerHTML
+ else:
+ self.parser.framesetOK = False
+ for attr, value in token["data"].items():
+ if attr not in self.tree.openElements[1].attributes:
+ self.tree.openElements[1].attributes[attr] = value
+
+ def startTagFrameset(self, token):
+ self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
+ if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
+ assert self.parser.innerHTML
+ elif not self.parser.framesetOK:
+ pass
+ else:
+ if self.tree.openElements[1].parent:
+ self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
+ while self.tree.openElements[-1].name != "html":
+ self.tree.openElements.pop()
+ self.tree.insertElement(token)
+ self.parser.phase = self.parser.phases["inFrameset"]
+
+ def startTagCloseP(self, token):
+ if self.tree.elementInScope("p", variant="button"):
+ self.endTagP(impliedTagToken("p"))
+ self.tree.insertElement(token)
+
+ def startTagPreListing(self, token):
+ if self.tree.elementInScope("p", variant="button"):
+ self.endTagP(impliedTagToken("p"))
+ self.tree.insertElement(token)
+ self.parser.framesetOK = False
+ self.processSpaceCharacters = self.processSpaceCharactersDropNewline
+
+ def startTagForm(self, token):
+ if self.tree.formPointer:
+ self.parser.parseError("unexpected-start-tag", {"name": "form"})
+ else:
+ if self.tree.elementInScope("p", variant="button"):
+ self.endTagP(impliedTagToken("p"))
+ self.tree.insertElement(token)
+ self.tree.formPointer = self.tree.openElements[-1]
+
+ def startTagListItem(self, token):
+ self.parser.framesetOK = False
+
+ stopNamesMap = {"li": ["li"],
+ "dt": ["dt", "dd"],
+ "dd": ["dt", "dd"]}
+ stopNames = stopNamesMap[token["name"]]
+ for node in reversed(self.tree.openElements):
+ if node.name in stopNames:
+ self.parser.phase.processEndTag(
+ impliedTagToken(node.name, "EndTag"))
+ break
+ if (node.nameTuple in specialElements and
+ node.name not in ("address", "div", "p")):
+ break
+
+ if self.tree.elementInScope("p", variant="button"):
+ self.parser.phase.processEndTag(
+ impliedTagToken("p", "EndTag"))
+
+ self.tree.insertElement(token)
+
+ def startTagPlaintext(self, token):
+ if self.tree.elementInScope("p", variant="button"):
+ self.endTagP(impliedTagToken("p"))
+ self.tree.insertElement(token)
+ self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
+
+ def startTagHeading(self, token):
+ if self.tree.elementInScope("p", variant="button"):
+ self.endTagP(impliedTagToken("p"))
+ if self.tree.openElements[-1].name in headingElements:
+ self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+ self.tree.openElements.pop()
+ self.tree.insertElement(token)
+
+ def startTagA(self, token):
+ afeAElement = self.tree.elementInActiveFormattingElements("a")
+ if afeAElement:
+ self.parser.parseError("unexpected-start-tag-implies-end-tag",
+ {"startName": "a", "endName": "a"})
+ self.endTagFormatting(impliedTagToken("a"))
+ if afeAElement in self.tree.openElements:
+ self.tree.openElements.remove(afeAElement)
+ if afeAElement in self.tree.activeFormattingElements:
+ self.tree.activeFormattingElements.remove(afeAElement)
+ self.tree.reconstructActiveFormattingElements()
+ self.addFormattingElement(token)
+
+ def startTagFormatting(self, token):
+ self.tree.reconstructActiveFormattingElements()
+ self.addFormattingElement(token)
+
+ def startTagNobr(self, token):
+ self.tree.reconstructActiveFormattingElements()
+ if self.tree.elementInScope("nobr"):
+ self.parser.parseError("unexpected-start-tag-implies-end-tag",
+ {"startName": "nobr", "endName": "nobr"})
+ self.processEndTag(impliedTagToken("nobr"))
+ # XXX Need tests that trigger the following
+ self.tree.reconstructActiveFormattingElements()
+ self.addFormattingElement(token)
+
+ def startTagButton(self, token):
+ if self.tree.elementInScope("button"):
+ self.parser.parseError("unexpected-start-tag-implies-end-tag",
+ {"startName": "button", "endName": "button"})
+ self.processEndTag(impliedTagToken("button"))
+ return token
+ else:
+ self.tree.reconstructActiveFormattingElements()
+ self.tree.insertElement(token)
+ self.parser.framesetOK = False
+
+ def startTagAppletMarqueeObject(self, token):
+ self.tree.reconstructActiveFormattingElements()
+ self.tree.insertElement(token)
+ self.tree.activeFormattingElements.append(Marker)
+ self.parser.framesetOK = False
+
+ def startTagXmp(self, token):
+ if self.tree.elementInScope("p", variant="button"):
+ self.endTagP(impliedTagToken("p"))
+ self.tree.reconstructActiveFormattingElements()
+ self.parser.framesetOK = False
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
+
+ def startTagTable(self, token):
+ if self.parser.compatMode != "quirks":
+ if self.tree.elementInScope("p", variant="button"):
+ self.processEndTag(impliedTagToken("p"))
+ self.tree.insertElement(token)
+ self.parser.framesetOK = False
+ self.parser.phase = self.parser.phases["inTable"]
+
+ def startTagVoidFormatting(self, token):
+ self.tree.reconstructActiveFormattingElements()
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
+ self.parser.framesetOK = False
+
+ def startTagInput(self, token):
+ framesetOK = self.parser.framesetOK
+ self.startTagVoidFormatting(token)
+ if ("type" in token["data"] and
+ token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
+ # input type=hidden doesn't change framesetOK
+ self.parser.framesetOK = framesetOK
+
+ def startTagParamSource(self, token):
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
+
+ def startTagHr(self, token):
+ if self.tree.elementInScope("p", variant="button"):
+ self.endTagP(impliedTagToken("p"))
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
+ self.parser.framesetOK = False
+
+ def startTagImage(self, token):
+ # No really...
+ self.parser.parseError("unexpected-start-tag-treated-as",
+ {"originalName": "image", "newName": "img"})
+ self.processStartTag(impliedTagToken("img", "StartTag",
+ attributes=token["data"],
+ selfClosing=token["selfClosing"]))
+
+ def startTagIsIndex(self, token):
+ self.parser.parseError("deprecated-tag", {"name": "isindex"})
+ if self.tree.formPointer:
+ return
+ form_attrs = {}
+ if "action" in token["data"]:
+ form_attrs["action"] = token["data"]["action"]
+ self.processStartTag(impliedTagToken("form", "StartTag",
+ attributes=form_attrs))
+ self.processStartTag(impliedTagToken("hr", "StartTag"))
+ self.processStartTag(impliedTagToken("label", "StartTag"))
+ # XXX Localization ...
+ if "prompt" in token["data"]:
+ prompt = token["data"]["prompt"]
+ else:
+ prompt = "This is a searchable index. Enter search keywords: "
+ self.processCharacters(
+ {"type": tokenTypes["Characters"], "data": prompt})
+ attributes = token["data"].copy()
+ if "action" in attributes:
+ del attributes["action"]
+ if "prompt" in attributes:
+ del attributes["prompt"]
+ attributes["name"] = "isindex"
+ self.processStartTag(impliedTagToken("input", "StartTag",
+ attributes=attributes,
+ selfClosing=
+ token["selfClosing"]))
+ self.processEndTag(impliedTagToken("label"))
+ self.processStartTag(impliedTagToken("hr", "StartTag"))
+ self.processEndTag(impliedTagToken("form"))
+
+ def startTagTextarea(self, token):
+ self.tree.insertElement(token)
+ self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
+ self.processSpaceCharacters = self.processSpaceCharactersDropNewline
+ self.parser.framesetOK = False
+
+ def startTagIFrame(self, token):
+ self.parser.framesetOK = False
+ self.startTagRawtext(token)
+
+ def startTagRawtext(self, token):
+ """iframe, noembed noframes, noscript(if scripting enabled)"""
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
+
+ def startTagOpt(self, token):
+ if self.tree.openElements[-1].name == "option":
+ self.parser.phase.processEndTag(impliedTagToken("option"))
+ self.tree.reconstructActiveFormattingElements()
+ self.parser.tree.insertElement(token)
+
+ def startTagSelect(self, token):
+ self.tree.reconstructActiveFormattingElements()
+ self.tree.insertElement(token)
+ self.parser.framesetOK = False
+ if self.parser.phase in (self.parser.phases["inTable"],
+ self.parser.phases["inCaption"],
+ self.parser.phases["inColumnGroup"],
+ self.parser.phases["inTableBody"],
+ self.parser.phases["inRow"],
+ self.parser.phases["inCell"]):
+ self.parser.phase = self.parser.phases["inSelectInTable"]
+ else:
+ self.parser.phase = self.parser.phases["inSelect"]
+
+ def startTagRpRt(self, token):
+ if self.tree.elementInScope("ruby"):
+ self.tree.generateImpliedEndTags()
+ if self.tree.openElements[-1].name != "ruby":
+ self.parser.parseError()
+ self.tree.insertElement(token)
+
+ def startTagMath(self, token):
+ self.tree.reconstructActiveFormattingElements()
+ self.parser.adjustMathMLAttributes(token)
+ self.parser.adjustForeignAttributes(token)
+ token["namespace"] = namespaces["mathml"]
+ self.tree.insertElement(token)
+ # Need to get the parse error right for the case where the token
+ # has a namespace not equal to the xmlns attribute
+ if token["selfClosing"]:
+ self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
+
+ def startTagSvg(self, token):
+ self.tree.reconstructActiveFormattingElements()
+ self.parser.adjustSVGAttributes(token)
+ self.parser.adjustForeignAttributes(token)
+ token["namespace"] = namespaces["svg"]
+ self.tree.insertElement(token)
+ # Need to get the parse error right for the case where the token
+ # has a namespace not equal to the xmlns attribute
+ if token["selfClosing"]:
+ self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
+
+ def startTagMisplaced(self, token):
+ """ Elements that should be children of other elements that have a
+ different insertion mode; here they are ignored
+ "caption", "col", "colgroup", "frame", "frameset", "head",
+ "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
+ "tr", "noscript"
+ """
+ self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
+
+ def startTagOther(self, token):
+ self.tree.reconstructActiveFormattingElements()
+ self.tree.insertElement(token)
+
+ def endTagP(self, token):
+ if not self.tree.elementInScope("p", variant="button"):
+ self.startTagCloseP(impliedTagToken("p", "StartTag"))
+ self.parser.parseError("unexpected-end-tag", {"name": "p"})
+ self.endTagP(impliedTagToken("p", "EndTag"))
+ else:
+ self.tree.generateImpliedEndTags("p")
+ if self.tree.openElements[-1].name != "p":
+ self.parser.parseError("unexpected-end-tag", {"name": "p"})
+ node = self.tree.openElements.pop()
+ while node.name != "p":
+ node = self.tree.openElements.pop()
+
+ def endTagBody(self, token):
+ if not self.tree.elementInScope("body"):
+ self.parser.parseError()
+ return
+ elif self.tree.openElements[-1].name != "body":
+ for node in self.tree.openElements[2:]:
+ if node.name not in frozenset(("dd", "dt", "li", "optgroup",
+ "option", "p", "rp", "rt",
+ "tbody", "td", "tfoot",
+ "th", "thead", "tr", "body",
+ "html")):
+ # Not sure this is the correct name for the parse error
+ self.parser.parseError(
+ "expected-one-end-tag-but-got-another",
+ {"expectedName": "body", "gotName": node.name})
+ break
+ self.parser.phase = self.parser.phases["afterBody"]
+
+ def endTagHtml(self, token):
+ # We repeat the test for the body end tag token being ignored here
+ if self.tree.elementInScope("body"):
+ self.endTagBody(impliedTagToken("body"))
+ return token
+
+ def endTagBlock(self, token):
+ # Put us back in the right whitespace handling mode
+ if token["name"] == "pre":
+ self.processSpaceCharacters = self.processSpaceCharactersNonPre
+ inScope = self.tree.elementInScope(token["name"])
+ if inScope:
+ self.tree.generateImpliedEndTags()
+ if self.tree.openElements[-1].name != token["name"]:
+ self.parser.parseError("end-tag-too-early", {"name": token["name"]})
+ if inScope:
+ node = self.tree.openElements.pop()
+ while node.name != token["name"]:
+ node = self.tree.openElements.pop()
+
+ def endTagForm(self, token):
+ node = self.tree.formPointer
+ self.tree.formPointer = None
+ if node is None or not self.tree.elementInScope(node):
+ self.parser.parseError("unexpected-end-tag",
+ {"name": "form"})
+ else:
+ self.tree.generateImpliedEndTags()
+ if self.tree.openElements[-1] != node:
+ self.parser.parseError("end-tag-too-early-ignored",
+ {"name": "form"})
+ self.tree.openElements.remove(node)
+
+ def endTagListItem(self, token):
+ if token["name"] == "li":
+ variant = "list"
+ else:
+ variant = None
+ if not self.tree.elementInScope(token["name"], variant=variant):
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+ else:
+ self.tree.generateImpliedEndTags(exclude=token["name"])
+ if self.tree.openElements[-1].name != token["name"]:
+ self.parser.parseError(
+ "end-tag-too-early",
+ {"name": token["name"]})
+ node = self.tree.openElements.pop()
+ while node.name != token["name"]:
+ node = self.tree.openElements.pop()
+
+ def endTagHeading(self, token):
+ for item in headingElements:
+ if self.tree.elementInScope(item):
+ self.tree.generateImpliedEndTags()
+ break
+ if self.tree.openElements[-1].name != token["name"]:
+ self.parser.parseError("end-tag-too-early", {"name": token["name"]})
+
+ for item in headingElements:
+ if self.tree.elementInScope(item):
+ item = self.tree.openElements.pop()
+ while item.name not in headingElements:
+ item = self.tree.openElements.pop()
+ break
+
+ def endTagFormatting(self, token):
+ """The much-feared adoption agency algorithm"""
+ # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
+ # XXX Better parseError messages appreciated.
+
+ # Step 1
+ outerLoopCounter = 0
+
+ # Step 2
+ while outerLoopCounter < 8:
+
+ # Step 3
+ outerLoopCounter += 1
+
+ # Step 4:
+
+ # Let the formatting element be the last element in
+ # the list of active formatting elements that:
+ # - is between the end of the list and the last scope
+ # marker in the list, if any, or the start of the list
+ # otherwise, and
+ # - has the same tag name as the token.
+ formattingElement = self.tree.elementInActiveFormattingElements(
+ token["name"])
+ if (not formattingElement or
+ (formattingElement in self.tree.openElements and
+ not self.tree.elementInScope(formattingElement.name))):
+ # If there is no such node, then abort these steps
+ # and instead act as described in the "any other
+ # end tag" entry below.
+ self.endTagOther(token)
+ return
+
+ # Otherwise, if there is such a node, but that node is
+ # not in the stack of open elements, then this is a
+ # parse error; remove the element from the list, and
+ # abort these steps.
+ elif formattingElement not in self.tree.openElements:
+ self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
+ self.tree.activeFormattingElements.remove(formattingElement)
+ return
+
+ # Otherwise, if there is such a node, and that node is
+ # also in the stack of open elements, but the element
+ # is not in scope, then this is a parse error; ignore
+ # the token, and abort these steps.
+ elif not self.tree.elementInScope(formattingElement.name):
+ self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
+ return
+
+ # Otherwise, there is a formatting element and that
+ # element is in the stack and is in scope. If the
+ # element is not the current node, this is a parse
+ # error. In any case, proceed with the algorithm as
+ # written in the following steps.
+ else:
+ if formattingElement != self.tree.openElements[-1]:
+ self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
+
+ # Step 5:
+
+ # Let the furthest block be the topmost node in the
+ # stack of open elements that is lower in the stack
+ # than the formatting element, and is an element in
+ # the special category. There might not be one.
+ afeIndex = self.tree.openElements.index(formattingElement)
+ furthestBlock = None
+ for element in self.tree.openElements[afeIndex:]:
+ if element.nameTuple in specialElements:
+ furthestBlock = element
+ break
+
+ # Step 6:
+
+ # If there is no furthest block, then the UA must
+ # first pop all the nodes from the bottom of the stack
+ # of open elements, from the current node up to and
+ # including the formatting element, then remove the
+ # formatting element from the list of active
+ # formatting elements, and finally abort these steps.
+ if furthestBlock is None:
+ element = self.tree.openElements.pop()
+ while element != formattingElement:
+ element = self.tree.openElements.pop()
+ self.tree.activeFormattingElements.remove(element)
+ return
+
+ # Step 7
+ commonAncestor = self.tree.openElements[afeIndex - 1]
+
+ # Step 8:
+ # The bookmark is supposed to help us identify where to reinsert
+ # nodes in step 15. We have to ensure that we reinsert nodes after
+ # the node before the active formatting element. Note the bookmark
+ # can move in step 9.7
+ bookmark = self.tree.activeFormattingElements.index(formattingElement)
+
+ # Step 9
+ lastNode = node = furthestBlock
+ innerLoopCounter = 0
+
+ index = self.tree.openElements.index(node)
+ while innerLoopCounter < 3:
+ innerLoopCounter += 1
+ # Node is element before node in open elements
+ index -= 1
+ node = self.tree.openElements[index]
+ if node not in self.tree.activeFormattingElements:
+ self.tree.openElements.remove(node)
+ continue
+ # Step 9.6
+ if node == formattingElement:
+ break
+ # Step 9.7
+ if lastNode == furthestBlock:
+ bookmark = self.tree.activeFormattingElements.index(node) + 1
+ # Step 9.8
+ clone = node.cloneNode()
+ # Replace node with clone
+ self.tree.activeFormattingElements[
+ self.tree.activeFormattingElements.index(node)] = clone
+ self.tree.openElements[
+ self.tree.openElements.index(node)] = clone
+ node = clone
+ # Step 9.9
+ # Remove lastNode from its parents, if any
+ if lastNode.parent:
+ lastNode.parent.removeChild(lastNode)
+ node.appendChild(lastNode)
+ # Step 9.10
+ lastNode = node
+
+ # Step 10
+ # Foster parent lastNode if commonAncestor is a
+ # table, tbody, tfoot, thead, or tr we need to foster
+ # parent the lastNode
+ if lastNode.parent:
+ lastNode.parent.removeChild(lastNode)
+
+ if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
+ parent, insertBefore = self.tree.getTableMisnestedNodePosition()
+ parent.insertBefore(lastNode, insertBefore)
+ else:
+ commonAncestor.appendChild(lastNode)
+
+ # Step 11
+ clone = formattingElement.cloneNode()
+
+ # Step 12
+ furthestBlock.reparentChildren(clone)
+
+ # Step 13
+ furthestBlock.appendChild(clone)
+
+ # Step 14
+ self.tree.activeFormattingElements.remove(formattingElement)
+ self.tree.activeFormattingElements.insert(bookmark, clone)
+
+ # Step 15
+ self.tree.openElements.remove(formattingElement)
+ self.tree.openElements.insert(
+ self.tree.openElements.index(furthestBlock) + 1, clone)
+
+ def endTagAppletMarqueeObject(self, token):
+ if self.tree.elementInScope(token["name"]):
+ self.tree.generateImpliedEndTags()
+ if self.tree.openElements[-1].name != token["name"]:
+ self.parser.parseError("end-tag-too-early", {"name": token["name"]})
+
+ if self.tree.elementInScope(token["name"]):
+ element = self.tree.openElements.pop()
+ while element.name != token["name"]:
+ element = self.tree.openElements.pop()
+ self.tree.clearActiveFormattingElements()
+
+ def endTagBr(self, token):
+ self.parser.parseError("unexpected-end-tag-treated-as",
+ {"originalName": "br", "newName": "br element"})
+ self.tree.reconstructActiveFormattingElements()
+ self.tree.insertElement(impliedTagToken("br", "StartTag"))
+ self.tree.openElements.pop()
+
+ def endTagOther(self, token):
+ for node in self.tree.openElements[::-1]:
+ if node.name == token["name"]:
+ self.tree.generateImpliedEndTags(exclude=token["name"])
+ if self.tree.openElements[-1].name != token["name"]:
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+ while self.tree.openElements.pop() != node:
+ pass
+ break
+ else:
+ if node.nameTuple in specialElements:
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+ break
+
+ class TextPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+ self.startTagHandler = utils.MethodDispatcher([])
+ self.startTagHandler.default = self.startTagOther
+ self.endTagHandler = utils.MethodDispatcher([
+ ("script", self.endTagScript)])
+ self.endTagHandler.default = self.endTagOther
+
+ def processCharacters(self, token):
+ self.tree.insertText(token["data"])
+
+ def processEOF(self):
+ self.parser.parseError("expected-named-closing-tag-but-got-eof",
+ {"name": self.tree.openElements[-1].name})
+ self.tree.openElements.pop()
+ self.parser.phase = self.parser.originalPhase
+ return True
+
+ def startTagOther(self, token):
+ assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
+
+ def endTagScript(self, token):
+ node = self.tree.openElements.pop()
+ assert node.name == "script"
+ self.parser.phase = self.parser.originalPhase
+ # The rest of this method is all stuff that only happens if
+ # document.write works
+
+ def endTagOther(self, token):
+ self.tree.openElements.pop()
+ self.parser.phase = self.parser.originalPhase
+
+ class InTablePhase(Phase):
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-table
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("caption", self.startTagCaption),
+ ("colgroup", self.startTagColgroup),
+ ("col", self.startTagCol),
+ (("tbody", "tfoot", "thead"), self.startTagRowGroup),
+ (("td", "th", "tr"), self.startTagImplyTbody),
+ ("table", self.startTagTable),
+ (("style", "script"), self.startTagStyleScript),
+ ("input", self.startTagInput),
+ ("form", self.startTagForm)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([
+ ("table", self.endTagTable),
+ (("body", "caption", "col", "colgroup", "html", "tbody", "td",
+ "tfoot", "th", "thead", "tr"), self.endTagIgnore)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ # helper methods
+ def clearStackToTableContext(self):
+ # "clear the stack back to a table context"
+ while self.tree.openElements[-1].name not in ("table", "html"):
+ # self.parser.parseError("unexpected-implied-end-tag-in-table",
+ # {"name": self.tree.openElements[-1].name})
+ self.tree.openElements.pop()
+ # When the current node is <html> it's an innerHTML case
+
+ # processing methods
+ def processEOF(self):
+ if self.tree.openElements[-1].name != "html":
+ self.parser.parseError("eof-in-table")
+ else:
+ assert self.parser.innerHTML
+ # Stop parsing
+
+ def processSpaceCharacters(self, token):
+ originalPhase = self.parser.phase
+ self.parser.phase = self.parser.phases["inTableText"]
+ self.parser.phase.originalPhase = originalPhase
+ self.parser.phase.processSpaceCharacters(token)
+
+ def processCharacters(self, token):
+ originalPhase = self.parser.phase
+ self.parser.phase = self.parser.phases["inTableText"]
+ self.parser.phase.originalPhase = originalPhase
+ self.parser.phase.processCharacters(token)
+
+ def insertText(self, token):
+ # If we get here there must be at least one non-whitespace character
+ # Do the table magic!
+ self.tree.insertFromTable = True
+ self.parser.phases["inBody"].processCharacters(token)
+ self.tree.insertFromTable = False
+
+ def startTagCaption(self, token):
+ self.clearStackToTableContext()
+ self.tree.activeFormattingElements.append(Marker)
+ self.tree.insertElement(token)
+ self.parser.phase = self.parser.phases["inCaption"]
+
+ def startTagColgroup(self, token):
+ self.clearStackToTableContext()
+ self.tree.insertElement(token)
+ self.parser.phase = self.parser.phases["inColumnGroup"]
+
+ def startTagCol(self, token):
+ self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
+ return token
+
+ def startTagRowGroup(self, token):
+ self.clearStackToTableContext()
+ self.tree.insertElement(token)
+ self.parser.phase = self.parser.phases["inTableBody"]
+
+ def startTagImplyTbody(self, token):
+ self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
+ return token
+
+ def startTagTable(self, token):
+ self.parser.parseError("unexpected-start-tag-implies-end-tag",
+ {"startName": "table", "endName": "table"})
+ self.parser.phase.processEndTag(impliedTagToken("table"))
+ if not self.parser.innerHTML:
+ return token
+
+ def startTagStyleScript(self, token):
+ return self.parser.phases["inHead"].processStartTag(token)
+
+ def startTagInput(self, token):
+ if ("type" in token["data"] and
+ token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
+ self.parser.parseError("unexpected-hidden-input-in-table")
+ self.tree.insertElement(token)
+ # XXX associate with form
+ self.tree.openElements.pop()
+ else:
+ self.startTagOther(token)
+
+ def startTagForm(self, token):
+ self.parser.parseError("unexpected-form-in-table")
+ if self.tree.formPointer is None:
+ self.tree.insertElement(token)
+ self.tree.formPointer = self.tree.openElements[-1]
+ self.tree.openElements.pop()
+
+ def startTagOther(self, token):
+ self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
+ # Do the table magic!
+ self.tree.insertFromTable = True
+ self.parser.phases["inBody"].processStartTag(token)
+ self.tree.insertFromTable = False
+
+ def endTagTable(self, token):
+ if self.tree.elementInScope("table", variant="table"):
+ self.tree.generateImpliedEndTags()
+ if self.tree.openElements[-1].name != "table":
+ self.parser.parseError("end-tag-too-early-named",
+ {"gotName": "table",
+ "expectedName": self.tree.openElements[-1].name})
+ while self.tree.openElements[-1].name != "table":
+ self.tree.openElements.pop()
+ self.tree.openElements.pop()
+ self.parser.resetInsertionMode()
+ else:
+ # innerHTML case
+ assert self.parser.innerHTML
+ self.parser.parseError()
+
+ def endTagIgnore(self, token):
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+ def endTagOther(self, token):
+ self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
+ # Do the table magic!
+ self.tree.insertFromTable = True
+ self.parser.phases["inBody"].processEndTag(token)
+ self.tree.insertFromTable = False
+
+ class InTableTextPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+ self.originalPhase = None
+ self.characterTokens = []
+
+ def flushCharacters(self):
+ data = "".join([item["data"] for item in self.characterTokens])
+ if any([item not in spaceCharacters for item in data]):
+ token = {"type": tokenTypes["Characters"], "data": data}
+ self.parser.phases["inTable"].insertText(token)
+ elif data:
+ self.tree.insertText(data)
+ self.characterTokens = []
+
+ def processComment(self, token):
+ self.flushCharacters()
+ self.parser.phase = self.originalPhase
+ return token
+
+ def processEOF(self):
+ self.flushCharacters()
+ self.parser.phase = self.originalPhase
+ return True
+
+ def processCharacters(self, token):
+ if token["data"] == "\u0000":
+ return
+ self.characterTokens.append(token)
+
+ def processSpaceCharacters(self, token):
+ # pretty sure we should never reach here
+ self.characterTokens.append(token)
+ # assert False
+
+ def processStartTag(self, token):
+ self.flushCharacters()
+ self.parser.phase = self.originalPhase
+ return token
+
+ def processEndTag(self, token):
+ self.flushCharacters()
+ self.parser.phase = self.originalPhase
+ return token
+
+ class InCaptionPhase(Phase):
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
+ "thead", "tr"), self.startTagTableElement)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([
+ ("caption", self.endTagCaption),
+ ("table", self.endTagTable),
+ (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
+ "thead", "tr"), self.endTagIgnore)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ def ignoreEndTagCaption(self):
+ return not self.tree.elementInScope("caption", variant="table")
+
+ def processEOF(self):
+ self.parser.phases["inBody"].processEOF()
+
+ def processCharacters(self, token):
+ return self.parser.phases["inBody"].processCharacters(token)
+
+ def startTagTableElement(self, token):
+ self.parser.parseError()
+ # XXX Have to duplicate logic here to find out if the tag is ignored
+ ignoreEndTag = self.ignoreEndTagCaption()
+ self.parser.phase.processEndTag(impliedTagToken("caption"))
+ if not ignoreEndTag:
+ return token
+
+ def startTagOther(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def endTagCaption(self, token):
+ if not self.ignoreEndTagCaption():
+ # AT this code is quite similar to endTagTable in "InTable"
+ self.tree.generateImpliedEndTags()
+ if self.tree.openElements[-1].name != "caption":
+ self.parser.parseError("expected-one-end-tag-but-got-another",
+ {"gotName": "caption",
+ "expectedName": self.tree.openElements[-1].name})
+ while self.tree.openElements[-1].name != "caption":
+ self.tree.openElements.pop()
+ self.tree.openElements.pop()
+ self.tree.clearActiveFormattingElements()
+ self.parser.phase = self.parser.phases["inTable"]
+ else:
+ # innerHTML case
+ assert self.parser.innerHTML
+ self.parser.parseError()
+
+ def endTagTable(self, token):
+ self.parser.parseError()
+ ignoreEndTag = self.ignoreEndTagCaption()
+ self.parser.phase.processEndTag(impliedTagToken("caption"))
+ if not ignoreEndTag:
+ return token
+
+ def endTagIgnore(self, token):
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+ def endTagOther(self, token):
+ return self.parser.phases["inBody"].processEndTag(token)
+
+ class InColumnGroupPhase(Phase):
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-column
+
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("col", self.startTagCol)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([
+ ("colgroup", self.endTagColgroup),
+ ("col", self.endTagCol)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ def ignoreEndTagColgroup(self):
+ return self.tree.openElements[-1].name == "html"
+
+ def processEOF(self):
+ if self.tree.openElements[-1].name == "html":
+ assert self.parser.innerHTML
+ return
+ else:
+ ignoreEndTag = self.ignoreEndTagColgroup()
+ self.endTagColgroup(impliedTagToken("colgroup"))
+ if not ignoreEndTag:
+ return True
+
+ def processCharacters(self, token):
+ ignoreEndTag = self.ignoreEndTagColgroup()
+ self.endTagColgroup(impliedTagToken("colgroup"))
+ if not ignoreEndTag:
+ return token
+
+ def startTagCol(self, token):
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+
+ def startTagOther(self, token):
+ ignoreEndTag = self.ignoreEndTagColgroup()
+ self.endTagColgroup(impliedTagToken("colgroup"))
+ if not ignoreEndTag:
+ return token
+
+ def endTagColgroup(self, token):
+ if self.ignoreEndTagColgroup():
+ # innerHTML case
+ assert self.parser.innerHTML
+ self.parser.parseError()
+ else:
+ self.tree.openElements.pop()
+ self.parser.phase = self.parser.phases["inTable"]
+
+ def endTagCol(self, token):
+ self.parser.parseError("no-end-tag", {"name": "col"})
+
+ def endTagOther(self, token):
+ ignoreEndTag = self.ignoreEndTagColgroup()
+ self.endTagColgroup(impliedTagToken("colgroup"))
+ if not ignoreEndTag:
+ return token
+
+ class InTableBodyPhase(Phase):
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("tr", self.startTagTr),
+ (("td", "th"), self.startTagTableCell),
+ (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
+ self.startTagTableOther)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([
+ (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
+ ("table", self.endTagTable),
+ (("body", "caption", "col", "colgroup", "html", "td", "th",
+ "tr"), self.endTagIgnore)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ # helper methods
+ def clearStackToTableBodyContext(self):
+ while self.tree.openElements[-1].name not in ("tbody", "tfoot",
+ "thead", "html"):
+ # self.parser.parseError("unexpected-implied-end-tag-in-table",
+ # {"name": self.tree.openElements[-1].name})
+ self.tree.openElements.pop()
+ if self.tree.openElements[-1].name == "html":
+ assert self.parser.innerHTML
+
+ # the rest
+ def processEOF(self):
+ self.parser.phases["inTable"].processEOF()
+
+ def processSpaceCharacters(self, token):
+ return self.parser.phases["inTable"].processSpaceCharacters(token)
+
+ def processCharacters(self, token):
+ return self.parser.phases["inTable"].processCharacters(token)
+
+ def startTagTr(self, token):
+ self.clearStackToTableBodyContext()
+ self.tree.insertElement(token)
+ self.parser.phase = self.parser.phases["inRow"]
+
+ def startTagTableCell(self, token):
+ self.parser.parseError("unexpected-cell-in-table-body",
+ {"name": token["name"]})
+ self.startTagTr(impliedTagToken("tr", "StartTag"))
+ return token
+
+ def startTagTableOther(self, token):
+ # XXX AT Any ideas on how to share this with endTagTable?
+ if (self.tree.elementInScope("tbody", variant="table") or
+ self.tree.elementInScope("thead", variant="table") or
+ self.tree.elementInScope("tfoot", variant="table")):
+ self.clearStackToTableBodyContext()
+ self.endTagTableRowGroup(
+ impliedTagToken(self.tree.openElements[-1].name))
+ return token
+ else:
+ # innerHTML case
+ assert self.parser.innerHTML
+ self.parser.parseError()
+
+ def startTagOther(self, token):
+ return self.parser.phases["inTable"].processStartTag(token)
+
+ def endTagTableRowGroup(self, token):
+ if self.tree.elementInScope(token["name"], variant="table"):
+ self.clearStackToTableBodyContext()
+ self.tree.openElements.pop()
+ self.parser.phase = self.parser.phases["inTable"]
+ else:
+ self.parser.parseError("unexpected-end-tag-in-table-body",
+ {"name": token["name"]})
+
+ def endTagTable(self, token):
+ if (self.tree.elementInScope("tbody", variant="table") or
+ self.tree.elementInScope("thead", variant="table") or
+ self.tree.elementInScope("tfoot", variant="table")):
+ self.clearStackToTableBodyContext()
+ self.endTagTableRowGroup(
+ impliedTagToken(self.tree.openElements[-1].name))
+ return token
+ else:
+ # innerHTML case
+ assert self.parser.innerHTML
+ self.parser.parseError()
+
+ def endTagIgnore(self, token):
+ self.parser.parseError("unexpected-end-tag-in-table-body",
+ {"name": token["name"]})
+
+ def endTagOther(self, token):
+ return self.parser.phases["inTable"].processEndTag(token)
+
+ class InRowPhase(Phase):
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-row
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ (("td", "th"), self.startTagTableCell),
+ (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
+ "tr"), self.startTagTableOther)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([
+ ("tr", self.endTagTr),
+ ("table", self.endTagTable),
+ (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
+ (("body", "caption", "col", "colgroup", "html", "td", "th"),
+ self.endTagIgnore)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ # helper methods (XXX unify this with other table helper methods)
+ def clearStackToTableRowContext(self):
+ while self.tree.openElements[-1].name not in ("tr", "html"):
+ self.parser.parseError("unexpected-implied-end-tag-in-table-row",
+ {"name": self.tree.openElements[-1].name})
+ self.tree.openElements.pop()
+
+ def ignoreEndTagTr(self):
+ return not self.tree.elementInScope("tr", variant="table")
+
+ # the rest
+ def processEOF(self):
+ self.parser.phases["inTable"].processEOF()
+
+ def processSpaceCharacters(self, token):
+ return self.parser.phases["inTable"].processSpaceCharacters(token)
+
+ def processCharacters(self, token):
+ return self.parser.phases["inTable"].processCharacters(token)
+
+ def startTagTableCell(self, token):
+ self.clearStackToTableRowContext()
+ self.tree.insertElement(token)
+ self.parser.phase = self.parser.phases["inCell"]
+ self.tree.activeFormattingElements.append(Marker)
+
+ def startTagTableOther(self, token):
+ ignoreEndTag = self.ignoreEndTagTr()
+ self.endTagTr(impliedTagToken("tr"))
+ # XXX how are we sure it's always ignored in the innerHTML case?
+ if not ignoreEndTag:
+ return token
+
+ def startTagOther(self, token):
+ return self.parser.phases["inTable"].processStartTag(token)
+
+ def endTagTr(self, token):
+ if not self.ignoreEndTagTr():
+ self.clearStackToTableRowContext()
+ self.tree.openElements.pop()
+ self.parser.phase = self.parser.phases["inTableBody"]
+ else:
+ # innerHTML case
+ assert self.parser.innerHTML
+ self.parser.parseError()
+
+ def endTagTable(self, token):
+ ignoreEndTag = self.ignoreEndTagTr()
+ self.endTagTr(impliedTagToken("tr"))
+ # Reprocess the current tag if the tr end tag was not ignored
+ # XXX how are we sure it's always ignored in the innerHTML case?
+ if not ignoreEndTag:
+ return token
+
+ def endTagTableRowGroup(self, token):
+ if self.tree.elementInScope(token["name"], variant="table"):
+ self.endTagTr(impliedTagToken("tr"))
+ return token
+ else:
+ self.parser.parseError()
+
+ def endTagIgnore(self, token):
+ self.parser.parseError("unexpected-end-tag-in-table-row",
+ {"name": token["name"]})
+
+ def endTagOther(self, token):
+ return self.parser.phases["inTable"].processEndTag(token)
+
+ class InCellPhase(Phase):
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
+ "thead", "tr"), self.startTagTableOther)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([
+ (("td", "th"), self.endTagTableCell),
+ (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
+ (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ # helper
+ def closeCell(self):
+ if self.tree.elementInScope("td", variant="table"):
+ self.endTagTableCell(impliedTagToken("td"))
+ elif self.tree.elementInScope("th", variant="table"):
+ self.endTagTableCell(impliedTagToken("th"))
+
+ # the rest
+ def processEOF(self):
+ self.parser.phases["inBody"].processEOF()
+
+ def processCharacters(self, token):
+ return self.parser.phases["inBody"].processCharacters(token)
+
+ def startTagTableOther(self, token):
+ if (self.tree.elementInScope("td", variant="table") or
+ self.tree.elementInScope("th", variant="table")):
+ self.closeCell()
+ return token
+ else:
+ # innerHTML case
+ assert self.parser.innerHTML
+ self.parser.parseError()
+
+ def startTagOther(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def endTagTableCell(self, token):
+ if self.tree.elementInScope(token["name"], variant="table"):
+ self.tree.generateImpliedEndTags(token["name"])
+ if self.tree.openElements[-1].name != token["name"]:
+ self.parser.parseError("unexpected-cell-end-tag",
+ {"name": token["name"]})
+ while True:
+ node = self.tree.openElements.pop()
+ if node.name == token["name"]:
+ break
+ else:
+ self.tree.openElements.pop()
+ self.tree.clearActiveFormattingElements()
+ self.parser.phase = self.parser.phases["inRow"]
+ else:
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+ def endTagIgnore(self, token):
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+ def endTagImply(self, token):
+ if self.tree.elementInScope(token["name"], variant="table"):
+ self.closeCell()
+ return token
+ else:
+ # sometimes innerHTML case
+ self.parser.parseError()
+
+ def endTagOther(self, token):
+ return self.parser.phases["inBody"].processEndTag(token)
+
+ class InSelectPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("option", self.startTagOption),
+ ("optgroup", self.startTagOptgroup),
+ ("select", self.startTagSelect),
+ (("input", "keygen", "textarea"), self.startTagInput),
+ ("script", self.startTagScript)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([
+ ("option", self.endTagOption),
+ ("optgroup", self.endTagOptgroup),
+ ("select", self.endTagSelect)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-select
+ def processEOF(self):
+ if self.tree.openElements[-1].name != "html":
+ self.parser.parseError("eof-in-select")
+ else:
+ assert self.parser.innerHTML
+
+ def processCharacters(self, token):
+ if token["data"] == "\u0000":
+ return
+ self.tree.insertText(token["data"])
+
+ def startTagOption(self, token):
+ # We need to imply </option> if <option> is the current node.
+ if self.tree.openElements[-1].name == "option":
+ self.tree.openElements.pop()
+ self.tree.insertElement(token)
+
+ def startTagOptgroup(self, token):
+ if self.tree.openElements[-1].name == "option":
+ self.tree.openElements.pop()
+ if self.tree.openElements[-1].name == "optgroup":
+ self.tree.openElements.pop()
+ self.tree.insertElement(token)
+
+ def startTagSelect(self, token):
+ self.parser.parseError("unexpected-select-in-select")
+ self.endTagSelect(impliedTagToken("select"))
+
+ def startTagInput(self, token):
+ self.parser.parseError("unexpected-input-in-select")
+ if self.tree.elementInScope("select", variant="select"):
+ self.endTagSelect(impliedTagToken("select"))
+ return token
+ else:
+ assert self.parser.innerHTML
+
+ def startTagScript(self, token):
+ return self.parser.phases["inHead"].processStartTag(token)
+
+ def startTagOther(self, token):
+ self.parser.parseError("unexpected-start-tag-in-select",
+ {"name": token["name"]})
+
+ def endTagOption(self, token):
+ if self.tree.openElements[-1].name == "option":
+ self.tree.openElements.pop()
+ else:
+ self.parser.parseError("unexpected-end-tag-in-select",
+ {"name": "option"})
+
+ def endTagOptgroup(self, token):
+ # </optgroup> implicitly closes <option>
+ if (self.tree.openElements[-1].name == "option" and
+ self.tree.openElements[-2].name == "optgroup"):
+ self.tree.openElements.pop()
+ # It also closes </optgroup>
+ if self.tree.openElements[-1].name == "optgroup":
+ self.tree.openElements.pop()
+ # But nothing else
+ else:
+ self.parser.parseError("unexpected-end-tag-in-select",
+ {"name": "optgroup"})
+
+ def endTagSelect(self, token):
+ if self.tree.elementInScope("select", variant="select"):
+ node = self.tree.openElements.pop()
+ while node.name != "select":
+ node = self.tree.openElements.pop()
+ self.parser.resetInsertionMode()
+ else:
+ # innerHTML case
+ assert self.parser.innerHTML
+ self.parser.parseError()
+
+ def endTagOther(self, token):
+ self.parser.parseError("unexpected-end-tag-in-select",
+ {"name": token["name"]})
+
+ class InSelectInTablePhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = utils.MethodDispatcher([
+ (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
+ self.startTagTable)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([
+ (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
+ self.endTagTable)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ def processEOF(self):
+ self.parser.phases["inSelect"].processEOF()
+
+ def processCharacters(self, token):
+ return self.parser.phases["inSelect"].processCharacters(token)
+
+ def startTagTable(self, token):
+ self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
+ self.endTagOther(impliedTagToken("select"))
+ return token
+
+ def startTagOther(self, token):
+ return self.parser.phases["inSelect"].processStartTag(token)
+
+ def endTagTable(self, token):
+ self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
+ if self.tree.elementInScope(token["name"], variant="table"):
+ self.endTagOther(impliedTagToken("select"))
+ return token
+
+ def endTagOther(self, token):
+ return self.parser.phases["inSelect"].processEndTag(token)
+
+ class InForeignContentPhase(Phase):
+ breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
+ "center", "code", "dd", "div", "dl", "dt",
+ "em", "embed", "h1", "h2", "h3",
+ "h4", "h5", "h6", "head", "hr", "i", "img",
+ "li", "listing", "menu", "meta", "nobr",
+ "ol", "p", "pre", "ruby", "s", "small",
+ "span", "strong", "strike", "sub", "sup",
+ "table", "tt", "u", "ul", "var"])
+
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ def adjustSVGTagNames(self, token):
+ replacements = {"altglyph": "altGlyph",
+ "altglyphdef": "altGlyphDef",
+ "altglyphitem": "altGlyphItem",
+ "animatecolor": "animateColor",
+ "animatemotion": "animateMotion",
+ "animatetransform": "animateTransform",
+ "clippath": "clipPath",
+ "feblend": "feBlend",
+ "fecolormatrix": "feColorMatrix",
+ "fecomponenttransfer": "feComponentTransfer",
+ "fecomposite": "feComposite",
+ "feconvolvematrix": "feConvolveMatrix",
+ "fediffuselighting": "feDiffuseLighting",
+ "fedisplacementmap": "feDisplacementMap",
+ "fedistantlight": "feDistantLight",
+ "feflood": "feFlood",
+ "fefunca": "feFuncA",
+ "fefuncb": "feFuncB",
+ "fefuncg": "feFuncG",
+ "fefuncr": "feFuncR",
+ "fegaussianblur": "feGaussianBlur",
+ "feimage": "feImage",
+ "femerge": "feMerge",
+ "femergenode": "feMergeNode",
+ "femorphology": "feMorphology",
+ "feoffset": "feOffset",
+ "fepointlight": "fePointLight",
+ "fespecularlighting": "feSpecularLighting",
+ "fespotlight": "feSpotLight",
+ "fetile": "feTile",
+ "feturbulence": "feTurbulence",
+ "foreignobject": "foreignObject",
+ "glyphref": "glyphRef",
+ "lineargradient": "linearGradient",
+ "radialgradient": "radialGradient",
+ "textpath": "textPath"}
+
+ if token["name"] in replacements:
+ token["name"] = replacements[token["name"]]
+
+ def processCharacters(self, token):
+ if token["data"] == "\u0000":
+ token["data"] = "\uFFFD"
+ elif (self.parser.framesetOK and
+ any(char not in spaceCharacters for char in token["data"])):
+ self.parser.framesetOK = False
+ Phase.processCharacters(self, token)
+
+ def processStartTag(self, token):
+ currentNode = self.tree.openElements[-1]
+ if (token["name"] in self.breakoutElements or
+ (token["name"] == "font" and
+ set(token["data"].keys()) & set(["color", "face", "size"]))):
+ self.parser.parseError("unexpected-html-element-in-foreign-content",
+ {"name": token["name"]})
+ while (self.tree.openElements[-1].namespace !=
+ self.tree.defaultNamespace and
+ not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
+ not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
+ self.tree.openElements.pop()
+ return token
+
+ else:
+ if currentNode.namespace == namespaces["mathml"]:
+ self.parser.adjustMathMLAttributes(token)
+ elif currentNode.namespace == namespaces["svg"]:
+ self.adjustSVGTagNames(token)
+ self.parser.adjustSVGAttributes(token)
+ self.parser.adjustForeignAttributes(token)
+ token["namespace"] = currentNode.namespace
+ self.tree.insertElement(token)
+ if token["selfClosing"]:
+ self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
+
+ def processEndTag(self, token):
+ nodeIndex = len(self.tree.openElements) - 1
+ node = self.tree.openElements[-1]
+ if node.name != token["name"]:
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+ while True:
+ if node.name.translate(asciiUpper2Lower) == token["name"]:
+ # XXX this isn't in the spec but it seems necessary
+ if self.parser.phase == self.parser.phases["inTableText"]:
+ self.parser.phase.flushCharacters()
+ self.parser.phase = self.parser.phase.originalPhase
+ while self.tree.openElements.pop() != node:
+ assert self.tree.openElements
+ new_token = None
+ break
+ nodeIndex -= 1
+
+ node = self.tree.openElements[nodeIndex]
+ if node.namespace != self.tree.defaultNamespace:
+ continue
+ else:
+ new_token = self.parser.phase.processEndTag(token)
+ break
+ return new_token
+
+ class AfterBodyPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
+ self.endTagHandler.default = self.endTagOther
+
+ def processEOF(self):
+ # Stop parsing
+ pass
+
+ def processComment(self, token):
+ # This is needed because data is to be appended to the <html> element
+ # here and not to whatever is currently open.
+ self.tree.insertComment(token, self.tree.openElements[0])
+
+ def processCharacters(self, token):
+ self.parser.parseError("unexpected-char-after-body")
+ self.parser.phase = self.parser.phases["inBody"]
+ return token
+
+ def startTagHtml(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def startTagOther(self, token):
+ self.parser.parseError("unexpected-start-tag-after-body",
+ {"name": token["name"]})
+ self.parser.phase = self.parser.phases["inBody"]
+ return token
+
+ def endTagHtml(self, name):
+ if self.parser.innerHTML:
+ self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
+ else:
+ self.parser.phase = self.parser.phases["afterAfterBody"]
+
+ def endTagOther(self, token):
+ self.parser.parseError("unexpected-end-tag-after-body",
+ {"name": token["name"]})
+ self.parser.phase = self.parser.phases["inBody"]
+ return token
+
+ class InFramesetPhase(Phase):
+ # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("frameset", self.startTagFrameset),
+ ("frame", self.startTagFrame),
+ ("noframes", self.startTagNoframes)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([
+ ("frameset", self.endTagFrameset)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ def processEOF(self):
+ if self.tree.openElements[-1].name != "html":
+ self.parser.parseError("eof-in-frameset")
+ else:
+ assert self.parser.innerHTML
+
+ def processCharacters(self, token):
+ self.parser.parseError("unexpected-char-in-frameset")
+
+ def startTagFrameset(self, token):
+ self.tree.insertElement(token)
+
+ def startTagFrame(self, token):
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+
+ def startTagNoframes(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def startTagOther(self, token):
+ self.parser.parseError("unexpected-start-tag-in-frameset",
+ {"name": token["name"]})
+
+ def endTagFrameset(self, token):
+ if self.tree.openElements[-1].name == "html":
+ # innerHTML case
+ self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
+ else:
+ self.tree.openElements.pop()
+ if (not self.parser.innerHTML and
+ self.tree.openElements[-1].name != "frameset"):
+ # If we're not in innerHTML mode and the the current node is not a
+ # "frameset" element (anymore) then switch.
+ self.parser.phase = self.parser.phases["afterFrameset"]
+
+ def endTagOther(self, token):
+ self.parser.parseError("unexpected-end-tag-in-frameset",
+ {"name": token["name"]})
+
+ class AfterFramesetPhase(Phase):
+ # http://www.whatwg.org/specs/web-apps/current-work/#after3
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("noframes", self.startTagNoframes)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = utils.MethodDispatcher([
+ ("html", self.endTagHtml)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ def processEOF(self):
+ # Stop parsing
+ pass
+
+ def processCharacters(self, token):
+ self.parser.parseError("unexpected-char-after-frameset")
+
+ def startTagNoframes(self, token):
+ return self.parser.phases["inHead"].processStartTag(token)
+
+ def startTagOther(self, token):
+ self.parser.parseError("unexpected-start-tag-after-frameset",
+ {"name": token["name"]})
+
+ def endTagHtml(self, token):
+ self.parser.phase = self.parser.phases["afterAfterFrameset"]
+
+ def endTagOther(self, token):
+ self.parser.parseError("unexpected-end-tag-after-frameset",
+ {"name": token["name"]})
+
+ class AfterAfterBodyPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ def processEOF(self):
+ pass
+
+ def processComment(self, token):
+ self.tree.insertComment(token, self.tree.document)
+
+ def processSpaceCharacters(self, token):
+ return self.parser.phases["inBody"].processSpaceCharacters(token)
+
+ def processCharacters(self, token):
+ self.parser.parseError("expected-eof-but-got-char")
+ self.parser.phase = self.parser.phases["inBody"]
+ return token
+
+ def startTagHtml(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def startTagOther(self, token):
+ self.parser.parseError("expected-eof-but-got-start-tag",
+ {"name": token["name"]})
+ self.parser.phase = self.parser.phases["inBody"]
+ return token
+
+ def processEndTag(self, token):
+ self.parser.parseError("expected-eof-but-got-end-tag",
+ {"name": token["name"]})
+ self.parser.phase = self.parser.phases["inBody"]
+ return token
+
+ class AfterAfterFramesetPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("noframes", self.startTagNoFrames)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ def processEOF(self):
+ pass
+
+ def processComment(self, token):
+ self.tree.insertComment(token, self.tree.document)
+
+ def processSpaceCharacters(self, token):
+ return self.parser.phases["inBody"].processSpaceCharacters(token)
+
+ def processCharacters(self, token):
+ self.parser.parseError("expected-eof-but-got-char")
+
+ def startTagHtml(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def startTagNoFrames(self, token):
+ return self.parser.phases["inHead"].processStartTag(token)
+
+ def startTagOther(self, token):
+ self.parser.parseError("expected-eof-but-got-start-tag",
+ {"name": token["name"]})
+
+ def processEndTag(self, token):
+ self.parser.parseError("expected-eof-but-got-end-tag",
+ {"name": token["name"]})
+
+ return {
+ "initial": InitialPhase,
+ "beforeHtml": BeforeHtmlPhase,
+ "beforeHead": BeforeHeadPhase,
+ "inHead": InHeadPhase,
+ # XXX "inHeadNoscript": InHeadNoScriptPhase,
+ "afterHead": AfterHeadPhase,
+ "inBody": InBodyPhase,
+ "text": TextPhase,
+ "inTable": InTablePhase,
+ "inTableText": InTableTextPhase,
+ "inCaption": InCaptionPhase,
+ "inColumnGroup": InColumnGroupPhase,
+ "inTableBody": InTableBodyPhase,
+ "inRow": InRowPhase,
+ "inCell": InCellPhase,
+ "inSelect": InSelectPhase,
+ "inSelectInTable": InSelectInTablePhase,
+ "inForeignContent": InForeignContentPhase,
+ "afterBody": AfterBodyPhase,
+ "inFrameset": InFramesetPhase,
+ "afterFrameset": AfterFramesetPhase,
+ "afterAfterBody": AfterAfterBodyPhase,
+ "afterAfterFrameset": AfterAfterFramesetPhase,
+ # XXX after after frameset
+ }
+
+
+def impliedTagToken(name, type="EndTag", attributes=None,
+ selfClosing=False):
+ if attributes is None:
+ attributes = {}
+ return {"type": tokenTypes[type], "name": name, "data": attributes,
+ "selfClosing": selfClosing}
+
+
+class ParseError(Exception):
+ """Error in parsed document"""
+ pass