aboutsummaryrefslogtreecommitdiffstats
path: root/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/tokenizer.py
diff options
context:
space:
mode:
Diffstat (limited to 'jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/tokenizer.py')
-rw-r--r--jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/tokenizer.py1731
1 files changed, 1731 insertions, 0 deletions
diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/tokenizer.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/tokenizer.py
new file mode 100644
index 0000000..7977457
--- /dev/null
+++ b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/tokenizer.py
@@ -0,0 +1,1731 @@
+from __future__ import absolute_import, division, unicode_literals
+
+try:
+ chr = unichr # flake8: noqa
+except NameError:
+ pass
+
+from collections import deque
+
+from .constants import spaceCharacters
+from .constants import entities
+from .constants import asciiLetters, asciiUpper2Lower
+from .constants import digits, hexDigits, EOF
+from .constants import tokenTypes, tagTokenTypes
+from .constants import replacementCharacters
+
+from .inputstream import HTMLInputStream
+
+from .trie import Trie
+
+entitiesTrie = Trie(entities)
+
+
+class HTMLTokenizer(object):
+ """ This class takes care of tokenizing HTML.
+
+ * self.currentToken
+ Holds the token that is currently being processed.
+
+ * self.state
+ Holds a reference to the method to be invoked... XXX
+
+ * self.stream
+ Points to HTMLInputStream object.
+ """
+
+ def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
+ lowercaseElementName=True, lowercaseAttrName=True, parser=None):
+
+ self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
+ self.parser = parser
+
+ # Perform case conversions?
+ self.lowercaseElementName = lowercaseElementName
+ self.lowercaseAttrName = lowercaseAttrName
+
+ # Setup the initial tokenizer state
+ self.escapeFlag = False
+ self.lastFourChars = []
+ self.state = self.dataState
+ self.escape = False
+
+ # The current token being created
+ self.currentToken = None
+ super(HTMLTokenizer, self).__init__()
+
+ def __iter__(self):
+ """ This is where the magic happens.
+
+ We do our usually processing through the states and when we have a token
+ to return we yield the token which pauses processing until the next token
+ is requested.
+ """
+ self.tokenQueue = deque([])
+ # Start processing. When EOF is reached self.state will return False
+ # instead of True and the loop will terminate.
+ while self.state():
+ while self.stream.errors:
+ yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
+ while self.tokenQueue:
+ yield self.tokenQueue.popleft()
+
+ def consumeNumberEntity(self, isHex):
+ """This function returns either U+FFFD or the character based on the
+ decimal or hexadecimal representation. It also discards ";" if present.
+ If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
+ """
+
+ allowed = digits
+ radix = 10
+ if isHex:
+ allowed = hexDigits
+ radix = 16
+
+ charStack = []
+
+ # Consume all the characters that are in range while making sure we
+ # don't hit an EOF.
+ c = self.stream.char()
+ while c in allowed and c is not EOF:
+ charStack.append(c)
+ c = self.stream.char()
+
+ # Convert the set of characters consumed to an int.
+ charAsInt = int("".join(charStack), radix)
+
+ # Certain characters get replaced with others
+ if charAsInt in replacementCharacters:
+ char = replacementCharacters[charAsInt]
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "illegal-codepoint-for-numeric-entity",
+ "datavars": {"charAsInt": charAsInt}})
+ elif ((0xD800 <= charAsInt <= 0xDFFF) or
+ (charAsInt > 0x10FFFF)):
+ char = "\uFFFD"
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "illegal-codepoint-for-numeric-entity",
+ "datavars": {"charAsInt": charAsInt}})
+ else:
+ # Should speed up this check somehow (e.g. move the set to a constant)
+ if ((0x0001 <= charAsInt <= 0x0008) or
+ (0x000E <= charAsInt <= 0x001F) or
+ (0x007F <= charAsInt <= 0x009F) or
+ (0xFDD0 <= charAsInt <= 0xFDEF) or
+ charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
+ 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+ 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
+ 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
+ 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
+ 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
+ 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+ 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
+ 0xFFFFF, 0x10FFFE, 0x10FFFF])):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data":
+ "illegal-codepoint-for-numeric-entity",
+ "datavars": {"charAsInt": charAsInt}})
+ try:
+ # Try/except needed as UCS-2 Python builds' unichar only works
+ # within the BMP.
+ char = chr(charAsInt)
+ except ValueError:
+ v = charAsInt - 0x10000
+ char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
+
+ # Discard the ; if present. Otherwise, put it back on the queue and
+ # invoke parseError on parser.
+ if c != ";":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "numeric-entity-without-semicolon"})
+ self.stream.unget(c)
+
+ return char
+
+ def consumeEntity(self, allowedChar=None, fromAttribute=False):
+ # Initialise to the default output for when no entity is matched
+ output = "&"
+
+ charStack = [self.stream.char()]
+ if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
+ or (allowedChar is not None and allowedChar == charStack[0])):
+ self.stream.unget(charStack[0])
+
+ elif charStack[0] == "#":
+ # Read the next character to see if it's hex or decimal
+ hex = False
+ charStack.append(self.stream.char())
+ if charStack[-1] in ("x", "X"):
+ hex = True
+ charStack.append(self.stream.char())
+
+ # charStack[-1] should be the first digit
+ if (hex and charStack[-1] in hexDigits) \
+ or (not hex and charStack[-1] in digits):
+ # At least one digit found, so consume the whole number
+ self.stream.unget(charStack[-1])
+ output = self.consumeNumberEntity(hex)
+ else:
+ # No digits found
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "expected-numeric-entity"})
+ self.stream.unget(charStack.pop())
+ output = "&" + "".join(charStack)
+
+ else:
+ # At this point in the process might have named entity. Entities
+ # are stored in the global variable "entities".
+ #
+ # Consume characters and compare to these to a substring of the
+ # entity names in the list until the substring no longer matches.
+ while (charStack[-1] is not EOF):
+ if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
+ break
+ charStack.append(self.stream.char())
+
+ # At this point we have a string that starts with some characters
+ # that may match an entity
+ # Try to find the longest entity the string will match to take care
+ # of &noti for instance.
+ try:
+ entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
+ entityLength = len(entityName)
+ except KeyError:
+ entityName = None
+
+ if entityName is not None:
+ if entityName[-1] != ";":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "named-entity-without-semicolon"})
+ if (entityName[-1] != ";" and fromAttribute and
+ (charStack[entityLength] in asciiLetters or
+ charStack[entityLength] in digits or
+ charStack[entityLength] == "=")):
+ self.stream.unget(charStack.pop())
+ output = "&" + "".join(charStack)
+ else:
+ output = entities[entityName]
+ self.stream.unget(charStack.pop())
+ output += "".join(charStack[entityLength:])
+ else:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-named-entity"})
+ self.stream.unget(charStack.pop())
+ output = "&" + "".join(charStack)
+
+ if fromAttribute:
+ self.currentToken["data"][-1][1] += output
+ else:
+ if output in spaceCharacters:
+ tokenType = "SpaceCharacters"
+ else:
+ tokenType = "Characters"
+ self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
+
+ def processEntityInAttribute(self, allowedChar):
+ """This method replaces the need for "entityInAttributeValueState".
+ """
+ self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
+
+ def emitCurrentToken(self):
+ """This method is a generic handler for emitting the tags. It also sets
+ the state to "data" because that's what's needed after a token has been
+ emitted.
+ """
+ token = self.currentToken
+ # Add token to the queue to be yielded
+ if (token["type"] in tagTokenTypes):
+ if self.lowercaseElementName:
+ token["name"] = token["name"].translate(asciiUpper2Lower)
+ if token["type"] == tokenTypes["EndTag"]:
+ if token["data"]:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "attributes-in-end-tag"})
+ if token["selfClosing"]:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "self-closing-flag-on-end-tag"})
+ self.tokenQueue.append(token)
+ self.state = self.dataState
+
+ # Below are the various tokenizer states worked out.
+ def dataState(self):
+ data = self.stream.char()
+ if data == "&":
+ self.state = self.entityDataState
+ elif data == "<":
+ self.state = self.tagOpenState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "\u0000"})
+ elif data is EOF:
+ # Tokenization ends.
+ return False
+ elif data in spaceCharacters:
+ # Directly after emitting a token you switch back to the "data
+ # state". At that point spaceCharacters are important so they are
+ # emitted separately.
+ self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
+ data + self.stream.charsUntil(spaceCharacters, True)})
+ # No need to update lastFourChars here, since the first space will
+ # have already been appended to lastFourChars and will have broken
+ # any <!-- or --> sequences
+ else:
+ chars = self.stream.charsUntil(("&", "<", "\u0000"))
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+ data + chars})
+ return True
+
+ def entityDataState(self):
+ self.consumeEntity()
+ self.state = self.dataState
+ return True
+
+ def rcdataState(self):
+ data = self.stream.char()
+ if data == "&":
+ self.state = self.characterReferenceInRcdata
+ elif data == "<":
+ self.state = self.rcdataLessThanSignState
+ elif data == EOF:
+ # Tokenization ends.
+ return False
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "\uFFFD"})
+ elif data in spaceCharacters:
+ # Directly after emitting a token you switch back to the "data
+ # state". At that point spaceCharacters are important so they are
+ # emitted separately.
+ self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
+ data + self.stream.charsUntil(spaceCharacters, True)})
+ # No need to update lastFourChars here, since the first space will
+ # have already been appended to lastFourChars and will have broken
+ # any <!-- or --> sequences
+ else:
+ chars = self.stream.charsUntil(("&", "<", "\u0000"))
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+ data + chars})
+ return True
+
+ def characterReferenceInRcdata(self):
+ self.consumeEntity()
+ self.state = self.rcdataState
+ return True
+
+ def rawtextState(self):
+ data = self.stream.char()
+ if data == "<":
+ self.state = self.rawtextLessThanSignState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "\uFFFD"})
+ elif data == EOF:
+ # Tokenization ends.
+ return False
+ else:
+ chars = self.stream.charsUntil(("<", "\u0000"))
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+ data + chars})
+ return True
+
+ def scriptDataState(self):
+ data = self.stream.char()
+ if data == "<":
+ self.state = self.scriptDataLessThanSignState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "\uFFFD"})
+ elif data == EOF:
+ # Tokenization ends.
+ return False
+ else:
+ chars = self.stream.charsUntil(("<", "\u0000"))
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+ data + chars})
+ return True
+
+ def plaintextState(self):
+ data = self.stream.char()
+ if data == EOF:
+ # Tokenization ends.
+ return False
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "\uFFFD"})
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+ data + self.stream.charsUntil("\u0000")})
+ return True
+
+ def tagOpenState(self):
+ data = self.stream.char()
+ if data == "!":
+ self.state = self.markupDeclarationOpenState
+ elif data == "/":
+ self.state = self.closeTagOpenState
+ elif data in asciiLetters:
+ self.currentToken = {"type": tokenTypes["StartTag"],
+ "name": data, "data": [],
+ "selfClosing": False,
+ "selfClosingAcknowledged": False}
+ self.state = self.tagNameState
+ elif data == ">":
+ # XXX In theory it could be something besides a tag name. But
+ # do we really care?
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-tag-name-but-got-right-bracket"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
+ self.state = self.dataState
+ elif data == "?":
+ # XXX In theory it could be something besides a tag name. But
+ # do we really care?
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-tag-name-but-got-question-mark"})
+ self.stream.unget(data)
+ self.state = self.bogusCommentState
+ else:
+ # XXX
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-tag-name"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+ self.stream.unget(data)
+ self.state = self.dataState
+ return True
+
+ def closeTagOpenState(self):
+ data = self.stream.char()
+ if data in asciiLetters:
+ self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
+ "data": [], "selfClosing": False}
+ self.state = self.tagNameState
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-closing-tag-but-got-right-bracket"})
+ self.state = self.dataState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-closing-tag-but-got-eof"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+ self.state = self.dataState
+ else:
+ # XXX data can be _'_...
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-closing-tag-but-got-char",
+ "datavars": {"data": data}})
+ self.stream.unget(data)
+ self.state = self.bogusCommentState
+ return True
+
+ def tagNameState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.state = self.beforeAttributeNameState
+ elif data == ">":
+ self.emitCurrentToken()
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-tag-name"})
+ self.state = self.dataState
+ elif data == "/":
+ self.state = self.selfClosingStartTagState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["name"] += "\uFFFD"
+ else:
+ self.currentToken["name"] += data
+ # (Don't use charsUntil here, because tag names are
+ # very short and it's faster to not do anything fancy)
+ return True
+
+ def rcdataLessThanSignState(self):
+ data = self.stream.char()
+ if data == "/":
+ self.temporaryBuffer = ""
+ self.state = self.rcdataEndTagOpenState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+ self.stream.unget(data)
+ self.state = self.rcdataState
+ return True
+
+ def rcdataEndTagOpenState(self):
+ data = self.stream.char()
+ if data in asciiLetters:
+ self.temporaryBuffer += data
+ self.state = self.rcdataEndTagNameState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+ self.stream.unget(data)
+ self.state = self.rcdataState
+ return True
+
+ def rcdataEndTagNameState(self):
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+ data = self.stream.char()
+ if data in spaceCharacters and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing": False}
+ self.state = self.beforeAttributeNameState
+ elif data == "/" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing": False}
+ self.state = self.selfClosingStartTagState
+ elif data == ">" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing": False}
+ self.emitCurrentToken()
+ self.state = self.dataState
+ elif data in asciiLetters:
+ self.temporaryBuffer += data
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "</" + self.temporaryBuffer})
+ self.stream.unget(data)
+ self.state = self.rcdataState
+ return True
+
+ def rawtextLessThanSignState(self):
+ data = self.stream.char()
+ if data == "/":
+ self.temporaryBuffer = ""
+ self.state = self.rawtextEndTagOpenState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+ self.stream.unget(data)
+ self.state = self.rawtextState
+ return True
+
+ def rawtextEndTagOpenState(self):
+ data = self.stream.char()
+ if data in asciiLetters:
+ self.temporaryBuffer += data
+ self.state = self.rawtextEndTagNameState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+ self.stream.unget(data)
+ self.state = self.rawtextState
+ return True
+
+ def rawtextEndTagNameState(self):
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+ data = self.stream.char()
+ if data in spaceCharacters and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing": False}
+ self.state = self.beforeAttributeNameState
+ elif data == "/" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing": False}
+ self.state = self.selfClosingStartTagState
+ elif data == ">" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing": False}
+ self.emitCurrentToken()
+ self.state = self.dataState
+ elif data in asciiLetters:
+ self.temporaryBuffer += data
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "</" + self.temporaryBuffer})
+ self.stream.unget(data)
+ self.state = self.rawtextState
+ return True
+
+ def scriptDataLessThanSignState(self):
+ data = self.stream.char()
+ if data == "/":
+ self.temporaryBuffer = ""
+ self.state = self.scriptDataEndTagOpenState
+ elif data == "!":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
+ self.state = self.scriptDataEscapeStartState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+ self.stream.unget(data)
+ self.state = self.scriptDataState
+ return True
+
+ def scriptDataEndTagOpenState(self):
+ data = self.stream.char()
+ if data in asciiLetters:
+ self.temporaryBuffer += data
+ self.state = self.scriptDataEndTagNameState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+ self.stream.unget(data)
+ self.state = self.scriptDataState
+ return True
+
+ def scriptDataEndTagNameState(self):
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+ data = self.stream.char()
+ if data in spaceCharacters and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing": False}
+ self.state = self.beforeAttributeNameState
+ elif data == "/" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing": False}
+ self.state = self.selfClosingStartTagState
+ elif data == ">" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing": False}
+ self.emitCurrentToken()
+ self.state = self.dataState
+ elif data in asciiLetters:
+ self.temporaryBuffer += data
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "</" + self.temporaryBuffer})
+ self.stream.unget(data)
+ self.state = self.scriptDataState
+ return True
+
+ def scriptDataEscapeStartState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+ self.state = self.scriptDataEscapeStartDashState
+ else:
+ self.stream.unget(data)
+ self.state = self.scriptDataState
+ return True
+
+ def scriptDataEscapeStartDashState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+ self.state = self.scriptDataEscapedDashDashState
+ else:
+ self.stream.unget(data)
+ self.state = self.scriptDataState
+ return True
+
+ def scriptDataEscapedState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+ self.state = self.scriptDataEscapedDashState
+ elif data == "<":
+ self.state = self.scriptDataEscapedLessThanSignState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "\uFFFD"})
+ elif data == EOF:
+ self.state = self.dataState
+ else:
+ chars = self.stream.charsUntil(("<", "-", "\u0000"))
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+ data + chars})
+ return True
+
+ def scriptDataEscapedDashState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+ self.state = self.scriptDataEscapedDashDashState
+ elif data == "<":
+ self.state = self.scriptDataEscapedLessThanSignState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "\uFFFD"})
+ self.state = self.scriptDataEscapedState
+ elif data == EOF:
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ self.state = self.scriptDataEscapedState
+ return True
+
+ def scriptDataEscapedDashDashState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+ elif data == "<":
+ self.state = self.scriptDataEscapedLessThanSignState
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
+ self.state = self.scriptDataState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "\uFFFD"})
+ self.state = self.scriptDataEscapedState
+ elif data == EOF:
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ self.state = self.scriptDataEscapedState
+ return True
+
+ def scriptDataEscapedLessThanSignState(self):
+ data = self.stream.char()
+ if data == "/":
+ self.temporaryBuffer = ""
+ self.state = self.scriptDataEscapedEndTagOpenState
+ elif data in asciiLetters:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
+ self.temporaryBuffer = data
+ self.state = self.scriptDataDoubleEscapeStartState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+ self.stream.unget(data)
+ self.state = self.scriptDataEscapedState
+ return True
+
+ def scriptDataEscapedEndTagOpenState(self):
+ data = self.stream.char()
+ if data in asciiLetters:
+ self.temporaryBuffer = data
+ self.state = self.scriptDataEscapedEndTagNameState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+ self.stream.unget(data)
+ self.state = self.scriptDataEscapedState
+ return True
+
+ def scriptDataEscapedEndTagNameState(self):
+ appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+ data = self.stream.char()
+ if data in spaceCharacters and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing": False}
+ self.state = self.beforeAttributeNameState
+ elif data == "/" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing": False}
+ self.state = self.selfClosingStartTagState
+ elif data == ">" and appropriate:
+ self.currentToken = {"type": tokenTypes["EndTag"],
+ "name": self.temporaryBuffer,
+ "data": [], "selfClosing": False}
+ self.emitCurrentToken()
+ self.state = self.dataState
+ elif data in asciiLetters:
+ self.temporaryBuffer += data
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "</" + self.temporaryBuffer})
+ self.stream.unget(data)
+ self.state = self.scriptDataEscapedState
+ return True
+
+ def scriptDataDoubleEscapeStartState(self):
+ data = self.stream.char()
+ if data in (spaceCharacters | frozenset(("/", ">"))):
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ if self.temporaryBuffer.lower() == "script":
+ self.state = self.scriptDataDoubleEscapedState
+ else:
+ self.state = self.scriptDataEscapedState
+ elif data in asciiLetters:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ self.temporaryBuffer += data
+ else:
+ self.stream.unget(data)
+ self.state = self.scriptDataEscapedState
+ return True
+
+ def scriptDataDoubleEscapedState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+ self.state = self.scriptDataDoubleEscapedDashState
+ elif data == "<":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+ self.state = self.scriptDataDoubleEscapedLessThanSignState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "\uFFFD"})
+ elif data == EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-script-in-script"})
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ return True
+
+ def scriptDataDoubleEscapedDashState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+ self.state = self.scriptDataDoubleEscapedDashDashState
+ elif data == "<":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+ self.state = self.scriptDataDoubleEscapedLessThanSignState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "\uFFFD"})
+ self.state = self.scriptDataDoubleEscapedState
+ elif data == EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-script-in-script"})
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ self.state = self.scriptDataDoubleEscapedState
+ return True
+
+ def scriptDataDoubleEscapedDashDashState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+ elif data == "<":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+ self.state = self.scriptDataDoubleEscapedLessThanSignState
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
+ self.state = self.scriptDataState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": "\uFFFD"})
+ self.state = self.scriptDataDoubleEscapedState
+ elif data == EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-script-in-script"})
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ self.state = self.scriptDataDoubleEscapedState
+ return True
+
+ def scriptDataDoubleEscapedLessThanSignState(self):
+ data = self.stream.char()
+ if data == "/":
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
+ self.temporaryBuffer = ""
+ self.state = self.scriptDataDoubleEscapeEndState
+ else:
+ self.stream.unget(data)
+ self.state = self.scriptDataDoubleEscapedState
+ return True
+
+ def scriptDataDoubleEscapeEndState(self):
+ data = self.stream.char()
+ if data in (spaceCharacters | frozenset(("/", ">"))):
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ if self.temporaryBuffer.lower() == "script":
+ self.state = self.scriptDataEscapedState
+ else:
+ self.state = self.scriptDataDoubleEscapedState
+ elif data in asciiLetters:
+ self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+ self.temporaryBuffer += data
+ else:
+ self.stream.unget(data)
+ self.state = self.scriptDataDoubleEscapedState
+ return True
+
+ def beforeAttributeNameState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.stream.charsUntil(spaceCharacters, True)
+ elif data in asciiLetters:
+ self.currentToken["data"].append([data, ""])
+ self.state = self.attributeNameState
+ elif data == ">":
+ self.emitCurrentToken()
+ elif data == "/":
+ self.state = self.selfClosingStartTagState
+ elif data in ("'", '"', "=", "<"):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "invalid-character-in-attribute-name"})
+ self.currentToken["data"].append([data, ""])
+ self.state = self.attributeNameState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"].append(["\uFFFD", ""])
+ self.state = self.attributeNameState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-attribute-name-but-got-eof"})
+ self.state = self.dataState
+ else:
+ self.currentToken["data"].append([data, ""])
+ self.state = self.attributeNameState
+ return True
+
+ def attributeNameState(self):
+ data = self.stream.char()
+ leavingThisState = True
+ emitToken = False
+ if data == "=":
+ self.state = self.beforeAttributeValueState
+ elif data in asciiLetters:
+ self.currentToken["data"][-1][0] += data +\
+ self.stream.charsUntil(asciiLetters, True)
+ leavingThisState = False
+ elif data == ">":
+ # XXX If we emit here the attributes are converted to a dict
+ # without being checked and when the code below runs we error
+ # because data is a dict not a list
+ emitToken = True
+ elif data in spaceCharacters:
+ self.state = self.afterAttributeNameState
+ elif data == "/":
+ self.state = self.selfClosingStartTagState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"][-1][0] += "\uFFFD"
+ leavingThisState = False
+ elif data in ("'", '"', "<"):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data":
+ "invalid-character-in-attribute-name"})
+ self.currentToken["data"][-1][0] += data
+ leavingThisState = False
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "eof-in-attribute-name"})
+ self.state = self.dataState
+ else:
+ self.currentToken["data"][-1][0] += data
+ leavingThisState = False
+
+ if leavingThisState:
+ # Attributes are not dropped at this stage. That happens when the
+ # start tag token is emitted so values can still be safely appended
+ # to attributes, but we do want to report the parse error in time.
+ if self.lowercaseAttrName:
+ self.currentToken["data"][-1][0] = (
+ self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
+ for name, value in self.currentToken["data"][:-1]:
+ if self.currentToken["data"][-1][0] == name:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "duplicate-attribute"})
+ break
+ # XXX Fix for above XXX
+ if emitToken:
+ self.emitCurrentToken()
+ return True
+
+ def afterAttributeNameState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.stream.charsUntil(spaceCharacters, True)
+ elif data == "=":
+ self.state = self.beforeAttributeValueState
+ elif data == ">":
+ self.emitCurrentToken()
+ elif data in asciiLetters:
+ self.currentToken["data"].append([data, ""])
+ self.state = self.attributeNameState
+ elif data == "/":
+ self.state = self.selfClosingStartTagState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"].append(["\uFFFD", ""])
+ self.state = self.attributeNameState
+ elif data in ("'", '"', "<"):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "invalid-character-after-attribute-name"})
+ self.currentToken["data"].append([data, ""])
+ self.state = self.attributeNameState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-end-of-tag-but-got-eof"})
+ self.state = self.dataState
+ else:
+ self.currentToken["data"].append([data, ""])
+ self.state = self.attributeNameState
+ return True
+
+ def beforeAttributeValueState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.stream.charsUntil(spaceCharacters, True)
+ elif data == "\"":
+ self.state = self.attributeValueDoubleQuotedState
+ elif data == "&":
+ self.state = self.attributeValueUnQuotedState
+ self.stream.unget(data)
+ elif data == "'":
+ self.state = self.attributeValueSingleQuotedState
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-attribute-value-but-got-right-bracket"})
+ self.emitCurrentToken()
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"][-1][1] += "\uFFFD"
+ self.state = self.attributeValueUnQuotedState
+ elif data in ("=", "<", "`"):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "equals-in-unquoted-attribute-value"})
+ self.currentToken["data"][-1][1] += data
+ self.state = self.attributeValueUnQuotedState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-attribute-value-but-got-eof"})
+ self.state = self.dataState
+ else:
+ self.currentToken["data"][-1][1] += data
+ self.state = self.attributeValueUnQuotedState
+ return True
+
+ def attributeValueDoubleQuotedState(self):
+ data = self.stream.char()
+ if data == "\"":
+ self.state = self.afterAttributeValueState
+ elif data == "&":
+ self.processEntityInAttribute('"')
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"][-1][1] += "\uFFFD"
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-attribute-value-double-quote"})
+ self.state = self.dataState
+ else:
+ self.currentToken["data"][-1][1] += data +\
+ self.stream.charsUntil(("\"", "&", "\u0000"))
+ return True
+
+ def attributeValueSingleQuotedState(self):
+ data = self.stream.char()
+ if data == "'":
+ self.state = self.afterAttributeValueState
+ elif data == "&":
+ self.processEntityInAttribute("'")
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"][-1][1] += "\uFFFD"
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-attribute-value-single-quote"})
+ self.state = self.dataState
+ else:
+ self.currentToken["data"][-1][1] += data +\
+ self.stream.charsUntil(("'", "&", "\u0000"))
+ return True
+
+ def attributeValueUnQuotedState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.state = self.beforeAttributeNameState
+ elif data == "&":
+ self.processEntityInAttribute(">")
+ elif data == ">":
+ self.emitCurrentToken()
+ elif data in ('"', "'", "=", "<", "`"):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-character-in-unquoted-attribute-value"})
+ self.currentToken["data"][-1][1] += data
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"][-1][1] += "\uFFFD"
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-attribute-value-no-quotes"})
+ self.state = self.dataState
+ else:
+ self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
+ frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
+ return True
+
+ def afterAttributeValueState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.state = self.beforeAttributeNameState
+ elif data == ">":
+ self.emitCurrentToken()
+ elif data == "/":
+ self.state = self.selfClosingStartTagState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-EOF-after-attribute-value"})
+ self.stream.unget(data)
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-character-after-attribute-value"})
+ self.stream.unget(data)
+ self.state = self.beforeAttributeNameState
+ return True
+
+ def selfClosingStartTagState(self):
+ data = self.stream.char()
+ if data == ">":
+ self.currentToken["selfClosing"] = True
+ self.emitCurrentToken()
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data":
+ "unexpected-EOF-after-solidus-in-tag"})
+ self.stream.unget(data)
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-character-after-solidus-in-tag"})
+ self.stream.unget(data)
+ self.state = self.beforeAttributeNameState
+ return True
+
+ def bogusCommentState(self):
+ # Make a new comment token and give it as value all the characters
+ # until the first > or EOF (charsUntil checks for EOF automatically)
+ # and emit it.
+ data = self.stream.charsUntil(">")
+ data = data.replace("\u0000", "\uFFFD")
+ self.tokenQueue.append(
+ {"type": tokenTypes["Comment"], "data": data})
+
+ # Eat the character directly after the bogus comment which is either a
+ # ">" or an EOF.
+ self.stream.char()
+ self.state = self.dataState
+ return True
+
+ def markupDeclarationOpenState(self):
+ charStack = [self.stream.char()]
+ if charStack[-1] == "-":
+ charStack.append(self.stream.char())
+ if charStack[-1] == "-":
+ self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
+ self.state = self.commentStartState
+ return True
+ elif charStack[-1] in ('d', 'D'):
+ matched = True
+ for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
+ ('y', 'Y'), ('p', 'P'), ('e', 'E')):
+ charStack.append(self.stream.char())
+ if charStack[-1] not in expected:
+ matched = False
+ break
+ if matched:
+ self.currentToken = {"type": tokenTypes["Doctype"],
+ "name": "",
+ "publicId": None, "systemId": None,
+ "correct": True}
+ self.state = self.doctypeState
+ return True
+ elif (charStack[-1] == "[" and
+ self.parser is not None and
+ self.parser.tree.openElements and
+ self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
+ matched = True
+ for expected in ["C", "D", "A", "T", "A", "["]:
+ charStack.append(self.stream.char())
+ if charStack[-1] != expected:
+ matched = False
+ break
+ if matched:
+ self.state = self.cdataSectionState
+ return True
+
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-dashes-or-doctype"})
+
+ while charStack:
+ self.stream.unget(charStack.pop())
+ self.state = self.bogusCommentState
+ return True
+
+ def commentStartState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.state = self.commentStartDashState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"] += "\uFFFD"
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "incorrect-comment"})
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-comment"})
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.currentToken["data"] += data
+ self.state = self.commentState
+ return True
+
+ def commentStartDashState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.state = self.commentEndState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"] += "-\uFFFD"
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "incorrect-comment"})
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-comment"})
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.currentToken["data"] += "-" + data
+ self.state = self.commentState
+ return True
+
+ def commentState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.state = self.commentEndDashState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"] += "\uFFFD"
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "eof-in-comment"})
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.currentToken["data"] += data + \
+ self.stream.charsUntil(("-", "\u0000"))
+ return True
+
+ def commentEndDashState(self):
+ data = self.stream.char()
+ if data == "-":
+ self.state = self.commentEndState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"] += "-\uFFFD"
+ self.state = self.commentState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-comment-end-dash"})
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.currentToken["data"] += "-" + data
+ self.state = self.commentState
+ return True
+
+ def commentEndState(self):
+ data = self.stream.char()
+ if data == ">":
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"] += "--\uFFFD"
+ self.state = self.commentState
+ elif data == "!":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-bang-after-double-dash-in-comment"})
+ self.state = self.commentEndBangState
+ elif data == "-":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-dash-after-double-dash-in-comment"})
+ self.currentToken["data"] += data
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-comment-double-dash"})
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ # XXX
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-comment"})
+ self.currentToken["data"] += "--" + data
+ self.state = self.commentState
+ return True
+
+ def commentEndBangState(self):
+ data = self.stream.char()
+ if data == ">":
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data == "-":
+ self.currentToken["data"] += "--!"
+ self.state = self.commentEndDashState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["data"] += "--!\uFFFD"
+ self.state = self.commentState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-comment-end-bang-state"})
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.currentToken["data"] += "--!" + data
+ self.state = self.commentState
+ return True
+
+ def doctypeState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.state = self.beforeDoctypeNameState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-doctype-name-but-got-eof"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "need-space-after-doctype"})
+ self.stream.unget(data)
+ self.state = self.beforeDoctypeNameState
+ return True
+
+ def beforeDoctypeNameState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ pass
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-doctype-name-but-got-right-bracket"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["name"] = "\uFFFD"
+ self.state = self.doctypeNameState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-doctype-name-but-got-eof"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.currentToken["name"] = data
+ self.state = self.doctypeNameState
+ return True
+
+ def doctypeNameState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
+ self.state = self.afterDoctypeNameState
+ elif data == ">":
+ self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["name"] += "\uFFFD"
+ self.state = self.doctypeNameState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype-name"})
+ self.currentToken["correct"] = False
+ self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.currentToken["name"] += data
+ return True
+
+ def afterDoctypeNameState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ pass
+ elif data == ">":
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data is EOF:
+ self.currentToken["correct"] = False
+ self.stream.unget(data)
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ if data in ("p", "P"):
+ matched = True
+ for expected in (("u", "U"), ("b", "B"), ("l", "L"),
+ ("i", "I"), ("c", "C")):
+ data = self.stream.char()
+ if data not in expected:
+ matched = False
+ break
+ if matched:
+ self.state = self.afterDoctypePublicKeywordState
+ return True
+ elif data in ("s", "S"):
+ matched = True
+ for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
+ ("e", "E"), ("m", "M")):
+ data = self.stream.char()
+ if data not in expected:
+ matched = False
+ break
+ if matched:
+ self.state = self.afterDoctypeSystemKeywordState
+ return True
+
+ # All the characters read before the current 'data' will be
+ # [a-zA-Z], so they're garbage in the bogus doctype and can be
+ # discarded; only the latest character might be '>' or EOF
+ # and needs to be ungetted
+ self.stream.unget(data)
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "expected-space-or-right-bracket-in-doctype", "datavars":
+ {"data": data}})
+ self.currentToken["correct"] = False
+ self.state = self.bogusDoctypeState
+
+ return True
+
+ def afterDoctypePublicKeywordState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.state = self.beforeDoctypePublicIdentifierState
+ elif data in ("'", '"'):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.stream.unget(data)
+ self.state = self.beforeDoctypePublicIdentifierState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.stream.unget(data)
+ self.state = self.beforeDoctypePublicIdentifierState
+ return True
+
+ def beforeDoctypePublicIdentifierState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ pass
+ elif data == "\"":
+ self.currentToken["publicId"] = ""
+ self.state = self.doctypePublicIdentifierDoubleQuotedState
+ elif data == "'":
+ self.currentToken["publicId"] = ""
+ self.state = self.doctypePublicIdentifierSingleQuotedState
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-end-of-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.currentToken["correct"] = False
+ self.state = self.bogusDoctypeState
+ return True
+
+ def doctypePublicIdentifierDoubleQuotedState(self):
+ data = self.stream.char()
+ if data == "\"":
+ self.state = self.afterDoctypePublicIdentifierState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["publicId"] += "\uFFFD"
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-end-of-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.currentToken["publicId"] += data
+ return True
+
+ def doctypePublicIdentifierSingleQuotedState(self):
+ data = self.stream.char()
+ if data == "'":
+ self.state = self.afterDoctypePublicIdentifierState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["publicId"] += "\uFFFD"
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-end-of-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.currentToken["publicId"] += data
+ return True
+
+ def afterDoctypePublicIdentifierState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.state = self.betweenDoctypePublicAndSystemIdentifiersState
+ elif data == ">":
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data == '"':
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.currentToken["systemId"] = ""
+ self.state = self.doctypeSystemIdentifierDoubleQuotedState
+ elif data == "'":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.currentToken["systemId"] = ""
+ self.state = self.doctypeSystemIdentifierSingleQuotedState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.currentToken["correct"] = False
+ self.state = self.bogusDoctypeState
+ return True
+
+ def betweenDoctypePublicAndSystemIdentifiersState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ pass
+ elif data == ">":
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data == '"':
+ self.currentToken["systemId"] = ""
+ self.state = self.doctypeSystemIdentifierDoubleQuotedState
+ elif data == "'":
+ self.currentToken["systemId"] = ""
+ self.state = self.doctypeSystemIdentifierSingleQuotedState
+ elif data == EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.currentToken["correct"] = False
+ self.state = self.bogusDoctypeState
+ return True
+
+ def afterDoctypeSystemKeywordState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ self.state = self.beforeDoctypeSystemIdentifierState
+ elif data in ("'", '"'):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.stream.unget(data)
+ self.state = self.beforeDoctypeSystemIdentifierState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.stream.unget(data)
+ self.state = self.beforeDoctypeSystemIdentifierState
+ return True
+
+ def beforeDoctypeSystemIdentifierState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ pass
+ elif data == "\"":
+ self.currentToken["systemId"] = ""
+ self.state = self.doctypeSystemIdentifierDoubleQuotedState
+ elif data == "'":
+ self.currentToken["systemId"] = ""
+ self.state = self.doctypeSystemIdentifierSingleQuotedState
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.currentToken["correct"] = False
+ self.state = self.bogusDoctypeState
+ return True
+
+ def doctypeSystemIdentifierDoubleQuotedState(self):
+ data = self.stream.char()
+ if data == "\"":
+ self.state = self.afterDoctypeSystemIdentifierState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["systemId"] += "\uFFFD"
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-end-of-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.currentToken["systemId"] += data
+ return True
+
+ def doctypeSystemIdentifierSingleQuotedState(self):
+ data = self.stream.char()
+ if data == "'":
+ self.state = self.afterDoctypeSystemIdentifierState
+ elif data == "\u0000":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ self.currentToken["systemId"] += "\uFFFD"
+ elif data == ">":
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-end-of-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.currentToken["systemId"] += data
+ return True
+
+ def afterDoctypeSystemIdentifierState(self):
+ data = self.stream.char()
+ if data in spaceCharacters:
+ pass
+ elif data == ">":
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data is EOF:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "eof-in-doctype"})
+ self.currentToken["correct"] = False
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+ "unexpected-char-in-doctype"})
+ self.state = self.bogusDoctypeState
+ return True
+
+ def bogusDoctypeState(self):
+ data = self.stream.char()
+ if data == ">":
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ elif data is EOF:
+ # XXX EMIT
+ self.stream.unget(data)
+ self.tokenQueue.append(self.currentToken)
+ self.state = self.dataState
+ else:
+ pass
+ return True
+
+ def cdataSectionState(self):
+ data = []
+ while True:
+ data.append(self.stream.charsUntil("]"))
+ data.append(self.stream.charsUntil(">"))
+ char = self.stream.char()
+ if char == EOF:
+ break
+ else:
+ assert char == ">"
+ if data[-1][-2:] == "]]":
+ data[-1] = data[-1][:-2]
+ break
+ else:
+ data.append(char)
+
+ data = "".join(data)
+ # Deal with null here rather than in the parser
+ nullCount = data.count("\u0000")
+ if nullCount > 0:
+ for i in range(nullCount):
+ self.tokenQueue.append({"type": tokenTypes["ParseError"],
+ "data": "invalid-codepoint"})
+ data = data.replace("\u0000", "\uFFFD")
+ if data:
+ self.tokenQueue.append({"type": tokenTypes["Characters"],
+ "data": data})
+ self.state = self.dataState
+ return True