From 04387deeab820e75c6d37d2ddd5b514cb7bcfd9e Mon Sep 17 00:00:00 2001
From: Pavel Aharoni <pa0916@att.com>
Date: Sun, 11 Jun 2017 14:33:57 +0300
Subject: [SDC-32] separate Tosca Parser from DC

Change-Id: I7e7f31ff2bd92fec22031f75b7051d129a21d01b
Signed-off-by: Pavel Aharoni <pa0916@att.com>
---
 .../pip/_vendor/html5lib/inputstream.py            | 905 ---------------------
 1 file changed, 905 deletions(-)
 delete mode 100644 jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/inputstream.py

(limited to 'jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/inputstream.py')

diff --git a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/inputstream.py b/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/inputstream.py
deleted file mode 100644
index dc39ad0..0000000
--- a/jython-tosca-parser/src/main/resources/Lib/site-packages/pip/_vendor/html5lib/inputstream.py
+++ /dev/null
@@ -1,905 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-from pip._vendor.six import text_type
-
-import codecs
-import platform
-import re
-
-from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
-from .constants import encodings, ReparseException
-from . import utils
-
-from io import StringIO
-
-try:
-    from io import BytesIO
-except ImportError:
-    BytesIO = StringIO
-
-try:
-    from io import BufferedIOBase
-except ImportError:
-    class BufferedIOBase(object):
-        pass
-
-# Non-unicode versions of constants for use in the pre-parser
-spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
-asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
-asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
-spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
-
-invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"
-
-if platform.python_implementation() == "Jython":
-    # Jython does not allow the use of solitary surrogate escapes
-    # (\uD800-\uDFFF) in literals or other usage. This is because it
-    # uses UTF-16, which is based on the use of such surrogates.
-    invalid_unicode_re = re.compile(invalid_unicode_template % "")
-else:
-    # Instead use one extra step of indirection and create surrogates with
-    # unichr
-    invalid_unicode_re = re.compile(invalid_unicode_template % (
-        "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))
-
-non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
-                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
-                                  0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
-                                  0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
-                                  0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
-                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
-                                  0x10FFFE, 0x10FFFF])
-
-ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
-
-# Cache for charsUntil()
-charsUntilRegEx = {}
-
-
-class BufferedStream:
-    """Buffering for streams that do not have buffering of their own
-
-    The buffer is implemented as a list of chunks on the assumption that
-    joining many strings will be slow since it is O(n**2)
-    """
-
-    def __init__(self, stream):
-        self.stream = stream
-        self.buffer = []
-        self.position = [-1, 0]  # chunk number, offset
-
-    def tell(self):
-        pos = 0
-        for chunk in self.buffer[:self.position[0]]:
-            pos += len(chunk)
-        pos += self.position[1]
-        return pos
-
-    def seek(self, pos):
-        assert pos < self._bufferedBytes()
-        offset = pos
-        i = 0
-        while len(self.buffer[i]) < offset:
-            offset -= pos
-            i += 1
-        self.position = [i, offset]
-
-    def read(self, bytes):
-        if not self.buffer:
-            return self._readStream(bytes)
-        elif (self.position[0] == len(self.buffer) and
-              self.position[1] == len(self.buffer[-1])):
-            return self._readStream(bytes)
-        else:
-            return self._readFromBuffer(bytes)
-
-    def _bufferedBytes(self):
-        return sum([len(item) for item in self.buffer])
-
-    def _readStream(self, bytes):
-        data = self.stream.read(bytes)
-        self.buffer.append(data)
-        self.position[0] += 1
-        self.position[1] = len(data)
-        return data
-
-    def _readFromBuffer(self, bytes):
-        remainingBytes = bytes
-        rv = []
-        bufferIndex = self.position[0]
-        bufferOffset = self.position[1]
-        while bufferIndex < len(self.buffer) and remainingBytes != 0:
-            assert remainingBytes > 0
-            bufferedData = self.buffer[bufferIndex]
-
-            if remainingBytes <= len(bufferedData) - bufferOffset:
-                bytesToRead = remainingBytes
-                self.position = [bufferIndex, bufferOffset + bytesToRead]
-            else:
-                bytesToRead = len(bufferedData) - bufferOffset
-                self.position = [bufferIndex, len(bufferedData)]
-                bufferIndex += 1
-            rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
-            remainingBytes -= bytesToRead
-
-            bufferOffset = 0
-
-        if remainingBytes:
-            rv.append(self._readStream(remainingBytes))
-
-        return "".join(rv)
-
-
-def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
-    if hasattr(source, "read"):
-        isUnicode = isinstance(source.read(0), text_type)
-    else:
-        isUnicode = isinstance(source, text_type)
-
-    if isUnicode:
-        if encoding is not None:
-            raise TypeError("Cannot explicitly set an encoding with a unicode string")
-
-        return HTMLUnicodeInputStream(source)
-    else:
-        return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
-
-
-class HTMLUnicodeInputStream:
-    """Provides a unicode stream of characters to the HTMLTokenizer.
-
-    This class takes care of character encoding and removing or replacing
-    incorrect byte-sequences and also provides column and line tracking.
-
-    """
-
-    _defaultChunkSize = 10240
-
-    def __init__(self, source):
-        """Initialises the HTMLInputStream.
-
-        HTMLInputStream(source, [encoding]) -> Normalized stream from source
-        for use by html5lib.
-
-        source can be either a file-object, local filename or a string.
-
-        The optional encoding parameter must be a string that indicates
-        the encoding.  If specified, that encoding will be used,
-        regardless of any BOM or later declaration (such as in a meta
-        element)
-
-        parseMeta - Look for a <meta> element containing encoding information
-
-        """
-
-        # Craziness
-        if platform.python_implementation() == "Jython":
-            # By its nature Jython's UTF-16 support does not allow
-            # surrogate errors, so no need to do this check.
-            self.reportCharacterErrors = None
-            self.replaceCharactersRegexp = None
-        elif len("\U0010FFFF") == 1:
-            self.reportCharacterErrors = self.characterErrorsUCS4
-            self.replaceCharactersRegexp = re.compile("{}".format(
-                "[{}-{}]".format(unichr(0xD800), unichr(0xDFFF))))
-        else:
-            self.reportCharacterErrors = self.characterErrorsUCS2
-            self.replaceCharactersRegexp = re.compile("{}".format(
-                "([{}-{}](?![{}-{})|(?<![{}-{}])[{}-{}])".format(
-                    unichr(0xD800), unichr(0xDBFF), unichr(0xDC00), unichr(0xDFFF),
-                    unichr(0xD800), unichr(0xDBFF), unichr(0xDC00), unichr(0xDFFF))))
-
-        # List of where new lines occur
-        self.newLines = [0]
-
-        self.charEncoding = ("utf-8", "certain")
-        self.dataStream = self.openStream(source)
-
-        self.reset()
-
-    def reset(self):
-        self.chunk = ""
-        self.chunkSize = 0
-        self.chunkOffset = 0
-        self.errors = []
-
-        # number of (complete) lines in previous chunks
-        self.prevNumLines = 0
-        # number of columns in the last line of the previous chunk
-        self.prevNumCols = 0
-
-        # Deal with CR LF and surrogates split over chunk boundaries
-        self._bufferedCharacter = None
-
-    def openStream(self, source):
-        """Produces a file object from source.
-
-        source can be either a file object, local filename or a string.
-
-        """
-        # Already a file object
-        if hasattr(source, 'read'):
-            stream = source
-        else:
-            stream = StringIO(source)
-
-        return stream
-
-    def _position(self, offset):
-        chunk = self.chunk
-        nLines = chunk.count('\n', 0, offset)
-        positionLine = self.prevNumLines + nLines
-        lastLinePos = chunk.rfind('\n', 0, offset)
-        if lastLinePos == -1:
-            positionColumn = self.prevNumCols + offset
-        else:
-            positionColumn = offset - (lastLinePos + 1)
-        return (positionLine, positionColumn)
-
-    def position(self):
-        """Returns (line, col) of the current position in the stream."""
-        line, col = self._position(self.chunkOffset)
-        return (line + 1, col)
-
-    def char(self):
-        """ Read one character from the stream or queue if available. Return
-            EOF when EOF is reached.
-        """
-        # Read a new chunk from the input stream if necessary
-        if self.chunkOffset >= self.chunkSize:
-            if not self.readChunk():
-                return EOF
-
-        chunkOffset = self.chunkOffset
-        char = self.chunk[chunkOffset]
-        self.chunkOffset = chunkOffset + 1
-
-        return char
-
-    def readChunk(self, chunkSize=None):
-        if chunkSize is None:
-            chunkSize = self._defaultChunkSize
-
-        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
-
-        self.chunk = ""
-        self.chunkSize = 0
-        self.chunkOffset = 0
-
-        data = self.dataStream.read(chunkSize)
-
-        # Deal with CR LF and surrogates broken across chunks
-        if self._bufferedCharacter:
-            data = self._bufferedCharacter + data
-            self._bufferedCharacter = None
-        elif not data:
-            # We have no more data, bye-bye stream
-            return False
-
-        if len(data) > 1:
-            lastv = ord(data[-1])
-            if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
-                self._bufferedCharacter = data[-1]
-                data = data[:-1]
-
-        if platform.python_implementation() != "Jython":
-            # data is already Unicode, so Jython already has dealt
-            # with any surrogate character errors, no need to go here
-            self.reportCharacterErrors(data)
-
-            # Replace invalid characters
-            # Note U+0000 is dealt with in the tokenizer
-            data = self.replaceCharactersRegexp.sub("\ufffd", data)
-
-        data = data.replace("\r\n", "\n")
-        data = data.replace("\r", "\n")
-
-        self.chunk = data
-        self.chunkSize = len(data)
-
-        return True
-
-    def characterErrorsUCS4(self, data):
-        for i in range(len(invalid_unicode_re.findall(data))):
-            self.errors.append("invalid-codepoint")
-
-    def characterErrorsUCS2(self, data):
-        # Someone picked the wrong compile option
-        # You lose
-        skip = False
-        for match in invalid_unicode_re.finditer(data):
-            if skip:
-                continue
-            codepoint = ord(match.group())
-            pos = match.start()
-            # Pretty sure there should be endianness issues here
-            if utils.isSurrogatePair(data[pos:pos + 2]):
-                # We have a surrogate pair!
-                char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
-                if char_val in non_bmp_invalid_codepoints:
-                    self.errors.append("invalid-codepoint")
-                skip = True
-            elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
-                  pos == len(data) - 1):
-                self.errors.append("invalid-codepoint")
-            else:
-                skip = False
-                self.errors.append("invalid-codepoint")
-
-    def charsUntil(self, characters, opposite=False):
-        """ Returns a string of characters from the stream up to but not
-        including any character in 'characters' or EOF. 'characters' must be
-        a container that supports the 'in' method and iteration over its
-        characters.
-        """
-
-        # Use a cache of regexps to find the required characters
-        try:
-            chars = charsUntilRegEx[(characters, opposite)]
-        except KeyError:
-            if __debug__:
-                for c in characters:
-                    assert(ord(c) < 128)
-            regex = "".join(["\\x%02x" % ord(c) for c in characters])
-            if not opposite:
-                regex = "^%s" % regex
-            chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
-
-        rv = []
-
-        while True:
-            # Find the longest matching prefix
-            m = chars.match(self.chunk, self.chunkOffset)
-            if m is None:
-                # If nothing matched, and it wasn't because we ran out of chunk,
-                # then stop
-                if self.chunkOffset != self.chunkSize:
-                    break
-            else:
-                end = m.end()
-                # If not the whole chunk matched, return everything
-                # up to the part that didn't match
-                if end != self.chunkSize:
-                    rv.append(self.chunk[self.chunkOffset:end])
-                    self.chunkOffset = end
-                    break
-            # If the whole remainder of the chunk matched,
-            # use it all and read the next chunk
-            rv.append(self.chunk[self.chunkOffset:])
-            if not self.readChunk():
-                # Reached EOF
-                break
-
-        r = "".join(rv)
-        return r
-
-    def unget(self, char):
-        # Only one character is allowed to be ungotten at once - it must
-        # be consumed again before any further call to unget
-        if char is not None:
-            if self.chunkOffset == 0:
-                # unget is called quite rarely, so it's a good idea to do
-                # more work here if it saves a bit of work in the frequently
-                # called char and charsUntil.
-                # So, just prepend the ungotten character onto the current
-                # chunk:
-                self.chunk = char + self.chunk
-                self.chunkSize += 1
-            else:
-                self.chunkOffset -= 1
-                assert self.chunk[self.chunkOffset] == char
-
-
-class HTMLBinaryInputStream(HTMLUnicodeInputStream):
-    """Provides a unicode stream of characters to the HTMLTokenizer.
-
-    This class takes care of character encoding and removing or replacing
-    incorrect byte-sequences and also provides column and line tracking.
-
-    """
-
-    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
-        """Initialises the HTMLInputStream.
-
-        HTMLInputStream(source, [encoding]) -> Normalized stream from source
-        for use by html5lib.
-
-        source can be either a file-object, local filename or a string.
-
-        The optional encoding parameter must be a string that indicates
-        the encoding.  If specified, that encoding will be used,
-        regardless of any BOM or later declaration (such as in a meta
-        element)
-
-        parseMeta - Look for a <meta> element containing encoding information
-
-        """
-        # Raw Stream - for unicode objects this will encode to utf-8 and set
-        #              self.charEncoding as appropriate
-        self.rawStream = self.openStream(source)
-
-        HTMLUnicodeInputStream.__init__(self, self.rawStream)
-
-        self.charEncoding = (codecName(encoding), "certain")
-
-        # Encoding Information
-        # Number of bytes to use when looking for a meta element with
-        # encoding information
-        self.numBytesMeta = 512
-        # Number of bytes to use when using detecting encoding using chardet
-        self.numBytesChardet = 100
-        # Encoding to use if no other information can be found
-        self.defaultEncoding = "windows-1252"
-
-        # Detect encoding iff no explicit "transport level" encoding is supplied
-        if (self.charEncoding[0] is None):
-            self.charEncoding = self.detectEncoding(parseMeta, chardet)
-
-        # Call superclass
-        self.reset()
-
-    def reset(self):
-        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
-                                                                 'replace')
-        HTMLUnicodeInputStream.reset(self)
-
-    def openStream(self, source):
-        """Produces a file object from source.
-
-        source can be either a file object, local filename or a string.
-
-        """
-        # Already a file object
-        if hasattr(source, 'read'):
-            stream = source
-        else:
-            stream = BytesIO(source)
-
-        try:
-            stream.seek(stream.tell())
-        except:
-            stream = BufferedStream(stream)
-
-        return stream
-
-    def detectEncoding(self, parseMeta=True, chardet=True):
-        # First look for a BOM
-        # This will also read past the BOM if present
-        encoding = self.detectBOM()
-        confidence = "certain"
-        # If there is no BOM need to look for meta elements with encoding
-        # information
-        if encoding is None and parseMeta:
-            encoding = self.detectEncodingMeta()
-            confidence = "tentative"
-        # Guess with chardet, if avaliable
-        if encoding is None and chardet:
-            confidence = "tentative"
-            try:
-                try:
-                    from charade.universaldetector import UniversalDetector
-                except ImportError:
-                    from chardet.universaldetector import UniversalDetector
-                buffers = []
-                detector = UniversalDetector()
-                while not detector.done:
-                    buffer = self.rawStream.read(self.numBytesChardet)
-                    assert isinstance(buffer, bytes)
-                    if not buffer:
-                        break
-                    buffers.append(buffer)
-                    detector.feed(buffer)
-                detector.close()
-                encoding = detector.result['encoding']
-                self.rawStream.seek(0)
-            except ImportError:
-                pass
-        # If all else fails use the default encoding
-        if encoding is None:
-            confidence = "tentative"
-            encoding = self.defaultEncoding
-
-        # Substitute for equivalent encodings:
-        encodingSub = {"iso-8859-1": "windows-1252"}
-
-        if encoding.lower() in encodingSub:
-            encoding = encodingSub[encoding.lower()]
-
-        return encoding, confidence
-
-    def changeEncoding(self, newEncoding):
-        assert self.charEncoding[1] != "certain"
-        newEncoding = codecName(newEncoding)
-        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
-            newEncoding = "utf-8"
-        if newEncoding is None:
-            return
-        elif newEncoding == self.charEncoding[0]:
-            self.charEncoding = (self.charEncoding[0], "certain")
-        else:
-            self.rawStream.seek(0)
-            self.reset()
-            self.charEncoding = (newEncoding, "certain")
-            raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
-
-    def detectBOM(self):
-        """Attempts to detect at BOM at the start of the stream. If
-        an encoding can be determined from the BOM return the name of the
-        encoding otherwise return None"""
-        bomDict = {
-            codecs.BOM_UTF8: 'utf-8',
-            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
-            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
-        }
-
-        # Go to beginning of file and read in 4 bytes
-        string = self.rawStream.read(4)
-        assert isinstance(string, bytes)
-
-        # Try detecting the BOM using bytes from the string
-        encoding = bomDict.get(string[:3])         # UTF-8
-        seek = 3
-        if not encoding:
-            # Need to detect UTF-32 before UTF-16
-            encoding = bomDict.get(string)         # UTF-32
-            seek = 4
-            if not encoding:
-                encoding = bomDict.get(string[:2])  # UTF-16
-                seek = 2
-
-        # Set the read position past the BOM if one was found, otherwise
-        # set it to the start of the stream
-        self.rawStream.seek(encoding and seek or 0)
-
-        return encoding
-
-    def detectEncodingMeta(self):
-        """Report the encoding declared by the meta element
-        """
-        buffer = self.rawStream.read(self.numBytesMeta)
-        assert isinstance(buffer, bytes)
-        parser = EncodingParser(buffer)
-        self.rawStream.seek(0)
-        encoding = parser.getEncoding()
-
-        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
-            encoding = "utf-8"
-
-        return encoding
-
-
-class EncodingBytes(bytes):
-    """String-like object with an associated position and various extra methods
-    If the position is ever greater than the string length then an exception is
-    raised"""
-    def __new__(self, value):
-        assert isinstance(value, bytes)
-        return bytes.__new__(self, value.lower())
-
-    def __init__(self, value):
-        self._position = -1
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        p = self._position = self._position + 1
-        if p >= len(self):
-            raise StopIteration
-        elif p < 0:
-            raise TypeError
-        return self[p:p + 1]
-
-    def next(self):
-        # Py2 compat
-        return self.__next__()
-
-    def previous(self):
-        p = self._position
-        if p >= len(self):
-            raise StopIteration
-        elif p < 0:
-            raise TypeError
-        self._position = p = p - 1
-        return self[p:p + 1]
-
-    def setPosition(self, position):
-        if self._position >= len(self):
-            raise StopIteration
-        self._position = position
-
-    def getPosition(self):
-        if self._position >= len(self):
-            raise StopIteration
-        if self._position >= 0:
-            return self._position
-        else:
-            return None
-
-    position = property(getPosition, setPosition)
-
-    def getCurrentByte(self):
-        return self[self.position:self.position + 1]
-
-    currentByte = property(getCurrentByte)
-
-    def skip(self, chars=spaceCharactersBytes):
-        """Skip past a list of characters"""
-        p = self.position               # use property for the error-checking
-        while p < len(self):
-            c = self[p:p + 1]
-            if c not in chars:
-                self._position = p
-                return c
-            p += 1
-        self._position = p
-        return None
-
-    def skipUntil(self, chars):
-        p = self.position
-        while p < len(self):
-            c = self[p:p + 1]
-            if c in chars:
-                self._position = p
-                return c
-            p += 1
-        self._position = p
-        return None
-
-    def matchBytes(self, bytes):
-        """Look for a sequence of bytes at the start of a string. If the bytes
-        are found return True and advance the position to the byte after the
-        match. Otherwise return False and leave the position alone"""
-        p = self.position
-        data = self[p:p + len(bytes)]
-        rv = data.startswith(bytes)
-        if rv:
-            self.position += len(bytes)
-        return rv
-
-    def jumpTo(self, bytes):
-        """Look for the next sequence of bytes matching a given sequence. If
-        a match is found advance the position to the last byte of the match"""
-        newPosition = self[self.position:].find(bytes)
-        if newPosition > -1:
-            # XXX: This is ugly, but I can't see a nicer way to fix this.
-            if self._position == -1:
-                self._position = 0
-            self._position += (newPosition + len(bytes) - 1)
-            return True
-        else:
-            raise StopIteration
-
-
-class EncodingParser(object):
-    """Mini parser for detecting character encoding from meta elements"""
-
-    def __init__(self, data):
-        """string - the data to work on for encoding detection"""
-        self.data = EncodingBytes(data)
-        self.encoding = None
-
-    def getEncoding(self):
-        methodDispatch = (
-            (b"<!--", self.handleComment),
-            (b"<meta", self.handleMeta),
-            (b"</", self.handlePossibleEndTag),
-            (b"<!", self.handleOther),
-            (b"<?", self.handleOther),
-            (b"<", self.handlePossibleStartTag))
-        for byte in self.data:
-            keepParsing = True
-            for key, method in methodDispatch:
-                if self.data.matchBytes(key):
-                    try:
-                        keepParsing = method()
-                        break
-                    except StopIteration:
-                        keepParsing = False
-                        break
-            if not keepParsing:
-                break
-
-        return self.encoding
-
-    def handleComment(self):
-        """Skip over comments"""
-        return self.data.jumpTo(b"-->")
-
-    def handleMeta(self):
-        if self.data.currentByte not in spaceCharactersBytes:
-            # if we have <meta not followed by a space so just keep going
-            return True
-        # We have a valid meta element we want to search for attributes
-        hasPragma = False
-        pendingEncoding = None
-        while True:
-            # Try to find the next attribute after the current position
-            attr = self.getAttribute()
-            if attr is None:
-                return True
-            else:
-                if attr[0] == b"http-equiv":
-                    hasPragma = attr[1] == b"content-type"
-                    if hasPragma and pendingEncoding is not None:
-                        self.encoding = pendingEncoding
-                        return False
-                elif attr[0] == b"charset":
-                    tentativeEncoding = attr[1]
-                    codec = codecName(tentativeEncoding)
-                    if codec is not None:
-                        self.encoding = codec
-                        return False
-                elif attr[0] == b"content":
-                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
-                    tentativeEncoding = contentParser.parse()
-                    if tentativeEncoding is not None:
-                        codec = codecName(tentativeEncoding)
-                        if codec is not None:
-                            if hasPragma:
-                                self.encoding = codec
-                                return False
-                            else:
-                                pendingEncoding = codec
-
-    def handlePossibleStartTag(self):
-        return self.handlePossibleTag(False)
-
-    def handlePossibleEndTag(self):
-        next(self.data)
-        return self.handlePossibleTag(True)
-
-    def handlePossibleTag(self, endTag):
-        data = self.data
-        if data.currentByte not in asciiLettersBytes:
-            # If the next byte is not an ascii letter either ignore this
-            # fragment (possible start tag case) or treat it according to
-            # handleOther
-            if endTag:
-                data.previous()
-                self.handleOther()
-            return True
-
-        c = data.skipUntil(spacesAngleBrackets)
-        if c == b"<":
-            # return to the first step in the overall "two step" algorithm
-            # reprocessing the < byte
-            data.previous()
-        else:
-            # Read all attributes
-            attr = self.getAttribute()
-            while attr is not None:
-                attr = self.getAttribute()
-        return True
-
-    def handleOther(self):
-        return self.data.jumpTo(b">")
-
-    def getAttribute(self):
-        """Return a name,value pair for the next attribute in the stream,
-        if one is found, or None"""
-        data = self.data
-        # Step 1 (skip chars)
-        c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
-        assert c is None or len(c) == 1
-        # Step 2
-        if c in (b">", None):
-            return None
-        # Step 3
-        attrName = []
-        attrValue = []
-        # Step 4 attribute name
-        while True:
-            if c == b"=" and attrName:
-                break
-            elif c in spaceCharactersBytes:
-                # Step 6!
-                c = data.skip()
-                break
-            elif c in (b"/", b">"):
-                return b"".join(attrName), b""
-            elif c in asciiUppercaseBytes:
-                attrName.append(c.lower())
-            elif c is None:
-                return None
-            else:
-                attrName.append(c)
-            # Step 5
-            c = next(data)
-        # Step 7
-        if c != b"=":
-            data.previous()
-            return b"".join(attrName), b""
-        # Step 8
-        next(data)
-        # Step 9
-        c = data.skip()
-        # Step 10
-        if c in (b"'", b'"'):
-            # 10.1
-            quoteChar = c
-            while True:
-                # 10.2
-                c = next(data)
-                # 10.3
-                if c == quoteChar:
-                    next(data)
-                    return b"".join(attrName), b"".join(attrValue)
-                # 10.4
-                elif c in asciiUppercaseBytes:
-                    attrValue.append(c.lower())
-                # 10.5
-                else:
-                    attrValue.append(c)
-        elif c == b">":
-            return b"".join(attrName), b""
-        elif c in asciiUppercaseBytes:
-            attrValue.append(c.lower())
-        elif c is None:
-            return None
-        else:
-            attrValue.append(c)
-        # Step 11
-        while True:
-            c = next(data)
-            if c in spacesAngleBrackets:
-                return b"".join(attrName), b"".join(attrValue)
-            elif c in asciiUppercaseBytes:
-                attrValue.append(c.lower())
-            elif c is None:
-                return None
-            else:
-                attrValue.append(c)
-
-
-class ContentAttrParser(object):
-    def __init__(self, data):
-        assert isinstance(data, bytes)
-        self.data = data
-
-    def parse(self):
-        try:
-            # Check if the attr name is charset
-            # otherwise return
-            self.data.jumpTo(b"charset")
-            self.data.position += 1
-            self.data.skip()
-            if not self.data.currentByte == b"=":
-                # If there is no = sign keep looking for attrs
-                return None
-            self.data.position += 1
-            self.data.skip()
-            # Look for an encoding between matching quote marks
-            if self.data.currentByte in (b'"', b"'"):
-                quoteMark = self.data.currentByte
-                self.data.position += 1
-                oldPosition = self.data.position
-                if self.data.jumpTo(quoteMark):
-                    return self.data[oldPosition:self.data.position]
-                else:
-                    return None
-            else:
-                # Unquoted value
-                oldPosition = self.data.position
-                try:
-                    self.data.skipUntil(spaceCharactersBytes)
-                    return self.data[oldPosition:self.data.position]
-                except StopIteration:
-                    # Return the whole remaining value
-                    return self.data[oldPosition:]
-        except StopIteration:
-            return None
-
-
-def codecName(encoding):
-    """Return the python codec name corresponding to an encoding or None if the
-    string doesn't correspond to a valid encoding."""
-    if isinstance(encoding, bytes):
-        try:
-            encoding = encoding.decode("ascii")
-        except UnicodeDecodeError:
-            return None
-    if encoding:
-        canonicalName = ascii_punctuation_re.sub("", encoding).lower()
-        return encodings.get(canonicalName, None)
-    else:
-        return None
-- 
cgit 1.2.3-korg