// wrapper for non-node envs ;(function (sax) { sax.parser = function (strict, opt) { return new SAXParser(strict, opt) } sax.SAXParser = SAXParser sax.SAXStream = SAXStream sax.createStream = createStream // When we pass the MAX_BUFFER_LENGTH position, start checking for buffer overruns. // When we check, schedule the next check for MAX_BUFFER_LENGTH - (max(buffer lengths)), // since that's the earliest that a buffer overrun could occur. This way, checks are // as rare as required, but as often as necessary to ensure never crossing this bound. // Furthermore, buffers are only tested at most once per write(), so passing a very // large string into write() might have undesirable effects, but this is manageable by // the caller, so it is assumed to be safe. Thus, a call to write() may, in the extreme // edge case, result in creating at most one complete copy of the string passed in. // Set to Infinity to have unlimited buffers. sax.MAX_BUFFER_LENGTH = 64 * 1024 var buffers = [ "comment", "sgmlDecl", "textNode", "tagName", "doctype", "procInstName", "procInstBody", "entity", "attribName", "attribValue", "cdata", "script" ] sax.EVENTS = // for discoverability. [ "text" , "processinginstruction" , "sgmldeclaration" , "doctype" , "comment" , "attribute" , "opentag" , "closetag" , "opencdata" , "cdata" , "closecdata" , "error" , "end" , "ready" , "script" , "opennamespace" , "closenamespace" ] function SAXParser (strict, opt) { if (!(this instanceof SAXParser)) return new SAXParser(strict, opt) var parser = this clearBuffers(parser) parser.q = parser.c = "" parser.bufferCheckPosition = sax.MAX_BUFFER_LENGTH parser.opt = opt || {} parser.opt.lowercase = parser.opt.lowercase || parser.opt.lowercasetags parser.looseCase = parser.opt.lowercase ? "toLowerCase" : "toUpperCase" parser.tags = [] parser.closed = parser.closedRoot = parser.sawRoot = false parser.tag = parser.error = null parser.strict = !!strict parser.noscript = !!(strict || parser.opt.noscript) parser.state = S.BEGIN parser.strictEntities = parser.opt.strictEntities parser.ENTITIES = parser.strictEntities ? Object.create(sax.XML_ENTITIES) : Object.create(sax.ENTITIES) parser.attribList = [] // namespaces form a prototype chain. // it always points at the current tag, // which protos to its parent tag. if (parser.opt.xmlns) parser.ns = Object.create(rootNS) // mostly just for error reporting parser.trackPosition = parser.opt.position !== false if (parser.trackPosition) { parser.position = parser.line = parser.column = 0 } emit(parser, "onready") } if (!Object.create) Object.create = function (o) { function f () { this.__proto__ = o } f.prototype = o return new f } if (!Object.getPrototypeOf) Object.getPrototypeOf = function (o) { return o.__proto__ } if (!Object.keys) Object.keys = function (o) { var a = [] for (var i in o) if (o.hasOwnProperty(i)) a.push(i) return a } function checkBufferLength (parser) { var maxAllowed = Math.max(sax.MAX_BUFFER_LENGTH, 10) , maxActual = 0 for (var i = 0, l = buffers.length; i < l; i ++) { var len = parser[buffers[i]].length if (len > maxAllowed) { // Text/cdata nodes can get big, and since they're buffered, // we can get here under normal conditions. // Avoid issues by emitting the text node now, // so at least it won't get any bigger. switch (buffers[i]) { case "textNode": closeText(parser) break case "cdata": emitNode(parser, "oncdata", parser.cdata) parser.cdata = "" break case "script": emitNode(parser, "onscript", parser.script) parser.script = "" break default: error(parser, "Max buffer length exceeded: "+buffers[i]) } } maxActual = Math.max(maxActual, len) } // schedule the next check for the earliest possible buffer overrun. parser.bufferCheckPosition = (sax.MAX_BUFFER_LENGTH - maxActual) + parser.position } function clearBuffers (parser) { for (var i = 0, l = buffers.length; i < l; i ++) { parser[buffers[i]] = "" } } function flushBuffers (parser) { closeText(parser) if (parser.cdata !== "") { emitNode(parser, "oncdata", parser.cdata) parser.cdata = "" } if (parser.script !== "") { emitNode(parser, "onscript", parser.script) parser.script = "" } } SAXParser.prototype = { end: function () { end(this) } , write: write , resume: function () { this.error = null; return this } , close: function () { return this.write(null) } , flush: function () { flushBuffers(this) } } try { var Stream = require("stream").Stream } catch (ex) { var Stream = function () {} } var streamWraps = sax.EVENTS.filter(function (ev) { return ev !== "error" && ev !== "end" }) function createStream (strict, opt) { return new SAXStream(strict, opt) } function SAXStream (strict, opt) { if (!(this instanceof SAXStream)) return new SAXStream(strict, opt) Stream.apply(this) this._parser = new SAXParser(strict, opt) this.writable = true this.readable = true var me = this this._parser.onend = function () { me.emit("end") } this._parser.onerror = function (er) { me.emit("error", er) // if didn't throw, then means error was handled. // go ahead and clear error, so we can write again. me._parser.error = null } this._decoder = null; streamWraps.forEach(function (ev) { Object.defineProperty(me, "on" + ev, { get: function () { return me._parser["on" + ev] }, set: function (h) { if (!h) { me.removeAllListeners(ev) return me._parser["on"+ev] = h } me.on(ev, h) }, enumerable: true, configurable: false }) }) } SAXStream.prototype = Object.create(Stream.prototype, { constructor: { value: SAXStream } }) SAXStream.prototype.write = function (data) { if (typeof Buffer === 'function' && typeof Buffer.isBuffer === 'function' && Buffer.isBuffer(data)) { if (!this._decoder) { var SD = require('string_decoder').StringDecoder this._decoder = new SD('utf8') } data = this._decoder.write(data); } this._parser.write(data.toString()) this.emit("data", data) return true } SAXStream.prototype.end = function (chunk) { if (chunk && chunk.length) this.write(chunk) this._parser.end() return true } SAXStream.prototype.on = function (ev, handler) { var me = this if (!me._parser["on"+ev] && streamWraps.indexOf(ev) !== -1) { me._parser["on"+ev] = function () { var args = arguments.length === 1 ? [arguments[0]] : Array.apply(null, arguments) args.splice(0, 0, ev) me.emit.apply(me, args) } } return Stream.prototype.on.call(me, ev, handler) } // character classes and tokens var whitespace = "\r\n\t " // this really needs to be replaced with character classes. // XML allows all manner of ridiculous numbers and digits. , number = "0124356789" , letter = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" // (Letter | "_" | ":") , quote = "'\"" , entity = number+letter+"#" , attribEnd = whitespace + ">" , CDATA = "[CDATA[" , DOCTYPE = "DOCTYPE" , XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" , XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/" , rootNS = { xml: XML_NAMESPACE, xmlns: XMLNS_NAMESPACE } // turn all the string character sets into character class objects. whitespace = charClass(whitespace) number = charClass(number) letter = charClass(letter) // http://www.w3.org/TR/REC-xml/#NT-NameStartChar // This implementation works on strings, a single character at a time // as such, it cannot ever support astral-plane characters (10000-EFFFF) // without a significant breaking change to either this parser, or the // JavaScript language. Implementation of an emoji-capable xml parser // is left as an exercise for the reader. var nameStart = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/ var nameBody = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040\.\d-]/ quote = charClass(quote) entity = charClass(entity) attribEnd = charClass(attribEnd) function charClass (str) { return str.split("").reduce(function (s, c) { s[c] = true return s }, {}) } function isRegExp (c) { return Object.prototype.toString.call(c) === '[object RegExp]' } function is (charclass, c) { return isRegExp(charclass) ? !!c.match(charclass) : charclass[c] } function not (charclass, c) { return !is(charclass, c) } var S = 0 sax.STATE = { BEGIN : S++ // leading byte order mark or whitespace , BEGIN_WHITESPACE : S++ // leading whitespace , TEXT : S++ // general stuff , TEXT_ENTITY : S++ // & and such. , OPEN_WAKA : S++ // < , SGML_DECL : S++ // , SCRIPT : S++ //