diff options
Diffstat (limited to 'bin/wiki/ImportarDesdeURL/node_modules/htmlparser2/lib/Tokenizer.js')
-rw-r--r-- | bin/wiki/ImportarDesdeURL/node_modules/htmlparser2/lib/Tokenizer.js | 970 |
1 files changed, 970 insertions, 0 deletions
diff --git a/bin/wiki/ImportarDesdeURL/node_modules/htmlparser2/lib/Tokenizer.js b/bin/wiki/ImportarDesdeURL/node_modules/htmlparser2/lib/Tokenizer.js new file mode 100644 index 00000000..413e664f --- /dev/null +++ b/bin/wiki/ImportarDesdeURL/node_modules/htmlparser2/lib/Tokenizer.js @@ -0,0 +1,970 @@ +module.exports = Tokenizer; + +var decodeCodePoint = require("entities/lib/decode_codepoint.js"); +var entityMap = require("entities/maps/entities.json"); +var legacyMap = require("entities/maps/legacy.json"); +var xmlMap = require("entities/maps/xml.json"); + +var i = 0; + +var TEXT = i++; +var BEFORE_TAG_NAME = i++; //after < +var IN_TAG_NAME = i++; +var IN_SELF_CLOSING_TAG = i++; +var BEFORE_CLOSING_TAG_NAME = i++; +var IN_CLOSING_TAG_NAME = i++; +var AFTER_CLOSING_TAG_NAME = i++; + +//attributes +var BEFORE_ATTRIBUTE_NAME = i++; +var IN_ATTRIBUTE_NAME = i++; +var AFTER_ATTRIBUTE_NAME = i++; +var BEFORE_ATTRIBUTE_VALUE = i++; +var IN_ATTRIBUTE_VALUE_DQ = i++; // " +var IN_ATTRIBUTE_VALUE_SQ = i++; // ' +var IN_ATTRIBUTE_VALUE_NQ = i++; + +//declarations +var BEFORE_DECLARATION = i++; // ! +var IN_DECLARATION = i++; + +//processing instructions +var IN_PROCESSING_INSTRUCTION = i++; // ? + +//comments +var BEFORE_COMMENT = i++; +var IN_COMMENT = i++; +var AFTER_COMMENT_1 = i++; +var AFTER_COMMENT_2 = i++; + +//cdata +var BEFORE_CDATA_1 = i++; // [ +var BEFORE_CDATA_2 = i++; // C +var BEFORE_CDATA_3 = i++; // D +var BEFORE_CDATA_4 = i++; // A +var BEFORE_CDATA_5 = i++; // T +var BEFORE_CDATA_6 = i++; // A +var IN_CDATA = i++; // [ +var AFTER_CDATA_1 = i++; // ] +var AFTER_CDATA_2 = i++; // ] + +//special tags +var BEFORE_SPECIAL = i++; //S +var BEFORE_SPECIAL_END = i++; //S + +var BEFORE_SCRIPT_1 = i++; //C +var BEFORE_SCRIPT_2 = i++; //R +var BEFORE_SCRIPT_3 = i++; //I +var BEFORE_SCRIPT_4 = i++; //P +var BEFORE_SCRIPT_5 = i++; //T +var AFTER_SCRIPT_1 = i++; //C +var AFTER_SCRIPT_2 = i++; //R +var AFTER_SCRIPT_3 = i++; //I +var AFTER_SCRIPT_4 = i++; //P +var AFTER_SCRIPT_5 = i++; //T + +var BEFORE_STYLE_1 = i++; //T +var BEFORE_STYLE_2 = i++; //Y +var BEFORE_STYLE_3 = i++; //L +var BEFORE_STYLE_4 = i++; //E +var AFTER_STYLE_1 = i++; //T +var AFTER_STYLE_2 = i++; //Y +var AFTER_STYLE_3 = i++; //L +var AFTER_STYLE_4 = i++; //E + +var BEFORE_ENTITY = i++; //& +var BEFORE_NUMERIC_ENTITY = i++; //# +var IN_NAMED_ENTITY = i++; +var IN_NUMERIC_ENTITY = i++; +var IN_HEX_ENTITY = i++; //X + +var j = 0; + +var SPECIAL_NONE = j++; +var SPECIAL_SCRIPT = j++; +var SPECIAL_STYLE = j++; + +function whitespace(c) { + return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r"; +} + +function ifElseState(upper, SUCCESS, FAILURE) { + var lower = upper.toLowerCase(); + + if (upper === lower) { + return function(c) { + if (c === lower) { + this._state = SUCCESS; + } else { + this._state = FAILURE; + this._index--; + } + }; + } else { + return function(c) { + if (c === lower || c === upper) { + this._state = SUCCESS; + } else { + this._state = FAILURE; + this._index--; + } + }; + } +} + +function consumeSpecialNameChar(upper, NEXT_STATE) { + var lower = upper.toLowerCase(); + + return function(c) { + if (c === lower || c === upper) { + this._state = NEXT_STATE; + } else { + this._state = IN_TAG_NAME; + this._index--; //consume the token again + } + }; +} + +function Tokenizer(options, cbs) { + this._state = TEXT; + this._buffer = ""; + this._sectionStart = 0; + this._index = 0; + this._bufferOffset = 0; //chars removed from _buffer + this._baseState = TEXT; + this._special = SPECIAL_NONE; + this._cbs = cbs; + this._running = true; + this._ended = false; + this._xmlMode = !!(options && options.xmlMode); + this._decodeEntities = !!(options && options.decodeEntities); +} + +Tokenizer.prototype._stateText = function(c) { + if (c === "<") { + if (this._index > this._sectionStart) { + this._cbs.ontext(this._getSection()); + } + this._state = BEFORE_TAG_NAME; + this._sectionStart = this._index; + } else if ( + this._decodeEntities && + this._special === SPECIAL_NONE && + c === "&" + ) { + if (this._index > this._sectionStart) { + this._cbs.ontext(this._getSection()); + } + this._baseState = TEXT; + this._state = BEFORE_ENTITY; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateBeforeTagName = function(c) { + if (c === "/") { + this._state = BEFORE_CLOSING_TAG_NAME; + } else if (c === "<") { + this._cbs.ontext(this._getSection()); + this._sectionStart = this._index; + } else if (c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) { + this._state = TEXT; + } else if (c === "!") { + this._state = BEFORE_DECLARATION; + this._sectionStart = this._index + 1; + } else if (c === "?") { + this._state = IN_PROCESSING_INSTRUCTION; + this._sectionStart = this._index + 1; + } else { + this._state = + !this._xmlMode && (c === "s" || c === "S") + ? BEFORE_SPECIAL + : IN_TAG_NAME; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateInTagName = function(c) { + if (c === "/" || c === ">" || whitespace(c)) { + this._emitToken("onopentagname"); + this._state = BEFORE_ATTRIBUTE_NAME; + this._index--; + } +}; + +Tokenizer.prototype._stateBeforeCloseingTagName = function(c) { + if (whitespace(c)); + else if (c === ">") { + this._state = TEXT; + } else if (this._special !== SPECIAL_NONE) { + if (c === "s" || c === "S") { + this._state = BEFORE_SPECIAL_END; + } else { + this._state = TEXT; + this._index--; + } + } else { + this._state = IN_CLOSING_TAG_NAME; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateInCloseingTagName = function(c) { + if (c === ">" || whitespace(c)) { + this._emitToken("onclosetag"); + this._state = AFTER_CLOSING_TAG_NAME; + this._index--; + } +}; + +Tokenizer.prototype._stateAfterCloseingTagName = function(c) { + //skip everything until ">" + if (c === ">") { + this._state = TEXT; + this._sectionStart = this._index + 1; + } +}; + +Tokenizer.prototype._stateBeforeAttributeName = function(c) { + if (c === ">") { + this._cbs.onopentagend(); + this._state = TEXT; + this._sectionStart = this._index + 1; + } else if (c === "/") { + this._state = IN_SELF_CLOSING_TAG; + } else if (!whitespace(c)) { + this._state = IN_ATTRIBUTE_NAME; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateInSelfClosingTag = function(c) { + if (c === ">") { + this._cbs.onselfclosingtag(); + this._state = TEXT; + this._sectionStart = this._index + 1; + } else if (!whitespace(c)) { + this._state = BEFORE_ATTRIBUTE_NAME; + this._index--; + } +}; + +Tokenizer.prototype._stateInAttributeName = function(c) { + if (c === "=" || c === "/" || c === ">" || whitespace(c)) { + this._cbs.onattribname(this._getSection()); + this._sectionStart = -1; + this._state = AFTER_ATTRIBUTE_NAME; + this._index--; + } +}; + +Tokenizer.prototype._stateAfterAttributeName = function(c) { + if (c === "=") { + this._state = BEFORE_ATTRIBUTE_VALUE; + } else if (c === "/" || c === ">") { + this._cbs.onattribend(); + this._state = BEFORE_ATTRIBUTE_NAME; + this._index--; + } else if (!whitespace(c)) { + this._cbs.onattribend(); + this._state = IN_ATTRIBUTE_NAME; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateBeforeAttributeValue = function(c) { + if (c === '"') { + this._state = IN_ATTRIBUTE_VALUE_DQ; + this._sectionStart = this._index + 1; + } else if (c === "'") { + this._state = IN_ATTRIBUTE_VALUE_SQ; + this._sectionStart = this._index + 1; + } else if (!whitespace(c)) { + this._state = IN_ATTRIBUTE_VALUE_NQ; + this._sectionStart = this._index; + this._index--; //reconsume token + } +}; + +Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function(c) { + if (c === '"') { + this._emitToken("onattribdata"); + this._cbs.onattribend(); + this._state = BEFORE_ATTRIBUTE_NAME; + } else if (this._decodeEntities && c === "&") { + this._emitToken("onattribdata"); + this._baseState = this._state; + this._state = BEFORE_ENTITY; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateInAttributeValueSingleQuotes = function(c) { + if (c === "'") { + this._emitToken("onattribdata"); + this._cbs.onattribend(); + this._state = BEFORE_ATTRIBUTE_NAME; + } else if (this._decodeEntities && c === "&") { + this._emitToken("onattribdata"); + this._baseState = this._state; + this._state = BEFORE_ENTITY; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateInAttributeValueNoQuotes = function(c) { + if (whitespace(c) || c === ">") { + this._emitToken("onattribdata"); + this._cbs.onattribend(); + this._state = BEFORE_ATTRIBUTE_NAME; + this._index--; + } else if (this._decodeEntities && c === "&") { + this._emitToken("onattribdata"); + this._baseState = this._state; + this._state = BEFORE_ENTITY; + this._sectionStart = this._index; + } +}; + +Tokenizer.prototype._stateBeforeDeclaration = function(c) { + this._state = + c === "[" + ? BEFORE_CDATA_1 + : c === "-" + ? BEFORE_COMMENT + : IN_DECLARATION; +}; + +Tokenizer.prototype._stateInDeclaration = function(c) { + if (c === ">") { + this._cbs.ondeclaration(this._getSection()); + this._state = TEXT; + this._sectionStart = this._index + 1; + } +}; + +Tokenizer.prototype._stateInProcessingInstruction = function(c) { + if (c === ">") { + this._cbs.onprocessinginstruction(this._getSection()); + this._state = TEXT; + this._sectionStart = this._index + 1; + } +}; + +Tokenizer.prototype._stateBeforeComment = function(c) { + if (c === "-") { + this._state = IN_COMMENT; + this._sectionStart = this._index + 1; + } else { + this._state = IN_DECLARATION; + } +}; + +Tokenizer.prototype._stateInComment = function(c) { + if (c === "-") this._state = AFTER_COMMENT_1; +}; + +Tokenizer.prototype._stateAfterComment1 = function(c) { + if (c === "-") { + this._state = AFTER_COMMENT_2; + } else { + this._state = IN_COMMENT; + } +}; + +Tokenizer.prototype._stateAfterComment2 = function(c) { + if (c === ">") { + //remove 2 trailing chars + this._cbs.oncomment( + this._buffer.substring(this._sectionStart, this._index - 2) + ); + this._state = TEXT; + this._sectionStart = this._index + 1; + } else if (c !== "-") { + this._state = IN_COMMENT; + } + // else: stay in AFTER_COMMENT_2 (`--->`) +}; + +Tokenizer.prototype._stateBeforeCdata1 = ifElseState( + "C", + BEFORE_CDATA_2, + IN_DECLARATION +); +Tokenizer.prototype._stateBeforeCdata2 = ifElseState( + "D", + BEFORE_CDATA_3, + IN_DECLARATION +); +Tokenizer.prototype._stateBeforeCdata3 = ifElseState( + "A", + BEFORE_CDATA_4, + IN_DECLARATION +); +Tokenizer.prototype._stateBeforeCdata4 = ifElseState( + "T", + BEFORE_CDATA_5, + IN_DECLARATION +); +Tokenizer.prototype._stateBeforeCdata5 = ifElseState( + "A", + BEFORE_CDATA_6, + IN_DECLARATION +); + +Tokenizer.prototype._stateBeforeCdata6 = function(c) { + if (c === "[") { + this._state = IN_CDATA; + this._sectionStart = this._index + 1; + } else { + this._state = IN_DECLARATION; + this._index--; + } +}; + +Tokenizer.prototype._stateInCdata = function(c) { + if (c === "]") this._state = AFTER_CDATA_1; +}; + +Tokenizer.prototype._stateAfterCdata1 = function(c) { + if (c === "]") this._state = AFTER_CDATA_2; + else this._state = IN_CDATA; +}; + +Tokenizer.prototype._stateAfterCdata2 = function(c) { + if (c === ">") { + //remove 2 trailing chars + this._cbs.oncdata( + this._buffer.substring(this._sectionStart, this._index - 2) + ); + this._state = TEXT; + this._sectionStart = this._index + 1; + } else if (c !== "]") { + this._state = IN_CDATA; + } + //else: stay in AFTER_CDATA_2 (`]]]>`) +}; + +Tokenizer.prototype._stateBeforeSpecial = function(c) { + if (c === "c" || c === "C") { + this._state = BEFORE_SCRIPT_1; + } else if (c === "t" || c === "T") { + this._state = BEFORE_STYLE_1; + } else { + this._state = IN_TAG_NAME; + this._index--; //consume the token again + } +}; + +Tokenizer.prototype._stateBeforeSpecialEnd = function(c) { + if (this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")) { + this._state = AFTER_SCRIPT_1; + } else if (this._special === SPECIAL_STYLE && (c === "t" || c === "T")) { + this._state = AFTER_STYLE_1; + } else this._state = TEXT; +}; + +Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar( + "R", + BEFORE_SCRIPT_2 +); +Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar( + "I", + BEFORE_SCRIPT_3 +); +Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar( + "P", + BEFORE_SCRIPT_4 +); +Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar( + "T", + BEFORE_SCRIPT_5 +); + +Tokenizer.prototype._stateBeforeScript5 = function(c) { + if (c === "/" || c === ">" || whitespace(c)) { + this._special = SPECIAL_SCRIPT; + } + this._state = IN_TAG_NAME; + this._index--; //consume the token again +}; + +Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT); +Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT); +Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT); +Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT); + +Tokenizer.prototype._stateAfterScript5 = function(c) { + if (c === ">" || whitespace(c)) { + this._special = SPECIAL_NONE; + this._state = IN_CLOSING_TAG_NAME; + this._sectionStart = this._index - 6; + this._index--; //reconsume the token + } else this._state = TEXT; +}; + +Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar( + "Y", + BEFORE_STYLE_2 +); +Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar( + "L", + BEFORE_STYLE_3 +); +Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar( + "E", + BEFORE_STYLE_4 +); + +Tokenizer.prototype._stateBeforeStyle4 = function(c) { + if (c === "/" || c === ">" || whitespace(c)) { + this._special = SPECIAL_STYLE; + } + this._state = IN_TAG_NAME; + this._index--; //consume the token again +}; + +Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT); +Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT); +Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT); + +Tokenizer.prototype._stateAfterStyle4 = function(c) { + if (c === ">" || whitespace(c)) { + this._special = SPECIAL_NONE; + this._state = IN_CLOSING_TAG_NAME; + this._sectionStart = this._index - 5; + this._index--; //reconsume the token + } else this._state = TEXT; +}; + +Tokenizer.prototype._stateBeforeEntity = ifElseState( + "#", + BEFORE_NUMERIC_ENTITY, + IN_NAMED_ENTITY +); +Tokenizer.prototype._stateBeforeNumericEntity = ifElseState( + "X", + IN_HEX_ENTITY, + IN_NUMERIC_ENTITY +); + +//for entities terminated with a semicolon +Tokenizer.prototype._parseNamedEntityStrict = function() { + //offset = 1 + if (this._sectionStart + 1 < this._index) { + var entity = this._buffer.substring( + this._sectionStart + 1, + this._index + ), + map = this._xmlMode ? xmlMap : entityMap; + + if (map.hasOwnProperty(entity)) { + this._emitPartial(map[entity]); + this._sectionStart = this._index + 1; + } + } +}; + +//parses legacy entities (without trailing semicolon) +Tokenizer.prototype._parseLegacyEntity = function() { + var start = this._sectionStart + 1, + limit = this._index - start; + + if (limit > 6) limit = 6; //the max length of legacy entities is 6 + + while (limit >= 2) { + //the min length of legacy entities is 2 + var entity = this._buffer.substr(start, limit); + + if (legacyMap.hasOwnProperty(entity)) { + this._emitPartial(legacyMap[entity]); + this._sectionStart += limit + 1; + return; + } else { + limit--; + } + } +}; + +Tokenizer.prototype._stateInNamedEntity = function(c) { + if (c === ";") { + this._parseNamedEntityStrict(); + if (this._sectionStart + 1 < this._index && !this._xmlMode) { + this._parseLegacyEntity(); + } + this._state = this._baseState; + } else if ( + (c < "a" || c > "z") && + (c < "A" || c > "Z") && + (c < "0" || c > "9") + ) { + if (this._xmlMode); + else if (this._sectionStart + 1 === this._index); + else if (this._baseState !== TEXT) { + if (c !== "=") { + this._parseNamedEntityStrict(); + } + } else { + this._parseLegacyEntity(); + } + + this._state = this._baseState; + this._index--; + } +}; + +Tokenizer.prototype._decodeNumericEntity = function(offset, base) { + var sectionStart = this._sectionStart + offset; + + if (sectionStart !== this._index) { + //parse entity + var entity = this._buffer.substring(sectionStart, this._index); + var parsed = parseInt(entity, base); + + this._emitPartial(decodeCodePoint(parsed)); + this._sectionStart = this._index; + } else { + this._sectionStart--; + } + + this._state = this._baseState; +}; + +Tokenizer.prototype._stateInNumericEntity = function(c) { + if (c === ";") { + this._decodeNumericEntity(2, 10); + this._sectionStart++; + } else if (c < "0" || c > "9") { + if (!this._xmlMode) { + this._decodeNumericEntity(2, 10); + } else { + this._state = this._baseState; + } + this._index--; + } +}; + +Tokenizer.prototype._stateInHexEntity = function(c) { + if (c === ";") { + this._decodeNumericEntity(3, 16); + this._sectionStart++; + } else if ( + (c < "a" || c > "f") && + (c < "A" || c > "F") && + (c < "0" || c > "9") + ) { + if (!this._xmlMode) { + this._decodeNumericEntity(3, 16); + } else { + this._state = this._baseState; + } + this._index--; + } +}; + +Tokenizer.prototype._cleanup = function() { + if (this._sectionStart < 0) { + this._buffer = ""; + this._bufferOffset += this._index; + this._index = 0; + } else if (this._running) { + if (this._state === TEXT) { + if (this._sectionStart !== this._index) { + this._cbs.ontext(this._buffer.substr(this._sectionStart)); + } + this._buffer = ""; + this._bufferOffset += this._index; + this._index = 0; + } else if (this._sectionStart === this._index) { + //the section just started + this._buffer = ""; + this._bufferOffset += this._index; + this._index = 0; + } else { + //remove everything unnecessary + this._buffer = this._buffer.substr(this._sectionStart); + this._index -= this._sectionStart; + this._bufferOffset += this._sectionStart; + } + + this._sectionStart = 0; + } +}; + +//TODO make events conditional +Tokenizer.prototype.write = function(chunk) { + if (this._ended) this._cbs.onerror(Error(".write() after done!")); + + this._buffer += chunk; + this._parse(); +}; + +Tokenizer.prototype._parse = function() { + while (this._index < this._buffer.length && this._running) { + var c = this._buffer.charAt(this._index); + if (this._state === TEXT) { + this._stateText(c); + } else if (this._state === BEFORE_TAG_NAME) { + this._stateBeforeTagName(c); + } else if (this._state === IN_TAG_NAME) { + this._stateInTagName(c); + } else if (this._state === BEFORE_CLOSING_TAG_NAME) { + this._stateBeforeCloseingTagName(c); + } else if (this._state === IN_CLOSING_TAG_NAME) { + this._stateInCloseingTagName(c); + } else if (this._state === AFTER_CLOSING_TAG_NAME) { + this._stateAfterCloseingTagName(c); + } else if (this._state === IN_SELF_CLOSING_TAG) { + this._stateInSelfClosingTag(c); + } else if (this._state === BEFORE_ATTRIBUTE_NAME) { + + /* + * attributes + */ + this._stateBeforeAttributeName(c); + } else if (this._state === IN_ATTRIBUTE_NAME) { + this._stateInAttributeName(c); + } else if (this._state === AFTER_ATTRIBUTE_NAME) { + this._stateAfterAttributeName(c); + } else if (this._state === BEFORE_ATTRIBUTE_VALUE) { + this._stateBeforeAttributeValue(c); + } else if (this._state === IN_ATTRIBUTE_VALUE_DQ) { + this._stateInAttributeValueDoubleQuotes(c); + } else if (this._state === IN_ATTRIBUTE_VALUE_SQ) { + this._stateInAttributeValueSingleQuotes(c); + } else if (this._state === IN_ATTRIBUTE_VALUE_NQ) { + this._stateInAttributeValueNoQuotes(c); + } else if (this._state === BEFORE_DECLARATION) { + + /* + * declarations + */ + this._stateBeforeDeclaration(c); + } else if (this._state === IN_DECLARATION) { + this._stateInDeclaration(c); + } else if (this._state === IN_PROCESSING_INSTRUCTION) { + + /* + * processing instructions + */ + this._stateInProcessingInstruction(c); + } else if (this._state === BEFORE_COMMENT) { + + /* + * comments + */ + this._stateBeforeComment(c); + } else if (this._state === IN_COMMENT) { + this._stateInComment(c); + } else if (this._state === AFTER_COMMENT_1) { + this._stateAfterComment1(c); + } else if (this._state === AFTER_COMMENT_2) { + this._stateAfterComment2(c); + } else if (this._state === BEFORE_CDATA_1) { + + /* + * cdata + */ + this._stateBeforeCdata1(c); + } else if (this._state === BEFORE_CDATA_2) { + this._stateBeforeCdata2(c); + } else if (this._state === BEFORE_CDATA_3) { + this._stateBeforeCdata3(c); + } else if (this._state === BEFORE_CDATA_4) { + this._stateBeforeCdata4(c); + } else if (this._state === BEFORE_CDATA_5) { + this._stateBeforeCdata5(c); + } else if (this._state === BEFORE_CDATA_6) { + this._stateBeforeCdata6(c); + } else if (this._state === IN_CDATA) { + this._stateInCdata(c); + } else if (this._state === AFTER_CDATA_1) { + this._stateAfterCdata1(c); + } else if (this._state === AFTER_CDATA_2) { + this._stateAfterCdata2(c); + } else if (this._state === BEFORE_SPECIAL) { + + /* + * special tags + */ + this._stateBeforeSpecial(c); + } else if (this._state === BEFORE_SPECIAL_END) { + this._stateBeforeSpecialEnd(c); + } else if (this._state === BEFORE_SCRIPT_1) { + + /* + * script + */ + this._stateBeforeScript1(c); + } else if (this._state === BEFORE_SCRIPT_2) { + this._stateBeforeScript2(c); + } else if (this._state === BEFORE_SCRIPT_3) { + this._stateBeforeScript3(c); + } else if (this._state === BEFORE_SCRIPT_4) { + this._stateBeforeScript4(c); + } else if (this._state === BEFORE_SCRIPT_5) { + this._stateBeforeScript5(c); + } else if (this._state === AFTER_SCRIPT_1) { + this._stateAfterScript1(c); + } else if (this._state === AFTER_SCRIPT_2) { + this._stateAfterScript2(c); + } else if (this._state === AFTER_SCRIPT_3) { + this._stateAfterScript3(c); + } else if (this._state === AFTER_SCRIPT_4) { + this._stateAfterScript4(c); + } else if (this._state === AFTER_SCRIPT_5) { + this._stateAfterScript5(c); + } else if (this._state === BEFORE_STYLE_1) { + + /* + * style + */ + this._stateBeforeStyle1(c); + } else if (this._state === BEFORE_STYLE_2) { + this._stateBeforeStyle2(c); + } else if (this._state === BEFORE_STYLE_3) { + this._stateBeforeStyle3(c); + } else if (this._state === BEFORE_STYLE_4) { + this._stateBeforeStyle4(c); + } else if (this._state === AFTER_STYLE_1) { + this._stateAfterStyle1(c); + } else if (this._state === AFTER_STYLE_2) { + this._stateAfterStyle2(c); + } else if (this._state === AFTER_STYLE_3) { + this._stateAfterStyle3(c); + } else if (this._state === AFTER_STYLE_4) { + this._stateAfterStyle4(c); + } else if (this._state === BEFORE_ENTITY) { + + /* + * entities + */ + this._stateBeforeEntity(c); + } else if (this._state === BEFORE_NUMERIC_ENTITY) { + this._stateBeforeNumericEntity(c); + } else if (this._state === IN_NAMED_ENTITY) { + this._stateInNamedEntity(c); + } else if (this._state === IN_NUMERIC_ENTITY) { + this._stateInNumericEntity(c); + } else if (this._state === IN_HEX_ENTITY) { + this._stateInHexEntity(c); + } else { + this._cbs.onerror(Error("unknown _state"), this._state); + } + + this._index++; + } + + this._cleanup(); +}; + +Tokenizer.prototype.pause = function() { + this._running = false; +}; +Tokenizer.prototype.resume = function() { + this._running = true; + + if (this._index < this._buffer.length) { + this._parse(); + } + if (this._ended) { + this._finish(); + } +}; + +Tokenizer.prototype.end = function(chunk) { + if (this._ended) this._cbs.onerror(Error(".end() after done!")); + if (chunk) this.write(chunk); + + this._ended = true; + + if (this._running) this._finish(); +}; + +Tokenizer.prototype._finish = function() { + //if there is remaining data, emit it in a reasonable way + if (this._sectionStart < this._index) { + this._handleTrailingData(); + } + + this._cbs.onend(); +}; + +Tokenizer.prototype._handleTrailingData = function() { + var data = this._buffer.substr(this._sectionStart); + + if ( + this._state === IN_CDATA || + this._state === AFTER_CDATA_1 || + this._state === AFTER_CDATA_2 + ) { + this._cbs.oncdata(data); + } else if ( + this._state === IN_COMMENT || + this._state === AFTER_COMMENT_1 || + this._state === AFTER_COMMENT_2 + ) { + this._cbs.oncomment(data); + } else if (this._state === IN_NAMED_ENTITY && !this._xmlMode) { + this._parseLegacyEntity(); + if (this._sectionStart < this._index) { + this._state = this._baseState; + this._handleTrailingData(); + } + } else if (this._state === IN_NUMERIC_ENTITY && !this._xmlMode) { + this._decodeNumericEntity(2, 10); + if (this._sectionStart < this._index) { + this._state = this._baseState; + this._handleTrailingData(); + } + } else if (this._state === IN_HEX_ENTITY && !this._xmlMode) { + this._decodeNumericEntity(3, 16); + if (this._sectionStart < this._index) { + this._state = this._baseState; + this._handleTrailingData(); + } + } else if ( + this._state !== IN_TAG_NAME && + this._state !== BEFORE_ATTRIBUTE_NAME && + this._state !== BEFORE_ATTRIBUTE_VALUE && + this._state !== AFTER_ATTRIBUTE_NAME && + this._state !== IN_ATTRIBUTE_NAME && + this._state !== IN_ATTRIBUTE_VALUE_SQ && + this._state !== IN_ATTRIBUTE_VALUE_DQ && + this._state !== IN_ATTRIBUTE_VALUE_NQ && + this._state !== IN_CLOSING_TAG_NAME + ) { + this._cbs.ontext(data); + } + //else, ignore remaining data + //TODO add a way to remove current tag +}; + +Tokenizer.prototype.reset = function() { + Tokenizer.call( + this, + { xmlMode: this._xmlMode, decodeEntities: this._decodeEntities }, + this._cbs + ); +}; + +Tokenizer.prototype.getAbsoluteIndex = function() { + return this._bufferOffset + this._index; +}; + +Tokenizer.prototype._getSection = function() { + return this._buffer.substring(this._sectionStart, this._index); +}; + +Tokenizer.prototype._emitToken = function(name) { + this._cbs[name](this._getSection()); + this._sectionStart = -1; +}; + +Tokenizer.prototype._emitPartial = function(value) { + if (this._baseState !== TEXT) { + this._cbs.onattribdata(value); //TODO implement the new event + } else { + this._cbs.ontext(value); + } +}; |