Permalink
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
codeql-action/node_modules/parse5/lib/tokenizer/preprocessor.js
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

159 lines (123 sloc)
4.38 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
const unicode = require('../common/unicode'); | |
const ERR = require('../common/error-codes'); | |
//Aliases | |
const $ = unicode.CODE_POINTS; | |
//Const | |
const DEFAULT_BUFFER_WATERLINE = 1 << 16; | |
//Preprocessor | |
//NOTE: HTML input preprocessing | |
//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream) | |
class Preprocessor { | |
constructor() { | |
this.html = null; | |
this.pos = -1; | |
this.lastGapPos = -1; | |
this.lastCharPos = -1; | |
this.gapStack = []; | |
this.skipNextNewLine = false; | |
this.lastChunkWritten = false; | |
this.endOfChunkHit = false; | |
this.bufferWaterline = DEFAULT_BUFFER_WATERLINE; | |
} | |
_err() { | |
// NOTE: err reporting is noop by default. Enabled by mixin. | |
} | |
_addGap() { | |
this.gapStack.push(this.lastGapPos); | |
this.lastGapPos = this.pos; | |
} | |
_processSurrogate(cp) { | |
//NOTE: try to peek a surrogate pair | |
if (this.pos !== this.lastCharPos) { | |
const nextCp = this.html.charCodeAt(this.pos + 1); | |
if (unicode.isSurrogatePair(nextCp)) { | |
//NOTE: we have a surrogate pair. Peek pair character and recalculate code point. | |
this.pos++; | |
//NOTE: add gap that should be avoided during retreat | |
this._addGap(); | |
return unicode.getSurrogatePairCodePoint(cp, nextCp); | |
} | |
} | |
//NOTE: we are at the end of a chunk, therefore we can't infer surrogate pair yet. | |
else if (!this.lastChunkWritten) { | |
this.endOfChunkHit = true; | |
return $.EOF; | |
} | |
//NOTE: isolated surrogate | |
this._err(ERR.surrogateInInputStream); | |
return cp; | |
} | |
dropParsedChunk() { | |
if (this.pos > this.bufferWaterline) { | |
this.lastCharPos -= this.pos; | |
this.html = this.html.substring(this.pos); | |
this.pos = 0; | |
this.lastGapPos = -1; | |
this.gapStack = []; | |
} | |
} | |
write(chunk, isLastChunk) { | |
if (this.html) { | |
this.html += chunk; | |
} else { | |
this.html = chunk; | |
} | |
this.lastCharPos = this.html.length - 1; | |
this.endOfChunkHit = false; | |
this.lastChunkWritten = isLastChunk; | |
} | |
insertHtmlAtCurrentPos(chunk) { | |
this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1, this.html.length); | |
this.lastCharPos = this.html.length - 1; | |
this.endOfChunkHit = false; | |
} | |
advance() { | |
this.pos++; | |
if (this.pos > this.lastCharPos) { | |
this.endOfChunkHit = !this.lastChunkWritten; | |
return $.EOF; | |
} | |
let cp = this.html.charCodeAt(this.pos); | |
//NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character | |
//must be ignored. | |
if (this.skipNextNewLine && cp === $.LINE_FEED) { | |
this.skipNextNewLine = false; | |
this._addGap(); | |
return this.advance(); | |
} | |
//NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters | |
if (cp === $.CARRIAGE_RETURN) { | |
this.skipNextNewLine = true; | |
return $.LINE_FEED; | |
} | |
this.skipNextNewLine = false; | |
if (unicode.isSurrogate(cp)) { | |
cp = this._processSurrogate(cp); | |
} | |
//OPTIMIZATION: first check if code point is in the common allowed | |
//range (ASCII alphanumeric, whitespaces, big chunk of BMP) | |
//before going into detailed performance cost validation. | |
const isCommonValidRange = | |
(cp > 0x1f && cp < 0x7f) || cp === $.LINE_FEED || cp === $.CARRIAGE_RETURN || (cp > 0x9f && cp < 0xfdd0); | |
if (!isCommonValidRange) { | |
this._checkForProblematicCharacters(cp); | |
} | |
return cp; | |
} | |
_checkForProblematicCharacters(cp) { | |
if (unicode.isControlCodePoint(cp)) { | |
this._err(ERR.controlCharacterInInputStream); | |
} else if (unicode.isUndefinedCodePoint(cp)) { | |
this._err(ERR.noncharacterInInputStream); | |
} | |
} | |
retreat() { | |
if (this.pos === this.lastGapPos) { | |
this.lastGapPos = this.gapStack.pop(); | |
this.pos--; | |
} | |
this.pos--; | |
} | |
} | |
module.exports = Preprocessor; |