| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195 | import { CODE_POINTS as $, getSurrogatePairCodePoint, isControlCodePoint, isSurrogate, isSurrogatePair, isUndefinedCodePoint, } from '../common/unicode.js';import { ERR } from '../common/error-codes.js';//Constconst DEFAULT_BUFFER_WATERLINE = 1 << 16;//Preprocessor//NOTE: HTML input preprocessing//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)export class Preprocessor {    constructor(handler) {        this.handler = handler;        this.html = '';        this.pos = -1;        // NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0        this.lastGapPos = -2;        this.gapStack = [];        this.skipNextNewLine = false;        this.lastChunkWritten = false;        this.endOfChunkHit = false;        this.bufferWaterline = DEFAULT_BUFFER_WATERLINE;        this.isEol = false;        this.lineStartPos = 0;        this.droppedBufferSize = 0;        this.line = 1;        //NOTE: avoid reporting errors twice on advance/retreat        this.lastErrOffset = -1;    }    /** The column on the current line. If we just saw a gap (eg. a surrogate pair), return the index before. */    get col() {        return this.pos - this.lineStartPos + Number(this.lastGapPos !== this.pos);    }    get offset() {        return this.droppedBufferSize + this.pos;    }    getError(code) {        const { line, col, offset } = this;        return {            code,            startLine: line,            endLine: line,            startCol: col,            endCol: col,            startOffset: offset,            endOffset: offset,        };    }    _err(code) {        if (this.handler.onParseError && this.lastErrOffset !== this.offset) {            this.lastErrOffset = this.offset;            this.handler.onParseError(this.getError(code));        }    }    _addGap() {        this.gapStack.push(this.lastGapPos);        this.lastGapPos = this.pos;    }    _processSurrogate(cp) {        //NOTE: try to peek a surrogate pair        if (this.pos !== this.html.length - 1) {            const nextCp = this.html.charCodeAt(this.pos + 1);            if (isSurrogatePair(nextCp)) {                //NOTE: we have a surrogate pair. Peek pair character and recalculate code point.                this.pos++;                //NOTE: add a gap that should be avoided during retreat                this._addGap();                return getSurrogatePairCodePoint(cp, nextCp);            }        }        //NOTE: we are at the end of a chunk, therefore we can't infer the surrogate pair yet.        else if (!this.lastChunkWritten) {            this.endOfChunkHit = true;            return $.EOF;        }        //NOTE: isolated surrogate        this._err(ERR.surrogateInInputStream);        return cp;    }    willDropParsedChunk() {        return this.pos > this.bufferWaterline;    }    dropParsedChunk() {        if (this.willDropParsedChunk()) {            this.html = this.html.substring(this.pos);            this.lineStartPos -= this.pos;            this.droppedBufferSize += this.pos;            this.pos = 0;            this.lastGapPos = -2;            this.gapStack.length = 0;        }    }    write(chunk, isLastChunk) {        if (this.html.length > 0) {            this.html += chunk;        }        else {            this.html = chunk;        }        this.endOfChunkHit = false;        this.lastChunkWritten = isLastChunk;    }    insertHtmlAtCurrentPos(chunk) {        this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1);        this.endOfChunkHit = false;    }    startsWith(pattern, caseSensitive) {        // Check if our buffer has enough characters        if (this.pos + pattern.length > this.html.length) {            this.endOfChunkHit = !this.lastChunkWritten;            return false;        }        if (caseSensitive) {            return this.html.startsWith(pattern, this.pos);        }        for (let i = 0; i < pattern.length; i++) {            const cp = this.html.charCodeAt(this.pos + i) | 0x20;            if (cp !== pattern.charCodeAt(i)) {                return false;            }        }        return true;    }    peek(offset) {        const pos = this.pos + offset;        if (pos >= this.html.length) {            this.endOfChunkHit = !this.lastChunkWritten;            return $.EOF;        }        const code = this.html.charCodeAt(pos);        return code === $.CARRIAGE_RETURN ? $.LINE_FEED : code;    }    advance() {        this.pos++;        //NOTE: LF should be in the last column of the line        if (this.isEol) {            this.isEol = false;            this.line++;            this.lineStartPos = this.pos;        }        if (this.pos >= this.html.length) {            this.endOfChunkHit = !this.lastChunkWritten;            return $.EOF;        }        let cp = this.html.charCodeAt(this.pos);        //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters        if (cp === $.CARRIAGE_RETURN) {            this.isEol = true;            this.skipNextNewLine = true;            return $.LINE_FEED;        }        //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character        //must be ignored.        if (cp === $.LINE_FEED) {            this.isEol = true;            if (this.skipNextNewLine) {                // `line` will be bumped again in the recursive call.                this.line--;                this.skipNextNewLine = false;                this._addGap();                return this.advance();            }        }        this.skipNextNewLine = false;        if (isSurrogate(cp)) {            cp = this._processSurrogate(cp);        }        //OPTIMIZATION: first check if code point is in the common allowed        //range (ASCII alphanumeric, whitespaces, big chunk of BMP)        //before going into detailed performance cost validation.        const isCommonValidRange = this.handler.onParseError === null ||            (cp > 0x1f && cp < 0x7f) ||            cp === $.LINE_FEED ||            cp === $.CARRIAGE_RETURN ||            (cp > 0x9f && cp < 64976);        if (!isCommonValidRange) {            this._checkForProblematicCharacters(cp);        }        return cp;    }    _checkForProblematicCharacters(cp) {        if (isControlCodePoint(cp)) {            this._err(ERR.controlCharacterInInputStream);        }        else if (isUndefinedCodePoint(cp)) {            this._err(ERR.noncharacterInInputStream);        }    }    retreat(count) {        this.pos -= count;        while (this.pos < this.lastGapPos) {            this.lastGapPos = this.gapStack.pop();            this.pos--;        }        this.isEol = false;    }}//# sourceMappingURL=preprocessor.js.map
 |