123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670 |
- /**
- * @typedef {import('micromark-util-types').Chunk} Chunk
- * @typedef {import('micromark-util-types').Code} Code
- * @typedef {import('micromark-util-types').Construct} Construct
- * @typedef {import('micromark-util-types').ConstructRecord} ConstructRecord
- * @typedef {import('micromark-util-types').Effects} Effects
- * @typedef {import('micromark-util-types').InitialConstruct} InitialConstruct
- * @typedef {import('micromark-util-types').ParseContext} ParseContext
- * @typedef {import('micromark-util-types').Point} Point
- * @typedef {import('micromark-util-types').State} State
- * @typedef {import('micromark-util-types').Token} Token
- * @typedef {import('micromark-util-types').TokenType} TokenType
- * @typedef {import('micromark-util-types').TokenizeContext} TokenizeContext
- */
- /**
- * @callback Restore
- * @returns {undefined}
- *
- * @typedef Info
- * @property {Restore} restore
- * @property {number} from
- *
- * @callback ReturnHandle
- * Handle a successful run.
- * @param {Construct} construct
- * @param {Info} info
- * @returns {undefined}
- */
- import createDebug from 'debug'
- import {markdownLineEnding} from 'micromark-util-character'
- import {push, splice} from 'micromark-util-chunked'
- import {resolveAll} from 'micromark-util-resolve-all'
- import {codes, values} from 'micromark-util-symbol'
- import {ok as assert} from 'devlop'
- const debug = createDebug('micromark')
- /**
- * Create a tokenizer.
- * Tokenizers deal with one type of data (e.g., containers, flow, text).
- * The parser is the object dealing with it all.
- * `initialize` works like other constructs, except that only its `tokenize`
- * function is used, in which case it doesn’t receive an `ok` or `nok`.
- * `from` can be given to set the point before the first character, although
- * when further lines are indented, they must be set with `defineSkip`.
- *
- * @param {ParseContext} parser
- * @param {InitialConstruct} initialize
- * @param {Omit<Point, '_bufferIndex' | '_index'> | undefined} [from]
- * @returns {TokenizeContext}
- */
- export function createTokenizer(parser, initialize, from) {
- /** @type {Point} */
- let point = Object.assign(
- from ? Object.assign({}, from) : {line: 1, column: 1, offset: 0},
- {_index: 0, _bufferIndex: -1}
- )
- /** @type {Record<string, number>} */
- const columnStart = {}
- /** @type {Array<Construct>} */
- const resolveAllConstructs = []
- /** @type {Array<Chunk>} */
- let chunks = []
- /** @type {Array<Token>} */
- let stack = []
- /** @type {boolean | undefined} */
- let consumed = true
- /**
- * Tools used for tokenizing.
- *
- * @type {Effects}
- */
- const effects = {
- consume,
- enter,
- exit,
- attempt: constructFactory(onsuccessfulconstruct),
- check: constructFactory(onsuccessfulcheck),
- interrupt: constructFactory(onsuccessfulcheck, {interrupt: true})
- }
- /**
- * State and tools for resolving and serializing.
- *
- * @type {TokenizeContext}
- */
- const context = {
- previous: codes.eof,
- code: codes.eof,
- containerState: {},
- events: [],
- parser,
- sliceStream,
- sliceSerialize,
- now,
- defineSkip,
- write
- }
- /**
- * The state function.
- *
- * @type {State | undefined}
- */
- let state = initialize.tokenize.call(context, effects)
- /**
- * Track which character we expect to be consumed, to catch bugs.
- *
- * @type {Code}
- */
- let expectedCode
- if (initialize.resolveAll) {
- resolveAllConstructs.push(initialize)
- }
- return context
- /** @type {TokenizeContext['write']} */
- function write(slice) {
- chunks = push(chunks, slice)
- main()
- // Exit if we’re not done, resolve might change stuff.
- if (chunks[chunks.length - 1] !== codes.eof) {
- return []
- }
- addResult(initialize, 0)
- // Otherwise, resolve, and exit.
- context.events = resolveAll(resolveAllConstructs, context.events, context)
- return context.events
- }
- //
- // Tools.
- //
- /** @type {TokenizeContext['sliceSerialize']} */
- function sliceSerialize(token, expandTabs) {
- return serializeChunks(sliceStream(token), expandTabs)
- }
- /** @type {TokenizeContext['sliceStream']} */
- function sliceStream(token) {
- return sliceChunks(chunks, token)
- }
- /** @type {TokenizeContext['now']} */
- function now() {
- // This is a hot path, so we clone manually instead of `Object.assign({}, point)`
- const {line, column, offset, _index, _bufferIndex} = point
- return {line, column, offset, _index, _bufferIndex}
- }
- /** @type {TokenizeContext['defineSkip']} */
- function defineSkip(value) {
- columnStart[value.line] = value.column
- accountForPotentialSkip()
- debug('position: define skip: `%j`', point)
- }
- //
- // State management.
- //
- /**
- * Main loop (note that `_index` and `_bufferIndex` in `point` are modified by
- * `consume`).
- * Here is where we walk through the chunks, which either include strings of
- * several characters, or numerical character codes.
- * The reason to do this in a loop instead of a call is so the stack can
- * drain.
- *
- * @returns {undefined}
- */
- function main() {
- /** @type {number} */
- let chunkIndex
- while (point._index < chunks.length) {
- const chunk = chunks[point._index]
- // If we’re in a buffer chunk, loop through it.
- if (typeof chunk === 'string') {
- chunkIndex = point._index
- if (point._bufferIndex < 0) {
- point._bufferIndex = 0
- }
- while (
- point._index === chunkIndex &&
- point._bufferIndex < chunk.length
- ) {
- go(chunk.charCodeAt(point._bufferIndex))
- }
- } else {
- go(chunk)
- }
- }
- }
- /**
- * Deal with one code.
- *
- * @param {Code} code
- * @returns {undefined}
- */
- function go(code) {
- assert(consumed === true, 'expected character to be consumed')
- consumed = undefined
- debug('main: passing `%s` to %s', code, state && state.name)
- expectedCode = code
- assert(typeof state === 'function', 'expected state')
- state = state(code)
- }
- /** @type {Effects['consume']} */
- function consume(code) {
- assert(code === expectedCode, 'expected given code to equal expected code')
- debug('consume: `%s`', code)
- assert(
- consumed === undefined,
- 'expected code to not have been consumed: this might be because `return x(code)` instead of `return x` was used'
- )
- assert(
- code === null
- ? context.events.length === 0 ||
- context.events[context.events.length - 1][0] === 'exit'
- : context.events[context.events.length - 1][0] === 'enter',
- 'expected last token to be open'
- )
- if (markdownLineEnding(code)) {
- point.line++
- point.column = 1
- point.offset += code === codes.carriageReturnLineFeed ? 2 : 1
- accountForPotentialSkip()
- debug('position: after eol: `%j`', point)
- } else if (code !== codes.virtualSpace) {
- point.column++
- point.offset++
- }
- // Not in a string chunk.
- if (point._bufferIndex < 0) {
- point._index++
- } else {
- point._bufferIndex++
- // At end of string chunk.
- // @ts-expect-error Points w/ non-negative `_bufferIndex` reference
- // strings.
- if (point._bufferIndex === chunks[point._index].length) {
- point._bufferIndex = -1
- point._index++
- }
- }
- // Expose the previous character.
- context.previous = code
- // Mark as consumed.
- consumed = true
- }
- /** @type {Effects['enter']} */
- function enter(type, fields) {
- /** @type {Token} */
- // @ts-expect-error Patch instead of assign required fields to help GC.
- const token = fields || {}
- token.type = type
- token.start = now()
- assert(typeof type === 'string', 'expected string type')
- assert(type.length > 0, 'expected non-empty string')
- debug('enter: `%s`', type)
- context.events.push(['enter', token, context])
- stack.push(token)
- return token
- }
- /** @type {Effects['exit']} */
- function exit(type) {
- assert(typeof type === 'string', 'expected string type')
- assert(type.length > 0, 'expected non-empty string')
- const token = stack.pop()
- assert(token, 'cannot close w/o open tokens')
- token.end = now()
- assert(type === token.type, 'expected exit token to match current token')
- assert(
- !(
- token.start._index === token.end._index &&
- token.start._bufferIndex === token.end._bufferIndex
- ),
- 'expected non-empty token (`' + type + '`)'
- )
- debug('exit: `%s`', token.type)
- context.events.push(['exit', token, context])
- return token
- }
- /**
- * Use results.
- *
- * @type {ReturnHandle}
- */
- function onsuccessfulconstruct(construct, info) {
- addResult(construct, info.from)
- }
- /**
- * Discard results.
- *
- * @type {ReturnHandle}
- */
- function onsuccessfulcheck(_, info) {
- info.restore()
- }
- /**
- * Factory to attempt/check/interrupt.
- *
- * @param {ReturnHandle} onreturn
- * @param {{interrupt?: boolean | undefined} | undefined} [fields]
- */
- function constructFactory(onreturn, fields) {
- return hook
- /**
- * Handle either an object mapping codes to constructs, a list of
- * constructs, or a single construct.
- *
- * @param {Array<Construct> | Construct | ConstructRecord} constructs
- * @param {State} returnState
- * @param {State | undefined} [bogusState]
- * @returns {State}
- */
- function hook(constructs, returnState, bogusState) {
- /** @type {Array<Construct>} */
- let listOfConstructs
- /** @type {number} */
- let constructIndex
- /** @type {Construct} */
- let currentConstruct
- /** @type {Info} */
- let info
- return Array.isArray(constructs)
- ? /* c8 ignore next 1 */
- handleListOfConstructs(constructs)
- : 'tokenize' in constructs
- ? // @ts-expect-error Looks like a construct.
- handleListOfConstructs([constructs])
- : handleMapOfConstructs(constructs)
- /**
- * Handle a list of construct.
- *
- * @param {ConstructRecord} map
- * @returns {State}
- */
- function handleMapOfConstructs(map) {
- return start
- /** @type {State} */
- function start(code) {
- const def = code !== null && map[code]
- const all = code !== null && map.null
- const list = [
- // To do: add more extension tests.
- /* c8 ignore next 2 */
- ...(Array.isArray(def) ? def : def ? [def] : []),
- ...(Array.isArray(all) ? all : all ? [all] : [])
- ]
- return handleListOfConstructs(list)(code)
- }
- }
- /**
- * Handle a list of construct.
- *
- * @param {Array<Construct>} list
- * @returns {State}
- */
- function handleListOfConstructs(list) {
- listOfConstructs = list
- constructIndex = 0
- if (list.length === 0) {
- assert(bogusState, 'expected `bogusState` to be given')
- return bogusState
- }
- return handleConstruct(list[constructIndex])
- }
- /**
- * Handle a single construct.
- *
- * @param {Construct} construct
- * @returns {State}
- */
- function handleConstruct(construct) {
- return start
- /** @type {State} */
- function start(code) {
- // To do: not needed to store if there is no bogus state, probably?
- // Currently doesn’t work because `inspect` in document does a check
- // w/o a bogus, which doesn’t make sense. But it does seem to help perf
- // by not storing.
- info = store()
- currentConstruct = construct
- if (!construct.partial) {
- context.currentConstruct = construct
- }
- // Always populated by defaults.
- assert(
- context.parser.constructs.disable.null,
- 'expected `disable.null` to be populated'
- )
- if (
- construct.name &&
- context.parser.constructs.disable.null.includes(construct.name)
- ) {
- return nok(code)
- }
- return construct.tokenize.call(
- // If we do have fields, create an object w/ `context` as its
- // prototype.
- // This allows a “live binding”, which is needed for `interrupt`.
- fields ? Object.assign(Object.create(context), fields) : context,
- effects,
- ok,
- nok
- )(code)
- }
- }
- /** @type {State} */
- function ok(code) {
- assert(code === expectedCode, 'expected code')
- consumed = true
- onreturn(currentConstruct, info)
- return returnState
- }
- /** @type {State} */
- function nok(code) {
- assert(code === expectedCode, 'expected code')
- consumed = true
- info.restore()
- if (++constructIndex < listOfConstructs.length) {
- return handleConstruct(listOfConstructs[constructIndex])
- }
- return bogusState
- }
- }
- }
- /**
- * @param {Construct} construct
- * @param {number} from
- * @returns {undefined}
- */
- function addResult(construct, from) {
- if (construct.resolveAll && !resolveAllConstructs.includes(construct)) {
- resolveAllConstructs.push(construct)
- }
- if (construct.resolve) {
- splice(
- context.events,
- from,
- context.events.length - from,
- construct.resolve(context.events.slice(from), context)
- )
- }
- if (construct.resolveTo) {
- context.events = construct.resolveTo(context.events, context)
- }
- assert(
- construct.partial ||
- context.events.length === 0 ||
- context.events[context.events.length - 1][0] === 'exit',
- 'expected last token to end'
- )
- }
- /**
- * Store state.
- *
- * @returns {Info}
- */
- function store() {
- const startPoint = now()
- const startPrevious = context.previous
- const startCurrentConstruct = context.currentConstruct
- const startEventsIndex = context.events.length
- const startStack = Array.from(stack)
- return {restore, from: startEventsIndex}
- /**
- * Restore state.
- *
- * @returns {undefined}
- */
- function restore() {
- point = startPoint
- context.previous = startPrevious
- context.currentConstruct = startCurrentConstruct
- context.events.length = startEventsIndex
- stack = startStack
- accountForPotentialSkip()
- debug('position: restore: `%j`', point)
- }
- }
- /**
- * Move the current point a bit forward in the line when it’s on a column
- * skip.
- *
- * @returns {undefined}
- */
- function accountForPotentialSkip() {
- if (point.line in columnStart && point.column < 2) {
- point.column = columnStart[point.line]
- point.offset += columnStart[point.line] - 1
- }
- }
- }
- /**
- * Get the chunks from a slice of chunks in the range of a token.
- *
- * @param {Array<Chunk>} chunks
- * @param {Pick<Token, 'end' | 'start'>} token
- * @returns {Array<Chunk>}
- */
- function sliceChunks(chunks, token) {
- const startIndex = token.start._index
- const startBufferIndex = token.start._bufferIndex
- const endIndex = token.end._index
- const endBufferIndex = token.end._bufferIndex
- /** @type {Array<Chunk>} */
- let view
- if (startIndex === endIndex) {
- assert(endBufferIndex > -1, 'expected non-negative end buffer index')
- assert(startBufferIndex > -1, 'expected non-negative start buffer index')
- // @ts-expect-error `_bufferIndex` is used on string chunks.
- view = [chunks[startIndex].slice(startBufferIndex, endBufferIndex)]
- } else {
- view = chunks.slice(startIndex, endIndex)
- if (startBufferIndex > -1) {
- const head = view[0]
- if (typeof head === 'string') {
- view[0] = head.slice(startBufferIndex)
- } else {
- assert(startBufferIndex === 0, 'expected `startBufferIndex` to be `0`')
- view.shift()
- }
- }
- if (endBufferIndex > 0) {
- // @ts-expect-error `_bufferIndex` is used on string chunks.
- view.push(chunks[endIndex].slice(0, endBufferIndex))
- }
- }
- return view
- }
- /**
- * Get the string value of a slice of chunks.
- *
- * @param {Array<Chunk>} chunks
- * @param {boolean | undefined} [expandTabs=false]
- * @returns {string}
- */
- function serializeChunks(chunks, expandTabs) {
- let index = -1
- /** @type {Array<string>} */
- const result = []
- /** @type {boolean | undefined} */
- let atTab
- while (++index < chunks.length) {
- const chunk = chunks[index]
- /** @type {string} */
- let value
- if (typeof chunk === 'string') {
- value = chunk
- } else
- switch (chunk) {
- case codes.carriageReturn: {
- value = values.cr
- break
- }
- case codes.lineFeed: {
- value = values.lf
- break
- }
- case codes.carriageReturnLineFeed: {
- value = values.cr + values.lf
- break
- }
- case codes.horizontalTab: {
- value = expandTabs ? values.space : values.ht
- break
- }
- case codes.virtualSpace: {
- if (!expandTabs && atTab) continue
- value = values.space
- break
- }
- default: {
- assert(typeof chunk === 'number', 'expected number')
- // Currently only replacement character.
- value = String.fromCharCode(chunk)
- }
- }
- atTab = chunk === codes.horizontalTab
- result.push(value)
- }
- return result.join('')
- }
|