create-tokenizer.js 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670
  1. /**
  2. * @typedef {import('micromark-util-types').Chunk} Chunk
  3. * @typedef {import('micromark-util-types').Code} Code
  4. * @typedef {import('micromark-util-types').Construct} Construct
  5. * @typedef {import('micromark-util-types').ConstructRecord} ConstructRecord
  6. * @typedef {import('micromark-util-types').Effects} Effects
  7. * @typedef {import('micromark-util-types').InitialConstruct} InitialConstruct
  8. * @typedef {import('micromark-util-types').ParseContext} ParseContext
  9. * @typedef {import('micromark-util-types').Point} Point
  10. * @typedef {import('micromark-util-types').State} State
  11. * @typedef {import('micromark-util-types').Token} Token
  12. * @typedef {import('micromark-util-types').TokenType} TokenType
  13. * @typedef {import('micromark-util-types').TokenizeContext} TokenizeContext
  14. */
  15. /**
  16. * @callback Restore
  17. * @returns {undefined}
  18. *
  19. * @typedef Info
  20. * @property {Restore} restore
  21. * @property {number} from
  22. *
  23. * @callback ReturnHandle
  24. * Handle a successful run.
  25. * @param {Construct} construct
  26. * @param {Info} info
  27. * @returns {undefined}
  28. */
  29. import createDebug from 'debug'
  30. import {markdownLineEnding} from 'micromark-util-character'
  31. import {push, splice} from 'micromark-util-chunked'
  32. import {resolveAll} from 'micromark-util-resolve-all'
  33. import {codes, values} from 'micromark-util-symbol'
  34. import {ok as assert} from 'devlop'
  35. const debug = createDebug('micromark')
  36. /**
  37. * Create a tokenizer.
  38. * Tokenizers deal with one type of data (e.g., containers, flow, text).
  39. * The parser is the object dealing with it all.
  40. * `initialize` works like other constructs, except that only its `tokenize`
  41. * function is used, in which case it doesn’t receive an `ok` or `nok`.
  42. * `from` can be given to set the point before the first character, although
  43. * when further lines are indented, they must be set with `defineSkip`.
  44. *
  45. * @param {ParseContext} parser
  46. * @param {InitialConstruct} initialize
  47. * @param {Omit<Point, '_bufferIndex' | '_index'> | undefined} [from]
  48. * @returns {TokenizeContext}
  49. */
  50. export function createTokenizer(parser, initialize, from) {
  51. /** @type {Point} */
  52. let point = Object.assign(
  53. from ? Object.assign({}, from) : {line: 1, column: 1, offset: 0},
  54. {_index: 0, _bufferIndex: -1}
  55. )
  56. /** @type {Record<string, number>} */
  57. const columnStart = {}
  58. /** @type {Array<Construct>} */
  59. const resolveAllConstructs = []
  60. /** @type {Array<Chunk>} */
  61. let chunks = []
  62. /** @type {Array<Token>} */
  63. let stack = []
  64. /** @type {boolean | undefined} */
  65. let consumed = true
  66. /**
  67. * Tools used for tokenizing.
  68. *
  69. * @type {Effects}
  70. */
  71. const effects = {
  72. consume,
  73. enter,
  74. exit,
  75. attempt: constructFactory(onsuccessfulconstruct),
  76. check: constructFactory(onsuccessfulcheck),
  77. interrupt: constructFactory(onsuccessfulcheck, {interrupt: true})
  78. }
  79. /**
  80. * State and tools for resolving and serializing.
  81. *
  82. * @type {TokenizeContext}
  83. */
  84. const context = {
  85. previous: codes.eof,
  86. code: codes.eof,
  87. containerState: {},
  88. events: [],
  89. parser,
  90. sliceStream,
  91. sliceSerialize,
  92. now,
  93. defineSkip,
  94. write
  95. }
  96. /**
  97. * The state function.
  98. *
  99. * @type {State | undefined}
  100. */
  101. let state = initialize.tokenize.call(context, effects)
  102. /**
  103. * Track which character we expect to be consumed, to catch bugs.
  104. *
  105. * @type {Code}
  106. */
  107. let expectedCode
  108. if (initialize.resolveAll) {
  109. resolveAllConstructs.push(initialize)
  110. }
  111. return context
  112. /** @type {TokenizeContext['write']} */
  113. function write(slice) {
  114. chunks = push(chunks, slice)
  115. main()
  116. // Exit if we’re not done, resolve might change stuff.
  117. if (chunks[chunks.length - 1] !== codes.eof) {
  118. return []
  119. }
  120. addResult(initialize, 0)
  121. // Otherwise, resolve, and exit.
  122. context.events = resolveAll(resolveAllConstructs, context.events, context)
  123. return context.events
  124. }
  125. //
  126. // Tools.
  127. //
  128. /** @type {TokenizeContext['sliceSerialize']} */
  129. function sliceSerialize(token, expandTabs) {
  130. return serializeChunks(sliceStream(token), expandTabs)
  131. }
  132. /** @type {TokenizeContext['sliceStream']} */
  133. function sliceStream(token) {
  134. return sliceChunks(chunks, token)
  135. }
  136. /** @type {TokenizeContext['now']} */
  137. function now() {
  138. // This is a hot path, so we clone manually instead of `Object.assign({}, point)`
  139. const {line, column, offset, _index, _bufferIndex} = point
  140. return {line, column, offset, _index, _bufferIndex}
  141. }
  142. /** @type {TokenizeContext['defineSkip']} */
  143. function defineSkip(value) {
  144. columnStart[value.line] = value.column
  145. accountForPotentialSkip()
  146. debug('position: define skip: `%j`', point)
  147. }
  148. //
  149. // State management.
  150. //
  151. /**
  152. * Main loop (note that `_index` and `_bufferIndex` in `point` are modified by
  153. * `consume`).
  154. * Here is where we walk through the chunks, which either include strings of
  155. * several characters, or numerical character codes.
  156. * The reason to do this in a loop instead of a call is so the stack can
  157. * drain.
  158. *
  159. * @returns {undefined}
  160. */
  161. function main() {
  162. /** @type {number} */
  163. let chunkIndex
  164. while (point._index < chunks.length) {
  165. const chunk = chunks[point._index]
  166. // If we’re in a buffer chunk, loop through it.
  167. if (typeof chunk === 'string') {
  168. chunkIndex = point._index
  169. if (point._bufferIndex < 0) {
  170. point._bufferIndex = 0
  171. }
  172. while (
  173. point._index === chunkIndex &&
  174. point._bufferIndex < chunk.length
  175. ) {
  176. go(chunk.charCodeAt(point._bufferIndex))
  177. }
  178. } else {
  179. go(chunk)
  180. }
  181. }
  182. }
  183. /**
  184. * Deal with one code.
  185. *
  186. * @param {Code} code
  187. * @returns {undefined}
  188. */
  189. function go(code) {
  190. assert(consumed === true, 'expected character to be consumed')
  191. consumed = undefined
  192. debug('main: passing `%s` to %s', code, state && state.name)
  193. expectedCode = code
  194. assert(typeof state === 'function', 'expected state')
  195. state = state(code)
  196. }
  197. /** @type {Effects['consume']} */
  198. function consume(code) {
  199. assert(code === expectedCode, 'expected given code to equal expected code')
  200. debug('consume: `%s`', code)
  201. assert(
  202. consumed === undefined,
  203. 'expected code to not have been consumed: this might be because `return x(code)` instead of `return x` was used'
  204. )
  205. assert(
  206. code === null
  207. ? context.events.length === 0 ||
  208. context.events[context.events.length - 1][0] === 'exit'
  209. : context.events[context.events.length - 1][0] === 'enter',
  210. 'expected last token to be open'
  211. )
  212. if (markdownLineEnding(code)) {
  213. point.line++
  214. point.column = 1
  215. point.offset += code === codes.carriageReturnLineFeed ? 2 : 1
  216. accountForPotentialSkip()
  217. debug('position: after eol: `%j`', point)
  218. } else if (code !== codes.virtualSpace) {
  219. point.column++
  220. point.offset++
  221. }
  222. // Not in a string chunk.
  223. if (point._bufferIndex < 0) {
  224. point._index++
  225. } else {
  226. point._bufferIndex++
  227. // At end of string chunk.
  228. // @ts-expect-error Points w/ non-negative `_bufferIndex` reference
  229. // strings.
  230. if (point._bufferIndex === chunks[point._index].length) {
  231. point._bufferIndex = -1
  232. point._index++
  233. }
  234. }
  235. // Expose the previous character.
  236. context.previous = code
  237. // Mark as consumed.
  238. consumed = true
  239. }
  240. /** @type {Effects['enter']} */
  241. function enter(type, fields) {
  242. /** @type {Token} */
  243. // @ts-expect-error Patch instead of assign required fields to help GC.
  244. const token = fields || {}
  245. token.type = type
  246. token.start = now()
  247. assert(typeof type === 'string', 'expected string type')
  248. assert(type.length > 0, 'expected non-empty string')
  249. debug('enter: `%s`', type)
  250. context.events.push(['enter', token, context])
  251. stack.push(token)
  252. return token
  253. }
  254. /** @type {Effects['exit']} */
  255. function exit(type) {
  256. assert(typeof type === 'string', 'expected string type')
  257. assert(type.length > 0, 'expected non-empty string')
  258. const token = stack.pop()
  259. assert(token, 'cannot close w/o open tokens')
  260. token.end = now()
  261. assert(type === token.type, 'expected exit token to match current token')
  262. assert(
  263. !(
  264. token.start._index === token.end._index &&
  265. token.start._bufferIndex === token.end._bufferIndex
  266. ),
  267. 'expected non-empty token (`' + type + '`)'
  268. )
  269. debug('exit: `%s`', token.type)
  270. context.events.push(['exit', token, context])
  271. return token
  272. }
  273. /**
  274. * Use results.
  275. *
  276. * @type {ReturnHandle}
  277. */
  278. function onsuccessfulconstruct(construct, info) {
  279. addResult(construct, info.from)
  280. }
  281. /**
  282. * Discard results.
  283. *
  284. * @type {ReturnHandle}
  285. */
  286. function onsuccessfulcheck(_, info) {
  287. info.restore()
  288. }
  289. /**
  290. * Factory to attempt/check/interrupt.
  291. *
  292. * @param {ReturnHandle} onreturn
  293. * @param {{interrupt?: boolean | undefined} | undefined} [fields]
  294. */
  295. function constructFactory(onreturn, fields) {
  296. return hook
  297. /**
  298. * Handle either an object mapping codes to constructs, a list of
  299. * constructs, or a single construct.
  300. *
  301. * @param {Array<Construct> | Construct | ConstructRecord} constructs
  302. * @param {State} returnState
  303. * @param {State | undefined} [bogusState]
  304. * @returns {State}
  305. */
  306. function hook(constructs, returnState, bogusState) {
  307. /** @type {Array<Construct>} */
  308. let listOfConstructs
  309. /** @type {number} */
  310. let constructIndex
  311. /** @type {Construct} */
  312. let currentConstruct
  313. /** @type {Info} */
  314. let info
  315. return Array.isArray(constructs)
  316. ? /* c8 ignore next 1 */
  317. handleListOfConstructs(constructs)
  318. : 'tokenize' in constructs
  319. ? // @ts-expect-error Looks like a construct.
  320. handleListOfConstructs([constructs])
  321. : handleMapOfConstructs(constructs)
  322. /**
  323. * Handle a list of construct.
  324. *
  325. * @param {ConstructRecord} map
  326. * @returns {State}
  327. */
  328. function handleMapOfConstructs(map) {
  329. return start
  330. /** @type {State} */
  331. function start(code) {
  332. const def = code !== null && map[code]
  333. const all = code !== null && map.null
  334. const list = [
  335. // To do: add more extension tests.
  336. /* c8 ignore next 2 */
  337. ...(Array.isArray(def) ? def : def ? [def] : []),
  338. ...(Array.isArray(all) ? all : all ? [all] : [])
  339. ]
  340. return handleListOfConstructs(list)(code)
  341. }
  342. }
  343. /**
  344. * Handle a list of construct.
  345. *
  346. * @param {Array<Construct>} list
  347. * @returns {State}
  348. */
  349. function handleListOfConstructs(list) {
  350. listOfConstructs = list
  351. constructIndex = 0
  352. if (list.length === 0) {
  353. assert(bogusState, 'expected `bogusState` to be given')
  354. return bogusState
  355. }
  356. return handleConstruct(list[constructIndex])
  357. }
  358. /**
  359. * Handle a single construct.
  360. *
  361. * @param {Construct} construct
  362. * @returns {State}
  363. */
  364. function handleConstruct(construct) {
  365. return start
  366. /** @type {State} */
  367. function start(code) {
  368. // To do: not needed to store if there is no bogus state, probably?
  369. // Currently doesn’t work because `inspect` in document does a check
  370. // w/o a bogus, which doesn’t make sense. But it does seem to help perf
  371. // by not storing.
  372. info = store()
  373. currentConstruct = construct
  374. if (!construct.partial) {
  375. context.currentConstruct = construct
  376. }
  377. // Always populated by defaults.
  378. assert(
  379. context.parser.constructs.disable.null,
  380. 'expected `disable.null` to be populated'
  381. )
  382. if (
  383. construct.name &&
  384. context.parser.constructs.disable.null.includes(construct.name)
  385. ) {
  386. return nok(code)
  387. }
  388. return construct.tokenize.call(
  389. // If we do have fields, create an object w/ `context` as its
  390. // prototype.
  391. // This allows a “live binding”, which is needed for `interrupt`.
  392. fields ? Object.assign(Object.create(context), fields) : context,
  393. effects,
  394. ok,
  395. nok
  396. )(code)
  397. }
  398. }
  399. /** @type {State} */
  400. function ok(code) {
  401. assert(code === expectedCode, 'expected code')
  402. consumed = true
  403. onreturn(currentConstruct, info)
  404. return returnState
  405. }
  406. /** @type {State} */
  407. function nok(code) {
  408. assert(code === expectedCode, 'expected code')
  409. consumed = true
  410. info.restore()
  411. if (++constructIndex < listOfConstructs.length) {
  412. return handleConstruct(listOfConstructs[constructIndex])
  413. }
  414. return bogusState
  415. }
  416. }
  417. }
  418. /**
  419. * @param {Construct} construct
  420. * @param {number} from
  421. * @returns {undefined}
  422. */
  423. function addResult(construct, from) {
  424. if (construct.resolveAll && !resolveAllConstructs.includes(construct)) {
  425. resolveAllConstructs.push(construct)
  426. }
  427. if (construct.resolve) {
  428. splice(
  429. context.events,
  430. from,
  431. context.events.length - from,
  432. construct.resolve(context.events.slice(from), context)
  433. )
  434. }
  435. if (construct.resolveTo) {
  436. context.events = construct.resolveTo(context.events, context)
  437. }
  438. assert(
  439. construct.partial ||
  440. context.events.length === 0 ||
  441. context.events[context.events.length - 1][0] === 'exit',
  442. 'expected last token to end'
  443. )
  444. }
  445. /**
  446. * Store state.
  447. *
  448. * @returns {Info}
  449. */
  450. function store() {
  451. const startPoint = now()
  452. const startPrevious = context.previous
  453. const startCurrentConstruct = context.currentConstruct
  454. const startEventsIndex = context.events.length
  455. const startStack = Array.from(stack)
  456. return {restore, from: startEventsIndex}
  457. /**
  458. * Restore state.
  459. *
  460. * @returns {undefined}
  461. */
  462. function restore() {
  463. point = startPoint
  464. context.previous = startPrevious
  465. context.currentConstruct = startCurrentConstruct
  466. context.events.length = startEventsIndex
  467. stack = startStack
  468. accountForPotentialSkip()
  469. debug('position: restore: `%j`', point)
  470. }
  471. }
  472. /**
  473. * Move the current point a bit forward in the line when it’s on a column
  474. * skip.
  475. *
  476. * @returns {undefined}
  477. */
  478. function accountForPotentialSkip() {
  479. if (point.line in columnStart && point.column < 2) {
  480. point.column = columnStart[point.line]
  481. point.offset += columnStart[point.line] - 1
  482. }
  483. }
  484. }
  485. /**
  486. * Get the chunks from a slice of chunks in the range of a token.
  487. *
  488. * @param {Array<Chunk>} chunks
  489. * @param {Pick<Token, 'end' | 'start'>} token
  490. * @returns {Array<Chunk>}
  491. */
  492. function sliceChunks(chunks, token) {
  493. const startIndex = token.start._index
  494. const startBufferIndex = token.start._bufferIndex
  495. const endIndex = token.end._index
  496. const endBufferIndex = token.end._bufferIndex
  497. /** @type {Array<Chunk>} */
  498. let view
  499. if (startIndex === endIndex) {
  500. assert(endBufferIndex > -1, 'expected non-negative end buffer index')
  501. assert(startBufferIndex > -1, 'expected non-negative start buffer index')
  502. // @ts-expect-error `_bufferIndex` is used on string chunks.
  503. view = [chunks[startIndex].slice(startBufferIndex, endBufferIndex)]
  504. } else {
  505. view = chunks.slice(startIndex, endIndex)
  506. if (startBufferIndex > -1) {
  507. const head = view[0]
  508. if (typeof head === 'string') {
  509. view[0] = head.slice(startBufferIndex)
  510. } else {
  511. assert(startBufferIndex === 0, 'expected `startBufferIndex` to be `0`')
  512. view.shift()
  513. }
  514. }
  515. if (endBufferIndex > 0) {
  516. // @ts-expect-error `_bufferIndex` is used on string chunks.
  517. view.push(chunks[endIndex].slice(0, endBufferIndex))
  518. }
  519. }
  520. return view
  521. }
  522. /**
  523. * Get the string value of a slice of chunks.
  524. *
  525. * @param {Array<Chunk>} chunks
  526. * @param {boolean | undefined} [expandTabs=false]
  527. * @returns {string}
  528. */
  529. function serializeChunks(chunks, expandTabs) {
  530. let index = -1
  531. /** @type {Array<string>} */
  532. const result = []
  533. /** @type {boolean | undefined} */
  534. let atTab
  535. while (++index < chunks.length) {
  536. const chunk = chunks[index]
  537. /** @type {string} */
  538. let value
  539. if (typeof chunk === 'string') {
  540. value = chunk
  541. } else
  542. switch (chunk) {
  543. case codes.carriageReturn: {
  544. value = values.cr
  545. break
  546. }
  547. case codes.lineFeed: {
  548. value = values.lf
  549. break
  550. }
  551. case codes.carriageReturnLineFeed: {
  552. value = values.cr + values.lf
  553. break
  554. }
  555. case codes.horizontalTab: {
  556. value = expandTabs ? values.space : values.ht
  557. break
  558. }
  559. case codes.virtualSpace: {
  560. if (!expandTabs && atTab) continue
  561. value = values.space
  562. break
  563. }
  564. default: {
  565. assert(typeof chunk === 'number', 'expected number')
  566. // Currently only replacement character.
  567. value = String.fromCharCode(chunk)
  568. }
  569. }
  570. atTab = chunk === codes.horizontalTab
  571. result.push(value)
  572. }
  573. return result.join('')
  574. }