create-tokenizer.js 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582
  1. /**
  2. * @typedef {import('micromark-util-types').Chunk} Chunk
  3. * @typedef {import('micromark-util-types').Code} Code
  4. * @typedef {import('micromark-util-types').Construct} Construct
  5. * @typedef {import('micromark-util-types').ConstructRecord} ConstructRecord
  6. * @typedef {import('micromark-util-types').Effects} Effects
  7. * @typedef {import('micromark-util-types').InitialConstruct} InitialConstruct
  8. * @typedef {import('micromark-util-types').ParseContext} ParseContext
  9. * @typedef {import('micromark-util-types').Point} Point
  10. * @typedef {import('micromark-util-types').State} State
  11. * @typedef {import('micromark-util-types').Token} Token
  12. * @typedef {import('micromark-util-types').TokenType} TokenType
  13. * @typedef {import('micromark-util-types').TokenizeContext} TokenizeContext
  14. */
  15. /**
  16. * @callback Restore
  17. * @returns {undefined}
  18. *
  19. * @typedef Info
  20. * @property {Restore} restore
  21. * @property {number} from
  22. *
  23. * @callback ReturnHandle
  24. * Handle a successful run.
  25. * @param {Construct} construct
  26. * @param {Info} info
  27. * @returns {undefined}
  28. */
  29. import {markdownLineEnding} from 'micromark-util-character'
  30. import {push, splice} from 'micromark-util-chunked'
  31. import {resolveAll} from 'micromark-util-resolve-all'
  32. /**
  33. * Create a tokenizer.
  34. * Tokenizers deal with one type of data (e.g., containers, flow, text).
  35. * The parser is the object dealing with it all.
  36. * `initialize` works like other constructs, except that only its `tokenize`
  37. * function is used, in which case it doesn’t receive an `ok` or `nok`.
  38. * `from` can be given to set the point before the first character, although
  39. * when further lines are indented, they must be set with `defineSkip`.
  40. *
  41. * @param {ParseContext} parser
  42. * @param {InitialConstruct} initialize
  43. * @param {Omit<Point, '_bufferIndex' | '_index'> | undefined} [from]
  44. * @returns {TokenizeContext}
  45. */
  46. export function createTokenizer(parser, initialize, from) {
  47. /** @type {Point} */
  48. let point = Object.assign(
  49. from
  50. ? Object.assign({}, from)
  51. : {
  52. line: 1,
  53. column: 1,
  54. offset: 0
  55. },
  56. {
  57. _index: 0,
  58. _bufferIndex: -1
  59. }
  60. )
  61. /** @type {Record<string, number>} */
  62. const columnStart = {}
  63. /** @type {Array<Construct>} */
  64. const resolveAllConstructs = []
  65. /** @type {Array<Chunk>} */
  66. let chunks = []
  67. /** @type {Array<Token>} */
  68. let stack = []
  69. /** @type {boolean | undefined} */
  70. let consumed = true
  71. /**
  72. * Tools used for tokenizing.
  73. *
  74. * @type {Effects}
  75. */
  76. const effects = {
  77. consume,
  78. enter,
  79. exit,
  80. attempt: constructFactory(onsuccessfulconstruct),
  81. check: constructFactory(onsuccessfulcheck),
  82. interrupt: constructFactory(onsuccessfulcheck, {
  83. interrupt: true
  84. })
  85. }
  86. /**
  87. * State and tools for resolving and serializing.
  88. *
  89. * @type {TokenizeContext}
  90. */
  91. const context = {
  92. previous: null,
  93. code: null,
  94. containerState: {},
  95. events: [],
  96. parser,
  97. sliceStream,
  98. sliceSerialize,
  99. now,
  100. defineSkip,
  101. write
  102. }
  103. /**
  104. * The state function.
  105. *
  106. * @type {State | undefined}
  107. */
  108. let state = initialize.tokenize.call(context, effects)
  109. /**
  110. * Track which character we expect to be consumed, to catch bugs.
  111. *
  112. * @type {Code}
  113. */
  114. let expectedCode
  115. if (initialize.resolveAll) {
  116. resolveAllConstructs.push(initialize)
  117. }
  118. return context
  119. /** @type {TokenizeContext['write']} */
  120. function write(slice) {
  121. chunks = push(chunks, slice)
  122. main()
  123. // Exit if we’re not done, resolve might change stuff.
  124. if (chunks[chunks.length - 1] !== null) {
  125. return []
  126. }
  127. addResult(initialize, 0)
  128. // Otherwise, resolve, and exit.
  129. context.events = resolveAll(resolveAllConstructs, context.events, context)
  130. return context.events
  131. }
  132. //
  133. // Tools.
  134. //
  135. /** @type {TokenizeContext['sliceSerialize']} */
  136. function sliceSerialize(token, expandTabs) {
  137. return serializeChunks(sliceStream(token), expandTabs)
  138. }
  139. /** @type {TokenizeContext['sliceStream']} */
  140. function sliceStream(token) {
  141. return sliceChunks(chunks, token)
  142. }
  143. /** @type {TokenizeContext['now']} */
  144. function now() {
  145. // This is a hot path, so we clone manually instead of `Object.assign({}, point)`
  146. const {line, column, offset, _index, _bufferIndex} = point
  147. return {
  148. line,
  149. column,
  150. offset,
  151. _index,
  152. _bufferIndex
  153. }
  154. }
  155. /** @type {TokenizeContext['defineSkip']} */
  156. function defineSkip(value) {
  157. columnStart[value.line] = value.column
  158. accountForPotentialSkip()
  159. }
  160. //
  161. // State management.
  162. //
  163. /**
  164. * Main loop (note that `_index` and `_bufferIndex` in `point` are modified by
  165. * `consume`).
  166. * Here is where we walk through the chunks, which either include strings of
  167. * several characters, or numerical character codes.
  168. * The reason to do this in a loop instead of a call is so the stack can
  169. * drain.
  170. *
  171. * @returns {undefined}
  172. */
  173. function main() {
  174. /** @type {number} */
  175. let chunkIndex
  176. while (point._index < chunks.length) {
  177. const chunk = chunks[point._index]
  178. // If we’re in a buffer chunk, loop through it.
  179. if (typeof chunk === 'string') {
  180. chunkIndex = point._index
  181. if (point._bufferIndex < 0) {
  182. point._bufferIndex = 0
  183. }
  184. while (
  185. point._index === chunkIndex &&
  186. point._bufferIndex < chunk.length
  187. ) {
  188. go(chunk.charCodeAt(point._bufferIndex))
  189. }
  190. } else {
  191. go(chunk)
  192. }
  193. }
  194. }
  195. /**
  196. * Deal with one code.
  197. *
  198. * @param {Code} code
  199. * @returns {undefined}
  200. */
  201. function go(code) {
  202. consumed = undefined
  203. expectedCode = code
  204. state = state(code)
  205. }
  206. /** @type {Effects['consume']} */
  207. function consume(code) {
  208. if (markdownLineEnding(code)) {
  209. point.line++
  210. point.column = 1
  211. point.offset += code === -3 ? 2 : 1
  212. accountForPotentialSkip()
  213. } else if (code !== -1) {
  214. point.column++
  215. point.offset++
  216. }
  217. // Not in a string chunk.
  218. if (point._bufferIndex < 0) {
  219. point._index++
  220. } else {
  221. point._bufferIndex++
  222. // At end of string chunk.
  223. // @ts-expect-error Points w/ non-negative `_bufferIndex` reference
  224. // strings.
  225. if (point._bufferIndex === chunks[point._index].length) {
  226. point._bufferIndex = -1
  227. point._index++
  228. }
  229. }
  230. // Expose the previous character.
  231. context.previous = code
  232. // Mark as consumed.
  233. consumed = true
  234. }
  235. /** @type {Effects['enter']} */
  236. function enter(type, fields) {
  237. /** @type {Token} */
  238. // @ts-expect-error Patch instead of assign required fields to help GC.
  239. const token = fields || {}
  240. token.type = type
  241. token.start = now()
  242. context.events.push(['enter', token, context])
  243. stack.push(token)
  244. return token
  245. }
  246. /** @type {Effects['exit']} */
  247. function exit(type) {
  248. const token = stack.pop()
  249. token.end = now()
  250. context.events.push(['exit', token, context])
  251. return token
  252. }
  253. /**
  254. * Use results.
  255. *
  256. * @type {ReturnHandle}
  257. */
  258. function onsuccessfulconstruct(construct, info) {
  259. addResult(construct, info.from)
  260. }
  261. /**
  262. * Discard results.
  263. *
  264. * @type {ReturnHandle}
  265. */
  266. function onsuccessfulcheck(_, info) {
  267. info.restore()
  268. }
  269. /**
  270. * Factory to attempt/check/interrupt.
  271. *
  272. * @param {ReturnHandle} onreturn
  273. * @param {{interrupt?: boolean | undefined} | undefined} [fields]
  274. */
  275. function constructFactory(onreturn, fields) {
  276. return hook
  277. /**
  278. * Handle either an object mapping codes to constructs, a list of
  279. * constructs, or a single construct.
  280. *
  281. * @param {Array<Construct> | Construct | ConstructRecord} constructs
  282. * @param {State} returnState
  283. * @param {State | undefined} [bogusState]
  284. * @returns {State}
  285. */
  286. function hook(constructs, returnState, bogusState) {
  287. /** @type {Array<Construct>} */
  288. let listOfConstructs
  289. /** @type {number} */
  290. let constructIndex
  291. /** @type {Construct} */
  292. let currentConstruct
  293. /** @type {Info} */
  294. let info
  295. return Array.isArray(constructs) /* c8 ignore next 1 */
  296. ? handleListOfConstructs(constructs)
  297. : 'tokenize' in constructs
  298. ? // @ts-expect-error Looks like a construct.
  299. handleListOfConstructs([constructs])
  300. : handleMapOfConstructs(constructs)
  301. /**
  302. * Handle a list of construct.
  303. *
  304. * @param {ConstructRecord} map
  305. * @returns {State}
  306. */
  307. function handleMapOfConstructs(map) {
  308. return start
  309. /** @type {State} */
  310. function start(code) {
  311. const def = code !== null && map[code]
  312. const all = code !== null && map.null
  313. const list = [
  314. // To do: add more extension tests.
  315. /* c8 ignore next 2 */
  316. ...(Array.isArray(def) ? def : def ? [def] : []),
  317. ...(Array.isArray(all) ? all : all ? [all] : [])
  318. ]
  319. return handleListOfConstructs(list)(code)
  320. }
  321. }
  322. /**
  323. * Handle a list of construct.
  324. *
  325. * @param {Array<Construct>} list
  326. * @returns {State}
  327. */
  328. function handleListOfConstructs(list) {
  329. listOfConstructs = list
  330. constructIndex = 0
  331. if (list.length === 0) {
  332. return bogusState
  333. }
  334. return handleConstruct(list[constructIndex])
  335. }
  336. /**
  337. * Handle a single construct.
  338. *
  339. * @param {Construct} construct
  340. * @returns {State}
  341. */
  342. function handleConstruct(construct) {
  343. return start
  344. /** @type {State} */
  345. function start(code) {
  346. // To do: not needed to store if there is no bogus state, probably?
  347. // Currently doesn’t work because `inspect` in document does a check
  348. // w/o a bogus, which doesn’t make sense. But it does seem to help perf
  349. // by not storing.
  350. info = store()
  351. currentConstruct = construct
  352. if (!construct.partial) {
  353. context.currentConstruct = construct
  354. }
  355. // Always populated by defaults.
  356. if (
  357. construct.name &&
  358. context.parser.constructs.disable.null.includes(construct.name)
  359. ) {
  360. return nok(code)
  361. }
  362. return construct.tokenize.call(
  363. // If we do have fields, create an object w/ `context` as its
  364. // prototype.
  365. // This allows a “live binding”, which is needed for `interrupt`.
  366. fields ? Object.assign(Object.create(context), fields) : context,
  367. effects,
  368. ok,
  369. nok
  370. )(code)
  371. }
  372. }
  373. /** @type {State} */
  374. function ok(code) {
  375. consumed = true
  376. onreturn(currentConstruct, info)
  377. return returnState
  378. }
  379. /** @type {State} */
  380. function nok(code) {
  381. consumed = true
  382. info.restore()
  383. if (++constructIndex < listOfConstructs.length) {
  384. return handleConstruct(listOfConstructs[constructIndex])
  385. }
  386. return bogusState
  387. }
  388. }
  389. }
  390. /**
  391. * @param {Construct} construct
  392. * @param {number} from
  393. * @returns {undefined}
  394. */
  395. function addResult(construct, from) {
  396. if (construct.resolveAll && !resolveAllConstructs.includes(construct)) {
  397. resolveAllConstructs.push(construct)
  398. }
  399. if (construct.resolve) {
  400. splice(
  401. context.events,
  402. from,
  403. context.events.length - from,
  404. construct.resolve(context.events.slice(from), context)
  405. )
  406. }
  407. if (construct.resolveTo) {
  408. context.events = construct.resolveTo(context.events, context)
  409. }
  410. }
  411. /**
  412. * Store state.
  413. *
  414. * @returns {Info}
  415. */
  416. function store() {
  417. const startPoint = now()
  418. const startPrevious = context.previous
  419. const startCurrentConstruct = context.currentConstruct
  420. const startEventsIndex = context.events.length
  421. const startStack = Array.from(stack)
  422. return {
  423. restore,
  424. from: startEventsIndex
  425. }
  426. /**
  427. * Restore state.
  428. *
  429. * @returns {undefined}
  430. */
  431. function restore() {
  432. point = startPoint
  433. context.previous = startPrevious
  434. context.currentConstruct = startCurrentConstruct
  435. context.events.length = startEventsIndex
  436. stack = startStack
  437. accountForPotentialSkip()
  438. }
  439. }
  440. /**
  441. * Move the current point a bit forward in the line when it’s on a column
  442. * skip.
  443. *
  444. * @returns {undefined}
  445. */
  446. function accountForPotentialSkip() {
  447. if (point.line in columnStart && point.column < 2) {
  448. point.column = columnStart[point.line]
  449. point.offset += columnStart[point.line] - 1
  450. }
  451. }
  452. }
  453. /**
  454. * Get the chunks from a slice of chunks in the range of a token.
  455. *
  456. * @param {Array<Chunk>} chunks
  457. * @param {Pick<Token, 'end' | 'start'>} token
  458. * @returns {Array<Chunk>}
  459. */
  460. function sliceChunks(chunks, token) {
  461. const startIndex = token.start._index
  462. const startBufferIndex = token.start._bufferIndex
  463. const endIndex = token.end._index
  464. const endBufferIndex = token.end._bufferIndex
  465. /** @type {Array<Chunk>} */
  466. let view
  467. if (startIndex === endIndex) {
  468. // @ts-expect-error `_bufferIndex` is used on string chunks.
  469. view = [chunks[startIndex].slice(startBufferIndex, endBufferIndex)]
  470. } else {
  471. view = chunks.slice(startIndex, endIndex)
  472. if (startBufferIndex > -1) {
  473. const head = view[0]
  474. if (typeof head === 'string') {
  475. view[0] = head.slice(startBufferIndex)
  476. } else {
  477. view.shift()
  478. }
  479. }
  480. if (endBufferIndex > 0) {
  481. // @ts-expect-error `_bufferIndex` is used on string chunks.
  482. view.push(chunks[endIndex].slice(0, endBufferIndex))
  483. }
  484. }
  485. return view
  486. }
  487. /**
  488. * Get the string value of a slice of chunks.
  489. *
  490. * @param {Array<Chunk>} chunks
  491. * @param {boolean | undefined} [expandTabs=false]
  492. * @returns {string}
  493. */
  494. function serializeChunks(chunks, expandTabs) {
  495. let index = -1
  496. /** @type {Array<string>} */
  497. const result = []
  498. /** @type {boolean | undefined} */
  499. let atTab
  500. while (++index < chunks.length) {
  501. const chunk = chunks[index]
  502. /** @type {string} */
  503. let value
  504. if (typeof chunk === 'string') {
  505. value = chunk
  506. } else
  507. switch (chunk) {
  508. case -5: {
  509. value = '\r'
  510. break
  511. }
  512. case -4: {
  513. value = '\n'
  514. break
  515. }
  516. case -3: {
  517. value = '\r' + '\n'
  518. break
  519. }
  520. case -2: {
  521. value = expandTabs ? ' ' : '\t'
  522. break
  523. }
  524. case -1: {
  525. if (!expandTabs && atTab) continue
  526. value = ' '
  527. break
  528. }
  529. default: {
  530. // Currently only replacement character.
  531. value = String.fromCharCode(chunk)
  532. }
  533. }
  534. atTab = chunk === -2
  535. result.push(value)
  536. }
  537. return result.join('')
  538. }