character-reference.js 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. /**
  2. * @typedef {import('micromark-util-types').Code} Code
  3. * @typedef {import('micromark-util-types').Construct} Construct
  4. * @typedef {import('micromark-util-types').State} State
  5. * @typedef {import('micromark-util-types').TokenizeContext} TokenizeContext
  6. * @typedef {import('micromark-util-types').Tokenizer} Tokenizer
  7. */
  8. import {decodeNamedCharacterReference} from 'decode-named-character-reference'
  9. import {
  10. asciiAlphanumeric,
  11. asciiDigit,
  12. asciiHexDigit
  13. } from 'micromark-util-character'
  14. import {codes, constants, types} from 'micromark-util-symbol'
  15. import {ok as assert} from 'devlop'
  16. /** @type {Construct} */
  17. export const characterReference = {
  18. name: 'characterReference',
  19. tokenize: tokenizeCharacterReference
  20. }
  21. /**
  22. * @this {TokenizeContext}
  23. * @type {Tokenizer}
  24. */
  25. function tokenizeCharacterReference(effects, ok, nok) {
  26. const self = this
  27. let size = 0
  28. /** @type {number} */
  29. let max
  30. /** @type {(code: Code) => boolean} */
  31. let test
  32. return start
  33. /**
  34. * Start of character reference.
  35. *
  36. * ```markdown
  37. * > | a&b
  38. * ^
  39. * > | a{b
  40. * ^
  41. * > | a	b
  42. * ^
  43. * ```
  44. *
  45. * @type {State}
  46. */
  47. function start(code) {
  48. assert(code === codes.ampersand, 'expected `&`')
  49. effects.enter(types.characterReference)
  50. effects.enter(types.characterReferenceMarker)
  51. effects.consume(code)
  52. effects.exit(types.characterReferenceMarker)
  53. return open
  54. }
  55. /**
  56. * After `&`, at `#` for numeric references or alphanumeric for named
  57. * references.
  58. *
  59. * ```markdown
  60. * > | a&b
  61. * ^
  62. * > | a{b
  63. * ^
  64. * > | a	b
  65. * ^
  66. * ```
  67. *
  68. * @type {State}
  69. */
  70. function open(code) {
  71. if (code === codes.numberSign) {
  72. effects.enter(types.characterReferenceMarkerNumeric)
  73. effects.consume(code)
  74. effects.exit(types.characterReferenceMarkerNumeric)
  75. return numeric
  76. }
  77. effects.enter(types.characterReferenceValue)
  78. max = constants.characterReferenceNamedSizeMax
  79. test = asciiAlphanumeric
  80. return value(code)
  81. }
  82. /**
  83. * After `#`, at `x` for hexadecimals or digit for decimals.
  84. *
  85. * ```markdown
  86. * > | a{b
  87. * ^
  88. * > | a	b
  89. * ^
  90. * ```
  91. *
  92. * @type {State}
  93. */
  94. function numeric(code) {
  95. if (code === codes.uppercaseX || code === codes.lowercaseX) {
  96. effects.enter(types.characterReferenceMarkerHexadecimal)
  97. effects.consume(code)
  98. effects.exit(types.characterReferenceMarkerHexadecimal)
  99. effects.enter(types.characterReferenceValue)
  100. max = constants.characterReferenceHexadecimalSizeMax
  101. test = asciiHexDigit
  102. return value
  103. }
  104. effects.enter(types.characterReferenceValue)
  105. max = constants.characterReferenceDecimalSizeMax
  106. test = asciiDigit
  107. return value(code)
  108. }
  109. /**
  110. * After markers (`&#x`, `&#`, or `&`), in value, before `;`.
  111. *
  112. * The character reference kind defines what and how many characters are
  113. * allowed.
  114. *
  115. * ```markdown
  116. * > | a&b
  117. * ^^^
  118. * > | a{b
  119. * ^^^
  120. * > | a	b
  121. * ^
  122. * ```
  123. *
  124. * @type {State}
  125. */
  126. function value(code) {
  127. if (code === codes.semicolon && size) {
  128. const token = effects.exit(types.characterReferenceValue)
  129. if (
  130. test === asciiAlphanumeric &&
  131. !decodeNamedCharacterReference(self.sliceSerialize(token))
  132. ) {
  133. return nok(code)
  134. }
  135. // To do: `markdown-rs` uses a different name:
  136. // `CharacterReferenceMarkerSemi`.
  137. effects.enter(types.characterReferenceMarker)
  138. effects.consume(code)
  139. effects.exit(types.characterReferenceMarker)
  140. effects.exit(types.characterReference)
  141. return ok
  142. }
  143. if (test(code) && size++ < max) {
  144. effects.consume(code)
  145. return value
  146. }
  147. return nok(code)
  148. }
  149. }