index.js 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. import {asciiAlphanumeric} from 'micromark-util-character'
  2. import {encode} from 'micromark-util-encode'
  3. import {codes, values} from 'micromark-util-symbol'
  4. /**
  5. * Make a value safe for injection as a URL.
  6. *
  7. * This encodes unsafe characters with percent-encoding and skips already
  8. * encoded sequences (see `normalizeUri`).
  9. * Further unsafe characters are encoded as character references (see
  10. * `micromark-util-encode`).
  11. *
  12. * A regex of allowed protocols can be given, in which case the URL is
  13. * sanitized.
  14. * For example, `/^(https?|ircs?|mailto|xmpp)$/i` can be used for `a[href]`, or
  15. * `/^https?$/i` for `img[src]` (this is what `github.com` allows).
  16. * If the URL includes an unknown protocol (one not matched by `protocol`, such
  17. * as a dangerous example, `javascript:`), the value is ignored.
  18. *
  19. * @param {string | null | undefined} url
  20. * URI to sanitize.
  21. * @param {RegExp | null | undefined} [protocol]
  22. * Allowed protocols.
  23. * @returns {string}
  24. * Sanitized URI.
  25. */
  26. export function sanitizeUri(url, protocol) {
  27. const value = encode(normalizeUri(url || ''))
  28. if (!protocol) {
  29. return value
  30. }
  31. const colon = value.indexOf(':')
  32. const questionMark = value.indexOf('?')
  33. const numberSign = value.indexOf('#')
  34. const slash = value.indexOf('/')
  35. if (
  36. // If there is no protocol, it’s relative.
  37. colon < 0 ||
  38. // If the first colon is after a `?`, `#`, or `/`, it’s not a protocol.
  39. (slash > -1 && colon > slash) ||
  40. (questionMark > -1 && colon > questionMark) ||
  41. (numberSign > -1 && colon > numberSign) ||
  42. // It is a protocol, it should be allowed.
  43. protocol.test(value.slice(0, colon))
  44. ) {
  45. return value
  46. }
  47. return ''
  48. }
  49. /**
  50. * Normalize a URL.
  51. *
  52. * Encode unsafe characters with percent-encoding, skipping already encoded
  53. * sequences.
  54. *
  55. * @param {string} value
  56. * URI to normalize.
  57. * @returns {string}
  58. * Normalized URI.
  59. */
  60. export function normalizeUri(value) {
  61. /** @type {Array<string>} */
  62. const result = []
  63. let index = -1
  64. let start = 0
  65. let skip = 0
  66. while (++index < value.length) {
  67. const code = value.charCodeAt(index)
  68. /** @type {string} */
  69. let replace = ''
  70. // A correct percent encoded value.
  71. if (
  72. code === codes.percentSign &&
  73. asciiAlphanumeric(value.charCodeAt(index + 1)) &&
  74. asciiAlphanumeric(value.charCodeAt(index + 2))
  75. ) {
  76. skip = 2
  77. }
  78. // ASCII.
  79. else if (code < 128) {
  80. if (!/[!#$&-;=?-Z_a-z~]/.test(String.fromCharCode(code))) {
  81. replace = String.fromCharCode(code)
  82. }
  83. }
  84. // Astral.
  85. else if (code > 55_295 && code < 57_344) {
  86. const next = value.charCodeAt(index + 1)
  87. // A correct surrogate pair.
  88. if (code < 56_320 && next > 56_319 && next < 57_344) {
  89. replace = String.fromCharCode(code, next)
  90. skip = 1
  91. }
  92. // Lone surrogate.
  93. else {
  94. replace = values.replacementCharacter
  95. }
  96. }
  97. // Unicode.
  98. else {
  99. replace = String.fromCharCode(code)
  100. }
  101. if (replace) {
  102. result.push(value.slice(start, index), encodeURIComponent(replace))
  103. start = index + skip + 1
  104. replace = ''
  105. }
  106. if (skip) {
  107. index += skip
  108. skip = 0
  109. }
  110. }
  111. return result.join('') + value.slice(start)
  112. }