index.js 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. import {asciiAlphanumeric} from 'micromark-util-character'
  2. import {encode} from 'micromark-util-encode'
  3. /**
  4. * Make a value safe for injection as a URL.
  5. *
  6. * This encodes unsafe characters with percent-encoding and skips already
  7. * encoded sequences (see `normalizeUri`).
  8. * Further unsafe characters are encoded as character references (see
  9. * `micromark-util-encode`).
  10. *
  11. * A regex of allowed protocols can be given, in which case the URL is
  12. * sanitized.
  13. * For example, `/^(https?|ircs?|mailto|xmpp)$/i` can be used for `a[href]`, or
  14. * `/^https?$/i` for `img[src]` (this is what `github.com` allows).
  15. * If the URL includes an unknown protocol (one not matched by `protocol`, such
  16. * as a dangerous example, `javascript:`), the value is ignored.
  17. *
  18. * @param {string | null | undefined} url
  19. * URI to sanitize.
  20. * @param {RegExp | null | undefined} [protocol]
  21. * Allowed protocols.
  22. * @returns {string}
  23. * Sanitized URI.
  24. */
  25. export function sanitizeUri(url, protocol) {
  26. const value = encode(normalizeUri(url || ''))
  27. if (!protocol) {
  28. return value
  29. }
  30. const colon = value.indexOf(':')
  31. const questionMark = value.indexOf('?')
  32. const numberSign = value.indexOf('#')
  33. const slash = value.indexOf('/')
  34. if (
  35. // If there is no protocol, it’s relative.
  36. colon < 0 ||
  37. // If the first colon is after a `?`, `#`, or `/`, it’s not a protocol.
  38. (slash > -1 && colon > slash) ||
  39. (questionMark > -1 && colon > questionMark) ||
  40. (numberSign > -1 && colon > numberSign) ||
  41. // It is a protocol, it should be allowed.
  42. protocol.test(value.slice(0, colon))
  43. ) {
  44. return value
  45. }
  46. return ''
  47. }
  48. /**
  49. * Normalize a URL.
  50. *
  51. * Encode unsafe characters with percent-encoding, skipping already encoded
  52. * sequences.
  53. *
  54. * @param {string} value
  55. * URI to normalize.
  56. * @returns {string}
  57. * Normalized URI.
  58. */
  59. export function normalizeUri(value) {
  60. /** @type {Array<string>} */
  61. const result = []
  62. let index = -1
  63. let start = 0
  64. let skip = 0
  65. while (++index < value.length) {
  66. const code = value.charCodeAt(index)
  67. /** @type {string} */
  68. let replace = ''
  69. // A correct percent encoded value.
  70. if (
  71. code === 37 &&
  72. asciiAlphanumeric(value.charCodeAt(index + 1)) &&
  73. asciiAlphanumeric(value.charCodeAt(index + 2))
  74. ) {
  75. skip = 2
  76. }
  77. // ASCII.
  78. else if (code < 128) {
  79. if (!/[!#$&-;=?-Z_a-z~]/.test(String.fromCharCode(code))) {
  80. replace = String.fromCharCode(code)
  81. }
  82. }
  83. // Astral.
  84. else if (code > 55_295 && code < 57_344) {
  85. const next = value.charCodeAt(index + 1)
  86. // A correct surrogate pair.
  87. if (code < 56_320 && next > 56_319 && next < 57_344) {
  88. replace = String.fromCharCode(code, next)
  89. skip = 1
  90. }
  91. // Lone surrogate.
  92. else {
  93. replace = '\uFFFD'
  94. }
  95. }
  96. // Unicode.
  97. else {
  98. replace = String.fromCharCode(code)
  99. }
  100. if (replace) {
  101. result.push(value.slice(start, index), encodeURIComponent(replace))
  102. start = index + skip + 1
  103. replace = ''
  104. }
  105. if (skip) {
  106. index += skip
  107. skip = 0
  108. }
  109. }
  110. return result.join('') + value.slice(start)
  111. }