rewrite-pattern.js 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904
  1. 'use strict';
  2. const generate = require('@babel/regjsgen').generate;
  3. const parse = require('regjsparser').parse;
  4. const regenerate = require('regenerate');
  5. const unicodeMatchProperty = require('unicode-match-property-ecmascript');
  6. const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
  7. const iuMappings = require('./data/iu-mappings.js');
  8. const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
  9. function flatMap(array, callback) {
  10. const result = [];
  11. array.forEach(item => {
  12. const res = callback(item);
  13. if (Array.isArray(res)) {
  14. result.push.apply(result, res);
  15. } else {
  16. result.push(res);
  17. }
  18. });
  19. return result;
  20. }
  21. const SPECIAL_CHARS = /([\\^$.*+?()[\]{}|])/g;
  22. // Prepare a Regenerate set containing all code points, used for negative
  23. // character classes (if any).
  24. const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
  25. const ASTRAL_SET = regenerate().addRange(0x10000, 0x10FFFF);
  26. const NEWLINE_SET = regenerate().add(
  27. // `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
  28. 0x000A, // Line Feed <LF>
  29. 0x000D, // Carriage Return <CR>
  30. 0x2028, // Line Separator <LS>
  31. 0x2029 // Paragraph Separator <PS>
  32. );
  33. // Prepare a Regenerate set containing all code points that are supposed to be
  34. // matched by `/./u`. https://mths.be/es6#sec-atom
  35. const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
  36. .remove(NEWLINE_SET);
  37. const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => {
  38. if (unicode) {
  39. if (ignoreCase) {
  40. return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
  41. }
  42. return ESCAPE_SETS.UNICODE.get(character);
  43. }
  44. return ESCAPE_SETS.REGULAR.get(character);
  45. };
  46. const getUnicodeDotSet = (dotAll) => {
  47. return dotAll ? UNICODE_SET : DOT_SET_UNICODE;
  48. };
  49. const getUnicodePropertyValueSet = (property, value) => {
  50. const path = value ?
  51. `${ property }/${ value }` :
  52. `Binary_Property/${ property }`;
  53. try {
  54. return require(`regenerate-unicode-properties/${ path }.js`);
  55. } catch (exception) {
  56. throw new Error(
  57. `Failed to recognize value \`${ value }\` for property ` +
  58. `\`${ property }\`.`
  59. );
  60. }
  61. };
  62. const handleLoneUnicodePropertyNameOrValue = (value) => {
  63. // It could be a `General_Category` value or a binary property.
  64. // Note: `unicodeMatchPropertyValue` throws on invalid values.
  65. try {
  66. const property = 'General_Category';
  67. const category = unicodeMatchPropertyValue(property, value);
  68. return getUnicodePropertyValueSet(property, category);
  69. } catch (exception) {}
  70. // It’s not a `General_Category` value, so check if it’s a property
  71. // of strings.
  72. try {
  73. return getUnicodePropertyValueSet('Property_of_Strings', value);
  74. } catch (exception) {}
  75. // Lastly, check if it’s a binary property of single code points.
  76. // Note: `unicodeMatchProperty` throws on invalid properties.
  77. const property = unicodeMatchProperty(value);
  78. return getUnicodePropertyValueSet(property);
  79. };
  80. const getUnicodePropertyEscapeSet = (value, isNegative) => {
  81. const parts = value.split('=');
  82. const firstPart = parts[0];
  83. let set;
  84. if (parts.length == 1) {
  85. set = handleLoneUnicodePropertyNameOrValue(firstPart);
  86. } else {
  87. // The pattern consists of two parts, i.e. `Property=Value`.
  88. const property = unicodeMatchProperty(firstPart);
  89. const value = unicodeMatchPropertyValue(property, parts[1]);
  90. set = getUnicodePropertyValueSet(property, value);
  91. }
  92. if (isNegative) {
  93. if (set.strings) {
  94. throw new Error('Cannot negate Unicode property of strings');
  95. }
  96. return {
  97. characters: UNICODE_SET.clone().remove(set.characters),
  98. strings: new Set()
  99. };
  100. }
  101. return {
  102. characters: set.characters.clone(),
  103. strings: set.strings
  104. // We need to escape strings like *️⃣ to make sure that they can be safely used in unions.
  105. ? new Set(set.strings.map(str => str.replace(SPECIAL_CHARS, '\\$1')))
  106. : new Set()
  107. };
  108. };
  109. const getUnicodePropertyEscapeCharacterClassData = (property, isNegative) => {
  110. const set = getUnicodePropertyEscapeSet(property, isNegative);
  111. const data = getCharacterClassEmptyData();
  112. data.singleChars = set.characters;
  113. if (set.strings.size > 0) {
  114. data.longStrings = set.strings;
  115. data.maybeIncludesStrings = true;
  116. }
  117. return data;
  118. };
  119. function configNeedCaseFoldAscii() {
  120. return !!config.modifiersData.i;
  121. }
  122. function configNeedCaseFoldUnicode() {
  123. // config.modifiersData.i : undefined | false
  124. if (config.modifiersData.i === false) return false;
  125. if (!config.transform.unicodeFlag) return false;
  126. return Boolean(config.modifiersData.i || config.flags.ignoreCase);
  127. }
  128. // Given a range of code points, add any case-folded code points in that range
  129. // to a set.
  130. regenerate.prototype.iuAddRange = function(min, max) {
  131. const $this = this;
  132. do {
  133. const folded = caseFold(min, configNeedCaseFoldAscii(), configNeedCaseFoldUnicode());
  134. if (folded) {
  135. $this.add(folded);
  136. }
  137. } while (++min <= max);
  138. return $this;
  139. };
  140. regenerate.prototype.iuRemoveRange = function(min, max) {
  141. const $this = this;
  142. do {
  143. const folded = caseFold(min, configNeedCaseFoldAscii(), configNeedCaseFoldUnicode());
  144. if (folded) {
  145. $this.remove(folded);
  146. }
  147. } while (++min <= max);
  148. return $this;
  149. };
  150. const update = (item, pattern) => {
  151. let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '', {
  152. lookbehind: true,
  153. namedGroups: true,
  154. unicodePropertyEscape: true,
  155. unicodeSet: true,
  156. modifiers: true,
  157. });
  158. switch (tree.type) {
  159. case 'characterClass':
  160. case 'group':
  161. case 'value':
  162. // No wrapping needed.
  163. break;
  164. default:
  165. // Wrap the pattern in a non-capturing group.
  166. tree = wrap(tree, pattern);
  167. }
  168. Object.assign(item, tree);
  169. };
  170. const wrap = (tree, pattern) => {
  171. // Wrap the pattern in a non-capturing group.
  172. return {
  173. 'type': 'group',
  174. 'behavior': 'ignore',
  175. 'body': [tree],
  176. 'raw': `(?:${ pattern })`
  177. };
  178. };
  179. const caseFold = (codePoint, includeAscii, includeUnicode) => {
  180. let folded = (includeUnicode ? iuMappings.get(codePoint) : undefined) || [];
  181. if (typeof folded === 'number') folded = [folded];
  182. if (includeAscii) {
  183. if (codePoint >= 0x41 && codePoint <= 0x5A) {
  184. folded.push(codePoint + 0x20);
  185. } else if (codePoint >= 0x61 && codePoint <= 0x7A) {
  186. folded.push(codePoint - 0x20);
  187. }
  188. }
  189. return folded.length == 0 ? false : folded;
  190. };
  191. const buildHandler = (action) => {
  192. switch (action) {
  193. case 'union':
  194. return {
  195. single: (data, cp) => {
  196. data.singleChars.add(cp);
  197. },
  198. regSet: (data, set2) => {
  199. data.singleChars.add(set2);
  200. },
  201. range: (data, start, end) => {
  202. data.singleChars.addRange(start, end);
  203. },
  204. iuRange: (data, start, end) => {
  205. data.singleChars.iuAddRange(start, end);
  206. },
  207. nested: (data, nestedData) => {
  208. data.singleChars.add(nestedData.singleChars);
  209. for (const str of nestedData.longStrings) data.longStrings.add(str);
  210. if (nestedData.maybeIncludesStrings) data.maybeIncludesStrings = true;
  211. }
  212. };
  213. case 'union-negative': {
  214. const regSet = (data, set2) => {
  215. data.singleChars = UNICODE_SET.clone().remove(set2).add(data.singleChars);
  216. };
  217. return {
  218. single: (data, cp) => {
  219. const unicode = UNICODE_SET.clone();
  220. data.singleChars = data.singleChars.contains(cp) ? unicode : unicode.remove(cp);
  221. },
  222. regSet: regSet,
  223. range: (data, start, end) => {
  224. data.singleChars = UNICODE_SET.clone().removeRange(start, end).add(data.singleChars);
  225. },
  226. iuRange: (data, start, end) => {
  227. data.singleChars = UNICODE_SET.clone().iuRemoveRange(start, end).add(data.singleChars);
  228. },
  229. nested: (data, nestedData) => {
  230. regSet(data, nestedData.singleChars);
  231. if (nestedData.maybeIncludesStrings) throw new Error('ASSERTION ERROR');
  232. }
  233. };
  234. }
  235. case 'intersection': {
  236. const regSet = (data, set2) => {
  237. if (data.first) data.singleChars = set2;
  238. else data.singleChars.intersection(set2);
  239. };
  240. return {
  241. single: (data, cp) => {
  242. data.singleChars = data.first || data.singleChars.contains(cp) ? regenerate(cp) : regenerate();
  243. data.longStrings.clear();
  244. data.maybeIncludesStrings = false;
  245. },
  246. regSet: (data, set) => {
  247. regSet(data, set);
  248. data.longStrings.clear();
  249. data.maybeIncludesStrings = false;
  250. },
  251. range: (data, start, end) => {
  252. if (data.first) data.singleChars.addRange(start, end);
  253. else data.singleChars.intersection(regenerate().addRange(start, end));
  254. data.longStrings.clear();
  255. data.maybeIncludesStrings = false;
  256. },
  257. iuRange: (data, start, end) => {
  258. if (data.first) data.singleChars.iuAddRange(start, end);
  259. else data.singleChars.intersection(regenerate().iuAddRange(start, end));
  260. data.longStrings.clear();
  261. data.maybeIncludesStrings = false;
  262. },
  263. nested: (data, nestedData) => {
  264. regSet(data, nestedData.singleChars);
  265. if (data.first) {
  266. data.longStrings = nestedData.longStrings;
  267. data.maybeIncludesStrings = nestedData.maybeIncludesStrings;
  268. } else {
  269. for (const str of data.longStrings) {
  270. if (!nestedData.longStrings.has(str)) data.longStrings.delete(str);
  271. }
  272. if (!nestedData.maybeIncludesStrings) data.maybeIncludesStrings = false;
  273. }
  274. }
  275. };
  276. }
  277. case 'subtraction': {
  278. const regSet = (data, set2) => {
  279. if (data.first) data.singleChars.add(set2);
  280. else data.singleChars.remove(set2);
  281. };
  282. return {
  283. single: (data, cp) => {
  284. if (data.first) data.singleChars.add(cp);
  285. else data.singleChars.remove(cp);
  286. },
  287. regSet: regSet,
  288. range: (data, start, end) => {
  289. if (data.first) data.singleChars.addRange(start, end);
  290. else data.singleChars.removeRange(start, end);
  291. },
  292. iuRange: (data, start, end) => {
  293. if (data.first) data.singleChars.iuAddRange(start, end);
  294. else data.singleChars.iuRemoveRange(start, end);
  295. },
  296. nested: (data, nestedData) => {
  297. regSet(data, nestedData.singleChars);
  298. if (data.first) {
  299. data.longStrings = nestedData.longStrings;
  300. data.maybeIncludesStrings = nestedData.maybeIncludesStrings;
  301. } else {
  302. for (const str of data.longStrings) {
  303. if (nestedData.longStrings.has(str)) data.longStrings.delete(str);
  304. }
  305. }
  306. }
  307. };
  308. }
  309. // The `default` clause is only here as a safeguard; it should never be
  310. // reached. Code coverage tools should ignore it.
  311. /* istanbul ignore next */
  312. default:
  313. throw new Error(`Unknown set action: ${ characterClassItem.kind }`);
  314. }
  315. };
  316. const getCharacterClassEmptyData = () => ({
  317. transformed: config.transform.unicodeFlag,
  318. singleChars: regenerate(),
  319. longStrings: new Set(),
  320. hasEmptyString: false,
  321. first: true,
  322. maybeIncludesStrings: false
  323. });
  324. const maybeFold = (codePoint) => {
  325. const caseFoldAscii = configNeedCaseFoldAscii();
  326. const caseFoldUnicode = configNeedCaseFoldUnicode();
  327. if (caseFoldAscii || caseFoldUnicode) {
  328. const folded = caseFold(codePoint, caseFoldAscii, caseFoldUnicode);
  329. if (folded) {
  330. return [codePoint, folded];
  331. }
  332. }
  333. return [codePoint];
  334. };
  335. const computeClassStrings = (classStrings, regenerateOptions) => {
  336. let data = getCharacterClassEmptyData();
  337. const caseFoldAscii = configNeedCaseFoldAscii();
  338. const caseFoldUnicode = configNeedCaseFoldUnicode();
  339. for (const string of classStrings.strings) {
  340. if (string.characters.length === 1) {
  341. maybeFold(string.characters[0].codePoint).forEach((cp) => {
  342. data.singleChars.add(cp);
  343. });
  344. } else {
  345. let stringifiedString;
  346. if (caseFoldUnicode || caseFoldAscii) {
  347. stringifiedString = '';
  348. for (const ch of string.characters) {
  349. let set = regenerate(ch.codePoint);
  350. const folded = maybeFold(ch.codePoint);
  351. if (folded) set.add(folded);
  352. stringifiedString += set.toString(regenerateOptions);
  353. }
  354. } else {
  355. stringifiedString = string.characters.map(ch => generate(ch)).join('')
  356. }
  357. data.longStrings.add(stringifiedString);
  358. data.maybeIncludesStrings = true;
  359. }
  360. }
  361. return data;
  362. }
  363. const computeCharacterClass = (characterClassItem, regenerateOptions) => {
  364. let data = getCharacterClassEmptyData();
  365. let handlePositive;
  366. let handleNegative;
  367. switch (characterClassItem.kind) {
  368. case 'union':
  369. handlePositive = buildHandler('union');
  370. handleNegative = buildHandler('union-negative');
  371. break;
  372. case 'intersection':
  373. handlePositive = buildHandler('intersection');
  374. handleNegative = buildHandler('subtraction');
  375. if (config.transform.unicodeSetsFlag) data.transformed = true;
  376. break;
  377. case 'subtraction':
  378. handlePositive = buildHandler('subtraction');
  379. handleNegative = buildHandler('intersection');
  380. if (config.transform.unicodeSetsFlag) data.transformed = true;
  381. break;
  382. // The `default` clause is only here as a safeguard; it should never be
  383. // reached. Code coverage tools should ignore it.
  384. /* istanbul ignore next */
  385. default:
  386. throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
  387. }
  388. const caseFoldAscii = configNeedCaseFoldAscii();
  389. const caseFoldUnicode = configNeedCaseFoldUnicode();
  390. for (const item of characterClassItem.body) {
  391. switch (item.type) {
  392. case 'value':
  393. maybeFold(item.codePoint).forEach((cp) => {
  394. handlePositive.single(data, cp);
  395. });
  396. break;
  397. case 'characterClassRange':
  398. const min = item.min.codePoint;
  399. const max = item.max.codePoint;
  400. handlePositive.range(data, min, max);
  401. if (caseFoldAscii || caseFoldUnicode) {
  402. handlePositive.iuRange(data, min, max);
  403. data.transformed = true;
  404. }
  405. break;
  406. case 'characterClassEscape':
  407. handlePositive.regSet(data, getCharacterClassEscapeSet(
  408. item.value,
  409. config.flags.unicode,
  410. config.flags.ignoreCase
  411. ));
  412. break;
  413. case 'unicodePropertyEscape':
  414. const nestedData = getUnicodePropertyEscapeCharacterClassData(item.value, item.negative);
  415. handlePositive.nested(data, nestedData);
  416. data.transformed =
  417. data.transformed ||
  418. config.transform.unicodePropertyEscapes ||
  419. (config.transform.unicodeSetsFlag && nestedData.maybeIncludesStrings);
  420. break;
  421. case 'characterClass':
  422. const handler = item.negative ? handleNegative : handlePositive;
  423. const res = computeCharacterClass(item, regenerateOptions);
  424. handler.nested(data, res);
  425. data.transformed = true;
  426. break;
  427. case 'classStrings':
  428. handlePositive.nested(data, computeClassStrings(item, regenerateOptions));
  429. data.transformed = true;
  430. break;
  431. // The `default` clause is only here as a safeguard; it should never be
  432. // reached. Code coverage tools should ignore it.
  433. /* istanbul ignore next */
  434. default:
  435. throw new Error(`Unknown term type: ${ item.type }`);
  436. }
  437. data.first = false;
  438. }
  439. if (characterClassItem.negative && data.maybeIncludesStrings) {
  440. throw new SyntaxError('Cannot negate set containing strings');
  441. }
  442. return data;
  443. }
  444. const processCharacterClass = (
  445. characterClassItem,
  446. regenerateOptions,
  447. computed = computeCharacterClass(characterClassItem, regenerateOptions)
  448. ) => {
  449. const negative = characterClassItem.negative;
  450. const { singleChars, transformed, longStrings } = computed;
  451. if (transformed) {
  452. const setStr = singleChars.toString(regenerateOptions);
  453. if (negative) {
  454. if (config.useUnicodeFlag) {
  455. update(characterClassItem, `[^${setStr[0] === '[' ? setStr.slice(1, -1) : setStr}]`)
  456. } else {
  457. if (config.flags.unicode) {
  458. if (config.flags.ignoreCase) {
  459. const astralCharsSet = singleChars.clone().intersection(ASTRAL_SET);
  460. // Assumption: singleChars do not contain lone surrogates.
  461. // Regex like /[^\ud800]/u is not supported
  462. const surrogateOrBMPSetStr = singleChars
  463. .clone()
  464. .remove(astralCharsSet)
  465. .addRange(0xd800, 0xdfff)
  466. .toString({ bmpOnly: true });
  467. // Don't generate negative lookahead for astral characters
  468. // because the case folding is not working anyway as we break
  469. // code points into surrogate pairs.
  470. const astralNegativeSetStr = ASTRAL_SET
  471. .clone()
  472. .remove(astralCharsSet)
  473. .toString(regenerateOptions);
  474. // The transform here does not support lone surrogates.
  475. update(
  476. characterClassItem,
  477. `(?!${surrogateOrBMPSetStr})[\\s\\S]|${astralNegativeSetStr}`
  478. );
  479. } else {
  480. // Generate negative set directly when case folding is not involved.
  481. update(
  482. characterClassItem,
  483. UNICODE_SET.clone().remove(singleChars).toString(regenerateOptions)
  484. );
  485. }
  486. } else {
  487. update(characterClassItem, `(?!${setStr})[\\s\\S]`);
  488. }
  489. }
  490. } else {
  491. const hasEmptyString = longStrings.has('');
  492. const pieces = Array.from(longStrings).sort((a, b) => b.length - a.length);
  493. if (setStr !== '[]' || longStrings.size === 0) {
  494. pieces.splice(pieces.length - (hasEmptyString ? 1 : 0), 0, setStr);
  495. }
  496. update(characterClassItem, pieces.join('|'));
  497. }
  498. }
  499. return characterClassItem;
  500. };
  501. const assertNoUnmatchedReferences = (groups) => {
  502. const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
  503. if (unmatchedReferencesNames.length > 0) {
  504. throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
  505. }
  506. };
  507. const processModifiers = (item, regenerateOptions, groups) => {
  508. const enabling = item.modifierFlags.enabling;
  509. const disabling = item.modifierFlags.disabling;
  510. delete item.modifierFlags;
  511. item.behavior = 'ignore';
  512. const oldData = Object.assign({}, config.modifiersData);
  513. enabling.split('').forEach(flag => {
  514. config.modifiersData[flag] = true;
  515. });
  516. disabling.split('').forEach(flag => {
  517. config.modifiersData[flag] = false;
  518. });
  519. item.body = item.body.map(term => {
  520. return processTerm(term, regenerateOptions, groups);
  521. });
  522. config.modifiersData = oldData;
  523. return item;
  524. }
  525. const processTerm = (item, regenerateOptions, groups) => {
  526. switch (item.type) {
  527. case 'dot':
  528. if (config.transform.unicodeFlag) {
  529. update(
  530. item,
  531. getUnicodeDotSet(config.flags.dotAll || config.modifiersData.s).toString(regenerateOptions)
  532. );
  533. } else if (config.transform.dotAllFlag || config.modifiersData.s) {
  534. // TODO: consider changing this at the regenerate level.
  535. update(item, '[\\s\\S]');
  536. }
  537. break;
  538. case 'characterClass':
  539. item = processCharacterClass(item, regenerateOptions);
  540. break;
  541. case 'unicodePropertyEscape':
  542. const data = getUnicodePropertyEscapeCharacterClassData(item.value, item.negative);
  543. if (data.maybeIncludesStrings) {
  544. if (!config.flags.unicodeSets) {
  545. throw new Error(
  546. 'Properties of strings are only supported when using the unicodeSets (v) flag.'
  547. );
  548. }
  549. if (config.transform.unicodeSetsFlag) {
  550. data.transformed = true;
  551. item = processCharacterClass(item, regenerateOptions, data);
  552. }
  553. } else if (config.transform.unicodePropertyEscapes) {
  554. update(
  555. item,
  556. data.singleChars.toString(regenerateOptions)
  557. );
  558. }
  559. break;
  560. case 'characterClassEscape':
  561. if (config.transform.unicodeFlag) {
  562. update(
  563. item,
  564. getCharacterClassEscapeSet(
  565. item.value,
  566. /* config.transform.unicodeFlag implies config.flags.unicode */ true,
  567. config.flags.ignoreCase
  568. ).toString(regenerateOptions)
  569. );
  570. }
  571. break;
  572. case 'group':
  573. if (item.behavior == 'normal') {
  574. groups.lastIndex++;
  575. }
  576. if (item.name) {
  577. const name = item.name.value;
  578. if (groups.namesConflicts[name]) {
  579. throw new Error(
  580. `Group '${ name }' has already been defined in this context.`
  581. );
  582. }
  583. groups.namesConflicts[name] = true;
  584. if (config.transform.namedGroups) {
  585. delete item.name;
  586. }
  587. const index = groups.lastIndex;
  588. if (!groups.names[name]) {
  589. groups.names[name] = [];
  590. }
  591. groups.names[name].push(index);
  592. if (groups.onNamedGroup) {
  593. groups.onNamedGroup.call(null, name, index);
  594. }
  595. if (groups.unmatchedReferences[name]) {
  596. delete groups.unmatchedReferences[name];
  597. }
  598. }
  599. if (item.modifierFlags && config.transform.modifiers) {
  600. return processModifiers(item, regenerateOptions, groups);
  601. }
  602. /* falls through */
  603. case 'quantifier':
  604. item.body = item.body.map(term => {
  605. return processTerm(term, regenerateOptions, groups);
  606. });
  607. break;
  608. case 'disjunction':
  609. const outerNamesConflicts = groups.namesConflicts;
  610. item.body = item.body.map(term => {
  611. groups.namesConflicts = Object.create(outerNamesConflicts);
  612. return processTerm(term, regenerateOptions, groups);
  613. });
  614. break;
  615. case 'alternative':
  616. item.body = flatMap(item.body, term => {
  617. const res = processTerm(term, regenerateOptions, groups);
  618. // Alternatives cannot contain alternatives; flatten them.
  619. return res.type === 'alternative' ? res.body : res;
  620. });
  621. break;
  622. case 'value':
  623. const codePoint = item.codePoint;
  624. const set = regenerate(codePoint);
  625. const folded = maybeFold(codePoint);
  626. set.add(folded);
  627. update(item, set.toString(regenerateOptions));
  628. break;
  629. case 'reference':
  630. if (item.name) {
  631. const name = item.name.value;
  632. const indexes = groups.names[name];
  633. if (!indexes) {
  634. groups.unmatchedReferences[name] = true;
  635. }
  636. if (config.transform.namedGroups) {
  637. if (indexes) {
  638. const body = indexes.map(index => ({
  639. 'type': 'reference',
  640. 'matchIndex': index,
  641. 'raw': '\\' + index,
  642. }));
  643. if (body.length === 1) {
  644. return body[0];
  645. }
  646. return {
  647. 'type': 'alternative',
  648. 'body': body,
  649. 'raw': body.map(term => term.raw).join(''),
  650. };
  651. }
  652. // This named reference comes before the group where it’s defined,
  653. // so it’s always an empty match.
  654. return {
  655. 'type': 'group',
  656. 'behavior': 'ignore',
  657. 'body': [],
  658. 'raw': '(?:)',
  659. };
  660. }
  661. }
  662. break;
  663. case 'anchor':
  664. if (config.modifiersData.m) {
  665. if (item.kind == 'start') {
  666. update(item, `(?:^|(?<=${NEWLINE_SET.toString()}))`);
  667. } else if (item.kind == 'end') {
  668. update(item, `(?:$|(?=${NEWLINE_SET.toString()}))`);
  669. }
  670. }
  671. case 'empty':
  672. // Nothing to do here.
  673. break;
  674. // The `default` clause is only here as a safeguard; it should never be
  675. // reached. Code coverage tools should ignore it.
  676. /* istanbul ignore next */
  677. default:
  678. throw new Error(`Unknown term type: ${ item.type }`);
  679. }
  680. return item;
  681. };
  682. const config = {
  683. 'flags': {
  684. 'ignoreCase': false,
  685. 'unicode': false,
  686. 'unicodeSets': false,
  687. 'dotAll': false,
  688. 'multiline': false,
  689. },
  690. 'transform': {
  691. 'dotAllFlag': false,
  692. 'unicodeFlag': false,
  693. 'unicodeSetsFlag': false,
  694. 'unicodePropertyEscapes': false,
  695. 'namedGroups': false,
  696. 'modifiers': false,
  697. },
  698. 'modifiersData': {
  699. 'i': undefined,
  700. 's': undefined,
  701. 'm': undefined,
  702. },
  703. get useUnicodeFlag() {
  704. return (this.flags.unicode || this.flags.unicodeSets) && !this.transform.unicodeFlag;
  705. }
  706. };
  707. const validateOptions = (options) => {
  708. if (!options) return;
  709. for (const key of Object.keys(options)) {
  710. const value = options[key];
  711. switch (key) {
  712. case 'dotAllFlag':
  713. case 'unicodeFlag':
  714. case 'unicodePropertyEscapes':
  715. case 'namedGroups':
  716. if (value != null && value !== false && value !== 'transform') {
  717. throw new Error(`.${key} must be false (default) or 'transform'.`);
  718. }
  719. break;
  720. case 'modifiers':
  721. case 'unicodeSetsFlag':
  722. if (value != null && value !== false && value !== 'parse' && value !== 'transform') {
  723. throw new Error(`.${key} must be false (default), 'parse' or 'transform'.`);
  724. }
  725. break;
  726. case 'onNamedGroup':
  727. case 'onNewFlags':
  728. if (value != null && typeof value !== 'function') {
  729. throw new Error(`.${key} must be a function.`);
  730. }
  731. break;
  732. default:
  733. throw new Error(`.${key} is not a valid regexpu-core option.`);
  734. }
  735. }
  736. };
  737. const hasFlag = (flags, flag) => flags ? flags.includes(flag) : false;
  738. const transform = (options, name) => options ? options[name] === 'transform' : false;
  739. const rewritePattern = (pattern, flags, options) => {
  740. validateOptions(options);
  741. config.flags.unicode = hasFlag(flags, 'u');
  742. config.flags.unicodeSets = hasFlag(flags, 'v');
  743. config.flags.ignoreCase = hasFlag(flags, 'i');
  744. config.flags.dotAll = hasFlag(flags, 's');
  745. config.flags.multiline = hasFlag(flags, 'm');
  746. config.transform.dotAllFlag = config.flags.dotAll && transform(options, 'dotAllFlag');
  747. config.transform.unicodeFlag = (config.flags.unicode || config.flags.unicodeSets) && transform(options, 'unicodeFlag');
  748. config.transform.unicodeSetsFlag = config.flags.unicodeSets && transform(options, 'unicodeSetsFlag');
  749. // unicodeFlag: 'transform' implies unicodePropertyEscapes: 'transform'
  750. config.transform.unicodePropertyEscapes = config.flags.unicode && (
  751. transform(options, 'unicodeFlag') || transform(options, 'unicodePropertyEscapes')
  752. );
  753. config.transform.namedGroups = transform(options, 'namedGroups');
  754. config.transform.modifiers = transform(options, 'modifiers');
  755. config.modifiersData.i = undefined;
  756. config.modifiersData.s = undefined;
  757. config.modifiersData.m = undefined;
  758. const regjsparserFeatures = {
  759. 'unicodeSet': Boolean(options && options.unicodeSetsFlag),
  760. 'modifiers': Boolean(options && options.modifiers),
  761. // Enable every stable RegExp feature by default
  762. 'unicodePropertyEscape': true,
  763. 'namedGroups': true,
  764. 'lookbehind': true,
  765. };
  766. const regenerateOptions = {
  767. 'hasUnicodeFlag': config.useUnicodeFlag,
  768. 'bmpOnly': !config.flags.unicode
  769. };
  770. const groups = {
  771. 'onNamedGroup': options && options.onNamedGroup,
  772. 'lastIndex': 0,
  773. 'names': Object.create(null), // { [name]: Array<index> }
  774. 'namesConflicts': Object.create(null), // { [name]: true }
  775. 'unmatchedReferences': Object.create(null) // { [name]: true }
  776. };
  777. const tree = parse(pattern, flags, regjsparserFeatures);
  778. if (config.transform.modifiers) {
  779. if (/\(\?[a-z]*-[a-z]+:/.test(pattern)) {
  780. // the pattern _likely_ contain inline disabled modifiers
  781. // we need to traverse to make sure that they are actually modifiers and to collect them
  782. const allDisabledModifiers = Object.create(null)
  783. const itemStack = [tree];
  784. let node;
  785. while (node = itemStack.pop(), node != undefined) {
  786. if (Array.isArray(node)) {
  787. Array.prototype.push.apply(itemStack, node);
  788. } else if (typeof node == 'object' && node != null) {
  789. for (const key of Object.keys(node)) {
  790. const value = node[key];
  791. if (key == 'modifierFlags') {
  792. if (value.disabling.length > 0){
  793. value.disabling.split('').forEach((flag)=>{
  794. allDisabledModifiers[flag] = true
  795. });
  796. }
  797. } else if (typeof value == 'object' && value != null) {
  798. itemStack.push(value);
  799. }
  800. }
  801. }
  802. }
  803. for (const flag of Object.keys(allDisabledModifiers)) {
  804. config.modifiersData[flag] = true;
  805. }
  806. }
  807. }
  808. // Note: `processTerm` mutates `tree` and `groups`.
  809. processTerm(tree, regenerateOptions, groups);
  810. assertNoUnmatchedReferences(groups);
  811. const onNewFlags = options && options.onNewFlags;
  812. if (onNewFlags) {
  813. let newFlags = flags.split('').filter((flag) => !config.modifiersData[flag]).join('');
  814. if (config.transform.unicodeSetsFlag) {
  815. newFlags = newFlags.replace('v', 'u');
  816. }
  817. if (config.transform.unicodeFlag) {
  818. newFlags = newFlags.replace('u', '');
  819. }
  820. if (config.transform.dotAllFlag === 'transform') {
  821. newFlags = newFlags.replace('s', '');
  822. }
  823. onNewFlags(newFlags);
  824. }
  825. return generate(tree);
  826. };
  827. module.exports = rewritePattern;