isValidCandidate.js 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. // Copy-pasted from `PhoneNumberMatcher.js`.
  2. import { PLUS_CHARS } from '../constants.js'
  3. import { limit } from './util.js'
  4. import {
  5. isLatinLetter,
  6. isInvalidPunctuationSymbol
  7. } from './utf-8.js'
  8. const OPENING_PARENS = '(\\[\uFF08\uFF3B'
  9. const CLOSING_PARENS = ')\\]\uFF09\uFF3D'
  10. const NON_PARENS = `[^${OPENING_PARENS}${CLOSING_PARENS}]`
  11. export const LEAD_CLASS = `[${OPENING_PARENS}${PLUS_CHARS}]`
  12. // Punctuation that may be at the start of a phone number - brackets and plus signs.
  13. const LEAD_CLASS_LEADING = new RegExp('^' + LEAD_CLASS)
  14. // Limit on the number of pairs of brackets in a phone number.
  15. const BRACKET_PAIR_LIMIT = limit(0, 3)
  16. /**
  17. * Pattern to check that brackets match. Opening brackets should be closed within a phone number.
  18. * This also checks that there is something inside the brackets. Having no brackets at all is also
  19. * fine.
  20. *
  21. * An opening bracket at the beginning may not be closed, but subsequent ones should be. It's
  22. * also possible that the leading bracket was dropped, so we shouldn't be surprised if we see a
  23. * closing bracket first. We limit the sets of brackets in a phone number to four.
  24. */
  25. const MATCHING_BRACKETS_ENTIRE = new RegExp
  26. (
  27. '^'
  28. + "(?:[" + OPENING_PARENS + "])?" + "(?:" + NON_PARENS + "+" + "[" + CLOSING_PARENS + "])?"
  29. + NON_PARENS + "+"
  30. + "(?:[" + OPENING_PARENS + "]" + NON_PARENS + "+[" + CLOSING_PARENS + "])" + BRACKET_PAIR_LIMIT
  31. + NON_PARENS + "*"
  32. + '$'
  33. )
  34. /**
  35. * Matches strings that look like publication pages. Example:
  36. * <pre>Computing Complete Answers to Queries in the Presence of Limited Access Patterns.
  37. * Chen Li. VLDB J. 12(3): 211-227 (2003).</pre>
  38. *
  39. * The string "211-227 (2003)" is not a telephone number.
  40. */
  41. const PUB_PAGES = /\d{1,5}-+\d{1,5}\s{0,4}\(\d{1,4}/
  42. export default function isValidCandidate(candidate, offset, text, leniency)
  43. {
  44. // Check the candidate doesn't contain any formatting
  45. // which would indicate that it really isn't a phone number.
  46. if (!MATCHING_BRACKETS_ENTIRE.test(candidate) || PUB_PAGES.test(candidate)) {
  47. return
  48. }
  49. // If leniency is set to VALID or stricter, we also want to skip numbers that are surrounded
  50. // by Latin alphabetic characters, to skip cases like abc8005001234 or 8005001234def.
  51. if (leniency !== 'POSSIBLE')
  52. {
  53. // If the candidate is not at the start of the text,
  54. // and does not start with phone-number punctuation,
  55. // check the previous character.
  56. if (offset > 0 && !LEAD_CLASS_LEADING.test(candidate))
  57. {
  58. const previousChar = text[offset - 1]
  59. // We return null if it is a latin letter or an invalid punctuation symbol.
  60. if (isInvalidPunctuationSymbol(previousChar) || isLatinLetter(previousChar)) {
  61. return false
  62. }
  63. }
  64. const lastCharIndex = offset + candidate.length
  65. if (lastCharIndex < text.length)
  66. {
  67. const nextChar = text[lastCharIndex]
  68. if (isInvalidPunctuationSymbol(nextChar) || isLatinLetter(nextChar)) {
  69. return false
  70. }
  71. }
  72. }
  73. return true
  74. }