PhoneNumberMatcher.js 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. /**
  2. * A port of Google's `PhoneNumberMatcher.java`.
  3. * https://github.com/googlei18n/libphonenumber/blob/master/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java
  4. * Date: 08.03.2018.
  5. */
  6. import PhoneNumber from './PhoneNumber.js'
  7. import {
  8. MAX_LENGTH_FOR_NSN,
  9. MAX_LENGTH_COUNTRY_CODE,
  10. VALID_PUNCTUATION
  11. } from './constants.js'
  12. import createExtensionPattern from './helpers/extension/createExtensionPattern.js'
  13. import RegExpCache from './findNumbers/RegExpCache.js'
  14. import {
  15. limit,
  16. trimAfterFirstMatch
  17. } from './findNumbers/util.js'
  18. import {
  19. _pL,
  20. _pN,
  21. pZ,
  22. PZ,
  23. pNd
  24. } from './findNumbers/utf-8.js'
  25. import Leniency from './findNumbers/Leniency.js'
  26. import parsePreCandidate from './findNumbers/parsePreCandidate.js'
  27. import isValidPreCandidate from './findNumbers/isValidPreCandidate.js'
  28. import isValidCandidate, { LEAD_CLASS } from './findNumbers/isValidCandidate.js'
  29. import { isSupportedCountry } from './metadata.js'
  30. import parsePhoneNumber from './parsePhoneNumber.js'
  31. const USE_NON_GEOGRAPHIC_COUNTRY_CODE = false
  32. const EXTN_PATTERNS_FOR_MATCHING = createExtensionPattern('matching')
  33. /**
  34. * Patterns used to extract phone numbers from a larger phone-number-like pattern. These are
  35. * ordered according to specificity. For example, white-space is last since that is frequently
  36. * used in numbers, not just to separate two numbers. We have separate patterns since we don't
  37. * want to break up the phone-number-like text on more than one different kind of symbol at one
  38. * time, although symbols of the same type (e.g. space) can be safely grouped together.
  39. *
  40. * Note that if there is a match, we will always check any text found up to the first match as
  41. * well.
  42. */
  43. const INNER_MATCHES =
  44. [
  45. // Breaks on the slash - e.g. "651-234-2345/332-445-1234"
  46. '\\/+(.*)/',
  47. // Note that the bracket here is inside the capturing group, since we consider it part of the
  48. // phone number. Will match a pattern like "(650) 223 3345 (754) 223 3321".
  49. '(\\([^(]*)',
  50. // Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number."
  51. // We require a space on either side of the hyphen for it to be considered a separator.
  52. `(?:${pZ}-|-${pZ})${pZ}*(.+)`,
  53. // Various types of wide hyphens. Note we have decided not to enforce a space here, since it's
  54. // possible that it's supposed to be used to break two numbers without spaces, and we haven't
  55. // seen many instances of it used within a number.
  56. `[\u2012-\u2015\uFF0D]${pZ}*(.+)`,
  57. // Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
  58. `\\.+${pZ}*([^.]+)`,
  59. // Breaks on space - e.g. "3324451234 8002341234"
  60. `${pZ}+(${PZ}+)`
  61. ]
  62. // Limit on the number of leading (plus) characters.
  63. const leadLimit = limit(0, 2)
  64. // Limit on the number of consecutive punctuation characters.
  65. const punctuationLimit = limit(0, 4)
  66. /* The maximum number of digits allowed in a digit-separated block. As we allow all digits in a
  67. * single block, set high enough to accommodate the entire national number and the international
  68. * country code. */
  69. const digitBlockLimit = MAX_LENGTH_FOR_NSN + MAX_LENGTH_COUNTRY_CODE
  70. // Limit on the number of blocks separated by punctuation.
  71. // Uses digitBlockLimit since some formats use spaces to separate each digit.
  72. const blockLimit = limit(0, digitBlockLimit)
  73. /* A punctuation sequence allowing white space. */
  74. const punctuation = `[${VALID_PUNCTUATION}]` + punctuationLimit
  75. // A digits block without punctuation.
  76. const digitSequence = pNd + limit(1, digitBlockLimit)
  77. /**
  78. * Phone number pattern allowing optional punctuation.
  79. * The phone number pattern used by `find()`, similar to
  80. * VALID_PHONE_NUMBER, but with the following differences:
  81. * <ul>
  82. * <li>All captures are limited in order to place an upper bound to the text matched by the
  83. * pattern.
  84. * <ul>
  85. * <li>Leading punctuation / plus signs are limited.
  86. * <li>Consecutive occurrences of punctuation are limited.
  87. * <li>Number of digits is limited.
  88. * </ul>
  89. * <li>No whitespace is allowed at the start or end.
  90. * <li>No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently supported.
  91. * </ul>
  92. */
  93. const PATTERN = '(?:' + LEAD_CLASS + punctuation + ')' + leadLimit
  94. + digitSequence + '(?:' + punctuation + digitSequence + ')' + blockLimit
  95. + '(?:' + EXTN_PATTERNS_FOR_MATCHING + ')?'
  96. // Regular expression of trailing characters that we want to remove.
  97. // We remove all characters that are not alpha or numerical characters.
  98. // The hash character is retained here, as it may signify
  99. // the previous block was an extension.
  100. //
  101. // // Don't know what does '&&' mean here.
  102. // const UNWANTED_END_CHAR_PATTERN = new RegExp(`[[\\P{N}&&\\P{L}]&&[^#]]+$`)
  103. //
  104. const UNWANTED_END_CHAR_PATTERN = new RegExp(`[^${_pN}${_pL}#]+$`)
  105. const NON_DIGITS_PATTERN = /(\D+)/
  106. const MAX_SAFE_INTEGER = Number.MAX_SAFE_INTEGER || Math.pow(2, 53) - 1
  107. /**
  108. * A stateful class that finds and extracts telephone numbers from {@linkplain CharSequence text}.
  109. * Instances can be created using the {@linkplain PhoneNumberUtil#findNumbers factory methods} in
  110. * {@link PhoneNumberUtil}.
  111. *
  112. * <p>Vanity numbers (phone numbers using alphabetic digits such as <tt>1-800-SIX-FLAGS</tt> are
  113. * not found.
  114. *
  115. * <p>This class is not thread-safe.
  116. */
  117. export default class PhoneNumberMatcher
  118. {
  119. /**
  120. * @param {string} text — the character sequence that we will search, null for no text.
  121. * @param {'POSSIBLE'|'VALID'|'STRICT_GROUPING'|'EXACT_GROUPING'} [options.leniency] — The leniency to use when evaluating candidate phone numbers. See `source/findNumbers/Leniency.js` for more details.
  122. * @param {number} [options.maxTries] — The maximum number of invalid numbers to try before giving up on the text. This is to cover degenerate cases where the text has a lot of false positives in it. Must be >= 0.
  123. */
  124. constructor(text = '', options = {}, metadata)
  125. {
  126. options = {
  127. v2: options.v2,
  128. defaultCallingCode: options.defaultCallingCode,
  129. defaultCountry: options.defaultCountry && isSupportedCountry(options.defaultCountry, metadata) ? options.defaultCountry : undefined,
  130. leniency: options.leniency || (options.extended ? 'POSSIBLE' : 'VALID'),
  131. maxTries: options.maxTries || MAX_SAFE_INTEGER
  132. }
  133. // Validate `leniency`.
  134. if (!options.leniency) {
  135. throw new TypeError('`leniency` is required')
  136. }
  137. if (options.leniency !== 'POSSIBLE' && options.leniency !== 'VALID') {
  138. throw new TypeError(`Invalid \`leniency\`: "${options.leniency}". Supported values: "POSSIBLE", "VALID".`)
  139. }
  140. // Validate `maxTries`.
  141. if (options.maxTries < 0) {
  142. throw new TypeError('`maxTries` must be `>= 0`')
  143. }
  144. this.text = text
  145. this.options = options
  146. this.metadata = metadata
  147. // The degree of phone number validation.
  148. this.leniency = Leniency[options.leniency]
  149. if (!this.leniency) {
  150. throw new TypeError(`Unknown leniency: "${options.leniency}"`)
  151. }
  152. /** The maximum number of retries after matching an invalid number. */
  153. this.maxTries = options.maxTries
  154. this.PATTERN = new RegExp(PATTERN, 'ig')
  155. /** The iteration tristate. */
  156. this.state = 'NOT_READY'
  157. /** The next index to start searching at. Undefined in {@link State#DONE}. */
  158. this.searchIndex = 0
  159. // A cache for frequently used country-specific regular expressions. Set to 32 to cover ~2-3
  160. // countries being used for the same doc with ~10 patterns for each country. Some pages will have
  161. // a lot more countries in use, but typically fewer numbers for each so expanding the cache for
  162. // that use-case won't have a lot of benefit.
  163. this.regExpCache = new RegExpCache(32)
  164. }
  165. /**
  166. * Attempts to find the next subsequence in the searched sequence on or after {@code searchIndex}
  167. * that represents a phone number. Returns the next match, null if none was found.
  168. *
  169. * @param index the search index to start searching at
  170. * @return the phone number match found, null if none can be found
  171. */
  172. find() {
  173. // // Reset the regular expression.
  174. // this.PATTERN.lastIndex = index
  175. let matches
  176. while ((this.maxTries > 0) && (matches = this.PATTERN.exec(this.text)) !== null) {
  177. let candidate = matches[0]
  178. const offset = matches.index
  179. candidate = parsePreCandidate(candidate)
  180. if (isValidPreCandidate(candidate, offset, this.text)) {
  181. const match =
  182. // Try to come up with a valid match given the entire candidate.
  183. this.parseAndVerify(candidate, offset, this.text)
  184. // If that failed, try to find an "inner match" -
  185. // there might be a phone number within this candidate.
  186. || this.extractInnerMatch(candidate, offset, this.text)
  187. if (match) {
  188. if (this.options.v2) {
  189. return {
  190. startsAt: match.startsAt,
  191. endsAt: match.endsAt,
  192. number: match.phoneNumber
  193. }
  194. } else {
  195. const { phoneNumber } = match
  196. const result = {
  197. startsAt: match.startsAt,
  198. endsAt: match.endsAt,
  199. phone: phoneNumber.nationalNumber
  200. }
  201. if (phoneNumber.country) {
  202. /* istanbul ignore if */
  203. if (USE_NON_GEOGRAPHIC_COUNTRY_CODE && country === '001') {
  204. result.countryCallingCode = phoneNumber.countryCallingCode
  205. } else {
  206. result.country = phoneNumber.country
  207. }
  208. } else {
  209. result.countryCallingCode = phoneNumber.countryCallingCode
  210. }
  211. if (phoneNumber.ext) {
  212. result.ext = phoneNumber.ext
  213. }
  214. return result
  215. }
  216. }
  217. }
  218. this.maxTries--
  219. }
  220. }
  221. /**
  222. * Attempts to extract a match from `substring`
  223. * if the substring itself does not qualify as a match.
  224. */
  225. extractInnerMatch(substring, offset, text) {
  226. for (const innerMatchPattern of INNER_MATCHES) {
  227. let isFirstMatch = true
  228. let candidateMatch
  229. const innerMatchRegExp = new RegExp(innerMatchPattern, 'g')
  230. while (this.maxTries > 0 && (candidateMatch = innerMatchRegExp.exec(substring)) !== null) {
  231. if (isFirstMatch) {
  232. // We should handle any group before this one too.
  233. const candidate = trimAfterFirstMatch(
  234. UNWANTED_END_CHAR_PATTERN,
  235. substring.slice(0, candidateMatch.index)
  236. )
  237. const match = this.parseAndVerify(candidate, offset, text)
  238. if (match) {
  239. return match
  240. }
  241. this.maxTries--
  242. isFirstMatch = false
  243. }
  244. const candidate = trimAfterFirstMatch(UNWANTED_END_CHAR_PATTERN, candidateMatch[1])
  245. // Java code does `groupMatcher.start(1)` here,
  246. // but there's no way in javascript to get a `candidate` start index,
  247. // therefore resort to using this kind of an approximation.
  248. // (`groupMatcher` is called `candidateInSubstringMatch` in this javascript port)
  249. // https://stackoverflow.com/questions/15934353/get-index-of-each-capture-in-a-javascript-regex
  250. const candidateIndexGuess = substring.indexOf(candidate, candidateMatch.index)
  251. const match = this.parseAndVerify(candidate, offset + candidateIndexGuess, text)
  252. if (match) {
  253. return match
  254. }
  255. this.maxTries--
  256. }
  257. }
  258. }
  259. /**
  260. * Parses a phone number from the `candidate` using `parse` and
  261. * verifies it matches the requested `leniency`. If parsing and verification succeed,
  262. * a corresponding `PhoneNumberMatch` is returned, otherwise this method returns `null`.
  263. *
  264. * @param candidate the candidate match
  265. * @param offset the offset of {@code candidate} within {@link #text}
  266. * @return the parsed and validated phone number match, or null
  267. */
  268. parseAndVerify(candidate, offset, text) {
  269. if (!isValidCandidate(candidate, offset, text, this.options.leniency)) {
  270. return
  271. }
  272. const phoneNumber = parsePhoneNumber(
  273. candidate,
  274. {
  275. extended: true,
  276. defaultCountry: this.options.defaultCountry,
  277. defaultCallingCode: this.options.defaultCallingCode
  278. },
  279. this.metadata
  280. )
  281. if (!phoneNumber) {
  282. return
  283. }
  284. if (!phoneNumber.isPossible()) {
  285. return
  286. }
  287. if (this.leniency(phoneNumber, {
  288. candidate,
  289. defaultCountry: this.options.defaultCountry,
  290. metadata: this.metadata,
  291. regExpCache: this.regExpCache
  292. })) {
  293. return {
  294. startsAt: offset,
  295. endsAt: offset + candidate.length,
  296. phoneNumber
  297. }
  298. }
  299. }
  300. hasNext()
  301. {
  302. if (this.state === 'NOT_READY')
  303. {
  304. this.lastMatch = this.find() // (this.searchIndex)
  305. if (this.lastMatch)
  306. {
  307. // this.searchIndex = this.lastMatch.endsAt
  308. this.state = 'READY'
  309. }
  310. else
  311. {
  312. this.state = 'DONE'
  313. }
  314. }
  315. return this.state === 'READY'
  316. }
  317. next()
  318. {
  319. // Check the state and find the next match as a side-effect if necessary.
  320. if (!this.hasNext())
  321. {
  322. throw new Error('No next element')
  323. }
  324. // Don't retain that memory any longer than necessary.
  325. const result = this.lastMatch
  326. this.lastMatch = null
  327. this.state = 'NOT_READY'
  328. return result
  329. }
  330. }