scrapetools.phone_scraper

  1import re
  2
  3import phonenumbers
  4from bs4 import BeautifulSoup
  5
  6
  7def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
  8    """Finds the number of consecutive numeric characters in a string."""
  9    # limit search to 10 characters
 10    text[:10]
 11    if reverse:
 12        text = text[::-1]
 13    for i, ch in enumerate(text):
 14        if not ch.isnumeric():
 15            return i
 16    return len(text)
 17
 18
 19def find_by_separator(text: str, separator: str) -> list[str]:
 20    """Attempts to detect phone numbers according to these
 21    patterns by scanning for separators (typically '-.')
 22    and how many consecutive numbers follow or precede them:
 23
 24    (xxx)xxx{separator}xxxx
 25
 26    (xxx) xxx{separator}xxxx
 27
 28    (xxx){separator}xxx{separator}xxxx
 29
 30    xxx{separator}xxx{separator}xxxx"""
 31    count = text.count(separator)
 32    numbers = []
 33    if count > 0:
 34        last_stopdex = 0
 35        for _ in range(count):
 36            number = ""
 37            sepdex = text.find(separator, last_stopdex)
 38            if sepdex != -1:
 39                next_sepdex = text.find(separator, sepdex + 1)
 40                # consecutive numbers preceding sepdex
 41                start_offset = get_num_consecutive_numbers(
 42                    text[last_stopdex:sepdex], reverse=True
 43                )
 44                # consecutive numbers between sepdex and next_sepdex
 45                first_stop_offset = get_num_consecutive_numbers(
 46                    text[sepdex + 1 : next_sepdex + 1]
 47                )
 48                # consecutive numbers after next_sepdex
 49                second_stop_offset = get_num_consecutive_numbers(
 50                    text[next_sepdex + 1 :]
 51                )
 52
 53                if (
 54                    start_offset == 3
 55                    and first_stop_offset == 3
 56                    and second_stop_offset == 4
 57                ):
 58                    # xxx{separator}xxx{separator}xxxx
 59                    number = text[
 60                        sepdex - start_offset : next_sepdex + second_stop_offset + 1
 61                    ]
 62                elif (
 63                    start_offset == 0
 64                    and first_stop_offset == 3
 65                    and second_stop_offset == 4
 66                    and text[sepdex - 1] == ")"
 67                    and text[sepdex - 5] == "("
 68                ):
 69                    # (xxx){separator}xxx{separator}xxxx
 70                    number = text[
 71                        sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2
 72                    ]
 73                elif start_offset == 3 and text[sepdex - 4] in [")", " "]:
 74                    # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx
 75                    number = text[sepdex - 8 : sepdex + 5]
 76                last_stopdex = sepdex + 5
 77                for ch in [separator, "(", ")", " "]:
 78                    number = number.replace(ch, "")
 79                if len(number) == 10 and all(ch.isnumeric() for ch in number):
 80                    numbers.append(number)
 81    return numbers
 82
 83
 84def find_by_href(text: str) -> list[str]:
 85    """Scrapes phone numbers by href attribute."""
 86    indicator = 'href="'
 87    count = text.count(indicator)
 88    prefixes = ["tel:", "callto:"]
 89    index = 0
 90    numbers = []
 91    for _ in range(count):
 92        index = text.find(indicator, index + 1)
 93        number = text[index + len(indicator) : text.find('"', index + len(indicator))]
 94        if any(prefix in number for prefix in prefixes):
 95            number = "".join(
 96                [num for num in number[number.find(":") + 1 :] if num.isnumeric()]
 97            )
 98            if len(number) == 10:
 99                numbers.append(number)
100    return numbers
101
102
103def scrape_phone_numbers_noregex(text: str) -> list[str]:
104    """Scrape for u.s. phone numbers."""
105    numbers = []
106    text = text.replace("+1", "")
107    for separator in "-.":
108        numbers.extend(find_by_separator(text, separator))
109    numbers.extend(find_by_href(text))
110    numbers = [
111        number
112        for number in numbers
113        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
114    ]
115    numbers = sorted(list(set(numbers)))
116    return numbers
117
118
119def scrape_phone_numbers(text: str) -> list[str]:
120    """Scrape phone numbers from text using regex."""
121    text = text.replace("+1", " ")
122    pattern = r"\b\(?[2-9]{1}[0-9]{2}\)?[ .-]{1}[2-9]{1}[0-9]{2}[ .-]{1}[0-9]{4}\b"
123    numbers = [re.sub(r"[^0-9]", "", number) for number in re.findall(pattern, text)]
124    numbers = [
125        number
126        for number in numbers
127        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
128    ]
129    return sorted(set(numbers))
def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
 8def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
 9    """Finds the number of consecutive numeric characters in a string."""
10    # limit search to 10 characters
11    text[:10]
12    if reverse:
13        text = text[::-1]
14    for i, ch in enumerate(text):
15        if not ch.isnumeric():
16            return i
17    return len(text)

Finds the number of consecutive numeric characters in a string.

def find_by_separator(text: str, separator: str) -> list[str]:
20def find_by_separator(text: str, separator: str) -> list[str]:
21    """Attempts to detect phone numbers according to these
22    patterns by scanning for separators (typically '-.')
23    and how many consecutive numbers follow or precede them:
24
25    (xxx)xxx{separator}xxxx
26
27    (xxx) xxx{separator}xxxx
28
29    (xxx){separator}xxx{separator}xxxx
30
31    xxx{separator}xxx{separator}xxxx"""
32    count = text.count(separator)
33    numbers = []
34    if count > 0:
35        last_stopdex = 0
36        for _ in range(count):
37            number = ""
38            sepdex = text.find(separator, last_stopdex)
39            if sepdex != -1:
40                next_sepdex = text.find(separator, sepdex + 1)
41                # consecutive numbers preceding sepdex
42                start_offset = get_num_consecutive_numbers(
43                    text[last_stopdex:sepdex], reverse=True
44                )
45                # consecutive numbers between sepdex and next_sepdex
46                first_stop_offset = get_num_consecutive_numbers(
47                    text[sepdex + 1 : next_sepdex + 1]
48                )
49                # consecutive numbers after next_sepdex
50                second_stop_offset = get_num_consecutive_numbers(
51                    text[next_sepdex + 1 :]
52                )
53
54                if (
55                    start_offset == 3
56                    and first_stop_offset == 3
57                    and second_stop_offset == 4
58                ):
59                    # xxx{separator}xxx{separator}xxxx
60                    number = text[
61                        sepdex - start_offset : next_sepdex + second_stop_offset + 1
62                    ]
63                elif (
64                    start_offset == 0
65                    and first_stop_offset == 3
66                    and second_stop_offset == 4
67                    and text[sepdex - 1] == ")"
68                    and text[sepdex - 5] == "("
69                ):
70                    # (xxx){separator}xxx{separator}xxxx
71                    number = text[
72                        sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2
73                    ]
74                elif start_offset == 3 and text[sepdex - 4] in [")", " "]:
75                    # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx
76                    number = text[sepdex - 8 : sepdex + 5]
77                last_stopdex = sepdex + 5
78                for ch in [separator, "(", ")", " "]:
79                    number = number.replace(ch, "")
80                if len(number) == 10 and all(ch.isnumeric() for ch in number):
81                    numbers.append(number)
82    return numbers

Attempts to detect phone numbers according to these patterns by scanning for separators (typically '-.') and how many consecutive numbers follow or precede them:

(xxx)xxx{separator}xxxx

(xxx) xxx{separator}xxxx

(xxx){separator}xxx{separator}xxxx

xxx{separator}xxx{separator}xxxx

def find_by_href(text: str) -> list[str]:
 85def find_by_href(text: str) -> list[str]:
 86    """Scrapes phone numbers by href attribute."""
 87    indicator = 'href="'
 88    count = text.count(indicator)
 89    prefixes = ["tel:", "callto:"]
 90    index = 0
 91    numbers = []
 92    for _ in range(count):
 93        index = text.find(indicator, index + 1)
 94        number = text[index + len(indicator) : text.find('"', index + len(indicator))]
 95        if any(prefix in number for prefix in prefixes):
 96            number = "".join(
 97                [num for num in number[number.find(":") + 1 :] if num.isnumeric()]
 98            )
 99            if len(number) == 10:
100                numbers.append(number)
101    return numbers

Scrapes phone numbers by href attribute.

def scrape_phone_numbers_noregex(text: str) -> list[str]:
104def scrape_phone_numbers_noregex(text: str) -> list[str]:
105    """Scrape for u.s. phone numbers."""
106    numbers = []
107    text = text.replace("+1", "")
108    for separator in "-.":
109        numbers.extend(find_by_separator(text, separator))
110    numbers.extend(find_by_href(text))
111    numbers = [
112        number
113        for number in numbers
114        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
115    ]
116    numbers = sorted(list(set(numbers)))
117    return numbers

Scrape for u.s. phone numbers.

def scrape_phone_numbers(text: str) -> list[str]:
120def scrape_phone_numbers(text: str) -> list[str]:
121    """Scrape phone numbers from text using regex."""
122    text = text.replace("+1", " ")
123    pattern = r"\b\(?[2-9]{1}[0-9]{2}\)?[ .-]{1}[2-9]{1}[0-9]{2}[ .-]{1}[0-9]{4}\b"
124    numbers = [re.sub(r"[^0-9]", "", number) for number in re.findall(pattern, text)]
125    numbers = [
126        number
127        for number in numbers
128        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
129    ]
130    return sorted(set(numbers))

Scrape phone numbers from text using regex.