scrapetools.email_scraper

  1import re
  2from string import printable
  3from urllib.parse import unquote
  4
  5
  6def validate(email: str) -> bool:
  7    """Checks string to see if it's likely an email address.
  8
  9    Returns True or False.
 10
 11    Some emails violating some of these rules
 12    may technically be valid, but are practically
 13    never seen in use out in the wild."""
 14    if email.count("@") != 1 or email.count(".") == 0:
 15        return False
 16    atdex = email.find("@")
 17    last_dot = email.rfind(".")
 18    local, domain = email.split("@")
 19    # RULES:
 20    #'@' comes before the last '.'
 21    # local part is 64 characters or less
 22    # domain part doesn't contain any '_'
 23    # at least 1 character in local is alphabetical
 24    # 1st character is not '@' or '.'
 25    # last character is not '@' or '.'
 26    # character after '@' is not '.'
 27    # doesn't start with 'www.'
 28    # local is two or more characters
 29    # domain is more than 3 characters
 30    # domain doesn't consist of only numbers
 31    # local doesn't consist of only numbers
 32    # no consecutive '.' in email
 33    # email doesn't contain a listed file ext
 34    if all(
 35        [
 36            atdex < last_dot,
 37            len(local) <= 64,
 38            domain.count("_") == 0,
 39            any(ch.isalpha() for ch in local),
 40            email[0] not in ["@", "."],
 41            email[-1] not in ["@", "."],
 42            email[email.find("@") + 1] != ".",
 43            not email.startswith("www."),
 44            len(local) >= 2,
 45            len(domain) > 3,
 46            not all(ch.isnumeric() for ch in domain.replace(".", "")),
 47            not all(ch.isnumeric() for ch in local.replace(".", "")),
 48            all(email[i - 1] != "." for i, ch in enumerate(email) if ch == "."),
 49            all(
 50                ext not in domain
 51                for ext in [
 52                    ".png",
 53                    ".jpg",
 54                    ".js",
 55                    ".html",
 56                    ".svg",
 57                    ".jpeg",
 58                    ".mp4",
 59                    ".mpeg",
 60                    ".css",
 61                    ".pdf",
 62                    ".wav",
 63                    ".docx",
 64                    ".txt",
 65                    ".rtf",
 66                    ".gif",
 67                    ".webp",
 68                    ".x.x",
 69                ]
 70            ),
 71        ]
 72    ):
 73        return True
 74    else:
 75        return False
 76
 77
 78def find_last_valid_character_offset(text: str) -> int:
 79    """Iterates through a string to find the index of the last valid character,
 80    assuming that string either starts or ends with '@'.
 81
 82    If the string doesn't start or end with '@', an Exception is raised.
 83
 84    Returns the number of valid characters between '@' and first invalid character.
 85    e.g. '@abcde%' will return 5 and '#123@' will return 3.
 86
 87    If no invalid characters are found, the function will return
 88    'len(text)-1'."""
 89
 90    """ Technically some of these characters are valid in an email string,
 91    but the ratio of how often they're used to how often they produce
 92    false positives makes them worth disregarding. """
 93    invalid_characters = " <>[]{},\"':;\\/#$%^&*()=+`?|\n\t\r"
 94    if text[-1] == "@" and text[0] != "@":
 95        # reverse the string
 96        text = text[::-1]
 97    elif text[0] != "@":
 98        raise ValueError(
 99            'First or last character of text arg needs to be "@"\n',
100            f"Argument {text} is invalid.",
101        )
102    i = 1
103    while i < len(text):
104        if text[i] in invalid_characters or text[i] not in printable:
105            return i - 1
106        else:
107            i += 1
108    return len(text) - 1
109
110
111def strip_unicode(emails: list[str]) -> list[str]:
112    """Removes unicode text that often gets picked
113    up at the front of email addresses and returns the list."""
114    stripped_emails = []
115    for email in emails:
116        for text in ["u003e", "u00a0"]:
117            if text in email:
118                email = email[len(text) :]
119        stripped_emails.append(email)
120    return stripped_emails
121
122
123def scrape_emails_noregex(text: str) -> list[str]:
124    """Extracts potential emails from given text
125    and returns as a list of strings."""
126    if "%" in text:
127        # decode percent encoding
128        text = unquote(text)
129    for ch in ["\n", "\t", "\r"]:
130        text = text.replace(ch, " ")
131    at_count = text.count("@")
132    emails = []
133    if at_count > 0:
134        last_stopdex = 0
135        for i in range(at_count):
136            atdex = text.find("@", last_stopdex)
137            next_atdex = text.find("@", atdex + 1)
138            try:
139                chunk = (
140                    text[last_stopdex:next_atdex]
141                    if next_atdex != -1
142                    else text[last_stopdex:]
143                )
144                chunk_atdex = chunk.find("@")
145                startdex = find_last_valid_character_offset(chunk[: chunk_atdex + 1])
146                stopdex = find_last_valid_character_offset(chunk[chunk_atdex:])
147                email = chunk[chunk_atdex - startdex : stopdex + chunk_atdex + 1]
148                while email[-1].isnumeric() or not email[-1].isalpha():
149                    email = email[:-1]
150                if validate(email):
151                    emails.append(email.lower())
152                """ The extra '+ 1' is to ensure last_stopdex increments
153                if 'len(email.split('@')[1])' is 0."""
154                last_stopdex = atdex + len(email.split("@")[1]) + 1
155            except Exception as e:
156                last_stopdex = atdex + 1
157        emails = sorted(list(set(strip_unicode(emails))))
158    return emails
159
160
161def filter_out_files(
162    emails: list[str], additional_extensions: list[str] = None
163) -> list[str]:
164    """Filter out emails with file extensions
165    instead of domains.
166
167    :param additional_extensions: Extra file extensions to filter out."""
168    ext = [
169        "png",
170        "jpg",
171        "js",
172        "html",
173        "svg",
174        "jpeg",
175        "mp4",
176        "mpeg",
177        "css",
178        "pdf",
179        "wav",
180        "docx",
181        "txt",
182        "rtf",
183        "gif",
184        "webp",
185        "x",
186    ]
187    if additional_extensions:
188        ext.extend([extension.strip(".") for extension in additional_extensions])
189    ignore = "$|".join(ext) + "$"
190    pattern = r".*[.](?!" + ignore + r")[^.]*$"
191    # Lazy evaluation means we can skip the regex overhead for common domains
192    return [
193        email
194        for email in emails
195        if email[email.rfind(".") + 1 :]
196        in ["com", "org", "net", "us", "io", "edu", "gov", "biz"]
197        or re.search(pattern, email.lower())
198    ]
199
200
201def replace_unicodehex(text: str) -> str:
202    """Replace unicode hex strings (u003e etc.) with a space."""
203    return re.sub(r"u00[a-zA-Z0-9]{2}", " ", text)
204
205
206def scrape_emails(text: str, extra_extensions: list[str] = None) -> list[str]:
207    """Extract emails from text using regex.
208
209    :param text: The text to scrape.
210
211    :param extra_extensions: Extra file extensions to filter out."""
212    # Remove chunks with no "@" in them to reduce processing
213    text = unquote(" ".join(chunk.lower() for chunk in text.split() if "@" in chunk))
214
215    # Replace any % encoding or unicode hex strings with spaces
216    text = replace_unicodehex(text)
217
218    # Validation:
219    # Starts with an alphanumeric character.
220    # Local part consists of 1-63 alphanumeric + '._-' characters.
221    # Contains a single '@' character not at the beginning or end of a string.
222    # Domain consists of one or more alphanumeric + '_-' characters
223    # followed by a '.' and one or more alphanumeric + '._-' characters
224    # and ending in an alphabetical character.
225    pattern = (
226        r"[a-zA-Z0-9]{1}[a-zA-Z0-9._-]{1,63}@[a-zA-Z0-9_-]+\.[a-zA-Z0-9._-]+[a-zA-Z]{1}"
227    )
228
229    # Match pattern but throw out duplicates and anything that has only numbers in the local part of the address.
230    emails = [
231        email.lower()
232        for email in set(re.findall(pattern, text))
233        if not email.split("@")[0].isnumeric()
234    ]
235    # Remove anything that looks like a file and sort the final results
236    return sorted(filter_out_files(emails))
def validate(email: str) -> bool:
 7def validate(email: str) -> bool:
 8    """Checks string to see if it's likely an email address.
 9
10    Returns True or False.
11
12    Some emails violating some of these rules
13    may technically be valid, but are practically
14    never seen in use out in the wild."""
15    if email.count("@") != 1 or email.count(".") == 0:
16        return False
17    atdex = email.find("@")
18    last_dot = email.rfind(".")
19    local, domain = email.split("@")
20    # RULES:
21    #'@' comes before the last '.'
22    # local part is 64 characters or less
23    # domain part doesn't contain any '_'
24    # at least 1 character in local is alphabetical
25    # 1st character is not '@' or '.'
26    # last character is not '@' or '.'
27    # character after '@' is not '.'
28    # doesn't start with 'www.'
29    # local is two or more characters
30    # domain is more than 3 characters
31    # domain doesn't consist of only numbers
32    # local doesn't consist of only numbers
33    # no consecutive '.' in email
34    # email doesn't contain a listed file ext
35    if all(
36        [
37            atdex < last_dot,
38            len(local) <= 64,
39            domain.count("_") == 0,
40            any(ch.isalpha() for ch in local),
41            email[0] not in ["@", "."],
42            email[-1] not in ["@", "."],
43            email[email.find("@") + 1] != ".",
44            not email.startswith("www."),
45            len(local) >= 2,
46            len(domain) > 3,
47            not all(ch.isnumeric() for ch in domain.replace(".", "")),
48            not all(ch.isnumeric() for ch in local.replace(".", "")),
49            all(email[i - 1] != "." for i, ch in enumerate(email) if ch == "."),
50            all(
51                ext not in domain
52                for ext in [
53                    ".png",
54                    ".jpg",
55                    ".js",
56                    ".html",
57                    ".svg",
58                    ".jpeg",
59                    ".mp4",
60                    ".mpeg",
61                    ".css",
62                    ".pdf",
63                    ".wav",
64                    ".docx",
65                    ".txt",
66                    ".rtf",
67                    ".gif",
68                    ".webp",
69                    ".x.x",
70                ]
71            ),
72        ]
73    ):
74        return True
75    else:
76        return False

Checks string to see if it's likely an email address.

Returns True or False.

Some emails violating some of these rules may technically be valid, but are practically never seen in use out in the wild.

def find_last_valid_character_offset(text: str) -> int:
 79def find_last_valid_character_offset(text: str) -> int:
 80    """Iterates through a string to find the index of the last valid character,
 81    assuming that string either starts or ends with '@'.
 82
 83    If the string doesn't start or end with '@', an Exception is raised.
 84
 85    Returns the number of valid characters between '@' and first invalid character.
 86    e.g. '@abcde%' will return 5 and '#123@' will return 3.
 87
 88    If no invalid characters are found, the function will return
 89    'len(text)-1'."""
 90
 91    """ Technically some of these characters are valid in an email string,
 92    but the ratio of how often they're used to how often they produce
 93    false positives makes them worth disregarding. """
 94    invalid_characters = " <>[]{},\"':;\\/#$%^&*()=+`?|\n\t\r"
 95    if text[-1] == "@" and text[0] != "@":
 96        # reverse the string
 97        text = text[::-1]
 98    elif text[0] != "@":
 99        raise ValueError(
100            'First or last character of text arg needs to be "@"\n',
101            f"Argument {text} is invalid.",
102        )
103    i = 1
104    while i < len(text):
105        if text[i] in invalid_characters or text[i] not in printable:
106            return i - 1
107        else:
108            i += 1
109    return len(text) - 1

Iterates through a string to find the index of the last valid character, assuming that string either starts or ends with '@'.

If the string doesn't start or end with '@', an Exception is raised.

Returns the number of valid characters between '@' and first invalid character. e.g. '@abcde%' will return 5 and '#123@' will return 3.

If no invalid characters are found, the function will return 'len(text)-1'.

def strip_unicode(emails: list[str]) -> list[str]:
112def strip_unicode(emails: list[str]) -> list[str]:
113    """Removes unicode text that often gets picked
114    up at the front of email addresses and returns the list."""
115    stripped_emails = []
116    for email in emails:
117        for text in ["u003e", "u00a0"]:
118            if text in email:
119                email = email[len(text) :]
120        stripped_emails.append(email)
121    return stripped_emails

Removes unicode text that often gets picked up at the front of email addresses and returns the list.

def scrape_emails_noregex(text: str) -> list[str]:
124def scrape_emails_noregex(text: str) -> list[str]:
125    """Extracts potential emails from given text
126    and returns as a list of strings."""
127    if "%" in text:
128        # decode percent encoding
129        text = unquote(text)
130    for ch in ["\n", "\t", "\r"]:
131        text = text.replace(ch, " ")
132    at_count = text.count("@")
133    emails = []
134    if at_count > 0:
135        last_stopdex = 0
136        for i in range(at_count):
137            atdex = text.find("@", last_stopdex)
138            next_atdex = text.find("@", atdex + 1)
139            try:
140                chunk = (
141                    text[last_stopdex:next_atdex]
142                    if next_atdex != -1
143                    else text[last_stopdex:]
144                )
145                chunk_atdex = chunk.find("@")
146                startdex = find_last_valid_character_offset(chunk[: chunk_atdex + 1])
147                stopdex = find_last_valid_character_offset(chunk[chunk_atdex:])
148                email = chunk[chunk_atdex - startdex : stopdex + chunk_atdex + 1]
149                while email[-1].isnumeric() or not email[-1].isalpha():
150                    email = email[:-1]
151                if validate(email):
152                    emails.append(email.lower())
153                """ The extra '+ 1' is to ensure last_stopdex increments
154                if 'len(email.split('@')[1])' is 0."""
155                last_stopdex = atdex + len(email.split("@")[1]) + 1
156            except Exception as e:
157                last_stopdex = atdex + 1
158        emails = sorted(list(set(strip_unicode(emails))))
159    return emails

Extracts potential emails from given text and returns as a list of strings.

def filter_out_files(emails: list[str], additional_extensions: list[str] = None) -> list[str]:
162def filter_out_files(
163    emails: list[str], additional_extensions: list[str] = None
164) -> list[str]:
165    """Filter out emails with file extensions
166    instead of domains.
167
168    :param additional_extensions: Extra file extensions to filter out."""
169    ext = [
170        "png",
171        "jpg",
172        "js",
173        "html",
174        "svg",
175        "jpeg",
176        "mp4",
177        "mpeg",
178        "css",
179        "pdf",
180        "wav",
181        "docx",
182        "txt",
183        "rtf",
184        "gif",
185        "webp",
186        "x",
187    ]
188    if additional_extensions:
189        ext.extend([extension.strip(".") for extension in additional_extensions])
190    ignore = "$|".join(ext) + "$"
191    pattern = r".*[.](?!" + ignore + r")[^.]*$"
192    # Lazy evaluation means we can skip the regex overhead for common domains
193    return [
194        email
195        for email in emails
196        if email[email.rfind(".") + 1 :]
197        in ["com", "org", "net", "us", "io", "edu", "gov", "biz"]
198        or re.search(pattern, email.lower())
199    ]

Filter out emails with file extensions instead of domains.

Parameters
  • additional_extensions: Extra file extensions to filter out.
def replace_unicodehex(text: str) -> str:
202def replace_unicodehex(text: str) -> str:
203    """Replace unicode hex strings (u003e etc.) with a space."""
204    return re.sub(r"u00[a-zA-Z0-9]{2}", " ", text)

Replace unicode hex strings (u003e etc.) with a space.

def scrape_emails(text: str, extra_extensions: list[str] = None) -> list[str]:
207def scrape_emails(text: str, extra_extensions: list[str] = None) -> list[str]:
208    """Extract emails from text using regex.
209
210    :param text: The text to scrape.
211
212    :param extra_extensions: Extra file extensions to filter out."""
213    # Remove chunks with no "@" in them to reduce processing
214    text = unquote(" ".join(chunk.lower() for chunk in text.split() if "@" in chunk))
215
216    # Replace any % encoding or unicode hex strings with spaces
217    text = replace_unicodehex(text)
218
219    # Validation:
220    # Starts with an alphanumeric character.
221    # Local part consists of 1-63 alphanumeric + '._-' characters.
222    # Contains a single '@' character not at the beginning or end of a string.
223    # Domain consists of one or more alphanumeric + '_-' characters
224    # followed by a '.' and one or more alphanumeric + '._-' characters
225    # and ending in an alphabetical character.
226    pattern = (
227        r"[a-zA-Z0-9]{1}[a-zA-Z0-9._-]{1,63}@[a-zA-Z0-9_-]+\.[a-zA-Z0-9._-]+[a-zA-Z]{1}"
228    )
229
230    # Match pattern but throw out duplicates and anything that has only numbers in the local part of the address.
231    emails = [
232        email.lower()
233        for email in set(re.findall(pattern, text))
234        if not email.split("@")[0].isnumeric()
235    ]
236    # Remove anything that looks like a file and sort the final results
237    return sorted(filter_out_files(emails))

Extract emails from text using regex.

Parameters
  • text: The text to scrape.

  • extra_extensions: Extra file extensions to filter out.