pixelripper.pixelripper
1import argparse 2import shutil 3import time 4from pathlib import Path 5from urllib.parse import urlparse 6 7import requests 8from printbuddies import ProgBar 9from seleniumuser import User 10from whosyouragent import get_agent 11 12from scrapetools import LinkScraper 13 14root = Path(__file__).parent 15 16 17class PixelRipper: 18 """Scrape and download media links.""" 19 20 def __init__(self): 21 self.scraper: LinkScraper 22 self.source_url: str 23 self.savedir: str | Path 24 self.video_exts: list[str] = ( 25 (root / "video_extensions.txt").read_text().splitlines() 26 ) 27 self.audio_exts: list[str] = ( 28 (root / "audio_extensions.txt").read_text().splitlines() 29 ) 30 self.image_urls: list[str] 31 self.video_urls: list[str] 32 self.audio_urls: list[str] 33 34 def get(self, url: str, extra_headers: dict[str, str] = {}) -> requests.Response: 35 """Construct and make request for a give url. 36 Returns a requests.Response object. 37 38 :param extra_headers: By default, only a 39 random user-agent string is used in 40 the request header, but additional 41 key-value pairs can be added via this param.""" 42 headers = {"User-Agent": get_agent()} 43 headers |= extra_headers 44 response = requests.get(url, headers=headers) 45 if response.status_code != 200: 46 raise RuntimeError( 47 f"getting {url} failed with response code {response.status_code}." 48 ) 49 return response 50 51 def rip(self, url: str, extra_headers: dict[str, str] = {}): 52 """Scrape page and store urls in a LinkScraper object. 53 54 :param url: The url to scrape for media content. 55 56 :param extra_headers: Any additional HTTP headers to submit 57 with the request.""" 58 response = self.get(url, extra_headers=extra_headers) 59 self.scraper = LinkScraper(response.text, response.url) 60 self.scraper.scrape_page() 61 self.image_urls = [ 62 url 63 for url in self.scraper.get_links("img") 64 if not url.strip().strip("/").endswith(".com") 65 and "apple-touch-icon" not in url.lower() 66 and "favicon" not in url.lower() 67 ] 68 self.video_urls = self.filter_by_extensions( 69 self.video_exts, self.image_urls 70 ) + [ 71 url 72 for url in self.scraper.get_links("all", excluded_links=self.image_urls) 73 if "video" in urlparse(url).path 74 ] 75 self.audio_urls = self.filter_by_extensions( 76 self.audio_exts, self.image_urls + self.video_urls 77 ) + [ 78 url 79 for url in self.scraper.get_links("all", excluded_links=self.image_urls) 80 if "audio" in urlparse(url).path 81 ] 82 83 def filter_by_extensions(self, extensions: list[str], ignores: list[str] = []): 84 """Return file urls from self.scraper 85 according to a list of extensions. 86 87 :param extensions: List of file extensions. 88 Return urls that have an extension matching one in this list. 89 90 :param ignores: List of urls. Filter out any urls 91 in this list regardless of whether they have 92 an extension matching one in the extensions param.""" 93 return [ 94 url.lower() 95 for url in self.scraper.get_links("all", excluded_links=ignores) 96 if any(ext == Path(url).suffix.lower() for ext in extensions) 97 ] 98 99 def download_files( 100 self, 101 urls: list[str], 102 dst: Path | str, 103 extra_headers: dict = None, 104 missing_ext_sub: str = "", 105 ) -> list[(str, int | None)]: 106 """Download a list of files. 107 108 :param urls: A list of urls to download 109 110 :param dst: The destination path to save the files to. 111 112 :param extra_headers: Any additional headers 113 to be added to the request. 114 115 :param missing_ext_sub: A file extension to use for 116 the saved file if the url doesn't have one. 117 118 :return list[(str, int|None)]: A list of files that failed to download. 119 Each element of the list is a tuple where the first element 120 is the file url and the second element is the status code that 121 was returned from requests. If the download failed without a status code, 122 this element will be None.""" 123 failures = [] 124 bar = ProgBar(len(urls)) 125 bar.counter = 1 126 dst = Path(dst) 127 dst.mkdir(parents=True, exist_ok=True) 128 for url in urls: 129 bar.display(prefix=f"Downloading file {bar.counter}/{bar.total}") 130 headers = {"User-Agent": get_agent()} 131 if extra_headers: 132 headers |= extra_headers 133 try: 134 response = requests.get(url, headers=headers) 135 except Exception as e: 136 failures.append((url, None)) 137 continue 138 if response.status_code != 200: 139 failures.append((url, response.status_code)) 140 continue 141 filename = Path(urlparse(url).path).name 142 if Path(filename).suffix == "": 143 filename += missing_ext_sub 144 filepath = dst / filename 145 if filepath.exists(): 146 filepath = filepath.with_stem(filepath.stem + str(time.time())) 147 filepath.write_bytes(response.content) 148 return failures 149 150 def download_all( 151 self, 152 dst: Path | str, 153 extra_headers: dict = None, 154 missing_ext_subs: tuple[str] = (".jpg", ".mp4", ".mp3"), 155 ) -> dict[str, list[tuple[str, int | None]]]: 156 """Download all scraped files. 157 158 :param dst: The destination folder to save to. 159 Separate subfolders for images, videos, and audio 160 will be created. 161 162 :param extra_headers: Any additional headers 163 to be added to the request. 164 165 :param missing_ext_subs: A three-tuple of file extensions to use for 166 the saved file if the url doesn't have one. 167 The expected order is (image_ext, video_ext, audio_ext) 168 169 :return dict[str, list[tuple[str, int | None]]]: Returns files 170 that failed to download. The keys are "images", "videos", and "audio". 171 The values are a list of tuples where the first element 172 is the file url and the second element is the status code that 173 was returned from requests. If the download failed without a status code, 174 this element will be None. 175 """ 176 link_types = ["images", "videos", "audio"] 177 dst = Path(dst) 178 subdirs = [dst / link_type for link_type in link_types] 179 failures = {} 180 for urls, subdir, ext_sub, link_type in zip( 181 (self.image_urls, self.video_urls, self.audio_urls), 182 subdirs, 183 missing_ext_subs, 184 link_types, 185 ): 186 fails = self.download_files(urls, subdir, extra_headers, ext_sub) 187 if len(fails) > 0: 188 failures[link_type] = fails 189 # Remove empty subdir 190 for subdir in subdirs: 191 try: 192 subdir.rmdir() 193 except Exception as e: 194 ... 195 return failures 196 197 198class PixelRipperSelenium(PixelRipper): 199 def __init__(self, headless: bool = True, browser: str = "firefox"): 200 super().__init__() 201 self.user = User(headless=headless, browser_type=browser) 202 203 def get(self, url: str, *args, **kwargs) -> requests.Response: 204 """Get webpage using selenium. 205 206 :param url: The url to scrape for media content. 207 208 :return: A pseudo requests.Response 209 object that only has "text" and "url" 210 members.""" 211 try: 212 if not self.user.browser_open: 213 self.user.open_browser() 214 self.user.get(url) 215 time.sleep(1) 216 old_scroll_height = self.user.script("return document.body.scrollHeight;") 217 # Try to scroll to bottom of continously loading page style 218 while True: 219 self.user.scroll(fraction=1) 220 time.sleep(1) 221 if ( 222 self.user.script("return document.body.scrollHeight;") 223 == old_scroll_height 224 ): 225 break 226 old_scroll_height = self.user.script( 227 "return document.body.scrollHeight;" 228 ) 229 time.sleep(1) 230 231 class Response: 232 def __init__(self, text: str, url: str): 233 self.text = text 234 self.url = url 235 236 return Response(self.user.browser.page_source, url) 237 except Exception as e: 238 print(e) 239 finally: 240 self.user.close_browser() 241 242 243def get_args() -> argparse.Namespace: 244 parser = argparse.ArgumentParser() 245 246 parser.add_argument("url", type=str, help=""" The url to scrape for media. """) 247 248 parser.add_argument( 249 "-s", 250 "--selenium", 251 action="store_true", 252 help=""" Use selenium to get page content 253 instead of requests. """, 254 ) 255 256 parser.add_argument( 257 "-nh", 258 "--no_headless", 259 action="store_true", 260 help="Don't use headless mode when using -s/--selenium. ", 261 ) 262 263 parser.add_argument( 264 "-b", 265 "--browser", 266 default="firefox", 267 type=str, 268 help=""" The browser to use when using -s/--selenium. 269 Can be 'firefox' or 'chrome'. You must have the appropriate webdriver 270 installed for your machine and browser version in order to use the selenium engine.""", 271 ) 272 273 parser.add_argument( 274 "-o", 275 "--output_path", 276 type=str, 277 default=None, 278 help=""" Output directory to save results to. 279 If not specified, a folder with the name of the 280 webpage will be created in the current working directory.""", 281 ) 282 283 parser.add_argument( 284 "-eh", 285 "--extra_headers", 286 nargs="*", 287 type=str, 288 default=[], 289 help=""" Extra headers to use when requesting files as key, value pairs. 290 Keys and values whould be colon separated and pairs should be space separated. 291 e.g. -eh Referer:website.com/page Host:website.com""", 292 ) 293 294 args = parser.parse_args() 295 296 if not args.output_path: 297 args.output_path = Path.cwd() / urlparse(args.url).netloc.strip("www.") 298 else: 299 args.output_path = Path(args.output_path).resolve() 300 args.browser = args.browser.lower() 301 if args.extra_headers: 302 args.extra_headers = { 303 pair[: pair.find(":")]: pair[pair.find(":") + 1 :] 304 for pair in args.extra_headers 305 } 306 return args 307 308 309def main(args: argparse.Namespace = None): 310 if not args: 311 args = get_args() 312 ripper = ( 313 PixelRipperSelenium(not args.no_headless, args.browser) 314 if args.selenium 315 else PixelRipper() 316 ) 317 ripper.rip(args.url) 318 failures = ripper.download_all(args.output_path, extra_headers=args.extra_headers) 319 if len(failures) > 0: 320 print("Failed to download the following:") 321 for key in failures: 322 if len(failures[key]) > 0: 323 print(f"{key}:") 324 print(*failures[key], sep="\n") 325 326 327if __name__ == "__main__": 328 main(get_args())
18class PixelRipper: 19 """Scrape and download media links.""" 20 21 def __init__(self): 22 self.scraper: LinkScraper 23 self.source_url: str 24 self.savedir: str | Path 25 self.video_exts: list[str] = ( 26 (root / "video_extensions.txt").read_text().splitlines() 27 ) 28 self.audio_exts: list[str] = ( 29 (root / "audio_extensions.txt").read_text().splitlines() 30 ) 31 self.image_urls: list[str] 32 self.video_urls: list[str] 33 self.audio_urls: list[str] 34 35 def get(self, url: str, extra_headers: dict[str, str] = {}) -> requests.Response: 36 """Construct and make request for a give url. 37 Returns a requests.Response object. 38 39 :param extra_headers: By default, only a 40 random user-agent string is used in 41 the request header, but additional 42 key-value pairs can be added via this param.""" 43 headers = {"User-Agent": get_agent()} 44 headers |= extra_headers 45 response = requests.get(url, headers=headers) 46 if response.status_code != 200: 47 raise RuntimeError( 48 f"getting {url} failed with response code {response.status_code}." 49 ) 50 return response 51 52 def rip(self, url: str, extra_headers: dict[str, str] = {}): 53 """Scrape page and store urls in a LinkScraper object. 54 55 :param url: The url to scrape for media content. 56 57 :param extra_headers: Any additional HTTP headers to submit 58 with the request.""" 59 response = self.get(url, extra_headers=extra_headers) 60 self.scraper = LinkScraper(response.text, response.url) 61 self.scraper.scrape_page() 62 self.image_urls = [ 63 url 64 for url in self.scraper.get_links("img") 65 if not url.strip().strip("/").endswith(".com") 66 and "apple-touch-icon" not in url.lower() 67 and "favicon" not in url.lower() 68 ] 69 self.video_urls = self.filter_by_extensions( 70 self.video_exts, self.image_urls 71 ) + [ 72 url 73 for url in self.scraper.get_links("all", excluded_links=self.image_urls) 74 if "video" in urlparse(url).path 75 ] 76 self.audio_urls = self.filter_by_extensions( 77 self.audio_exts, self.image_urls + self.video_urls 78 ) + [ 79 url 80 for url in self.scraper.get_links("all", excluded_links=self.image_urls) 81 if "audio" in urlparse(url).path 82 ] 83 84 def filter_by_extensions(self, extensions: list[str], ignores: list[str] = []): 85 """Return file urls from self.scraper 86 according to a list of extensions. 87 88 :param extensions: List of file extensions. 89 Return urls that have an extension matching one in this list. 90 91 :param ignores: List of urls. Filter out any urls 92 in this list regardless of whether they have 93 an extension matching one in the extensions param.""" 94 return [ 95 url.lower() 96 for url in self.scraper.get_links("all", excluded_links=ignores) 97 if any(ext == Path(url).suffix.lower() for ext in extensions) 98 ] 99 100 def download_files( 101 self, 102 urls: list[str], 103 dst: Path | str, 104 extra_headers: dict = None, 105 missing_ext_sub: str = "", 106 ) -> list[(str, int | None)]: 107 """Download a list of files. 108 109 :param urls: A list of urls to download 110 111 :param dst: The destination path to save the files to. 112 113 :param extra_headers: Any additional headers 114 to be added to the request. 115 116 :param missing_ext_sub: A file extension to use for 117 the saved file if the url doesn't have one. 118 119 :return list[(str, int|None)]: A list of files that failed to download. 120 Each element of the list is a tuple where the first element 121 is the file url and the second element is the status code that 122 was returned from requests. If the download failed without a status code, 123 this element will be None.""" 124 failures = [] 125 bar = ProgBar(len(urls)) 126 bar.counter = 1 127 dst = Path(dst) 128 dst.mkdir(parents=True, exist_ok=True) 129 for url in urls: 130 bar.display(prefix=f"Downloading file {bar.counter}/{bar.total}") 131 headers = {"User-Agent": get_agent()} 132 if extra_headers: 133 headers |= extra_headers 134 try: 135 response = requests.get(url, headers=headers) 136 except Exception as e: 137 failures.append((url, None)) 138 continue 139 if response.status_code != 200: 140 failures.append((url, response.status_code)) 141 continue 142 filename = Path(urlparse(url).path).name 143 if Path(filename).suffix == "": 144 filename += missing_ext_sub 145 filepath = dst / filename 146 if filepath.exists(): 147 filepath = filepath.with_stem(filepath.stem + str(time.time())) 148 filepath.write_bytes(response.content) 149 return failures 150 151 def download_all( 152 self, 153 dst: Path | str, 154 extra_headers: dict = None, 155 missing_ext_subs: tuple[str] = (".jpg", ".mp4", ".mp3"), 156 ) -> dict[str, list[tuple[str, int | None]]]: 157 """Download all scraped files. 158 159 :param dst: The destination folder to save to. 160 Separate subfolders for images, videos, and audio 161 will be created. 162 163 :param extra_headers: Any additional headers 164 to be added to the request. 165 166 :param missing_ext_subs: A three-tuple of file extensions to use for 167 the saved file if the url doesn't have one. 168 The expected order is (image_ext, video_ext, audio_ext) 169 170 :return dict[str, list[tuple[str, int | None]]]: Returns files 171 that failed to download. The keys are "images", "videos", and "audio". 172 The values are a list of tuples where the first element 173 is the file url and the second element is the status code that 174 was returned from requests. If the download failed without a status code, 175 this element will be None. 176 """ 177 link_types = ["images", "videos", "audio"] 178 dst = Path(dst) 179 subdirs = [dst / link_type for link_type in link_types] 180 failures = {} 181 for urls, subdir, ext_sub, link_type in zip( 182 (self.image_urls, self.video_urls, self.audio_urls), 183 subdirs, 184 missing_ext_subs, 185 link_types, 186 ): 187 fails = self.download_files(urls, subdir, extra_headers, ext_sub) 188 if len(fails) > 0: 189 failures[link_type] = fails 190 # Remove empty subdir 191 for subdir in subdirs: 192 try: 193 subdir.rmdir() 194 except Exception as e: 195 ... 196 return failures
Scrape and download media links.
21 def __init__(self): 22 self.scraper: LinkScraper 23 self.source_url: str 24 self.savedir: str | Path 25 self.video_exts: list[str] = ( 26 (root / "video_extensions.txt").read_text().splitlines() 27 ) 28 self.audio_exts: list[str] = ( 29 (root / "audio_extensions.txt").read_text().splitlines() 30 ) 31 self.image_urls: list[str] 32 self.video_urls: list[str] 33 self.audio_urls: list[str]
35 def get(self, url: str, extra_headers: dict[str, str] = {}) -> requests.Response: 36 """Construct and make request for a give url. 37 Returns a requests.Response object. 38 39 :param extra_headers: By default, only a 40 random user-agent string is used in 41 the request header, but additional 42 key-value pairs can be added via this param.""" 43 headers = {"User-Agent": get_agent()} 44 headers |= extra_headers 45 response = requests.get(url, headers=headers) 46 if response.status_code != 200: 47 raise RuntimeError( 48 f"getting {url} failed with response code {response.status_code}." 49 ) 50 return response
Construct and make request for a give url. Returns a requests.Response object.
Parameters
- extra_headers: By default, only a random user-agent string is used in the request header, but additional key-value pairs can be added via this param.
52 def rip(self, url: str, extra_headers: dict[str, str] = {}): 53 """Scrape page and store urls in a LinkScraper object. 54 55 :param url: The url to scrape for media content. 56 57 :param extra_headers: Any additional HTTP headers to submit 58 with the request.""" 59 response = self.get(url, extra_headers=extra_headers) 60 self.scraper = LinkScraper(response.text, response.url) 61 self.scraper.scrape_page() 62 self.image_urls = [ 63 url 64 for url in self.scraper.get_links("img") 65 if not url.strip().strip("/").endswith(".com") 66 and "apple-touch-icon" not in url.lower() 67 and "favicon" not in url.lower() 68 ] 69 self.video_urls = self.filter_by_extensions( 70 self.video_exts, self.image_urls 71 ) + [ 72 url 73 for url in self.scraper.get_links("all", excluded_links=self.image_urls) 74 if "video" in urlparse(url).path 75 ] 76 self.audio_urls = self.filter_by_extensions( 77 self.audio_exts, self.image_urls + self.video_urls 78 ) + [ 79 url 80 for url in self.scraper.get_links("all", excluded_links=self.image_urls) 81 if "audio" in urlparse(url).path 82 ]
Scrape page and store urls in a LinkScraper object.
Parameters
url: The url to scrape for media content.
extra_headers: Any additional HTTP headers to submit with the request.
84 def filter_by_extensions(self, extensions: list[str], ignores: list[str] = []): 85 """Return file urls from self.scraper 86 according to a list of extensions. 87 88 :param extensions: List of file extensions. 89 Return urls that have an extension matching one in this list. 90 91 :param ignores: List of urls. Filter out any urls 92 in this list regardless of whether they have 93 an extension matching one in the extensions param.""" 94 return [ 95 url.lower() 96 for url in self.scraper.get_links("all", excluded_links=ignores) 97 if any(ext == Path(url).suffix.lower() for ext in extensions) 98 ]
Return file urls from self.scraper according to a list of extensions.
Parameters
extensions: List of file extensions. Return urls that have an extension matching one in this list.
ignores: List of urls. Filter out any urls in this list regardless of whether they have an extension matching one in the extensions param.
100 def download_files( 101 self, 102 urls: list[str], 103 dst: Path | str, 104 extra_headers: dict = None, 105 missing_ext_sub: str = "", 106 ) -> list[(str, int | None)]: 107 """Download a list of files. 108 109 :param urls: A list of urls to download 110 111 :param dst: The destination path to save the files to. 112 113 :param extra_headers: Any additional headers 114 to be added to the request. 115 116 :param missing_ext_sub: A file extension to use for 117 the saved file if the url doesn't have one. 118 119 :return list[(str, int|None)]: A list of files that failed to download. 120 Each element of the list is a tuple where the first element 121 is the file url and the second element is the status code that 122 was returned from requests. If the download failed without a status code, 123 this element will be None.""" 124 failures = [] 125 bar = ProgBar(len(urls)) 126 bar.counter = 1 127 dst = Path(dst) 128 dst.mkdir(parents=True, exist_ok=True) 129 for url in urls: 130 bar.display(prefix=f"Downloading file {bar.counter}/{bar.total}") 131 headers = {"User-Agent": get_agent()} 132 if extra_headers: 133 headers |= extra_headers 134 try: 135 response = requests.get(url, headers=headers) 136 except Exception as e: 137 failures.append((url, None)) 138 continue 139 if response.status_code != 200: 140 failures.append((url, response.status_code)) 141 continue 142 filename = Path(urlparse(url).path).name 143 if Path(filename).suffix == "": 144 filename += missing_ext_sub 145 filepath = dst / filename 146 if filepath.exists(): 147 filepath = filepath.with_stem(filepath.stem + str(time.time())) 148 filepath.write_bytes(response.content) 149 return failures
Download a list of files.
Parameters
urls: A list of urls to download
dst: The destination path to save the files to.
extra_headers: Any additional headers to be added to the request.
missing_ext_sub: A file extension to use for the saved file if the url doesn't have one.
Returns
A list of files that failed to download. Each element of the list is a tuple where the first element is the file url and the second element is the status code that was returned from requests. If the download failed without a status code, this element will be None.
151 def download_all( 152 self, 153 dst: Path | str, 154 extra_headers: dict = None, 155 missing_ext_subs: tuple[str] = (".jpg", ".mp4", ".mp3"), 156 ) -> dict[str, list[tuple[str, int | None]]]: 157 """Download all scraped files. 158 159 :param dst: The destination folder to save to. 160 Separate subfolders for images, videos, and audio 161 will be created. 162 163 :param extra_headers: Any additional headers 164 to be added to the request. 165 166 :param missing_ext_subs: A three-tuple of file extensions to use for 167 the saved file if the url doesn't have one. 168 The expected order is (image_ext, video_ext, audio_ext) 169 170 :return dict[str, list[tuple[str, int | None]]]: Returns files 171 that failed to download. The keys are "images", "videos", and "audio". 172 The values are a list of tuples where the first element 173 is the file url and the second element is the status code that 174 was returned from requests. If the download failed without a status code, 175 this element will be None. 176 """ 177 link_types = ["images", "videos", "audio"] 178 dst = Path(dst) 179 subdirs = [dst / link_type for link_type in link_types] 180 failures = {} 181 for urls, subdir, ext_sub, link_type in zip( 182 (self.image_urls, self.video_urls, self.audio_urls), 183 subdirs, 184 missing_ext_subs, 185 link_types, 186 ): 187 fails = self.download_files(urls, subdir, extra_headers, ext_sub) 188 if len(fails) > 0: 189 failures[link_type] = fails 190 # Remove empty subdir 191 for subdir in subdirs: 192 try: 193 subdir.rmdir() 194 except Exception as e: 195 ... 196 return failures
Download all scraped files.
Parameters
dst: The destination folder to save to. Separate subfolders for images, videos, and audio will be created.
extra_headers: Any additional headers to be added to the request.
missing_ext_subs: A three-tuple of file extensions to use for the saved file if the url doesn't have one. The expected order is (image_ext, video_ext, audio_ext)
Returns
Returns files that failed to download. The keys are "images", "videos", and "audio". The values are a list of tuples where the first element is the file url and the second element is the status code that was returned from requests. If the download failed without a status code, this element will be None.
199class PixelRipperSelenium(PixelRipper): 200 def __init__(self, headless: bool = True, browser: str = "firefox"): 201 super().__init__() 202 self.user = User(headless=headless, browser_type=browser) 203 204 def get(self, url: str, *args, **kwargs) -> requests.Response: 205 """Get webpage using selenium. 206 207 :param url: The url to scrape for media content. 208 209 :return: A pseudo requests.Response 210 object that only has "text" and "url" 211 members.""" 212 try: 213 if not self.user.browser_open: 214 self.user.open_browser() 215 self.user.get(url) 216 time.sleep(1) 217 old_scroll_height = self.user.script("return document.body.scrollHeight;") 218 # Try to scroll to bottom of continously loading page style 219 while True: 220 self.user.scroll(fraction=1) 221 time.sleep(1) 222 if ( 223 self.user.script("return document.body.scrollHeight;") 224 == old_scroll_height 225 ): 226 break 227 old_scroll_height = self.user.script( 228 "return document.body.scrollHeight;" 229 ) 230 time.sleep(1) 231 232 class Response: 233 def __init__(self, text: str, url: str): 234 self.text = text 235 self.url = url 236 237 return Response(self.user.browser.page_source, url) 238 except Exception as e: 239 print(e) 240 finally: 241 self.user.close_browser()
Scrape and download media links.
204 def get(self, url: str, *args, **kwargs) -> requests.Response: 205 """Get webpage using selenium. 206 207 :param url: The url to scrape for media content. 208 209 :return: A pseudo requests.Response 210 object that only has "text" and "url" 211 members.""" 212 try: 213 if not self.user.browser_open: 214 self.user.open_browser() 215 self.user.get(url) 216 time.sleep(1) 217 old_scroll_height = self.user.script("return document.body.scrollHeight;") 218 # Try to scroll to bottom of continously loading page style 219 while True: 220 self.user.scroll(fraction=1) 221 time.sleep(1) 222 if ( 223 self.user.script("return document.body.scrollHeight;") 224 == old_scroll_height 225 ): 226 break 227 old_scroll_height = self.user.script( 228 "return document.body.scrollHeight;" 229 ) 230 time.sleep(1) 231 232 class Response: 233 def __init__(self, text: str, url: str): 234 self.text = text 235 self.url = url 236 237 return Response(self.user.browser.page_source, url) 238 except Exception as e: 239 print(e) 240 finally: 241 self.user.close_browser()
Get webpage using selenium.
Parameters
- url: The url to scrape for media content.
Returns
A pseudo requests.Response object that only has "text" and "url" members.
Inherited Members
244def get_args() -> argparse.Namespace: 245 parser = argparse.ArgumentParser() 246 247 parser.add_argument("url", type=str, help=""" The url to scrape for media. """) 248 249 parser.add_argument( 250 "-s", 251 "--selenium", 252 action="store_true", 253 help=""" Use selenium to get page content 254 instead of requests. """, 255 ) 256 257 parser.add_argument( 258 "-nh", 259 "--no_headless", 260 action="store_true", 261 help="Don't use headless mode when using -s/--selenium. ", 262 ) 263 264 parser.add_argument( 265 "-b", 266 "--browser", 267 default="firefox", 268 type=str, 269 help=""" The browser to use when using -s/--selenium. 270 Can be 'firefox' or 'chrome'. You must have the appropriate webdriver 271 installed for your machine and browser version in order to use the selenium engine.""", 272 ) 273 274 parser.add_argument( 275 "-o", 276 "--output_path", 277 type=str, 278 default=None, 279 help=""" Output directory to save results to. 280 If not specified, a folder with the name of the 281 webpage will be created in the current working directory.""", 282 ) 283 284 parser.add_argument( 285 "-eh", 286 "--extra_headers", 287 nargs="*", 288 type=str, 289 default=[], 290 help=""" Extra headers to use when requesting files as key, value pairs. 291 Keys and values whould be colon separated and pairs should be space separated. 292 e.g. -eh Referer:website.com/page Host:website.com""", 293 ) 294 295 args = parser.parse_args() 296 297 if not args.output_path: 298 args.output_path = Path.cwd() / urlparse(args.url).netloc.strip("www.") 299 else: 300 args.output_path = Path(args.output_path).resolve() 301 args.browser = args.browser.lower() 302 if args.extra_headers: 303 args.extra_headers = { 304 pair[: pair.find(":")]: pair[pair.find(":") + 1 :] 305 for pair in args.extra_headers 306 } 307 return args
310def main(args: argparse.Namespace = None): 311 if not args: 312 args = get_args() 313 ripper = ( 314 PixelRipperSelenium(not args.no_headless, args.browser) 315 if args.selenium 316 else PixelRipper() 317 ) 318 ripper.rip(args.url) 319 failures = ripper.download_all(args.output_path, extra_headers=args.extra_headers) 320 if len(failures) > 0: 321 print("Failed to download the following:") 322 for key in failures: 323 if len(failures[key]) > 0: 324 print(f"{key}:") 325 print(*failures[key], sep="\n")