pixelripper.pixelripper

  1import argparse
  2import shutil
  3import time
  4from pathlib import Path
  5from urllib.parse import urlparse
  6
  7import requests
  8
  9from printbuddies import ProgBar
 10from scrapetools import LinkScraper
 11from seleniumuser import User
 12from whosyouragent import get_agent
 13
 14root = Path(__file__).parent
 15
 16
 17class PixelRipper:
 18    """Scrape and download media links."""
 19
 20    def __init__(self):
 21        self.scraper: LinkScraper
 22        self.source_url: str
 23        self.savedir: str | Path
 24        self.video_exts: list[str] = (
 25            (root / "video_extensions.txt").read_text().splitlines()
 26        )
 27        self.audio_exts: list[str] = (
 28            (root / "audio_extensions.txt").read_text().splitlines()
 29        )
 30        self.image_urls: list[str]
 31        self.video_urls: list[str]
 32        self.audio_urls: list[str]
 33
 34    def get(self, url: str, extra_headers: dict[str, str] = {}) -> requests.Response:
 35        """Construct and make request for a give url.
 36        Returns a requests.Response object.
 37
 38        :param extra_headers: By default, only a
 39        random user-agent string is used in
 40        the request header, but additional
 41        key-value pairs can be added via this param."""
 42        headers = {"User-Agent": get_agent()}
 43        headers |= extra_headers
 44        response = requests.get(url, headers=headers)
 45        if response.status_code != 200:
 46            raise RuntimeError(
 47                f"getting {url} failed with response code {response.status_code}."
 48            )
 49        return response
 50
 51    def rip(self, url: str, extra_headers: dict[str, str] = {}):
 52        """Scrape page and store urls in a LinkScraper object.
 53
 54        :param url: The url to scrape for media content.
 55
 56        :param extra_headers: Any additional HTTP headers to submit
 57        with the request."""
 58        response = self.get(url, extra_headers=extra_headers)
 59        self.scraper = LinkScraper(response.text, response.url)
 60        self.scraper.scrape_page()
 61        self.image_urls = [
 62            url
 63            for url in self.scraper.get_links("img")
 64            if not url.strip().strip("/").endswith(".com")
 65            and "apple-touch-icon" not in url.lower()
 66            and "favicon" not in url.lower()
 67        ]
 68        self.video_urls = self.filter_by_extensions(
 69            self.video_exts, self.image_urls
 70        ) + [
 71            url
 72            for url in self.scraper.get_links("all", excluded_links=self.image_urls)
 73            if "video" in urlparse(url).path
 74        ]
 75        self.audio_urls = self.filter_by_extensions(
 76            self.audio_exts, self.image_urls + self.video_urls
 77        ) + [
 78            url
 79            for url in self.scraper.get_links("all", excluded_links=self.image_urls)
 80            if "audio" in urlparse(url).path
 81        ]
 82
 83    def filter_by_extensions(self, extensions: list[str], ignores: list[str] = []):
 84        """Return file urls from self.scraper
 85        according to a list of extensions.
 86
 87        :param extensions: List of file extensions.
 88        Return urls that have an extension matching one in this list.
 89
 90        :param ignores: List of urls. Filter out any urls
 91        in this list regardless of whether they have
 92        an extension matching one in the extensions param."""
 93        return [
 94            url.lower()
 95            for url in self.scraper.get_links("all", excluded_links=ignores)
 96            if any(ext == Path(url).suffix.lower() for ext in extensions)
 97        ]
 98
 99    def download_files(
100        self,
101        urls: list[str],
102        dst: Path | str,
103        extra_headers: dict = None,
104        missing_ext_sub: str = "",
105    ) -> list[(str, int | None)]:
106        """Download a list of files.
107
108        :param urls: A list of urls to download
109
110        :param dst: The destination path to save the files to.
111
112        :param extra_headers: Any additional headers
113        to be added to the request.
114
115        :param missing_ext_sub: A file extension to use for
116        the saved file if the url doesn't have one.
117
118        :return list[(str, int|None)]: A list of files that failed to download.
119        Each element of the list is a tuple where the first element
120        is the file url and the second element is the status code that
121        was returned from requests. If the download failed without a status code,
122        this element will be None."""
123        failures = []
124        bar = ProgBar(len(urls))
125        bar.counter = 1
126        dst = Path(dst)
127        dst.mkdir(parents=True, exist_ok=True)
128        for url in urls:
129            bar.display(prefix=f"Downloading file {bar.counter}/{bar.total}")
130            headers = {"User-Agent": get_agent()}
131            if extra_headers:
132                headers |= extra_headers
133            try:
134                response = requests.get(url, headers=headers)
135            except Exception as e:
136                failures.append((url, None))
137                continue
138            if response.status_code != 200:
139                failures.append((url, response.status_code))
140                continue
141            filename = Path(urlparse(url).path).name
142            if Path(filename).suffix == "":
143                filename += missing_ext_sub
144            filepath = dst / filename
145            if filepath.exists():
146                filepath = filepath.with_stem(filepath.stem + str(time.time()))
147            filepath.write_bytes(response.content)
148        return failures
149
150    def download_all(
151        self,
152        dst: Path | str,
153        extra_headers: dict = None,
154        missing_ext_subs: tuple[str] = (".jpg", ".mp4", ".mp3"),
155    ) -> dict[str, list[tuple[str, int | None]]]:
156        """Download all scraped files.
157
158        :param dst: The destination folder to save to.
159        Separate subfolders for images, videos, and audio
160        will be created.
161
162        :param extra_headers: Any additional headers
163        to be added to the request.
164
165        :param missing_ext_subs: A three-tuple of file extensions to use for
166        the saved file if the url doesn't have one.
167        The expected order is (image_ext, video_ext, audio_ext)
168
169        :return dict[str, list[tuple[str, int | None]]]: Returns files
170        that failed to download. The keys are "images", "videos", and "audio".
171        The values are a list of tuples where the first element
172        is the file url and the second element is the status code that
173        was returned from requests. If the download failed without a status code,
174        this element will be None.
175        """
176        link_types = ["images", "videos", "audio"]
177        dst = Path(dst)
178        subdirs = [dst / link_type for link_type in link_types]
179        failures = {}
180        for urls, subdir, ext_sub, link_type in zip(
181            (self.image_urls, self.video_urls, self.audio_urls),
182            subdirs,
183            missing_ext_subs,
184            link_types,
185        ):
186            fails = self.download_files(urls, subdir, extra_headers, ext_sub)
187            if len(fails) > 0:
188                failures[link_type] = fails
189        # Remove empty subdir
190        for subdir in subdirs:
191            try:
192                subdir.rmdir()
193            except Exception as e:
194                ...
195        return failures
196
197
198class PixelRipperSelenium(PixelRipper):
199    def __init__(self, headless: bool = True, browser: str = "firefox"):
200        super().__init__()
201        self.user = User(headless=headless, browser_type=browser)
202
203    def get(self, url: str, *args, **kwargs) -> requests.Response:
204        """Get webpage using selenium.
205
206        :param url: The url to scrape for media content.
207
208        :return: A pseudo requests.Response
209        object that only has "text" and "url"
210        members."""
211        try:
212            if not self.user.browser_open:
213                self.user.open_browser()
214            self.user.get(url)
215            time.sleep(1)
216            old_scroll_height = self.user.script("return document.body.scrollHeight;")
217            # Try to scroll to bottom of continously loading page style
218            while True:
219                self.user.scroll(fraction=1)
220                time.sleep(1)
221                if (
222                    self.user.script("return document.body.scrollHeight;")
223                    == old_scroll_height
224                ):
225                    break
226                old_scroll_height = self.user.script(
227                    "return document.body.scrollHeight;"
228                )
229            time.sleep(1)
230
231            class Response:
232                def __init__(self, text: str, url: str):
233                    self.text = text
234                    self.url = url
235
236            return Response(self.user.browser.page_source, url)
237        except Exception as e:
238            print(e)
239        finally:
240            self.user.close_browser()
241
242
243def get_args() -> argparse.Namespace:
244    parser = argparse.ArgumentParser()
245
246    parser.add_argument("url", type=str, help=""" The url to scrape for media. """)
247
248    parser.add_argument(
249        "-s",
250        "--selenium",
251        action="store_true",
252        help=""" Use selenium to get page content
253        instead of requests. """,
254    )
255
256    parser.add_argument(
257        "-nh",
258        "--no_headless",
259        action="store_true",
260        help="Don't use headless mode when using -s/--selenium. ",
261    )
262
263    parser.add_argument(
264        "-b",
265        "--browser",
266        default="firefox",
267        type=str,
268        help=""" The browser to use when using -s/--selenium.
269        Can be 'firefox' or 'chrome'. You must have the appropriate webdriver
270        installed for your machine and browser version in order to use the selenium engine.""",
271    )
272
273    parser.add_argument(
274        "-o",
275        "--output_path",
276        type=str,
277        default=None,
278        help=""" Output directory to save results to.
279        If not specified, a folder with the name of the
280        webpage will be created in the current working directory.""",
281    )
282
283    parser.add_argument(
284        "-eh",
285        "--extra_headers",
286        nargs="*",
287        type=str,
288        default=[],
289        help=""" Extra headers to use when requesting files as key, value pairs.
290        Keys and values whould be colon separated and pairs should be space separated.
291        e.g. -eh Referer:website.com/page Host:website.com""",
292    )
293
294    args = parser.parse_args()
295
296    if not args.output_path:
297        args.output_path = Path.cwd() / urlparse(args.url).netloc.strip("www.")
298    else:
299        args.output_path = Path(args.output_path).resolve()
300    args.browser = args.browser.lower()
301    if args.extra_headers:
302        args.extra_headers = {
303            pair[: pair.find(":")]: pair[pair.find(":") + 1 :]
304            for pair in args.extra_headers
305        }
306    return args
307
308
309def main(args: argparse.Namespace = None):
310    if not args:
311        args = get_args()
312    ripper = (
313        PixelRipperSelenium(not args.no_headless, args.browser)
314        if args.selenium
315        else PixelRipper()
316    )
317    ripper.rip(args.url)
318    failures = ripper.download_all(args.output_path, extra_headers=args.extra_headers)
319    if len(failures) > 0:
320        print("Failed to download the following:")
321        for key in failures:
322            if len(failures[key]) > 0:
323                print(f"{key}:")
324                print(*failures[key], sep="\n")
325
326
327if __name__ == "__main__":
328    main(get_args())
class PixelRipper:
 18class PixelRipper:
 19    """Scrape and download media links."""
 20
 21    def __init__(self):
 22        self.scraper: LinkScraper
 23        self.source_url: str
 24        self.savedir: str | Path
 25        self.video_exts: list[str] = (
 26            (root / "video_extensions.txt").read_text().splitlines()
 27        )
 28        self.audio_exts: list[str] = (
 29            (root / "audio_extensions.txt").read_text().splitlines()
 30        )
 31        self.image_urls: list[str]
 32        self.video_urls: list[str]
 33        self.audio_urls: list[str]
 34
 35    def get(self, url: str, extra_headers: dict[str, str] = {}) -> requests.Response:
 36        """Construct and make request for a give url.
 37        Returns a requests.Response object.
 38
 39        :param extra_headers: By default, only a
 40        random user-agent string is used in
 41        the request header, but additional
 42        key-value pairs can be added via this param."""
 43        headers = {"User-Agent": get_agent()}
 44        headers |= extra_headers
 45        response = requests.get(url, headers=headers)
 46        if response.status_code != 200:
 47            raise RuntimeError(
 48                f"getting {url} failed with response code {response.status_code}."
 49            )
 50        return response
 51
 52    def rip(self, url: str, extra_headers: dict[str, str] = {}):
 53        """Scrape page and store urls in a LinkScraper object.
 54
 55        :param url: The url to scrape for media content.
 56
 57        :param extra_headers: Any additional HTTP headers to submit
 58        with the request."""
 59        response = self.get(url, extra_headers=extra_headers)
 60        self.scraper = LinkScraper(response.text, response.url)
 61        self.scraper.scrape_page()
 62        self.image_urls = [
 63            url
 64            for url in self.scraper.get_links("img")
 65            if not url.strip().strip("/").endswith(".com")
 66            and "apple-touch-icon" not in url.lower()
 67            and "favicon" not in url.lower()
 68        ]
 69        self.video_urls = self.filter_by_extensions(
 70            self.video_exts, self.image_urls
 71        ) + [
 72            url
 73            for url in self.scraper.get_links("all", excluded_links=self.image_urls)
 74            if "video" in urlparse(url).path
 75        ]
 76        self.audio_urls = self.filter_by_extensions(
 77            self.audio_exts, self.image_urls + self.video_urls
 78        ) + [
 79            url
 80            for url in self.scraper.get_links("all", excluded_links=self.image_urls)
 81            if "audio" in urlparse(url).path
 82        ]
 83
 84    def filter_by_extensions(self, extensions: list[str], ignores: list[str] = []):
 85        """Return file urls from self.scraper
 86        according to a list of extensions.
 87
 88        :param extensions: List of file extensions.
 89        Return urls that have an extension matching one in this list.
 90
 91        :param ignores: List of urls. Filter out any urls
 92        in this list regardless of whether they have
 93        an extension matching one in the extensions param."""
 94        return [
 95            url.lower()
 96            for url in self.scraper.get_links("all", excluded_links=ignores)
 97            if any(ext == Path(url).suffix.lower() for ext in extensions)
 98        ]
 99
100    def download_files(
101        self,
102        urls: list[str],
103        dst: Path | str,
104        extra_headers: dict = None,
105        missing_ext_sub: str = "",
106    ) -> list[(str, int | None)]:
107        """Download a list of files.
108
109        :param urls: A list of urls to download
110
111        :param dst: The destination path to save the files to.
112
113        :param extra_headers: Any additional headers
114        to be added to the request.
115
116        :param missing_ext_sub: A file extension to use for
117        the saved file if the url doesn't have one.
118
119        :return list[(str, int|None)]: A list of files that failed to download.
120        Each element of the list is a tuple where the first element
121        is the file url and the second element is the status code that
122        was returned from requests. If the download failed without a status code,
123        this element will be None."""
124        failures = []
125        bar = ProgBar(len(urls))
126        bar.counter = 1
127        dst = Path(dst)
128        dst.mkdir(parents=True, exist_ok=True)
129        for url in urls:
130            bar.display(prefix=f"Downloading file {bar.counter}/{bar.total}")
131            headers = {"User-Agent": get_agent()}
132            if extra_headers:
133                headers |= extra_headers
134            try:
135                response = requests.get(url, headers=headers)
136            except Exception as e:
137                failures.append((url, None))
138                continue
139            if response.status_code != 200:
140                failures.append((url, response.status_code))
141                continue
142            filename = Path(urlparse(url).path).name
143            if Path(filename).suffix == "":
144                filename += missing_ext_sub
145            filepath = dst / filename
146            if filepath.exists():
147                filepath = filepath.with_stem(filepath.stem + str(time.time()))
148            filepath.write_bytes(response.content)
149        return failures
150
151    def download_all(
152        self,
153        dst: Path | str,
154        extra_headers: dict = None,
155        missing_ext_subs: tuple[str] = (".jpg", ".mp4", ".mp3"),
156    ) -> dict[str, list[tuple[str, int | None]]]:
157        """Download all scraped files.
158
159        :param dst: The destination folder to save to.
160        Separate subfolders for images, videos, and audio
161        will be created.
162
163        :param extra_headers: Any additional headers
164        to be added to the request.
165
166        :param missing_ext_subs: A three-tuple of file extensions to use for
167        the saved file if the url doesn't have one.
168        The expected order is (image_ext, video_ext, audio_ext)
169
170        :return dict[str, list[tuple[str, int | None]]]: Returns files
171        that failed to download. The keys are "images", "videos", and "audio".
172        The values are a list of tuples where the first element
173        is the file url and the second element is the status code that
174        was returned from requests. If the download failed without a status code,
175        this element will be None.
176        """
177        link_types = ["images", "videos", "audio"]
178        dst = Path(dst)
179        subdirs = [dst / link_type for link_type in link_types]
180        failures = {}
181        for urls, subdir, ext_sub, link_type in zip(
182            (self.image_urls, self.video_urls, self.audio_urls),
183            subdirs,
184            missing_ext_subs,
185            link_types,
186        ):
187            fails = self.download_files(urls, subdir, extra_headers, ext_sub)
188            if len(fails) > 0:
189                failures[link_type] = fails
190        # Remove empty subdir
191        for subdir in subdirs:
192            try:
193                subdir.rmdir()
194            except Exception as e:
195                ...
196        return failures

Scrape and download media links.

PixelRipper()
21    def __init__(self):
22        self.scraper: LinkScraper
23        self.source_url: str
24        self.savedir: str | Path
25        self.video_exts: list[str] = (
26            (root / "video_extensions.txt").read_text().splitlines()
27        )
28        self.audio_exts: list[str] = (
29            (root / "audio_extensions.txt").read_text().splitlines()
30        )
31        self.image_urls: list[str]
32        self.video_urls: list[str]
33        self.audio_urls: list[str]
def get( self, url: str, extra_headers: dict[str, str] = {}) -> requests.models.Response:
35    def get(self, url: str, extra_headers: dict[str, str] = {}) -> requests.Response:
36        """Construct and make request for a give url.
37        Returns a requests.Response object.
38
39        :param extra_headers: By default, only a
40        random user-agent string is used in
41        the request header, but additional
42        key-value pairs can be added via this param."""
43        headers = {"User-Agent": get_agent()}
44        headers |= extra_headers
45        response = requests.get(url, headers=headers)
46        if response.status_code != 200:
47            raise RuntimeError(
48                f"getting {url} failed with response code {response.status_code}."
49            )
50        return response

Construct and make request for a give url. Returns a requests.Response object.

Parameters
  • extra_headers: By default, only a random user-agent string is used in the request header, but additional key-value pairs can be added via this param.
def rip(self, url: str, extra_headers: dict[str, str] = {}):
52    def rip(self, url: str, extra_headers: dict[str, str] = {}):
53        """Scrape page and store urls in a LinkScraper object.
54
55        :param url: The url to scrape for media content.
56
57        :param extra_headers: Any additional HTTP headers to submit
58        with the request."""
59        response = self.get(url, extra_headers=extra_headers)
60        self.scraper = LinkScraper(response.text, response.url)
61        self.scraper.scrape_page()
62        self.image_urls = [
63            url
64            for url in self.scraper.get_links("img")
65            if not url.strip().strip("/").endswith(".com")
66            and "apple-touch-icon" not in url.lower()
67            and "favicon" not in url.lower()
68        ]
69        self.video_urls = self.filter_by_extensions(
70            self.video_exts, self.image_urls
71        ) + [
72            url
73            for url in self.scraper.get_links("all", excluded_links=self.image_urls)
74            if "video" in urlparse(url).path
75        ]
76        self.audio_urls = self.filter_by_extensions(
77            self.audio_exts, self.image_urls + self.video_urls
78        ) + [
79            url
80            for url in self.scraper.get_links("all", excluded_links=self.image_urls)
81            if "audio" in urlparse(url).path
82        ]

Scrape page and store urls in a LinkScraper object.

Parameters
  • url: The url to scrape for media content.

  • extra_headers: Any additional HTTP headers to submit with the request.

def filter_by_extensions(self, extensions: list[str], ignores: list[str] = []):
84    def filter_by_extensions(self, extensions: list[str], ignores: list[str] = []):
85        """Return file urls from self.scraper
86        according to a list of extensions.
87
88        :param extensions: List of file extensions.
89        Return urls that have an extension matching one in this list.
90
91        :param ignores: List of urls. Filter out any urls
92        in this list regardless of whether they have
93        an extension matching one in the extensions param."""
94        return [
95            url.lower()
96            for url in self.scraper.get_links("all", excluded_links=ignores)
97            if any(ext == Path(url).suffix.lower() for ext in extensions)
98        ]

Return file urls from self.scraper according to a list of extensions.

Parameters
  • extensions: List of file extensions. Return urls that have an extension matching one in this list.

  • ignores: List of urls. Filter out any urls in this list regardless of whether they have an extension matching one in the extensions param.

def download_files( self, urls: list[str], dst: pathlib.Path | str, extra_headers: dict = None, missing_ext_sub: str = '') -> list[str, int | None]:
100    def download_files(
101        self,
102        urls: list[str],
103        dst: Path | str,
104        extra_headers: dict = None,
105        missing_ext_sub: str = "",
106    ) -> list[(str, int | None)]:
107        """Download a list of files.
108
109        :param urls: A list of urls to download
110
111        :param dst: The destination path to save the files to.
112
113        :param extra_headers: Any additional headers
114        to be added to the request.
115
116        :param missing_ext_sub: A file extension to use for
117        the saved file if the url doesn't have one.
118
119        :return list[(str, int|None)]: A list of files that failed to download.
120        Each element of the list is a tuple where the first element
121        is the file url and the second element is the status code that
122        was returned from requests. If the download failed without a status code,
123        this element will be None."""
124        failures = []
125        bar = ProgBar(len(urls))
126        bar.counter = 1
127        dst = Path(dst)
128        dst.mkdir(parents=True, exist_ok=True)
129        for url in urls:
130            bar.display(prefix=f"Downloading file {bar.counter}/{bar.total}")
131            headers = {"User-Agent": get_agent()}
132            if extra_headers:
133                headers |= extra_headers
134            try:
135                response = requests.get(url, headers=headers)
136            except Exception as e:
137                failures.append((url, None))
138                continue
139            if response.status_code != 200:
140                failures.append((url, response.status_code))
141                continue
142            filename = Path(urlparse(url).path).name
143            if Path(filename).suffix == "":
144                filename += missing_ext_sub
145            filepath = dst / filename
146            if filepath.exists():
147                filepath = filepath.with_stem(filepath.stem + str(time.time()))
148            filepath.write_bytes(response.content)
149        return failures

Download a list of files.

Parameters
  • urls: A list of urls to download

  • dst: The destination path to save the files to.

  • extra_headers: Any additional headers to be added to the request.

  • missing_ext_sub: A file extension to use for the saved file if the url doesn't have one.

Returns

A list of files that failed to download. Each element of the list is a tuple where the first element is the file url and the second element is the status code that was returned from requests. If the download failed without a status code, this element will be None.

def download_all( self, dst: pathlib.Path | str, extra_headers: dict = None, missing_ext_subs: tuple[str] = ('.jpg', '.mp4', '.mp3')) -> dict[str, list[tuple[str, int | None]]]:
151    def download_all(
152        self,
153        dst: Path | str,
154        extra_headers: dict = None,
155        missing_ext_subs: tuple[str] = (".jpg", ".mp4", ".mp3"),
156    ) -> dict[str, list[tuple[str, int | None]]]:
157        """Download all scraped files.
158
159        :param dst: The destination folder to save to.
160        Separate subfolders for images, videos, and audio
161        will be created.
162
163        :param extra_headers: Any additional headers
164        to be added to the request.
165
166        :param missing_ext_subs: A three-tuple of file extensions to use for
167        the saved file if the url doesn't have one.
168        The expected order is (image_ext, video_ext, audio_ext)
169
170        :return dict[str, list[tuple[str, int | None]]]: Returns files
171        that failed to download. The keys are "images", "videos", and "audio".
172        The values are a list of tuples where the first element
173        is the file url and the second element is the status code that
174        was returned from requests. If the download failed without a status code,
175        this element will be None.
176        """
177        link_types = ["images", "videos", "audio"]
178        dst = Path(dst)
179        subdirs = [dst / link_type for link_type in link_types]
180        failures = {}
181        for urls, subdir, ext_sub, link_type in zip(
182            (self.image_urls, self.video_urls, self.audio_urls),
183            subdirs,
184            missing_ext_subs,
185            link_types,
186        ):
187            fails = self.download_files(urls, subdir, extra_headers, ext_sub)
188            if len(fails) > 0:
189                failures[link_type] = fails
190        # Remove empty subdir
191        for subdir in subdirs:
192            try:
193                subdir.rmdir()
194            except Exception as e:
195                ...
196        return failures

Download all scraped files.

Parameters
  • dst: The destination folder to save to. Separate subfolders for images, videos, and audio will be created.

  • extra_headers: Any additional headers to be added to the request.

  • missing_ext_subs: A three-tuple of file extensions to use for the saved file if the url doesn't have one. The expected order is (image_ext, video_ext, audio_ext)

Returns

Returns files that failed to download. The keys are "images", "videos", and "audio". The values are a list of tuples where the first element is the file url and the second element is the status code that was returned from requests. If the download failed without a status code, this element will be None.

class PixelRipperSelenium(PixelRipper):
199class PixelRipperSelenium(PixelRipper):
200    def __init__(self, headless: bool = True, browser: str = "firefox"):
201        super().__init__()
202        self.user = User(headless=headless, browser_type=browser)
203
204    def get(self, url: str, *args, **kwargs) -> requests.Response:
205        """Get webpage using selenium.
206
207        :param url: The url to scrape for media content.
208
209        :return: A pseudo requests.Response
210        object that only has "text" and "url"
211        members."""
212        try:
213            if not self.user.browser_open:
214                self.user.open_browser()
215            self.user.get(url)
216            time.sleep(1)
217            old_scroll_height = self.user.script("return document.body.scrollHeight;")
218            # Try to scroll to bottom of continously loading page style
219            while True:
220                self.user.scroll(fraction=1)
221                time.sleep(1)
222                if (
223                    self.user.script("return document.body.scrollHeight;")
224                    == old_scroll_height
225                ):
226                    break
227                old_scroll_height = self.user.script(
228                    "return document.body.scrollHeight;"
229                )
230            time.sleep(1)
231
232            class Response:
233                def __init__(self, text: str, url: str):
234                    self.text = text
235                    self.url = url
236
237            return Response(self.user.browser.page_source, url)
238        except Exception as e:
239            print(e)
240        finally:
241            self.user.close_browser()

Scrape and download media links.

PixelRipperSelenium(headless: bool = True, browser: str = 'firefox')
200    def __init__(self, headless: bool = True, browser: str = "firefox"):
201        super().__init__()
202        self.user = User(headless=headless, browser_type=browser)
def get(self, url: str, *args, **kwargs) -> requests.models.Response:
204    def get(self, url: str, *args, **kwargs) -> requests.Response:
205        """Get webpage using selenium.
206
207        :param url: The url to scrape for media content.
208
209        :return: A pseudo requests.Response
210        object that only has "text" and "url"
211        members."""
212        try:
213            if not self.user.browser_open:
214                self.user.open_browser()
215            self.user.get(url)
216            time.sleep(1)
217            old_scroll_height = self.user.script("return document.body.scrollHeight;")
218            # Try to scroll to bottom of continously loading page style
219            while True:
220                self.user.scroll(fraction=1)
221                time.sleep(1)
222                if (
223                    self.user.script("return document.body.scrollHeight;")
224                    == old_scroll_height
225                ):
226                    break
227                old_scroll_height = self.user.script(
228                    "return document.body.scrollHeight;"
229                )
230            time.sleep(1)
231
232            class Response:
233                def __init__(self, text: str, url: str):
234                    self.text = text
235                    self.url = url
236
237            return Response(self.user.browser.page_source, url)
238        except Exception as e:
239            print(e)
240        finally:
241            self.user.close_browser()

Get webpage using selenium.

Parameters
  • url: The url to scrape for media content.
Returns

A pseudo requests.Response object that only has "text" and "url" members.

def get_args() -> argparse.Namespace:
244def get_args() -> argparse.Namespace:
245    parser = argparse.ArgumentParser()
246
247    parser.add_argument("url", type=str, help=""" The url to scrape for media. """)
248
249    parser.add_argument(
250        "-s",
251        "--selenium",
252        action="store_true",
253        help=""" Use selenium to get page content
254        instead of requests. """,
255    )
256
257    parser.add_argument(
258        "-nh",
259        "--no_headless",
260        action="store_true",
261        help="Don't use headless mode when using -s/--selenium. ",
262    )
263
264    parser.add_argument(
265        "-b",
266        "--browser",
267        default="firefox",
268        type=str,
269        help=""" The browser to use when using -s/--selenium.
270        Can be 'firefox' or 'chrome'. You must have the appropriate webdriver
271        installed for your machine and browser version in order to use the selenium engine.""",
272    )
273
274    parser.add_argument(
275        "-o",
276        "--output_path",
277        type=str,
278        default=None,
279        help=""" Output directory to save results to.
280        If not specified, a folder with the name of the
281        webpage will be created in the current working directory.""",
282    )
283
284    parser.add_argument(
285        "-eh",
286        "--extra_headers",
287        nargs="*",
288        type=str,
289        default=[],
290        help=""" Extra headers to use when requesting files as key, value pairs.
291        Keys and values whould be colon separated and pairs should be space separated.
292        e.g. -eh Referer:website.com/page Host:website.com""",
293    )
294
295    args = parser.parse_args()
296
297    if not args.output_path:
298        args.output_path = Path.cwd() / urlparse(args.url).netloc.strip("www.")
299    else:
300        args.output_path = Path(args.output_path).resolve()
301    args.browser = args.browser.lower()
302    if args.extra_headers:
303        args.extra_headers = {
304            pair[: pair.find(":")]: pair[pair.find(":") + 1 :]
305            for pair in args.extra_headers
306        }
307    return args
def main(args: argparse.Namespace = None):
310def main(args: argparse.Namespace = None):
311    if not args:
312        args = get_args()
313    ripper = (
314        PixelRipperSelenium(not args.no_headless, args.browser)
315        if args.selenium
316        else PixelRipper()
317    )
318    ripper.rip(args.url)
319    failures = ripper.download_all(args.output_path, extra_headers=args.extra_headers)
320    if len(failures) > 0:
321        print("Failed to download the following:")
322        for key in failures:
323            if len(failures[key]) > 0:
324                print(f"{key}:")
325                print(*failures[key], sep="\n")