diff --git a/docs/guides/avoid_getting_blocked.mdx b/docs/guides/avoid_getting_blocked.mdx new file mode 100644 index 0000000000..253018ce15 --- /dev/null +++ b/docs/guides/avoid_getting_blocked.mdx @@ -0,0 +1,59 @@ +--- +id: avoid-getting-blocked +title: Avoid getting blocked +description: How to avoid getting blocked when scraping using crawlee-python. +--- +import CodeBlock from '@theme/CodeBlock'; +import ApiLink from '@site/src/components/ApiLink'; + +import PlaywrightSource from '!!raw-loader!./code/avoid_blocking_playwright.py'; +import PlaywrightFingerprintsOffSource from '!!raw-loader!./code/avoid_blocking_playwright_fingerprints_off.py'; + +A scraper might get blocked for numerous reasons. Let's narrow it down to the two main ones. The first is a bad or blocked IP address. You can learn about this topic in the [proxy management guide](./proxy-management). The second reason is [browser fingerprints](https://pixelprivacy.com/resources/browser-fingerprinting/) (or signatures), which we will explore more in this guide. Check the [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping) to gain a deeper theoretical understanding of blocking and learn a few tips and tricks. + +A browser fingerprint is a collection of browser attributes and significant features that can show if our browser is a bot or a real user. Moreover, most browsers have these unique features that allow the website to track the browser even with different IP addresses. This is the main reason why scrapers should change browser fingerprints while doing browser-based scraping. In return, it should significantly reduce the chance of getting blocked. + +## Using browser fingerprints + +Changing browser fingerprints can be a tedious job. Luckily, Crawlee provides this feature with zero configuration necessary—the usage of fingerprints is enabled by default and available in PlaywrightCrawler. So whenever we build a scraper using one of these crawlers, the fingerprints are going to be generated for the default browser and the operating system out of the box. + +## Customizing browser fingerprints + +In certain cases, we want to narrow down the fingerprints used—e.g., specify a certain operating system, locale, or browser. This is also possible with Crawlee; the crawler can have the generation algorithm customized to reflect a particular browser version and more. Let's take a look at the examples below: + + + + {PlaywrightSource} + + + +## Disabling browser fingerprints + +On the contrary, sometimes we want to entirely disable the usage of browser fingerprints. This is easy to do with Crawlee too. All we have to do is set the `use_fingerprints` option to `False` when creating the BrowserPool: + + + + {PlaywrightFingerprintsOffSource} + + + + +## Additional Tips to Avoid Getting Blocked + +### 1. Rotate Proxies +Using a pool of proxies and rotating them can help avoid IP bans. Refer to the [proxy management guide](./proxy-management) for more details. + +### 2. Randomize Request Patterns +Avoid making requests at regular intervals. Introduce random delays between requests to mimic human behavior. + +### 3. Respect Robots.txt +Always check and respect the `robots.txt` file of the website you are scraping to avoid legal issues and reduce the chance of getting blocked. + +**Related links** + +- [Fingerprint Suite Docs](https://github.com/apify/fingerprint-suite) +- [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping) +- [Proxy Management Guide](./proxy-management) +- [Robots.txt Guide](https://www.robotstxt.org/) + +By following these guidelines and utilizing the features provided by Crawlee, you can significantly reduce the chances of your scraper getting blocked. \ No newline at end of file diff --git a/docs/guides/code/avoid_blocking_playwright.py b/docs/guides/code/avoid_blocking_playwright.py new file mode 100644 index 0000000000..fabf65ff2b --- /dev/null +++ b/docs/guides/code/avoid_blocking_playwright.py @@ -0,0 +1,33 @@ +from crawlee.browsers import BrowserPool +from crawlee.browsers._playwright_browser_plugin import PlaywrightBrowserPlugin +from crawlee.playwright_crawler import PlaywrightCrawler + +# Create the PlaywrightBrowserPlugin with customized options +plugin = PlaywrightBrowserPlugin( + browser_type='chromium', # Use 'chromium', 'firefox', or 'webkit' + browser_options={ + 'args': [ + '--no-sandbox', + '--disable-setuid-sandbox', + ], + }, + fingerprint_generator_options={ + 'browsers': [ + { + 'name': 'chromium', # Or 'firefox', or 'webkit' + 'min_version': 96, + }, + ], + 'devices': ['desktop'], # Specify device types directly + 'operating_systems': ['windows'], # Specify OS types directly + }, + use_fingerprints=True, # Enable fingerprinting +) + +# Create the browser pool with the customized plugin +browser_pool = BrowserPool(plugins=[plugin]) + +# Instantiate the PlaywrightCrawler with the customized browser pool +crawler = PlaywrightCrawler( + browser_pool=browser_pool, +) diff --git a/docs/guides/code/avoid_blocking_playwright_fingerprints_off.py b/docs/guides/code/avoid_blocking_playwright_fingerprints_off.py new file mode 100644 index 0000000000..5a8c05f580 --- /dev/null +++ b/docs/guides/code/avoid_blocking_playwright_fingerprints_off.py @@ -0,0 +1,14 @@ +from crawlee.browsers import BrowserPool +from crawlee.playwright_crawler import PlaywrightCrawler + +# Create a browser pool with use_fingerprints set to False +browser_pool = BrowserPool.with_default_plugin( + headless=True, + use_fingerprints=False, +) + +# Instantiate the PlaywrightCrawler with the customized browser pool +crawler = PlaywrightCrawler( + browser_pool=browser_pool, + # Additional parameters if needed +) diff --git a/src/crawlee/browsers/_playwright_browser_plugin.py b/src/crawlee/browsers/_playwright_browser_plugin.py index 43b4933b3d..48872fb337 100644 --- a/src/crawlee/browsers/_playwright_browser_plugin.py +++ b/src/crawlee/browsers/_playwright_browser_plugin.py @@ -35,6 +35,8 @@ def __init__( browser_options: Mapping[str, Any] | None = None, page_options: Mapping[str, Any] | None = None, max_open_pages_per_browser: int = 20, + fingerprint_generator_options: Mapping[str, Any] | None = None, + use_fingerprints: bool = False, ) -> None: """Create a new instance. @@ -44,11 +46,15 @@ def __init__( page_options: Options to configure a new page instance. max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance. Once reached, a new browser instance will be launched to handle the excess. + fingerprint_generator_options: Options for generating browser fingerprints. + use_fingerprints: Whether to use browser fingerprints. """ self._browser_type = browser_type self._browser_options = browser_options or {} self._page_options = page_options or {} self._max_open_pages_per_browser = max_open_pages_per_browser + self._fingerprint_generator_options = fingerprint_generator_options or {} + self._use_fingerprints = use_fingerprints self._playwright_context_manager = async_playwright() self._playwright: Playwright | None = None @@ -73,6 +79,14 @@ def page_options(self) -> Mapping[str, Any]: def max_open_pages_per_browser(self) -> int: return self._max_open_pages_per_browser + @property + def fingerprint_generator_options(self) -> Mapping[str, Any]: + return self._fingerprint_generator_options + + @property + def use_fingerprints(self) -> bool: + return self._use_fingerprints + @override async def __aenter__(self) -> PlaywrightBrowserPlugin: logger.debug('Initializing Playwright browser plugin.')