apify · MostlyKIGuess · Oct 6, 2024 · Oct 6, 2024 · Oct 6, 2024 · Oct 6, 2024
diff --git a/docs/guides/avoid_blocking_playwright.py b/docs/guides/avoid_blocking_playwright.py
@@ -0,0 +1,28 @@
+from crawlee.browsers import BrowserPool
+from crawlee.playwright_crawler import PlaywrightCrawler
+
+# Create the browser pool with customized fingerprint options
+browser_pool = BrowserPool.with_default_plugin(
+    headless=True,
+    browser_type='chromium',  # Use 'chromium', 'firefox', or 'webkit'
+    kwargs={
+        'use_fingerprints': True,
+        'fingerprint_options': {
+            'fingerprint_generator_options': {
+                'browsers': [
+                    {
+                        'name': 'chromium',  # Or 'firefox', or 'webkit'
+                        'min_version': 96,
+                    },
+                ],
+                'devices': ['desktop'],  # Specify device types directly
+                'operating_systems': ['windows'],  # Specify OS types directly
+            },
+        },
+    },
+)
+
+# Instantiate the PlaywrightCrawler with the customized browser pool
+crawler = PlaywrightCrawler(
+    browser_pool=browser_pool,
+)
diff --git a/docs/guides/avoid_blocking_playwright_fingerprints_off.py b/docs/guides/avoid_blocking_playwright_fingerprints_off.py
@@ -0,0 +1,16 @@
+from crawlee.browsers import BrowserPool
+from crawlee.playwright_crawler import PlaywrightCrawler
+
+# Create a browser pool with use_fingerprints set to False
+browser_pool = BrowserPool.with_default_plugin(
+    headless=True,
+    kwargs={
+        'use_fingerprints': False,
+    },
+)
-browser_pool = BrowserPool.with_default_plugin(
-    headless=True,
-    kwargs={
-        'use_fingerprints': False,
-    },
-)
+browser_pool = BrowserPool.with_default_plugin(
+    headless=True,
+    use_fingerprints=False,
+)
-browser_pool = BrowserPool.with_default_plugin(
-    headless=True,
-    kwargs={
-        'use_fingerprints': False,
-    },
-)
+browser_pool = BrowserPool.with_default_plugin(
+    headless=True,
+    use_fingerprints=False,
+)
+
+# Instantiate the PlaywrightCrawler with the customized browser pool
+crawler = PlaywrightCrawler(
+    browser_pool=browser_pool,
+    # Additional parameters if needed
+)
diff --git a/docs/guides/avoid_getting_blocked.mdx b/docs/guides/avoid_getting_blocked.mdx
@@ -0,0 +1,67 @@
+---
+id: avoid-getting-blocked
+title: Avoid getting blocked
+description: How to avoid getting blocked when scraping using crawlee-python.
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import CodeBlock from '@theme/CodeBlock';
+
+import PlaywrightSource from '!!raw-loader!./avoid_blocking_playwright.py';
+import PlaywrightFingerprintsOffSource from '!!raw-loader!./avoid_blocking_playwright_fingerprints_off.py';
+
+A scraper might get blocked for numerous reasons. Let's narrow it down to the two main ones. The first is a bad or blocked IP address. You can learn about this topic in the [proxy management guide](./proxy-management). The second reason is [browser fingerprints](https://pixelprivacy.com/resources/browser-fingerprinting/) (or signatures), which we will explore more in this guide. Check the [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping) to gain a deeper theoretical understanding of blocking and learn a few tips and tricks.
+
+A browser fingerprint is a collection of browser attributes and significant features that can show if our browser is a bot or a real user. Moreover, most browsers have these unique features that allow the website to track the browser even with different IP addresses. This is the main reason why scrapers should change browser fingerprints while doing browser-based scraping. In return, it should significantly reduce the chance of getting blocked.
+
+## Using browser fingerprints
+
+Changing browser fingerprints can be a tedious job. Luckily, Crawlee provides this feature with zero configuration necessary—the usage of fingerprints is enabled by default and available in `PlaywrightCrawler`. So whenever we build a scraper using one of these crawlers, the fingerprints are going to be generated for the default browser and the operating system out of the box.
+
+## Customizing browser fingerprints
+
+In certain cases, we want to narrow down the fingerprints used—e.g., specify a certain operating system, locale, or browser. This is also possible with Crawlee; the crawler can have the generation algorithm customized to reflect a particular browser version and more. Let's take a look at the examples below:
+
+<Tabs groupId="avoid_blocking">
+    <TabItem value="playwright" label="PlaywrightCrawler" default>
+        <CodeBlock language="python">
+            {PlaywrightSource}
+        </CodeBlock>
+    </TabItem>
+
+</Tabs>
+
+## Disabling browser fingerprints
+
+On the contrary, sometimes we want to entirely disable the usage of browser fingerprints. This is easy to do with Crawlee too. All we have to do is set the `use_fingerprints` option to `False` when creating the `BrowserPool`:
+
+<Tabs groupId="avoid_blocking">
+    <TabItem value="playwright" label="PlaywrightCrawler" default>
+        <CodeBlock language="python">
+            {PlaywrightFingerprintsOffSource}
+        </CodeBlock>
+    </TabItem>
+
+</Tabs>
+
+
+## Additional Tips to Avoid Getting Blocked
+
+### 1. Rotate Proxies
+Using a pool of proxies and rotating them can help avoid IP bans. Refer to the [proxy management guide](./proxy-management) for more details.
+
+### 2. Randomize Request Patterns
+Avoid making requests at regular intervals. Introduce random delays between requests to mimic human behavior.
+
+### 3. Respect Robots.txt
+Always check and respect the `robots.txt` file of the website you are scraping to avoid legal issues and reduce the chance of getting blocked.
+
+**Related links**
+
+- [Fingerprint Suite Docs](https://github.com/apify/fingerprint-suite)
+- [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping)
+- [Proxy Management Guide](./proxy-management)
+- [Robots.txt Guide](https://www.robotstxt.org/)
+
+By following these guidelines and utilizing the features provided by Crawlee, you can significantly reduce the chances of your scraper getting blocked.