diff --git a/pyproject.toml b/pyproject.toml index 2a3d21fb42..b974b7b1d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,10 +35,12 @@ keywords = [ ] dependencies = [ "async-timeout>=5.0.1", + "browserforge>=1.2.4", "cachetools>=5.5.0", "colorama>=0.4.0", "impit>=0.8.0", "more-itertools>=10.2.0", + "playwright>=1.58.0", "protego>=0.5.0", "psutil>=6.0.0", "pydantic-settings>=2.12.0", @@ -55,15 +57,15 @@ adaptive-crawler = [ "jaro-winkler>=2.0.3", "playwright>=1.27.0", "scikit-learn>=1.6.0", - "apify_fingerprint_datapoints>=0.0.3", + "apify_fingerprint_datapoints>=0.11.0", "browserforge>=1.2.4" ] beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"] cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"] curl-impersonate = ["curl-cffi>=0.9.0"] -httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"] +httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.11.0", "browserforge>=1.2.3"] parsel = ["parsel>=1.10.0"] -playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"] +playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.11.0", "browserforge>=1.2.3"] otel = [ "opentelemetry-api>=1.34.1", "opentelemetry-distro[otlp]>=0.54", diff --git a/src/crawlee/browsers/_playwright_browser_controller.py b/src/crawlee/browsers/_playwright_browser_controller.py index f386a7a903..e1682a637f 100644 --- a/src/crawlee/browsers/_playwright_browser_controller.py +++ b/src/crawlee/browsers/_playwright_browser_controller.py @@ -2,12 +2,14 @@ from __future__ import annotations +import inspect from asyncio import Lock from datetime import datetime, timedelta, timezone from typing import TYPE_CHECKING, Any, cast from browserforge.injectors.playwright import AsyncNewContext from playwright.async_api import Browser, BrowserContext, Page, ProxySettings +from playwright.async_api import BrowserType as PlaywrightBrowserType from typing_extensions import override from crawlee._utils.docs import docs_group @@ -27,6 +29,14 @@ logger = getLogger(__name__) +# Cache Playwright signatures to avoid overhead in critical path +_launch_persistent_context_params = set(inspect.signature(PlaywrightBrowserType.launch_persistent_context).parameters) +_new_context_params = set(inspect.signature(Browser.new_context).parameters) + +_common_context_options = _launch_persistent_context_params & _new_context_params +_persistent_unique_context_options = _launch_persistent_context_params - _new_context_params +_incognito_unique_context_options = _new_context_params - _launch_persistent_context_params + @docs_group('Browser management') class PlaywrightBrowserController(BrowserController): @@ -222,11 +232,36 @@ async def _create_browser_context( `self._fingerprint_generator` is available. """ browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {} + + filtered_options = {} + for key, value in browser_new_context_options.items(): + if self._use_incognito_pages: + # Incognito mode (new_context) + if key in _common_context_options or key in _incognito_unique_context_options: + filtered_options[key] = value + elif key in _persistent_unique_context_options: + logger.warning( + f'Option "{key}" is only supported in persistent context mode ' + '(use_incognito_pages=False) and will be ignored.' + ) + else: + raise TypeError(f'"{key}" is not a valid Playwright context option.') + elif key in _common_context_options or key in _persistent_unique_context_options: + # Persistent mode (launch_persistent_context) + filtered_options[key] = value + elif key in _incognito_unique_context_options: + logger.warning( + f'Option "{key}" is only supported in incognito context mode ' + '(use_incognito_pages=True) and will be ignored.' + ) + else: + raise TypeError(f'"{key}" is not a valid Playwright context option.') + if proxy_info: - if browser_new_context_options.get('proxy'): + if filtered_options.get('proxy'): logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.") - browser_new_context_options['proxy'] = ProxySettings( + filtered_options['proxy'] = ProxySettings( server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', username=proxy_info.username, password=proxy_info.password, @@ -236,7 +271,7 @@ async def _create_browser_context( return await AsyncNewContext( browser=self._browser, fingerprint=self._fingerprint_generator.generate(), - **browser_new_context_options, + **filtered_options, ) if self._header_generator: @@ -256,7 +291,5 @@ async def _create_browser_context( else: extra_http_headers = None - browser_new_context_options['extra_http_headers'] = browser_new_context_options.get( - 'extra_http_headers', extra_http_headers - ) - return await self._browser.new_context(**browser_new_context_options) + filtered_options['extra_http_headers'] = filtered_options.get('extra_http_headers', extra_http_headers) + return await self._browser.new_context(**filtered_options) diff --git a/tests/unit/browsers/test_playwright_controller_validation.py b/tests/unit/browsers/test_playwright_controller_validation.py new file mode 100644 index 0000000000..034f0b9f77 --- /dev/null +++ b/tests/unit/browsers/test_playwright_controller_validation.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +import pytest +from playwright.async_api import Browser, Playwright, async_playwright + +from crawlee.browsers import PlaywrightBrowserController + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + +@pytest.fixture +async def playwright() -> AsyncGenerator[Playwright, None]: + async with async_playwright() as playwright: + yield playwright + + +@pytest.fixture +async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]: + browser = await playwright.chromium.launch() + yield browser + await browser.close() + + +async def test_controller_validation_typo(browser: Browser) -> None: + controller = PlaywrightBrowserController(browser) + with pytest.raises(TypeError, match=r'"headles" is not a valid Playwright context option.'): + await controller.new_page(browser_new_context_options={'headles': True}) + await controller.close() + + +async def test_controller_validation_cross_mode_persistent(browser: Browser, caplog: pytest.LogCaptureFixture) -> None: + # Default is persistent mode (use_incognito_pages=False) + controller = PlaywrightBrowserController(browser, use_incognito_pages=False) + # storage_state is incognito-only + with caplog.at_level(logging.WARNING): + page = await controller.new_page(browser_new_context_options={'storage_state': {'cookies': [], 'origins': []}}) + assert 'Option "storage_state" is only supported in incognito context mode' in caplog.text + await page.close() + await controller.close() + + +async def test_controller_validation_cross_mode_incognito(browser: Browser, caplog: pytest.LogCaptureFixture) -> None: + controller = PlaywrightBrowserController(browser, use_incognito_pages=True) + # env is persistent-only + with caplog.at_level(logging.WARNING): + page = await controller.new_page(browser_new_context_options={'env': {}}) + assert 'Option "env" is only supported in persistent context mode' in caplog.text + await page.close() + await controller.close() + + +async def test_controller_validation_valid_common(browser: Browser) -> None: + controller = PlaywrightBrowserController(browser) + # viewport is common + page = await controller.new_page(browser_new_context_options={'viewport': {'width': 800, 'height': 600}}) + assert page.viewport_size == {'width': 800, 'height': 600} + await page.close() + await controller.close() diff --git a/uv.lock b/uv.lock index b57b9aabf0..5a9fd5d4ab 100644 --- a/uv.lock +++ b/uv.lock @@ -788,10 +788,12 @@ version = "1.5.0" source = { editable = "." } dependencies = [ { name = "async-timeout" }, + { name = "browserforge" }, { name = "cachetools" }, { name = "colorama" }, { name = "impit" }, { name = "more-itertools" }, + { name = "playwright" }, { name = "protego" }, { name = "psutil" }, { name = "pydantic" }, @@ -924,12 +926,13 @@ dev = [ requires-dist = [ { name = "aiomysql", marker = "extra == 'sql-mysql'", specifier = ">=0.3.2" }, { name = "aiosqlite", marker = "extra == 'sql-sqlite'", specifier = ">=0.21.0" }, - { name = "apify-fingerprint-datapoints", marker = "extra == 'adaptive-crawler'", specifier = ">=0.0.3" }, - { name = "apify-fingerprint-datapoints", marker = "extra == 'httpx'", specifier = ">=0.0.2" }, - { name = "apify-fingerprint-datapoints", marker = "extra == 'playwright'", specifier = ">=0.0.2" }, + { name = "apify-fingerprint-datapoints", marker = "extra == 'adaptive-crawler'", specifier = ">=0.11.0" }, + { name = "apify-fingerprint-datapoints", marker = "extra == 'httpx'", specifier = ">=0.11.0" }, + { name = "apify-fingerprint-datapoints", marker = "extra == 'playwright'", specifier = ">=0.11.0" }, { name = "async-timeout", specifier = ">=5.0.1" }, { name = "asyncpg", marker = "extra == 'sql-postgres'", specifier = ">=0.24.0" }, { name = "beautifulsoup4", extras = ["lxml"], marker = "extra == 'beautifulsoup'", specifier = ">=4.12.0" }, + { name = "browserforge", specifier = ">=1.2.4" }, { name = "browserforge", marker = "extra == 'adaptive-crawler'", specifier = ">=1.2.4" }, { name = "browserforge", marker = "extra == 'httpx'", specifier = ">=1.2.3" }, { name = "browserforge", marker = "extra == 'playwright'", specifier = ">=1.2.3" }, @@ -952,6 +955,7 @@ requires-dist = [ { name = "opentelemetry-sdk", marker = "extra == 'otel'", specifier = ">=1.34.1" }, { name = "opentelemetry-semantic-conventions", marker = "extra == 'otel'", specifier = ">=0.54" }, { name = "parsel", marker = "extra == 'parsel'", specifier = ">=1.10.0" }, + { name = "playwright", specifier = ">=1.58.0" }, { name = "playwright", marker = "extra == 'adaptive-crawler'", specifier = ">=1.27.0" }, { name = "playwright", marker = "extra == 'playwright'", specifier = ">=1.27.0" }, { name = "protego", specifier = ">=0.5.0" },