-
Notifications
You must be signed in to change notification settings - Fork 698
fix(playwright): filter unsupported context options in persistent browser #1796
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
3ce9bb3
694e1ce
57da910
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,10 +35,12 @@ keywords = [ | |
| ] | ||
| dependencies = [ | ||
| "async-timeout>=5.0.1", | ||
| "browserforge>=1.2.4", | ||
| "cachetools>=5.5.0", | ||
| "colorama>=0.4.0", | ||
| "impit>=0.8.0", | ||
| "more-itertools>=10.2.0", | ||
| "playwright>=1.58.0", | ||
| "protego>=0.5.0", | ||
| "psutil>=6.0.0", | ||
| "pydantic-settings>=2.12.0", | ||
|
|
@@ -55,15 +57,15 @@ adaptive-crawler = [ | |
| "jaro-winkler>=2.0.3", | ||
| "playwright>=1.27.0", | ||
| "scikit-learn>=1.6.0", | ||
| "apify_fingerprint_datapoints>=0.0.3", | ||
| "apify_fingerprint_datapoints>=0.11.0", | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why? |
||
| "browserforge>=1.2.4" | ||
| ] | ||
| beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"] | ||
| cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"] | ||
| curl-impersonate = ["curl-cffi>=0.9.0"] | ||
| httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"] | ||
| httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.11.0", "browserforge>=1.2.3"] | ||
| parsel = ["parsel>=1.10.0"] | ||
| playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"] | ||
| playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.11.0", "browserforge>=1.2.3"] | ||
| otel = [ | ||
| "opentelemetry-api>=1.34.1", | ||
| "opentelemetry-distro[otlp]>=0.54", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,12 +2,14 @@ | |
|
|
||
| from __future__ import annotations | ||
|
|
||
| import inspect | ||
| from asyncio import Lock | ||
| from datetime import datetime, timedelta, timezone | ||
| from typing import TYPE_CHECKING, Any, cast | ||
|
|
||
| from browserforge.injectors.playwright import AsyncNewContext | ||
| from playwright.async_api import Browser, BrowserContext, Page, ProxySettings | ||
| from playwright.async_api import BrowserType as PlaywrightBrowserType | ||
| from typing_extensions import override | ||
|
|
||
| from crawlee._utils.docs import docs_group | ||
|
|
@@ -27,6 +29,14 @@ | |
|
|
||
| logger = getLogger(__name__) | ||
|
|
||
| # Cache Playwright signatures to avoid overhead in critical path | ||
| _launch_persistent_context_params = set(inspect.signature(PlaywrightBrowserType.launch_persistent_context).parameters) | ||
| _new_context_params = set(inspect.signature(Browser.new_context).parameters) | ||
|
Comment on lines
+33
to
+34
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it necessary to run these at the import time of the module? |
||
|
|
||
| _common_context_options = _launch_persistent_context_params & _new_context_params | ||
| _persistent_unique_context_options = _launch_persistent_context_params - _new_context_params | ||
| _incognito_unique_context_options = _new_context_params - _launch_persistent_context_params | ||
|
|
||
|
|
||
| @docs_group('Browser management') | ||
| class PlaywrightBrowserController(BrowserController): | ||
|
|
@@ -222,11 +232,36 @@ async def _create_browser_context( | |
| `self._fingerprint_generator` is available. | ||
| """ | ||
| browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {} | ||
|
|
||
| filtered_options = {} | ||
| for key, value in browser_new_context_options.items(): | ||
| if self._use_incognito_pages: | ||
| # Incognito mode (new_context) | ||
| if key in _common_context_options or key in _incognito_unique_context_options: | ||
| filtered_options[key] = value | ||
| elif key in _persistent_unique_context_options: | ||
| logger.warning( | ||
| f'Option "{key}" is only supported in persistent context mode ' | ||
| '(use_incognito_pages=False) and will be ignored.' | ||
| ) | ||
| else: | ||
| raise TypeError(f'"{key}" is not a valid Playwright context option.') | ||
| elif key in _common_context_options or key in _persistent_unique_context_options: | ||
| # Persistent mode (launch_persistent_context) | ||
| filtered_options[key] = value | ||
| elif key in _incognito_unique_context_options: | ||
| logger.warning( | ||
| f'Option "{key}" is only supported in incognito context mode ' | ||
| '(use_incognito_pages=True) and will be ignored.' | ||
| ) | ||
| else: | ||
| raise TypeError(f'"{key}" is not a valid Playwright context option.') | ||
|
|
||
| if proxy_info: | ||
| if browser_new_context_options.get('proxy'): | ||
| if filtered_options.get('proxy'): | ||
| logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.") | ||
|
|
||
| browser_new_context_options['proxy'] = ProxySettings( | ||
| filtered_options['proxy'] = ProxySettings( | ||
| server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', | ||
| username=proxy_info.username, | ||
| password=proxy_info.password, | ||
|
|
@@ -236,7 +271,7 @@ async def _create_browser_context( | |
| return await AsyncNewContext( | ||
| browser=self._browser, | ||
| fingerprint=self._fingerprint_generator.generate(), | ||
| **browser_new_context_options, | ||
| **filtered_options, | ||
| ) | ||
|
|
||
| if self._header_generator: | ||
|
|
@@ -256,7 +291,5 @@ async def _create_browser_context( | |
| else: | ||
| extra_http_headers = None | ||
|
|
||
| browser_new_context_options['extra_http_headers'] = browser_new_context_options.get( | ||
| 'extra_http_headers', extra_http_headers | ||
| ) | ||
| return await self._browser.new_context(**browser_new_context_options) | ||
| filtered_options['extra_http_headers'] = filtered_options.get('extra_http_headers', extra_http_headers) | ||
| return await self._browser.new_context(**filtered_options) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import logging | ||
| from typing import TYPE_CHECKING | ||
|
|
||
| import pytest | ||
| from playwright.async_api import Browser, Playwright, async_playwright | ||
|
|
||
| from crawlee.browsers import PlaywrightBrowserController | ||
|
|
||
| if TYPE_CHECKING: | ||
| from collections.abc import AsyncGenerator | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| async def playwright() -> AsyncGenerator[Playwright, None]: | ||
| async with async_playwright() as playwright: | ||
| yield playwright | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]: | ||
| browser = await playwright.chromium.launch() | ||
| yield browser | ||
| await browser.close() | ||
|
|
||
|
|
||
| async def test_controller_validation_typo(browser: Browser) -> None: | ||
| controller = PlaywrightBrowserController(browser) | ||
| with pytest.raises(TypeError, match=r'"headles" is not a valid Playwright context option.'): | ||
| await controller.new_page(browser_new_context_options={'headles': True}) | ||
| await controller.close() | ||
|
|
||
|
|
||
| async def test_controller_validation_cross_mode_persistent(browser: Browser, caplog: pytest.LogCaptureFixture) -> None: | ||
| # Default is persistent mode (use_incognito_pages=False) | ||
| controller = PlaywrightBrowserController(browser, use_incognito_pages=False) | ||
| # storage_state is incognito-only | ||
| with caplog.at_level(logging.WARNING): | ||
| page = await controller.new_page(browser_new_context_options={'storage_state': {'cookies': [], 'origins': []}}) | ||
| assert 'Option "storage_state" is only supported in incognito context mode' in caplog.text | ||
| await page.close() | ||
| await controller.close() | ||
|
|
||
|
|
||
| async def test_controller_validation_cross_mode_incognito(browser: Browser, caplog: pytest.LogCaptureFixture) -> None: | ||
| controller = PlaywrightBrowserController(browser, use_incognito_pages=True) | ||
| # env is persistent-only | ||
| with caplog.at_level(logging.WARNING): | ||
| page = await controller.new_page(browser_new_context_options={'env': {}}) | ||
| assert 'Option "env" is only supported in persistent context mode' in caplog.text | ||
| await page.close() | ||
| await controller.close() | ||
|
|
||
|
|
||
| async def test_controller_validation_valid_common(browser: Browser) -> None: | ||
| controller = PlaywrightBrowserController(browser) | ||
| # viewport is common | ||
| page = await controller.new_page(browser_new_context_options={'viewport': {'width': 800, 'height': 600}}) | ||
| assert page.viewport_size == {'width': 800, 'height': 600} | ||
| await page.close() | ||
| await controller.close() |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
browserforgeandplaywrightshould not be part of core dependencies