Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@ keywords = [
]
dependencies = [
"async-timeout>=5.0.1",
"browserforge>=1.2.4",
"cachetools>=5.5.0",
"colorama>=0.4.0",
"impit>=0.8.0",
"more-itertools>=10.2.0",
"playwright>=1.58.0",
Comment on lines +38 to +43
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

browserforge and playwright should not be part of core dependencies

"protego>=0.5.0",
"psutil>=6.0.0",
"pydantic-settings>=2.12.0",
Expand All @@ -55,15 +57,15 @@ adaptive-crawler = [
"jaro-winkler>=2.0.3",
"playwright>=1.27.0",
"scikit-learn>=1.6.0",
"apify_fingerprint_datapoints>=0.0.3",
"apify_fingerprint_datapoints>=0.11.0",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why?

"browserforge>=1.2.4"
]
beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
curl-impersonate = ["curl-cffi>=0.9.0"]
httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.11.0", "browserforge>=1.2.3"]
parsel = ["parsel>=1.10.0"]
playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.11.0", "browserforge>=1.2.3"]
otel = [
"opentelemetry-api>=1.34.1",
"opentelemetry-distro[otlp]>=0.54",
Expand Down
47 changes: 40 additions & 7 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

from __future__ import annotations

import inspect
from asyncio import Lock
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Any, cast

from browserforge.injectors.playwright import AsyncNewContext
from playwright.async_api import Browser, BrowserContext, Page, ProxySettings
from playwright.async_api import BrowserType as PlaywrightBrowserType
from typing_extensions import override

from crawlee._utils.docs import docs_group
Expand All @@ -27,6 +29,14 @@

logger = getLogger(__name__)

# Cache Playwright signatures to avoid overhead in critical path
_launch_persistent_context_params = set(inspect.signature(PlaywrightBrowserType.launch_persistent_context).parameters)
_new_context_params = set(inspect.signature(Browser.new_context).parameters)
Comment on lines +33 to +34
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it necessary to run these at the import time of the module?


_common_context_options = _launch_persistent_context_params & _new_context_params
_persistent_unique_context_options = _launch_persistent_context_params - _new_context_params
_incognito_unique_context_options = _new_context_params - _launch_persistent_context_params


@docs_group('Browser management')
class PlaywrightBrowserController(BrowserController):
Expand Down Expand Up @@ -222,11 +232,36 @@ async def _create_browser_context(
`self._fingerprint_generator` is available.
"""
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}

filtered_options = {}
for key, value in browser_new_context_options.items():
if self._use_incognito_pages:
# Incognito mode (new_context)
if key in _common_context_options or key in _incognito_unique_context_options:
filtered_options[key] = value
elif key in _persistent_unique_context_options:
logger.warning(
f'Option "{key}" is only supported in persistent context mode '
'(use_incognito_pages=False) and will be ignored.'
)
else:
raise TypeError(f'"{key}" is not a valid Playwright context option.')
elif key in _common_context_options or key in _persistent_unique_context_options:
# Persistent mode (launch_persistent_context)
filtered_options[key] = value
elif key in _incognito_unique_context_options:
logger.warning(
f'Option "{key}" is only supported in incognito context mode '
'(use_incognito_pages=True) and will be ignored.'
)
else:
raise TypeError(f'"{key}" is not a valid Playwright context option.')

if proxy_info:
if browser_new_context_options.get('proxy'):
if filtered_options.get('proxy'):
logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")

browser_new_context_options['proxy'] = ProxySettings(
filtered_options['proxy'] = ProxySettings(
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
username=proxy_info.username,
password=proxy_info.password,
Expand All @@ -236,7 +271,7 @@ async def _create_browser_context(
return await AsyncNewContext(
browser=self._browser,
fingerprint=self._fingerprint_generator.generate(),
**browser_new_context_options,
**filtered_options,
)

if self._header_generator:
Expand All @@ -256,7 +291,5 @@ async def _create_browser_context(
else:
extra_http_headers = None

browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
'extra_http_headers', extra_http_headers
)
return await self._browser.new_context(**browser_new_context_options)
filtered_options['extra_http_headers'] = filtered_options.get('extra_http_headers', extra_http_headers)
return await self._browser.new_context(**filtered_options)
62 changes: 62 additions & 0 deletions tests/unit/browsers/test_playwright_controller_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING

import pytest
from playwright.async_api import Browser, Playwright, async_playwright

from crawlee.browsers import PlaywrightBrowserController

if TYPE_CHECKING:
from collections.abc import AsyncGenerator


@pytest.fixture
async def playwright() -> AsyncGenerator[Playwright, None]:
async with async_playwright() as playwright:
yield playwright


@pytest.fixture
async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]:
browser = await playwright.chromium.launch()
yield browser
await browser.close()


async def test_controller_validation_typo(browser: Browser) -> None:
controller = PlaywrightBrowserController(browser)
with pytest.raises(TypeError, match=r'"headles" is not a valid Playwright context option.'):
await controller.new_page(browser_new_context_options={'headles': True})
await controller.close()


async def test_controller_validation_cross_mode_persistent(browser: Browser, caplog: pytest.LogCaptureFixture) -> None:
# Default is persistent mode (use_incognito_pages=False)
controller = PlaywrightBrowserController(browser, use_incognito_pages=False)
# storage_state is incognito-only
with caplog.at_level(logging.WARNING):
page = await controller.new_page(browser_new_context_options={'storage_state': {'cookies': [], 'origins': []}})
assert 'Option "storage_state" is only supported in incognito context mode' in caplog.text
await page.close()
await controller.close()


async def test_controller_validation_cross_mode_incognito(browser: Browser, caplog: pytest.LogCaptureFixture) -> None:
controller = PlaywrightBrowserController(browser, use_incognito_pages=True)
# env is persistent-only
with caplog.at_level(logging.WARNING):
page = await controller.new_page(browser_new_context_options={'env': {}})
assert 'Option "env" is only supported in persistent context mode' in caplog.text
await page.close()
await controller.close()


async def test_controller_validation_valid_common(browser: Browser) -> None:
controller = PlaywrightBrowserController(browser)
# viewport is common
page = await controller.new_page(browser_new_context_options={'viewport': {'width': 800, 'height': 600}})
assert page.viewport_size == {'width': 800, 'height': 600}
await page.close()
await controller.close()
10 changes: 7 additions & 3 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.