diff --git a/libs/community/langchain_community/document_loaders/url_playwright.py b/libs/community/langchain_community/document_loaders/url_playwright.py index 8071d3717f726..106f15cee6f61 100644 --- a/libs/community/langchain_community/document_loaders/url_playwright.py +++ b/libs/community/langchain_community/document_loaders/url_playwright.py @@ -2,7 +2,7 @@ """ import logging from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, Dict, List, Optional from langchain_core.documents import Document @@ -111,6 +111,22 @@ class PlaywrightURLLoader(BaseLoader): urls (List[str]): List of URLs to load. continue_on_failure (bool): If True, continue loading other URLs on failure. headless (bool): If True, the browser will run in headless mode. + proxy (Optional[Dict[str, str]]): If set, the browser will access URLs + through the specified proxy. + + Example: + .. code-block:: python + + from langchain_community.document_loaders import PlaywrightURLLoader + + urls = ["https://api.ipify.org/?format=json",] + proxy={ + "server": "https://xx.xx.xx:15818", # https://: + "username": "username", + "password": "password" + } + loader = PlaywrightURLLoader(urls, proxy=proxy) + data = loader.load() """ def __init__( @@ -120,6 +136,7 @@ def __init__( headless: bool = True, remove_selectors: Optional[List[str]] = None, evaluator: Optional[PlaywrightEvaluator] = None, + proxy: Optional[Dict[str, str]] = None, ): """Load a list of URLs using Playwright.""" try: @@ -133,6 +150,7 @@ def __init__( self.urls = urls self.continue_on_failure = continue_on_failure self.headless = headless + self.proxy = proxy if remove_selectors and evaluator: raise ValueError( @@ -153,7 +171,7 @@ def load(self) -> List[Document]: docs: List[Document] = list() with sync_playwright() as p: - browser = p.chromium.launch(headless=self.headless) + browser = p.chromium.launch(headless=self.headless, proxy=self.proxy) for url in self.urls: try: page = browser.new_page() @@ -186,7 +204,7 @@ async def aload(self) -> List[Document]: docs: List[Document] = list() async with async_playwright() as p: - browser = await p.chromium.launch(headless=self.headless) + browser = await p.chromium.launch(headless=self.headless, proxy=self.proxy) for url in self.urls: try: page = await browser.new_page()