Building a Playwright Browser Agent

This tutorial walks through using the LLM tools provided by the Playwright to allow LLMs to easily navigate and scrape content from the Internet.

Instaniation

python

%pip install llama-index-tools-playwright llama-index

python

# set up async playwright browser
# To enable more llamaindex usecases, we only offer async playwright tools at the moment

# install playwright
!playwright install

# This import is required only for jupyter notebooks, since they have their own eventloop
import nest_asyncio

nest_asyncio.apply()

# import the tools
from llama_index.tools.playwright.base import PlaywrightToolSpec

# create the tools
browser = await PlaywrightToolSpec.create_async_playwright_browser(headless=True)
playwright_tool = PlaywrightToolSpec.from_async_browser(browser)

Testing the playwright tools

Listing all tools

python

playwright_tool_list = playwright_tool.to_tool_list()
for tool in playwright_tool_list:
    print(tool.metadata.name)

Navigating to playwright doc website

python

await playwright_tool.navigate_to("https://playwright.dev/python/docs/intro")

### Print the current page URL
print(await playwright_tool.get_current_page())

Extract all hyperlinks

python

print(await playwright_tool.extract_hyperlinks())

Extract all text

python

print(await playwright_tool.extract_text())

Get element

Get element attributes for navigating to the next page. You can retrieve the selector from google chrome dev tools.

python

element = await playwright_tool.get_elements(
    selector="#__docusaurus_skipToContent_fallback > div > div > main > div > div > div.col.docItemCol_VOVn > div > nav > a",
    attributes=["innerText"],
)
print(element)

Click

Click on the search bar

python

await playwright_tool.click(
    selector="#__docusaurus > nav > div.navbar__inner > div.navbar__items.navbar__items--right > div.navbarSearchContainer_Bca1 > button"
)

Fill

Fill in the search bar with "Mouse click"

python

await playwright_tool.fill(selector="#docsearch-input", value="Mouse click")

Click on the first result, we should be redirected to the Mouse click page

python

await playwright_tool.click(selector="#docsearch-hits0-item-0")
print(await playwright_tool.get_current_page())

Using the playwright tool with agent

To get started, you will need an OpenAI api key

python

# set your openai key, if using openai
import os

os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

python

from llama_index.core.agent import FunctionAgent
from llama_index.llms.openai import OpenAI

playwright_tool_list = playwright_tool.to_tool_list()

agent = FunctionAgent(
    tools=playwright_tool_list,
    llm=OpenAI(model="gpt-4o"),
)

python

print(
    await agent.run(
        "Navigate to https://blog.samaltman.com/productivity, extract the text on this page and return a summary of the article."
    )
)

Using the playwright tool with agent workflow

python

from llama_index.llms.openai import OpenAI
from llama_index.core.agent.workflow import AgentWorkflow

from llama_index.core.agent.workflow import (
    AgentInput,
    AgentOutput,
    ToolCall,
    ToolCallResult,
    AgentStream,
)

python

llm = OpenAI(model="gpt-4o")

workflow = AgentWorkflow.from_tools_or_functions(
    playwright_tool_list,
    llm=llm,
    system_prompt="You are a helpful assistant that can do browser automation and data extraction",
)

handler = workflow.run(
    user_msg="Navigate to https://blog.samaltman.com/productivity, extract the text on this page and return a summary of the article."
)

async for event in handler.stream_events():
    if isinstance(event, AgentStream):
        print(event.delta, end="", flush=True)
        # print(event.response)  # the current full response
        # print(event.raw)  # the raw llm api response
        # print(event.current_agent_name)  # the current agent name
    # elif isinstance(event, AgentInput):
    # print(event.input)  # the current input messages
    # print(event.current_agent_name)  # the current agent name
    # elif isinstance(event, AgentOutput):
    # print(event.response)  # the current full response
    # print(event.tool_calls)  # the selected tool calls, if any
    # print(event.raw)  # the raw llm api response
    elif isinstance(event, ToolCallResult):
        print(event.tool_name)  # the tool name
        print(event.tool_kwargs)  # the tool kwargs
        print(event.tool_output)  # the tool output
    # elif isinstance(event, ToolCall):
    # print(event.tool_name)  # the tool name
    # print(event.tool_kwargs)  # the tool kwargs