python

%%capture
!pip install scrapegraphai
!apt install chromium-chromedriver
!pip install nest_asyncio
!pip install playwright
!playwright install

python

import nest_asyncio

nest_asyncio.apply()

python

# correct APIKEY
OPENAI_API_KEY = "YOUR API KEY"

For more examples visit the examples folder

SmartScraperGraph

SmartScraperGraph is a class representing one of the default scraping pipelines. It uses a direct graph implementation where each node has its own function, from retrieving html from a website to extracting relevant information based on your query and generate a coherent answer.

Using OpenAI models

python

from scrapegraphai.graphs import SmartScraperGraph

Define the configuration for the graph

python

graph_config = {
    "llm": {
        "api_key": OPENAI_API_KEY,
        "model": "openai/gpt-4o-mini",
        "temperature": 0,
    },
    "verbose": True,
}

Create the SmartScraperGraph instance and run it

python

smart_scraper_graph = SmartScraperGraph(
    prompt="List me all the projects with their descriptions.",
    # also accepts a string with the already downloaded HTML code
    source="https://perinim.github.io/projects/",
    config=graph_config,
)

python

graph_config = {
    "llm": {
        "api_key": OPENAI_API_KEY,
        "model": "openai/gpt-4o-mini",
    },
    "verbose": True,
    "headless": True,
}

# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************

smart_scraper_graph = SmartScraperGraph(
    prompt="List me all the projects with their description",
    source="https://perinim.github.io/projects/",
    config=graph_config,
)

python

result = smart_scraper_graph.run()

python

import json

output = json.dumps(result, indent=2)

line_list = output.split("\n")  # Sort of line replacing "\n" with a new line

for line in line_list:
    print(line)

Search graph

This graph transforms the user prompt in a internet search query, fetch the relevant URLs, and start the scraping process. Similar to the SmartScraperGraph but with the addition of the SearchInternetNode node.

python

from scrapegraphai.graphs import SearchGraph

# Define the configuration for the graph
graph_config = {
    "llm": {
        "api_key": OPENAI_API_KEY,
        "model": "openai/gpt-4o-mini",
        "temperature": 0,
    },
}

# Create the SearchGraph instance
search_graph = SearchGraph(
    prompt="List me all the European countries. Look in wikipedia.", config=graph_config
)

python

result = search_graph.run()

Prettify the result and display the JSON

python

import json

output = json.dumps(result, indent=2)

line_list = output.split("\n")  # Sort of line replacing "\n" with a new line

for line in line_list:
    print(line)

SpeechGraph

SpeechGraph is a class representing one of the default scraping pipelines that generate the answer together with an audio file. Similar to the SmartScraperGraph but with the addition of the TextToSpeechNode node.

python

from scrapegraphai.graphs import SpeechGraph

# Define the configuration for the graph
graph_config = {
    "llm": {
        "api_key": OPENAI_API_KEY,
        "model": "gpt-3.5-turbo",
    },
    "tts_model": {"api_key": OPENAI_API_KEY, "model": "tts-1", "voice": "alloy"},
    "output_path": "website_summary.mp3",
}

# Create the SpeechGraph instance
speech_graph = SpeechGraph(
    prompt="Create a summary of the website",
    source="https://perinim.github.io/projects/",
    config=graph_config,
)

python

result = speech_graph.run()
answer = result.get("answer", "No answer found")

Prettify the result and display the JSON

python

import json

output = json.dumps(answer, indent=2)

line_list = output.split("\n")  # Sort of line replacing "\n" with a new line

for line in line_list:
    print(line)

python

from IPython.display import Audio

wn = Audio("website_summary.mp3", autoplay=True)
display(wn)

Build a Custom Graph

It is possible to build your own scraping pipeline by using the default nodes and place them as you wish, without using pre-defined graphs.

You can create custom graphs based on your necessities, using standard nodes provided by the library.

The list of the existing nodes can be found through the nodes_metadata json construct.

python

# check available nodes
from scrapegraphai.helpers import nodes_metadata

nodes_metadata.keys()

python

# to get more information about a node
nodes_metadata["ImageToTextNode"]

To create a custom graph we must:

Istantiate the nodes you want to use
Create the graph using BaseGraph class, which must have a list of nodes, tuples representing the edges of the graph, an entry_point
Run it using the execute method

python

from langchain_openai import OpenAIEmbeddings
from scrapegraphai.models import OpenAI
from scrapegraphai.graphs import BaseGraph
from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode

# Define the configuration for the graph
graph_config = {
    "llm": {
        "api_key": OPENAI_API_KEY,
        "model": "openai/gpt-4o",
        "temperature": 0,
        "streaming": True,
    },
}

llm_model = OpenAI(graph_config["llm"])
embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key)

# define the nodes for the graph
fetch_node = FetchNode(
    input="url | local_dir",
    output=["doc", "link_urls", "img_urls"],
    node_config={
        "verbose": True,
        "headless": True,
    },
)
parse_node = ParseNode(
    input="doc",
    output=["parsed_doc"],
    node_config={
        "chunk_size": 4096,
        "verbose": True,
    },
)
rag_node = RAGNode(
    input="user_prompt & (parsed_doc | doc)",
    output=["relevant_chunks"],
    node_config={
        "llm_model": llm_model,
        "embedder_model": embedder,
        "verbose": True,
    },
)
generate_answer_node = GenerateAnswerNode(
    input="user_prompt & (relevant_chunks | parsed_doc | doc)",
    output=["answer"],
    node_config={
        "llm_model": llm_model,
        "verbose": True,
    },
)

# create the graph by defining the nodes and their connections
graph = BaseGraph(
    nodes=[
        fetch_node,
        parse_node,
        rag_node,
        generate_answer_node,
    ],
    edges=[
        (fetch_node, parse_node),
        (parse_node, rag_node),
        (rag_node, generate_answer_node),
    ],
    entry_point=fetch_node,
)

python

# execute the graph
result, execution_info = graph.execute(
    {
        "user_prompt": "List me the projects with their description",
        "url": "https://perinim.github.io/projects/",
    }
)

# get the answer from the result
result = result.get("answer", "No answer found.")

Prettify the result and display the JSON

python

import json

output = json.dumps(result, indent=2)

line_list = output.split("\n")  # Sort of line replacing "\n" with a new line

for line in line_list:
    print(line)