examples/ScrapegraphAI_cookbook.ipynb
%%capture
!pip install scrapegraphai
!apt install chromium-chromedriver
!pip install nest_asyncio
!pip install playwright
!playwright install
import nest_asyncio
nest_asyncio.apply()
# correct APIKEY
OPENAI_API_KEY = "YOUR API KEY"
For more examples visit the examples folder
SmartScraperGraph is a class representing one of the default scraping pipelines. It uses a direct graph implementation where each node has its own function, from retrieving html from a website to extracting relevant information based on your query and generate a coherent answer.
from scrapegraphai.graphs import SmartScraperGraph
Define the configuration for the graph
graph_config = {
"llm": {
"api_key": OPENAI_API_KEY,
"model": "openai/gpt-4o-mini",
"temperature": 0,
},
"verbose": True,
}
Create the SmartScraperGraph instance and run it
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects with their descriptions.",
# also accepts a string with the already downloaded HTML code
source="https://perinim.github.io/projects/",
config=graph_config,
)
graph_config = {
"llm": {
"api_key": OPENAI_API_KEY,
"model": "openai/gpt-4o-mini",
},
"verbose": True,
"headless": True,
}
# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************
smart_scraper_graph = SmartScraperGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
config=graph_config,
)
result = smart_scraper_graph.run()
import json
output = json.dumps(result, indent=2)
line_list = output.split("\n") # Sort of line replacing "\n" with a new line
for line in line_list:
print(line)
This graph transforms the user prompt in a internet search query, fetch the relevant URLs, and start the scraping process. Similar to the SmartScraperGraph but with the addition of the SearchInternetNode node.
from scrapegraphai.graphs import SearchGraph
# Define the configuration for the graph
graph_config = {
"llm": {
"api_key": OPENAI_API_KEY,
"model": "openai/gpt-4o-mini",
"temperature": 0,
},
}
# Create the SearchGraph instance
search_graph = SearchGraph(
prompt="List me all the European countries. Look in wikipedia.", config=graph_config
)
result = search_graph.run()
Prettify the result and display the JSON
import json
output = json.dumps(result, indent=2)
line_list = output.split("\n") # Sort of line replacing "\n" with a new line
for line in line_list:
print(line)
SpeechGraph is a class representing one of the default scraping pipelines that generate the answer together with an audio file. Similar to the SmartScraperGraph but with the addition of the TextToSpeechNode node.
from scrapegraphai.graphs import SpeechGraph
# Define the configuration for the graph
graph_config = {
"llm": {
"api_key": OPENAI_API_KEY,
"model": "gpt-3.5-turbo",
},
"tts_model": {"api_key": OPENAI_API_KEY, "model": "tts-1", "voice": "alloy"},
"output_path": "website_summary.mp3",
}
# Create the SpeechGraph instance
speech_graph = SpeechGraph(
prompt="Create a summary of the website",
source="https://perinim.github.io/projects/",
config=graph_config,
)
result = speech_graph.run()
answer = result.get("answer", "No answer found")
Prettify the result and display the JSON
import json
output = json.dumps(answer, indent=2)
line_list = output.split("\n") # Sort of line replacing "\n" with a new line
for line in line_list:
print(line)
from IPython.display import Audio
wn = Audio("website_summary.mp3", autoplay=True)
display(wn)
It is possible to build your own scraping pipeline by using the default nodes and place them as you wish, without using pre-defined graphs.
You can create custom graphs based on your necessities, using standard nodes provided by the library.
The list of the existing nodes can be found through the nodes_metadata json construct.
# check available nodes
from scrapegraphai.helpers import nodes_metadata
nodes_metadata.keys()
# to get more information about a node
nodes_metadata["ImageToTextNode"]
To create a custom graph we must:
from langchain_openai import OpenAIEmbeddings
from scrapegraphai.models import OpenAI
from scrapegraphai.graphs import BaseGraph
from scrapegraphai.nodes import FetchNode, ParseNode, RAGNode, GenerateAnswerNode
# Define the configuration for the graph
graph_config = {
"llm": {
"api_key": OPENAI_API_KEY,
"model": "openai/gpt-4o",
"temperature": 0,
"streaming": True,
},
}
llm_model = OpenAI(graph_config["llm"])
embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key)
# define the nodes for the graph
fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
node_config={
"verbose": True,
"headless": True,
},
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": 4096,
"verbose": True,
},
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm_model": llm_model,
"embedder_model": embedder,
"verbose": True,
},
)
generate_answer_node = GenerateAnswerNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={
"llm_model": llm_model,
"verbose": True,
},
)
# create the graph by defining the nodes and their connections
graph = BaseGraph(
nodes=[
fetch_node,
parse_node,
rag_node,
generate_answer_node,
],
edges=[
(fetch_node, parse_node),
(parse_node, rag_node),
(rag_node, generate_answer_node),
],
entry_point=fetch_node,
)
# execute the graph
result, execution_info = graph.execute(
{
"user_prompt": "List me the projects with their description",
"url": "https://perinim.github.io/projects/",
}
)
# get the answer from the result
result = result.get("answer", "No answer found.")
Prettify the result and display the JSON
import json
output = json.dumps(result, indent=2)
line_list = output.split("\n") # Sort of line replacing "\n" with a new line
for line in line_list:
print(line)