docs/releases_review/v0.7.5_video_walkthrough.ipynb
Welcome to Crawl4AI v0.7.5! This notebook demonstrates all the new features introduced in this release.
First, let's make sure we have the latest version installed:
# # Install or upgrade to v0.7.5
# !pip install -U crawl4ai==0.7.5 --quiet
# Import required modules
import asyncio
import nest_asyncio
nest_asyncio.apply() # For Jupyter compatibility
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai import FilterChain, URLPatternFilter, BFSDeepCrawlStrategy
from crawl4ai import hooks_to_string
print("ā
Crawl4AI v0.7.5 ready!")
v0.7.5 introduces a completely new Docker Hooks System that lets you inject custom Python functions at 8 key points in the crawling pipeline. This gives you full control over:
The Docker Hooks System offers three approaches, all part of this new feature:
hooks_to_string() utility - Convert Python functions to stringsAll three approaches are NEW in v0.7.5!
When crawling HTTPS sites, internal links sometimes get downgraded to HTTP, breaking authentication and causing security warnings.
The new preserve_https_for_internal_links=True parameter maintains HTTPS protocol for all internal links.
async def demo_https_preservation():
"""
Demonstrate HTTPS preservation with deep crawling
"""
print("š Testing HTTPS Preservation\n")
print("=" * 60)
# Setup URL filter for quotes.toscrape.com
url_filter = URLPatternFilter(
patterns=[r"^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
)
# Configure crawler with HTTPS preservation
config = CrawlerRunConfig(
exclude_external_links=True,
preserve_https_for_internal_links=True, # š NEW in v0.7.5
cache_mode=CacheMode.BYPASS,
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2,
max_pages=5,
filter_chain=FilterChain([url_filter])
)
)
async with AsyncWebCrawler() as crawler:
# With deep_crawl_strategy, arun() returns a list of CrawlResult objects
results = await crawler.arun(
url="https://quotes.toscrape.com",
config=config
)
# Analyze the first result
if results and len(results) > 0:
first_result = results[0]
internal_links = [link['href'] for link in first_result.links['internal']]
# Check HTTPS preservation
https_links = [link for link in internal_links if link.startswith('https://')]
http_links = [link for link in internal_links if link.startswith('http://') and not link.startswith('https://')]
print(f"\nš Results:")
print(f" Pages crawled: {len(results)}")
print(f" Total internal links (from first page): {len(internal_links)}")
print(f" HTTPS links: {len(https_links)} ā
")
print(f" HTTP links: {len(http_links)} {'ā ļø' if http_links else ''}")
if internal_links:
print(f" HTTPS preservation rate: {len(https_links)/len(internal_links)*100:.1f}%")
print(f"\nš Sample HTTPS-preserved links:")
for link in https_links[:5]:
print(f" ā {link}")
else:
print(f"\nā ļø No results returned")
print("\n" + "=" * 60)
print("ā
HTTPS Preservation Demo Complete!\n")
# Run the demo
await demo_https_preservation()
temperature parameter for creativity controlbase_url for custom API endpointsfrom crawl4ai import LLMExtractionStrategy, LLMConfig
from pydantic import BaseModel, Field
import os
# Define extraction schema
class Article(BaseModel):
title: str = Field(description="Article title")
summary: str = Field(description="Brief summary of the article")
main_topics: list[str] = Field(description="List of main topics covered")
async def demo_enhanced_llm():
"""
Demonstrate enhanced LLM integration with custom temperature
"""
print("š¤ Testing Enhanced LLM Integration\n")
print("=" * 60)
# Check for API key
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
print("ā ļø Note: Set OPENAI_API_KEY environment variable to test LLM extraction")
print("For this demo, we'll show the configuration only.\n")
print("š Example LLM Configuration with new v0.7.5 features:")
print("""
llm_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(
provider="openai/gpt-4o-mini",
api_token="your-api-key",
temperature=0.7, # š NEW: Control creativity (0.0-2.0)
base_url="custom-endpoint" # š NEW: Custom API endpoint
),
schema=Article.schema(),
extraction_type="schema",
instruction="Extract article information"
)
""")
return
# Create LLM extraction strategy with custom temperature
llm_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(
provider="openai/gpt-4o-mini",
api_token=api_key,
temperature=0.3, # š Lower temperature for more focused extraction
),
schema=Article.schema(),
extraction_type="schema",
instruction="Extract the article title, a brief summary, and main topics discussed."
)
config = CrawlerRunConfig(
extraction_strategy=llm_strategy,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/Artificial_intelligence",
config=config
)
if result.success:
print("\nā
LLM Extraction Successful!")
print(f"\nš Extracted Content:")
print(result.extracted_content)
else:
print(f"\nā Extraction failed: {result.error_message}")
print("\n" + "=" * 60)
print("ā
Enhanced LLM Demo Complete!\n")
# Run the demo
await demo_enhanced_llm()
First, let's create some hook functions that we can reuse:
# Define reusable hooks as Python functions
async def block_images_hook(page, context, **kwargs):
"""
Performance optimization: Block images to speed up crawling
"""
print("[Hook] Blocking images for faster loading...")
await context.route(
"**/*.{png,jpg,jpeg,gif,webp,svg,ico}",
lambda route: route.abort()
)
return page
async def set_viewport_hook(page, context, **kwargs):
"""
Set consistent viewport size for rendering
"""
print("[Hook] Setting viewport to 1920x1080...")
await page.set_viewport_size({"width": 1920, "height": 1080})
return page
async def add_custom_headers_hook(page, context, url, **kwargs):
"""
Add custom headers before navigation
"""
print(f"[Hook] Adding custom headers for {url}...")
await page.set_extra_http_headers({
'X-Crawl4AI-Version': '0.7.5',
'X-Custom-Header': 'docker-hooks-demo',
'Accept-Language': 'en-US,en;q=0.9'
})
return page
async def scroll_page_hook(page, context, **kwargs):
"""
Scroll page to load lazy-loaded content
"""
print("[Hook] Scrolling page to load lazy content...")
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(1000)
await page.evaluate("window.scrollTo(0, 0)")
await page.wait_for_timeout(500)
return page
async def log_page_metrics_hook(page, context, **kwargs):
"""
Log page metrics before extracting HTML
"""
metrics = await page.evaluate('''
() => ({
images: document.images.length,
links: document.links.length,
scripts: document.scripts.length,
title: document.title
})
''')
print(f"[Hook] Page Metrics - Title: {metrics['title']}")
print(f" Images: {metrics['images']}, Links: {metrics['links']}, Scripts: {metrics['scripts']}")
return page
print("ā
Reusable hook library created!")
print("\nš Available hooks:")
print(" ⢠block_images_hook - Speed optimization")
print(" ⢠set_viewport_hook - Consistent rendering")
print(" ⢠add_custom_headers_hook - Custom headers")
print(" ⢠scroll_page_hook - Lazy content loading")
print(" ⢠log_page_metrics_hook - Page analytics")
The new hooks_to_string() utility converts Python function objects to strings that can be sent to the Docker API:
# Convert functions to strings using the NEW utility
hooks_as_strings = hooks_to_string({
"on_page_context_created": block_images_hook,
"before_goto": add_custom_headers_hook,
"before_retrieve_html": scroll_page_hook,
})
print("ā
Converted 3 hook functions to string format")
print("\nš Example of converted hook (first 200 chars):")
print(hooks_as_strings["on_page_context_created"][:200] + "...")
print("\nš” Benefits of hooks_to_string():")
print(" ā Write hooks as Python functions (IDE support, type checking)")
print(" ā Automatically converts to string format for Docker API")
print(" ā Reusable across projects")
print(" ā Easy to test and debug")
The Docker Hooks System provides 8 strategic points where you can inject custom behavior:
Note: For a complete demonstration of all Docker Hooks approaches including:
See the separate file: v0.7.5_docker_hooks_demo.py
This standalone Python script provides comprehensive, runnable examples of the entire Docker Hooks System.
# OLD WAY (Deprecated)
# browser_config = BrowserConfig(proxy="http://proxy:8080")
# NEW WAY (v0.7.5)
browser_config_with_proxy = BrowserConfig(
proxy_config={
"server": "http://proxy.example.com:8080",
"username": "optional-username", # Optional
"password": "optional-password" # Optional
}
)
print("ā
New proxy configuration format demonstrated")
print("\nš Benefits:")
print(" ⢠More explicit and clear")
print(" ⢠Better authentication support")
print(" ⢠Consistent with industry standards")
Let's create a real-world example that uses multiple v0.7.5 features together:
async def complete_demo():
"""
Comprehensive demo combining multiple v0.7.5 features
"""
print("šÆ Complete v0.7.5 Feature Demo\n")
print("=" * 60)
# Use function-based hooks (NEW Docker Hooks System)
print("\n1ļøā£ Using Docker Hooks System (NEW!)")
hooks = {
"on_page_context_created": set_viewport_hook,
"before_goto": add_custom_headers_hook,
"before_retrieve_html": log_page_metrics_hook
}
# Convert to strings using the NEW utility
hooks_strings = hooks_to_string(hooks)
print(f" ā Converted {len(hooks_strings)} hooks to string format")
print(" ā Ready to send to Docker API")
# Use HTTPS preservation
print("\n2ļøā£ Enabling HTTPS Preservation")
url_filter = URLPatternFilter(
patterns=[r"^(https:\/\/)?example\.com(\/.*)?$"]
)
config = CrawlerRunConfig(
exclude_external_links=True,
preserve_https_for_internal_links=True, # v0.7.5 feature
cache_mode=CacheMode.BYPASS,
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=1,
max_pages=3,
filter_chain=FilterChain([url_filter])
)
)
print(" ā HTTPS preservation enabled")
# Use new proxy config format
print("\n3ļøā£ Using New Proxy Configuration Format")
browser_config = BrowserConfig(
headless=True,
# proxy_config={ # Uncomment if you have a proxy
# "server": "http://proxy:8080"
# }
)
print(" ā New proxy config format ready")
# Run the crawl
print("\n4ļøā£ Executing Crawl with All Features")
async with AsyncWebCrawler(config=browser_config) as crawler:
# With deep_crawl_strategy, returns a list
results = await crawler.arun(
url="https://example.com",
config=config
)
if results and len(results) > 0:
result = results[0] # Get first result
print(" ā Crawl successful!")
print(f"\nš Results:")
print(f" ⢠Pages crawled: {len(results)}")
print(f" ⢠Title: {result.metadata.get('title', 'N/A')}")
print(f" ⢠Content length: {len(result.markdown.raw_markdown)} characters")
print(f" ⢠Links found: {len(result.links['internal']) + len(result.links['external'])}")
else:
print(f" ā ļø No results returned")
print("\n" + "=" * 60)
print("ā
Complete Feature Demo Finished!\n")
# Run complete demo
await complete_demo()
ā
HTTPS Preservation - Maintain secure protocols throughout crawling
ā
Enhanced LLM Integration - Custom temperature and provider configuration
ā
Docker Hooks System (NEW!) - Complete pipeline customization with 3 approaches
ā
hooks_to_string() Utility (NEW!) - Convert functions for Docker API
ā
Bug Fixes - New proxy config and multiple improvements
The Docker Hooks System is completely NEW in v0.7.5. It offers:
v0.7.5_docker_hooks_demo.py for complete Docker Hooks examplesHappy Crawling with v0.7.5! š