apps/test-suite/index-benchmark/run.ipynb
from firecrawl import FirecrawlApp, ScrapeOptions
import os
from dotenv import load_dotenv
from datetime import datetime
import statistics
import requests
from time import sleep
load_dotenv()
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
scrape_urls=[
'https://news.ycombinator.com', # - Hacker News (simple, fast-loading)
'https://httpbin.org', # - HTTP testing service (very reliable)
'https://example.com', # - Standard test domain (minimal content)
'https://github.com/microsoft/vscode', # - GitHub repo page (structured content)
'https://stackoverflow.com/questions', # - Stack Overflow questions page
'https://www.wikipedia.org', # - Wikipedia main page (rich content)
'https://jsonplaceholder.typicode.com', # - Fake API for testing
'https://httpstat.us/200', # - HTTP status testing (minimal response)
'https://docs.python.org/3/' # - Python documentation (structured docs)
]
crawl_urls = [
"https://www.pcbgogo.com", # 7825
"https://github.com/Uniswap/v4-core", # 7353
"https://www.arcep.fr/actualites", # 9764
"https://www.synapticure.com", # 7746
"https://www.elecrow.com", # 8025
"https://www.idfcfirstbank.com", # 9912
"https://www.todaytix.com", # 7532
"https://www.wheel-size.com", # 7102
"https://drymerge.com", # 8422
"https://telegramindex.org" # 5335
]
Hypothesis: Indexed scrapes are faster
scrape_times_no_cache = []
scrape_times_cached = []
for i, url in enumerate(scrape_urls): # Test first 5 URLs
print(f"Testing {i+1}/{len(scrape_urls)}: {url}")
# No cache (maxAge=1)
try:
start = datetime.now()
doc = app.scrape_url(url, maxAge=1)
no_cache_time = (datetime.now() - start).total_seconds()
scrape_times_no_cache.append(no_cache_time)
print(f" No cache: {no_cache_time:.2f}s ({doc.metadata['scrapeId']})")
except Exception as e:
print(f" No cache: FAILED - {e}")
scrape_times_no_cache.append(None)
print(" Waiting for cache to propagate...")
sleep(17)
# Cached (maxAge=100000)
try:
start = datetime.now()
doc = app.scrape_url(url, maxAge=100000)
cached_time = (datetime.now() - start).total_seconds()
scrape_times_cached.append(cached_time)
print(f" Cached: {cached_time:.2f}s ({doc.metadata['scrapeId']})")
except Exception as e:
print(f" Cached: FAILED - {e}")
scrape_times_cached.append(None)
# Calculate averages
valid_no_cache = [t for t in scrape_times_no_cache if t is not None]
valid_cached = [t for t in scrape_times_cached if t is not None]
if valid_no_cache and valid_cached:
avg_no_cache = statistics.mean(valid_no_cache)
avg_cached = statistics.mean(valid_cached)
speedup = avg_no_cache / avg_cached if avg_cached > 0 else 0
print("SCRAPE RESULTS:")
print(f"Average no cache: {avg_no_cache:.2f}s")
print(f"Average cached: {avg_cached:.2f}s")
print(f"Speedup: {speedup:.1f}x faster with cache")
print(f"Time saved: {avg_no_cache - avg_cached:.2f}s per request")
--- for now used to improve map
Hypothesis: Indexed crawls are faster
crawl_times_no_cache = []
crawl_times_cached = []
for i, url in enumerate(crawl_urls):
try:
print(f"Crawling {i+1}/{len(crawl_urls)}: {url}")
result = app.crawl_url(url)
except Exception as e:
print(f"{url[0]} - Crawl FAILED - {e}")
Hypothesis: Indexed Map should get more urls after crawl
def map_request(url, ignore_index):
"""
Make a map request and return the links
"""
payload = {"url": url, "useIndex": not ignore_index, "limit": 30000}
headers = {'Content-Type': 'application/json', "Authorization": "Bearer no-auth"}
response = requests.post("https://api.firecrawl.dev/v1/map", headers=headers, json=payload)
if response.status_code == 200:
data = response.json()
return data.get('links', [])
else:
print(response.json())
return []
map_times_no_cache = []
map_times_cached = []
map_url_counts_no_cache = []
map_url_counts_cached = []
for i, url in enumerate(crawl_urls):
print(f"Testing {i+1}/{len(crawl_urls)}: {url}")
# No index (ignoreIndex=True)
start = datetime.now()
links_no_index = map_request(url, True)
time_no_index = (datetime.now() - start).total_seconds()
map_times_no_cache.append(time_no_index)
map_url_counts_no_cache.append(len(links_no_index))
print(f" No index: {time_no_index:.2f}s, {len(links_no_index)} URLs")
# With index (ignoreIndex=False)
start = datetime.now()
links_indexed = map_request(url, False)
time_indexed = (datetime.now() - start).total_seconds()
map_times_cached.append(time_indexed)
map_url_counts_cached.append(len(links_indexed))
print(f" With index: {time_indexed:.2f}s, {len(links_indexed)} URLs")
# Calculate averages
avg_time_no_cache = statistics.mean(map_times_no_cache)
avg_time_cached = statistics.mean(map_times_cached)
avg_urls_no_cache = statistics.mean(map_url_counts_no_cache)
avg_urls_cached = statistics.mean(map_url_counts_cached)
time_speedup = avg_time_no_cache / avg_time_cached if avg_time_cached > 0 else 0
url_difference = avg_urls_cached - avg_urls_no_cache
url_percentage = (avg_urls_cached / avg_urls_no_cache * 100) if avg_urls_no_cache > 0 else 0
print("MAP RESULTS:")
print(f"Average time (no cache): {avg_time_no_cache:.2f}s")
print(f"Average time (cached): {avg_time_cached:.2f}s")
print(f"Time speedup: {time_speedup:.2f}x faster with cache")
print(f"Average URLs found (no cache): {avg_urls_no_cache:.1f}")
print(f"Average URLs found (cached): {avg_urls_cached:.1f}")
print(f"URL difference: {url_difference:+.1f} URLs with cache")
print(f"URL percentage: {url_percentage:.1f}% of no-cache results")
if url_difference > 0:
print("✅ Cache finds MORE URLs")
elif url_difference < 0:
print("⚠️ Cache finds FEWER URLs")
else:
print("➡️ Cache finds SAME number of URLs")