docs/examples/evaluation/Deepeval.ipynb
<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/evaluation/Deepeval.ipynb" target="_parent"></a>
This code tutorial shows how you can easily trace and evaluate your LlamaIndex Agents. You can read more about the DeepEval framework here: https://docs.confident-ai.com/docs/getting-started
LlamaIndex integration with DeepEval allows you to trace your LlamaIndex Agents and evaluate them using DeepEval's default metrics. Read more about the integration here: https://deepeval.com/integrations/frameworks/langchain
Feel free to check out our repository here on GitHub: https://github.com/confident-ai/deepeval
Install the following packages:
!pip install -q -q llama-index
!pip install -U -q deepeval
This step is optional and only if you want a server-hosted dashboard! (Psst I think you should!)
!deepeval login
deepeval allows you to evaluate LlamaIndex applications end-to-end in under a minute.
Create a FunctionAgent with a list of metrics you wish to use, and pass it to your LlamaIndex application's run method.
import asyncio
from llama_index.llms.openai import OpenAI
import llama_index.core.instrumentation as instrument
from deepeval.integrations.llama_index import (
instrument_llama_index,
FunctionAgent,
)
from deepeval.metrics import AnswerRelevancyMetric
instrument_llama_index(instrument.get_dispatcher())
def multiply(a: float, b: float) -> float:
"""Useful for multiplying two numbers."""
return a * b
answer_relevancy_metric = AnswerRelevancyMetric()
agent = FunctionAgent(
tools=[multiply],
llm=OpenAI(model="gpt-4o-mini"),
system_prompt="You are a helpful assistant that can perform calculations.",
metrics=[answer_relevancy_metric],
)
async def llm_app(input: str):
return await agent.run(input)
asyncio.run(llm_app("What is 2 * 3?"))
Evaluations are supported for LlamaIndex FunctionAgent, ReActAgent and CodeActAgent. Only metrics with LLM parameters input and output are eligible for evaluation.
Create a FunctionAgent with a list of metrics you wish to use, and pass it to your LlamaIndex application's run method.
from deepeval.dataset import EvaluationDataset, Golden
dataset = EvaluationDataset(
goldens=[Golden(input="What is 3 * 12?"), Golden(input="What is 4 * 13?")]
)
for golden in dataset.evals_iterator():
task = asyncio.create_task(llm_app(golden.input))
dataset.evaluate(task)
from deepeval.dataset import EvaluationDataset, Golden
import asyncio
dataset = EvaluationDataset(
goldens=[Golden(input="What's 7 * 8?"), Golden(input="What's 7 * 6?")]
)
for golden in dataset.evals_iterator():
task = asyncio.create_task(llm_app(golden.input))
dataset.evaluate(task)
Jupyter notebooks already maintain their own event loop, which may lead to unexpected behavior, hangs, or runtime errors when running DeepEval examples directly in a notebook cell.
Recommendation: To avoid such issues, run your DeepEval examples in a standalone Python script (.py file) instead of within Jupyter Notebook.
Here are some examples scripts.
# Synchronous (End-to-End Evals)
import os
import deepeval
import asyncio
from llama_index.llms.openai import OpenAI
import llama_index.core.instrumentation as instrument
from deepeval.integrations.llama_index import instrument_llama_index
from deepeval.integrations.llama_index import FunctionAgent
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.dataset import EvaluationDataset, Golden
from dotenv import load_dotenv
load_dotenv()
deepeval.login(os.getenv("CONFIDENT_API_KEY"))
instrument_llama_index(instrument.get_dispatcher())
def multiply(a: float, b: float) -> float:
"""Useful for multiplying two numbers."""
return a * b
answer_relevancy_metric = AnswerRelevancyMetric()
agent = FunctionAgent(
tools=[multiply],
llm=OpenAI(model="gpt-4o-mini"),
system_prompt="You are a helpful assistant that can perform calculations.",
metrics=[answer_relevancy_metric],
)
async def llm_app(input: str):
return await agent.run(input)
dataset = EvaluationDataset(
goldens=[Golden(input="What is 3 * 12?"), Golden(input="What is 4 * 13?")]
)
for golden in dataset.evals_iterator():
task = asyncio.create_task(llm_app(golden.input))
dataset.evaluate(task)
# Asynchronous (End-to-End Evals)
import os
from deepeval.integrations.llama_index import instrument_llama_index
import llama_index.core.instrumentation as instrument
from deepeval.integrations.llama_index import FunctionAgent
from llama_index.llms.openai import OpenAI
import asyncio
import time
import deepeval
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.dataset import EvaluationDataset, Golden
from dotenv import load_dotenv
load_dotenv()
# Don't forget to setup tracing
deepeval.login(os.getenv("CONFIDENT_API_KEY"))
instrument_llama_index(instrument.get_dispatcher())
def multiply(a: float, b: float) -> float:
"""Useful for multiplying two numbers."""
return a * b
answer_relevancy_metric = AnswerRelevancyMetric()
agent = FunctionAgent(
tools=[multiply],
llm=OpenAI(model="gpt-4o-mini"),
system_prompt="You are a helpful assistant that can perform calculations.",
metrics=[answer_relevancy_metric],
)
goldens = [Golden(input="What's 7 * 8?"), Golden(input="What's 7 * 6?")]
async def llm_app(golden: Golden):
await agent.run(golden.input)
def main():
dataset = EvaluationDataset(goldens=goldens)
for golden in dataset.evals_iterator():
task = asyncio.create_task(llm_app(golden))
dataset.evaluate(task)
if __name__ == "__main__":
main()
import os
from deepeval.integrations.llama_index import instrument_llama_index
import llama_index.core.instrumentation as instrument
from deepeval.integrations.llama_index import FunctionAgent
from llama_index.llms.openai import OpenAI
import asyncio
import deepeval
from dotenv import load_dotenv
load_dotenv()
# Don't forget to setup tracing
deepeval.login(os.getenv("CONFIDENT_API_KEY"))
instrument_llama_index(instrument.get_dispatcher())
def multiply(a: float, b: float) -> float:
"""Useful for multiplying two numbers."""
return a * b
agent = FunctionAgent(
tools=[multiply],
llm=OpenAI(model="gpt-4o-mini"),
system_prompt="You are a helpful assistant that can perform calculations.",
metric_collection="test_collection_1",
)
async def llm_app(golden: Golden):
await agent.run(golden.input)
asyncio.run(llm_app(Golden(input="What is 3 * 12?")))