docs/examples/multi_modal/multi_modal_pydantic.ipynb
<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/multi_modal/multi_modal_pydantic.ipynb" target="_parent"></a>
In this notebook, we show you how to generate structured data with new OpenAI GPT4V API via LlamaIndex. The user just needs to specify a Pydantic object.
We also compared several Large Vision models for this task:
%pip install llama-index-multi-modal-llms-openai
%pip install llama-index-multi-modal-llms-replicate
import os
OPENAI_API_KEY = "sk-<your-openai-api-token>"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
REPLICATE_API_TOKEN = "" # Your Relicate API token here
os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN
from pathlib import Path
input_image_path = Path("restaurant_images")
if not input_image_path.exists():
Path.mkdir(input_image_path)
!wget "https://docs.google.com/uc?export=download&id=1GlqcNJhGGbwLKjJK1QJ_nyswCTQ2K2Fq" -O ./restaurant_images/fried_chicken.png
from pydantic import BaseModel
class Restaurant(BaseModel):
"""Data model for an restaurant."""
restaurant: str
food: str
discount: str
price: str
rating: str
review: str
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.core import SimpleDirectoryReader
# put your local directory here
image_documents = SimpleDirectoryReader("./restaurant_images").load_data()
openai_mm_llm = OpenAIMultiModal(
model="gpt-4o", api_key=OPENAI_API_KEY, max_new_tokens=1000
)
from PIL import Image
import matplotlib.pyplot as plt
imageUrl = "./restaurant_images/fried_chicken.png"
image = Image.open(imageUrl).convert("RGB")
plt.figure(figsize=(16, 5))
plt.imshow(image)
from llama_index.core.program import MultiModalLLMCompletionProgram
from llama_index.core.output_parsers import PydanticOutputParser
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
openai_program = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(Restaurant),
image_documents=image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=openai_mm_llm,
verbose=True,
)
response = openai_program()
for res in response:
print(res)
from llama_index.multi_modal_llms.replicate import ReplicateMultiModal
from llama_index.multi_modal_llms.replicate.base import (
REPLICATE_MULTI_MODAL_LLM_MODELS,
)
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
def pydantic_replicate(
model_name, output_class, image_documents, prompt_template_str
):
mm_llm = ReplicateMultiModal(
model=REPLICATE_MULTI_MODAL_LLM_MODELS[model_name],
temperature=0.1,
max_new_tokens=1000,
)
llm_program = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(output_class),
image_documents=image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=mm_llm,
verbose=True,
)
response = llm_program()
print(f"Model: {model_name}")
for res in response:
print(res)
pydantic_replicate("fuyu-8b", Restaurant, image_documents, prompt_template_str)
pydantic_replicate(
"llava-13b", Restaurant, image_documents, prompt_template_str
)
pydantic_replicate(
"minigpt-4", Restaurant, image_documents, prompt_template_str
)
pydantic_replicate("cogvlm", Restaurant, image_documents, prompt_template_str)
Observation:
input_image_path = Path("amazon_images")
if not input_image_path.exists():
Path.mkdir(input_image_path)
!wget "https://docs.google.com/uc?export=download&id=1p1Y1qAoM68eC4sAvvHaiJyPhdUZS0Gqb" -O ./amazon_images/amazon.png
from pydantic import BaseModel
class Product(BaseModel):
"""Data model for a Amazon Product."""
title: str
category: str
discount: str
price: str
rating: str
review: str
description: str
inventory: str
imageUrl = "./amazon_images/amazon.png"
image = Image.open(imageUrl).convert("RGB")
plt.figure(figsize=(16, 5))
plt.imshow(image)
amazon_image_documents = SimpleDirectoryReader("./amazon_images").load_data()
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
openai_program_amazon = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(Product),
image_documents=amazon_image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=openai_mm_llm,
verbose=True,
)
response = openai_program_amazon()
for res in response:
print(res)
pydantic_replicate(
"fuyu-8b", Product, amazon_image_documents, prompt_template_str
)
pydantic_replicate(
"minigpt-4", Product, amazon_image_documents, prompt_template_str
)
pydantic_replicate(
"cogvlm", Product, amazon_image_documents, prompt_template_str
)
pydantic_replicate(
"llava-13b", Product, amazon_image_documents, prompt_template_str
)
Observation:
input_image_path = Path("instagram_images")
if not input_image_path.exists():
Path.mkdir(input_image_path)
!wget "https://docs.google.com/uc?export=download&id=12ZpBBFkYu-jzz1iz356U5kMikn4uN9ww" -O ./instagram_images/jordan.png
from pydantic import BaseModel
class InsAds(BaseModel):
"""Data model for a Ins Ads."""
account: str
brand: str
product: str
category: str
discount: str
price: str
comments: str
review: str
description: str
from PIL import Image
import matplotlib.pyplot as plt
imageUrl = "./instagram_images/jordan.png"
image = Image.open(imageUrl).convert("RGB")
plt.figure(figsize=(16, 5))
plt.imshow(image)
ins_image_documents = SimpleDirectoryReader("./instagram_images").load_data()
prompt_template_str = """\
can you summarize what is in the image\
and return the answer with json format \
"""
openai_program_ins = MultiModalLLMCompletionProgram.from_defaults(
output_parser=PydanticOutputParser(InsAds),
image_documents=ins_image_documents,
prompt_template_str=prompt_template_str,
multi_modal_llm=openai_mm_llm,
verbose=True,
)
response = openai_program_ins()
for res in response:
print(res)
pydantic_replicate("fuyu-8b", InsAds, ins_image_documents, prompt_template_str)
pydantic_replicate(
"llava-13b", InsAds, ins_image_documents, prompt_template_str
)
pydantic_replicate("cogvlm", InsAds, ins_image_documents, prompt_template_str)
pydantic_replicate(
"minigpt-4", InsAds, ins_image_documents, prompt_template_str
)
Observation: