docs/examples/multi_modal/dashscope_multi_modal.ipynb
<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/multi_modal/dashscope_multi_modal.ipynb" target="_parent"></a>
In this notebook, we show how to use DashScope qwen-vl MultiModal LLM class/abstraction for image understanding/reasoning. Async is not currently supported
We also show several functions we are now supporting for DashScope LLM:
complete (sync): for a single prompt and list of imageschat (sync): for multiple chat messagesstream complete (sync): for steaming output of completestream chat (sync): for steaming output of chat!pip install -U llama-index-multi-modal-llms-dashscope
# Set API key
%env DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
DashScopeMultiModal and Load Images from URLsfrom llama_index.multi_modal_llms.dashscope import (
DashScopeMultiModal,
DashScopeMultiModalModels,
)
from llama_index.core.multi_modal_llms.generic_utils import load_image_urls
image_urls = [
"https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg",
]
image_documents = load_image_urls(image_urls)
dashscope_multi_modal_llm = DashScopeMultiModal(
model_name=DashScopeMultiModalModels.QWEN_VL_MAX,
)
complete_response = dashscope_multi_modal_llm.complete(
prompt="What's in the image?",
image_documents=image_documents,
)
print(complete_response)
### Complete a prompt with multi images
multi_image_urls = [
"https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg",
"https://dashscope.oss-cn-beijing.aliyuncs.com/images/panda.jpeg",
]
multi_image_documents = load_image_urls(multi_image_urls)
complete_response = dashscope_multi_modal_llm.complete(
prompt="What animals are in the pictures?",
image_documents=multi_image_documents,
)
print(complete_response)
stream_complete_response = dashscope_multi_modal_llm.stream_complete(
prompt="What's in the image?",
image_documents=image_documents,
)
for r in stream_complete_response:
print(r.delta, end="")
from llama_index.core.base.llms.types import MessageRole
from llama_index.multi_modal_llms.dashscope.utils import (
create_dashscope_multi_modal_chat_message,
)
chat_message_user_1 = create_dashscope_multi_modal_chat_message(
"What's in the image?", MessageRole.USER, image_documents
)
chat_response = dashscope_multi_modal_llm.chat([chat_message_user_1])
print(chat_response.message.content[0]["text"])
chat_message_assistent_1 = create_dashscope_multi_modal_chat_message(
chat_response.message.content[0]["text"], MessageRole.ASSISTANT, None
)
chat_message_user_2 = create_dashscope_multi_modal_chat_message(
"what are they doing?", MessageRole.USER, None
)
chat_response = dashscope_multi_modal_llm.chat(
[chat_message_user_1, chat_message_assistent_1, chat_message_user_2]
)
print(chat_response.message.content[0]["text"])
stream_chat_response = dashscope_multi_modal_llm.stream_chat(
[chat_message_user_1, chat_message_assistent_1, chat_message_user_2]
)
for r in stream_chat_response:
print(r.delta, end="")
Use local file:
Linux&mac file schema: file:///home/images/test.png
Windows file schema: file://D:/images/abc.png
from llama_index.multi_modal_llms.dashscope.utils import load_local_images
local_images = [
"file://THE_FILE_PATH1",
"file://THE_FILE_PATH2",
]
image_documents = load_local_images(local_images)
chat_message_local = create_dashscope_multi_modal_chat_message(
"What animals are in the pictures?", MessageRole.USER, image_documents
)
chat_response = dashscope_multi_modal_llm.chat([chat_message_local])
print(chat_response.message.content[0]["text"])