LiteLLM x IBM watsonx.ai

Pre-Requisites

python

!pip install litellm

Set watsonx.ai Credentials

See this documentation for more information about authenticating to watsonx.ai

python

import os
import litellm
from litellm.llms.watsonx import IBMWatsonXAI
litellm.set_verbose = False

os.environ["WATSONX_URL"] = "" # Your watsonx.ai base URL
os.environ["WATSONX_APIKEY"] = "" # Your IBM cloud API key or watsonx.ai token
os.environ["WATSONX_PROJECT_ID"] = "" # ID of your watsonx.ai project
# these can also be passed as arguments to the function

# generating an IAM token is optional, but it is recommended to generate it once and use it for all your requests during the session
# if not passed to the function, it will be generated automatically for each request
iam_token = IBMWatsonXAI().generate_iam_token(api_key=os.environ["WATSONX_APIKEY"]) 
# you can also set os.environ["WATSONX_TOKEN"] = iam_token

Completion Requests

See the following link for a list of supported text generation models available with watsonx.ai:

https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx&locale=en&audience=wdp

python

from litellm import completion

# see litellm.llms.watsonx.IBMWatsonXAIConfig for a list of available parameters to pass to the completion functions
response = completion(
        model="watsonx/ibm/granite-13b-chat-v2",
        messages=[{ "content": "Hello, how are you?","role": "user"}],
        token=iam_token
)
print("Granite v2 response:")
print(response)


response = completion(
        model="watsonx/meta-llama/llama-3-8b-instruct",
        messages=[{ "content": "Hello, how are you?","role": "user"}],
        token=iam_token
)
print("LLaMa 3 8b response:")
print(response)

Streaming Requests

python

from litellm import completion

response = completion(
        model="watsonx/ibm/granite-13b-chat-v2",
        messages=[{ "content": "Hello, how are you?","role": "user"}],
        stream=True,
        max_tokens=50, # maps to watsonx.ai max_new_tokens
)
print("Granite v2 streaming response:")
for chunk in response:
    print(chunk['choices'][0]['delta']['content'] or '', end='')

# print()
response = completion(
        model="watsonx/meta-llama/llama-3-8b-instruct",
        messages=[{ "content": "Hello, how are you?","role": "user"}],
        stream=True,
        max_tokens=50, # maps to watsonx.ai max_new_tokens
)
print("\nLLaMa 3 8b streaming response:")
for chunk in response:
    print(chunk['choices'][0]['delta']['content'] or '', end='')

Async Requests

python

from litellm import acompletion
import asyncio

granite_task = acompletion(
        model="watsonx/ibm/granite-13b-chat-v2",
        messages=[{ "content": "Hello, how are you?","role": "user"}],
        max_tokens=20, # maps to watsonx.ai max_new_tokens
        token=iam_token
)
llama_3_task = acompletion(
        model="watsonx/meta-llama/llama-3-8b-instruct",
        messages=[{ "content": "Hello, how are you?","role": "user"}],
        max_tokens=20, # maps to watsonx.ai max_new_tokens
        token=iam_token
)

granite_response, llama_3_response = await asyncio.gather(granite_task, llama_3_task)

print("Granite v2 response:")
print(granite_response)

print("LLaMa 3 8b response:")
print(llama_3_response)

Request deployed models

Models that have been deployed to a deployment space (e.g tuned models) can be called using the "deployment/<deployment_id>" format (where <deployment_id> is the ID of the deployed model in your deployment space). The ID of your deployment space must also be set in the environment variable WATSONX_DEPLOYMENT_SPACE_ID or passed to the function as space_id=<deployment_space_id>.

python

from litellm import acompletion

os.environ["WATSONX_DEPLOYMENT_SPACE_ID"] = "<deployment_space_id>" # ID of the watsonx.ai deployment space where the model is deployed
await acompletion(
        model="watsonx/deployment/<deployment_id>",
        messages=[{ "content": "Hello, how are you?","role": "user"}],
        token=iam_token
)

Embeddings

See the following link for a list of supported embedding models available with watsonx.ai:

https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx

python

from litellm import embedding,  aembedding

response = embedding(
        model="watsonx/ibm/slate-30m-english-rtrvr",
        input=["Hello, how are you?"],
        token=iam_token
)
print("Slate 30m embeddings response:")
print(response)

response = await aembedding(
        model="watsonx/ibm/slate-125m-english-rtrvr",
        input=["Hello, how are you?"],
        token=iam_token
)
print("Slate 125m embeddings response:")
print(response)