litellm uses async_generator for ollama async streaming, ensure it's installed - Litellm

python

!pip install litellm # version 0.1.724 or higher

Call Ollama - llama2 with Streaming

python

from litellm import completion

response = completion(
    model="ollama/llama2", 
    messages=[{ "content": "respond in 20 words. who are you?","role": "user"}], 
    api_base="http://localhost:11434",
    stream=True
)
print(response)
for chunk in response:
    print(chunk['choices'][0]['delta'])

Call Ollama - Llama2 with Acompletion + Streaming

python

# litellm uses async_generator for ollama async streaming, ensure it's installed
!pip install async_generator

python

import litellm

async def async_ollama():
    response = await litellm.acompletion(
        model="ollama/llama2", 
        messages=[{ "content": "what's the weather" ,"role": "user"}], 
        api_base="http://localhost:11434", 
        stream=True
    )
    async for chunk in response:
        print(chunk)

result = await async_ollama()
print(result)

try:
    async for chunk in result:
        print(chunk)
except TypeError: # the last chunk is None from Ollama, this raises an error with async streaming
    pass

Completion Call

python

from litellm import completion

response = completion(
    model="ollama/llama2", 
    messages=[{ "content": "respond in 20 words. who are you?","role": "user"}], 
    api_base="http://localhost:11434"
)
print(response)