FastChat API using Google Colab

python

%cd /content/

# clone FastChat
!git clone https://github.com/lm-sys/FastChat.git

# install dependencies
%cd FastChat
!python3 -m pip install -e ".[model_worker,webui]" --quiet

See openai_api.md from FastChat docs.

Because in Google Colab we are limited in resources and executing things in the background is not stable, we will run each API process in a thread and communicate them via explicit addresses:

python

import subprocess
import threading

%cd /content/

# Using 127.0.0.1 because localhost does not work properly in Colab

def run_controller():
    subprocess.run(["python3", "-m", "fastchat.serve.controller", "--host", "127.0.0.1"])

def run_model_worker():
    subprocess.run(["python3", "-m", "fastchat.serve.model_worker", "--host", "127.0.0.1", "--controller-address", "http://127.0.0.1:21001", "--model-path", "lmsys/vicuna-7b-v1.5", "--load-8bit"])

def run_api_server():
    subprocess.run(["python3", "-m", "fastchat.serve.openai_api_server", "--host", "127.0.0.1", "--controller-address", "http://127.0.0.1:21001", "--port", "8000"])

python

# Start controller thread
# see `controller.log` on the local storage provided by Colab
controller_thread = threading.Thread(target=run_controller)
controller_thread.start()

python

# Start model worker thread

# see `controller.log` on the local storage provided by Colab
# important to wait until the checkpoint shards are fully downloaded
model_worker_thread = threading.Thread(target=run_model_worker)
model_worker_thread.start()

python

# Start API server thread
api_server_thread = threading.Thread(target=run_api_server)
api_server_thread.start()

We now have the API running at http://127.0.0.1:8000/v1/ locally from Google Colab.

We can run the examples from FastChat with curl.

Try chat completion with

python

!curl http://127.0.0.1:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{ \
    "model": "vicuna-7b-v1.5", \
    "messages": [{"role": "user", "content": "Hello, can you tell me a joke for me?"}], \
    "temperature": 0.5 \
  }'

Try embeddings with

python

!curl http://127.0.0.1:8000/v1/embeddings \
  -H "Content-Type: application/json" \
  -d '{ \
    "model": "vicuna-7b-v1.5", \
    "input": "Hello, can you tell me a joke for me?" \
  }'

Try text completion with

python

!curl http://127.0.0.1:8000/v1/completions \
  -H "Content-Type: application/json" \
  -d '{ \
    "model": "vicuna-7b-v1.5", \
    "prompt": "Once upon a time", \
    "max_tokens": 20, \
    "temperature": 0.5 \
  }'

Try create_embeddings to analyze the prompts!

python

import json
import numpy as np
import requests
from scipy.spatial.distance import cosine


def get_embedding_from_api(word, model='vicuna-7b-v1.5'):
    url = 'http://127.0.0.1:8000/v1/embeddings'
    headers = {'Content-Type': 'application/json'}
    data = json.dumps({
        'model': model,
        'input': word
    })

    response = requests.post(url, headers=headers, data=data)
    if response.status_code == 200:
        embedding = np.array(response.json()['data'][0]['embedding'])
        return embedding
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None


def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)


def print_cosine_similarity(embeddings, texts):
    for i in range(len(texts)):
        for j in range(i + 1, len(texts)):
            sim = cosine_similarity(embeddings[texts[i]], embeddings[texts[j]])
            print(f"Cosine similarity between '{texts[i]}' and '{texts[j]}': {sim:.2f}")


texts = [
    'The quick brown fox',
    'The quick brown dog',
    'The fast brown fox',
    'A completely different sentence'
]

embeddings = {}
for text in texts:
    embeddings[text] = get_embedding_from_api(text)

print_cosine_similarity(embeddings, texts)