playground/FastChat_API_GoogleColab.ipynb
%cd /content/
# clone FastChat
!git clone https://github.com/lm-sys/FastChat.git
# install dependencies
%cd FastChat
!python3 -m pip install -e ".[model_worker,webui]" --quiet
See openai_api.md from FastChat docs.
Because in Google Colab we are limited in resources and executing things in the background is not stable, we will run each API process in a thread and communicate them via explicit addresses:
import subprocess
import threading
%cd /content/
# Using 127.0.0.1 because localhost does not work properly in Colab
def run_controller():
subprocess.run(["python3", "-m", "fastchat.serve.controller", "--host", "127.0.0.1"])
def run_model_worker():
subprocess.run(["python3", "-m", "fastchat.serve.model_worker", "--host", "127.0.0.1", "--controller-address", "http://127.0.0.1:21001", "--model-path", "lmsys/vicuna-7b-v1.5", "--load-8bit"])
def run_api_server():
subprocess.run(["python3", "-m", "fastchat.serve.openai_api_server", "--host", "127.0.0.1", "--controller-address", "http://127.0.0.1:21001", "--port", "8000"])
# Start controller thread
# see `controller.log` on the local storage provided by Colab
controller_thread = threading.Thread(target=run_controller)
controller_thread.start()
# Start model worker thread
# see `controller.log` on the local storage provided by Colab
# important to wait until the checkpoint shards are fully downloaded
model_worker_thread = threading.Thread(target=run_model_worker)
model_worker_thread.start()
# Start API server thread
api_server_thread = threading.Thread(target=run_api_server)
api_server_thread.start()
We now have the API running at http://127.0.0.1:8000/v1/ locally from Google Colab.
We can run the examples from FastChat with curl.
Try chat completion with
!curl http://127.0.0.1:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{ \
"model": "vicuna-7b-v1.5", \
"messages": [{"role": "user", "content": "Hello, can you tell me a joke for me?"}], \
"temperature": 0.5 \
}'
Try embeddings with
!curl http://127.0.0.1:8000/v1/embeddings \
-H "Content-Type: application/json" \
-d '{ \
"model": "vicuna-7b-v1.5", \
"input": "Hello, can you tell me a joke for me?" \
}'
Try text completion with
!curl http://127.0.0.1:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{ \
"model": "vicuna-7b-v1.5", \
"prompt": "Once upon a time", \
"max_tokens": 20, \
"temperature": 0.5 \
}'
Try create_embeddings to analyze the prompts!
import json
import numpy as np
import requests
from scipy.spatial.distance import cosine
def get_embedding_from_api(word, model='vicuna-7b-v1.5'):
url = 'http://127.0.0.1:8000/v1/embeddings'
headers = {'Content-Type': 'application/json'}
data = json.dumps({
'model': model,
'input': word
})
response = requests.post(url, headers=headers, data=data)
if response.status_code == 200:
embedding = np.array(response.json()['data'][0]['embedding'])
return embedding
else:
print(f"Error: {response.status_code} - {response.text}")
return None
def cosine_similarity(vec1, vec2):
return 1 - cosine(vec1, vec2)
def print_cosine_similarity(embeddings, texts):
for i in range(len(texts)):
for j in range(i + 1, len(texts)):
sim = cosine_similarity(embeddings[texts[i]], embeddings[texts[j]])
print(f"Cosine similarity between '{texts[i]}' and '{texts[j]}': {sim:.2f}")
texts = [
'The quick brown fox',
'The quick brown dog',
'The fast brown fox',
'A completely different sentence'
]
embeddings = {}
for text in texts:
embeddings[text] = get_embedding_from_api(text)
print_cosine_similarity(embeddings, texts)