examples/JARVIS.ipynb
# Welcome to JARVIS.
# The core of JARVIS is powered by Open Interpreter:
for chunk in interpreter.chat("What's 34/24?", stream=True, display=False):
print(chunk)
# (This cell is for demonstration purposes. Do not run it until you've setup JARVIS below.)
Runtime > Restart Session❗️ after this)!pip install open-interpreter
!pip install git+https://github.com/openai/whisper.git -q
!pip install gradio==3.50 -q
!pip install elevenlabs -q
eleven_labs_api_key = "<your_api_key>" # https://elevenlabs.io/speech-synthesis
openai_api_key = "<your_api_key>" # https://platform.openai.com/account/api-keys
import gradio as gr
import time
from interpreter import interpreter
interpreter.llm.api_key = openai_api_key
interpreter.auto_run = True
import whisper
model = whisper.load_model("base")
def transcribe(audio):
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
return result.text
from elevenlabs import generate, play, set_api_key
set_api_key(eleven_labs_api_key)
import io
from pydub import AudioSegment
def get_audio_length(audio_bytes):
# Create a BytesIO object from the byte array
byte_io = io.BytesIO(audio_bytes)
# Load the audio data with PyDub
audio = AudioSegment.from_mp3(byte_io)
# Get the length of the audio in milliseconds
length_ms = len(audio)
# Optionally convert to seconds
length_s = length_ms / 1000.0
return length_s
def speak(text):
speaking = True
audio = generate(
text=text,
voice="Daniel"
)
play(audio, notebook=True)
audio_length = get_audio_length(audio)
time.sleep(audio_length)
# @title JARVIS
# @markdown ### **Setup Instructions**
# @markdown 1. Run this cell, then scroll down to use the interface (don't click the link, and **give the interface 60 seconds to load**).
# @markdown 2. Press the `Record from Microphone` button.
# @markdown 3. Allow access to your microphone, then speak your command.
# @markdown 4. Stop the recording, then press `Submit`.
# @markdown
# @markdown
# @markdown JARVIS will respond verbally + carry out your command.
last_sentence = ""
with gr.Blocks() as demo:
chatbot = gr.Chatbot()
audio_input = gr.inputs.Audio(source="microphone", type="filepath")
btn = gr.Button("Submit")
def transcribe(audio):
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
return result.text
def add_user_message(audio, history):
user_message = transcribe(audio)
return history + [[user_message, None]]
def bot(history):
global last_sentence
user_message = history[-1][0]
history[-1][1] = ""
active_block_type = ""
language = ""
for chunk in interpreter.chat(user_message, stream=True, display=False):
# I built this before we build the flags, like "start": True and "end": True.
# See the streaming example above. You can use those "start" and "end" flags to
# start the code blocks, message blocks, etc. Here we track it manually and ignore the flags.
# You should use the flags though! I was just lazy. We should rebuild this soon.
# Message
if chunk["type"] == "message" and "content" in chunk:
if active_block_type != "message":
active_block_type = "message"
history[-1][1] += chunk["content"]
last_sentence += chunk["content"]
if any([punct in last_sentence for punct in ".?!\n"]):
yield history
speak(last_sentence)
last_sentence = ""
else:
yield history
# Code
if chunk["type"] == "code" and "content" in chunk:
if active_block_type != "code":
active_block_type = "code"
history[-1][1] += f"\n```{chunk['format']}"
history[-1][1] += chunk["content"]
yield history
# Output
if chunk["type"] == "confirmation":
history[-1][1] += "\n```\n\n```text\n"
yield history
if chunk["type"] == "console":
if chunk.get("format") == "output":
if chunk["content"] == "KeyboardInterrupt":
break
history[-1][1] += chunk["content"] + "\n"
yield history
if chunk.get("format") == "active_line" and chunk["content"] == None:
# Active line will be none when we finish execution.
# You could also detect this with "type": "console", "end": True.
history[-1][1] = history[-1][1].strip()
history[-1][1] += "\n```\n"
yield history
if last_sentence:
speak(last_sentence)
btn.click(add_user_message, [audio_input, chatbot], [chatbot]).then(
bot, chatbot, chatbot
)
demo.queue()
demo.launch(debug=True)
# @title Text-only JARVIS
# @markdown Run this cell for a ChatGPT-like interface.
with gr.Blocks() as demo:
chatbot = gr.Chatbot()
msg = gr.Textbox()
def user(user_message, history):
return "", history + [[user_message, None]]
def bot(history):
user_message = history[-1][0]
history[-1][1] = ""
active_block_type = ""
for chunk in interpreter.chat(user_message, stream=True, display=False):
# Message
if chunk["type"] == "message" and "content" in chunk:
if active_block_type != "message":
active_block_type = "message"
history[-1][1] += chunk["content"]
last_sentence += chunk["content"]
if any([punct in last_sentence for punct in ".?!\n"]):
yield history
speak(last_sentence)
last_sentence = ""
else:
yield history
# Code
if chunk["type"] == "code" and "content" in chunk:
if active_block_type != "code":
active_block_type = "code"
history[-1][1] += f"\n```{chunk['format']}"
history[-1][1] += chunk["content"]
yield history
# Output
if chunk["type"] == "confirmation":
history[-1][1] += "\n```\n\n```text\n"
yield history
if chunk["type"] == "console":
if chunk.get("format") == "output":
if chunk["content"] == "KeyboardInterrupt":
break
history[-1][1] += chunk["content"] + "\n"
yield history
if chunk.get("format") == "active_line" and chunk["content"] == None:
# Active line will be none when we finish execution.
# You could also detect this with "type": "console", "end": True.
history[-1][1] = history[-1][1].strip()
history[-1][1] += "\n```\n"
yield history
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
bot, chatbot, chatbot
)
demo.queue()
demo.launch(debug=True)