python

# Welcome to JARVIS.

# The core of JARVIS is powered by Open Interpreter:

for chunk in interpreter.chat("What's 34/24?", stream=True, display=False):
  print(chunk)

# (This cell is for demonstration purposes. Do not run it until you've setup JARVIS below.)

Install (You must run ❗️`Runtime > Restart Session`❗️ after this)

python

!pip install open-interpreter

python

!pip install git+https://github.com/openai/whisper.git -q
!pip install gradio==3.50 -q
!pip install elevenlabs -q

Set your API Keys

python

eleven_labs_api_key = "<your_api_key>" # https://elevenlabs.io/speech-synthesis
openai_api_key = "<your_api_key>" # https://platform.openai.com/account/api-keys

Setup

Misc Imports

python

import gradio as gr
import time

Open Interpreter

python

from interpreter import interpreter

interpreter.llm.api_key = openai_api_key
interpreter.auto_run = True

Whisper

python

import whisper
model = whisper.load_model("base")

python

def transcribe(audio):

    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)

    # decode the audio
    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    return result.text

ElevenLabs

python

from elevenlabs import generate, play, set_api_key

set_api_key(eleven_labs_api_key)

python

import io
from pydub import AudioSegment

def get_audio_length(audio_bytes):
  # Create a BytesIO object from the byte array
  byte_io = io.BytesIO(audio_bytes)

  # Load the audio data with PyDub
  audio = AudioSegment.from_mp3(byte_io)

  # Get the length of the audio in milliseconds
  length_ms = len(audio)

  # Optionally convert to seconds
  length_s = length_ms / 1000.0

  return length_s

python

def speak(text):
  speaking = True
  audio = generate(
      text=text,
      voice="Daniel"
  )
  play(audio, notebook=True)

  audio_length = get_audio_length(audio)
  time.sleep(audio_length)

Run

python

# @title JARVIS
# @markdown ### **Setup Instructions**
# @markdown 1. Run this cell, then scroll down to use the interface (don't click the link, and **give the interface 60 seconds to load**).
# @markdown 2. Press the `Record from Microphone` button.
# @markdown 3. Allow access to your microphone, then speak your command.
# @markdown 4. Stop the recording, then press `Submit`.
# @markdown
# @markdown
# @markdown JARVIS will respond verbally + carry out your command.

last_sentence = ""

with gr.Blocks() as demo:

    chatbot = gr.Chatbot()
    audio_input = gr.inputs.Audio(source="microphone", type="filepath")
    btn = gr.Button("Submit")

    def transcribe(audio):
      audio = whisper.load_audio(audio)
      audio = whisper.pad_or_trim(audio)
      mel = whisper.log_mel_spectrogram(audio).to(model.device)
      _, probs = model.detect_language(mel)
      options = whisper.DecodingOptions()
      result = whisper.decode(model, mel, options)
      return result.text

    def add_user_message(audio, history):
        user_message = transcribe(audio)
        return history + [[user_message, None]]

    def bot(history):
        global last_sentence

        user_message = history[-1][0]
        history[-1][1] = ""
        active_block_type = ""
        language = ""
        for chunk in interpreter.chat(user_message, stream=True, display=False):

            # I built this before we build the flags, like "start": True and "end": True.
            # See the streaming example above. You can use those "start" and "end" flags to
            # start the code blocks, message blocks, etc. Here we track it manually and ignore the flags.

            # You should use the flags though! I was just lazy. We should rebuild this soon.

            # Message
            if chunk["type"] == "message" and "content" in chunk:
              if active_block_type != "message":
                active_block_type = "message"
              history[-1][1] += chunk["content"]

              last_sentence += chunk["content"]
              if any([punct in last_sentence for punct in ".?!\n"]):
                yield history
                speak(last_sentence)
                last_sentence = ""
              else:
                yield history

            # Code
            if chunk["type"] == "code" and "content" in chunk:
              if active_block_type != "code":
                active_block_type = "code"
                history[-1][1] += f"\n```{chunk['format']}"
              history[-1][1] += chunk["content"]
              yield history

            # Output
            if chunk["type"] == "confirmation":
              history[-1][1] += "\n```\n\n```text\n"
              yield history
            if chunk["type"] == "console":
              if chunk.get("format") == "output":
                if chunk["content"] == "KeyboardInterrupt":
                  break
                history[-1][1] += chunk["content"] + "\n"
                yield history
              if chunk.get("format") == "active_line" and chunk["content"] == None:
                # Active line will be none when we finish execution.
                # You could also detect this with "type": "console", "end": True.
                history[-1][1] = history[-1][1].strip()
                history[-1][1] += "\n```\n"
                yield history

        if last_sentence:
          speak(last_sentence)

    btn.click(add_user_message, [audio_input, chatbot], [chatbot]).then(
        bot, chatbot, chatbot
    )

demo.queue()
demo.launch(debug=True)

python

# @title Text-only JARVIS
# @markdown Run this cell for a ChatGPT-like interface.

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):

        user_message = history[-1][0]
        history[-1][1] = ""
        active_block_type = ""

        for chunk in interpreter.chat(user_message, stream=True, display=False):

            # Message
            if chunk["type"] == "message" and "content" in chunk:
              if active_block_type != "message":
                active_block_type = "message"
              history[-1][1] += chunk["content"]

              last_sentence += chunk["content"]
              if any([punct in last_sentence for punct in ".?!\n"]):
                yield history
                speak(last_sentence)
                last_sentence = ""
              else:
                yield history

            # Code
            if chunk["type"] == "code" and "content" in chunk:
              if active_block_type != "code":
                active_block_type = "code"
                history[-1][1] += f"\n```{chunk['format']}"
              history[-1][1] += chunk["content"]
              yield history

            # Output
            if chunk["type"] == "confirmation":
              history[-1][1] += "\n```\n\n```text\n"
              yield history
            if chunk["type"] == "console":
              if chunk.get("format") == "output":
                if chunk["content"] == "KeyboardInterrupt":
                  break
                history[-1][1] += chunk["content"] + "\n"
                yield history
              if chunk.get("format") == "active_line" and chunk["content"] == None:
                # Active line will be none when we finish execution.
                # You could also detect this with "type": "console", "end": True.
                history[-1][1] = history[-1][1].strip()
                history[-1][1] += "\n```\n"
                yield history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )

demo.queue()
demo.launch(debug=True)

Welcome to JARVIS.

Install (You must run ❗️Runtime > Restart Session❗️ after this)

Set your API Keys

Setup

Misc Imports

Open Interpreter

Whisper

ElevenLabs

Run

Install (You must run ❗️`Runtime > Restart Session`❗️ after this)