EdgeTTS

https://github.com/rany2/edge-tts

edge-tts is a Python module that allows you to use Microsoft Edge's online text-to-speech service from within your Python code or using the provided edge-tts or edge-playback command.

python

%pip install edge-tts pygame

python

import edge_tts
import os
import pygame
import time

async def generate_edge_tts_audio(text, file_name, voice='en-US-GuyNeural', style='newscast-formal', verbose=False, play=False, overwrite=False):
    communicate = edge_tts.Communicate(text, voice)
    # whether file exists?
    if os.path.exists(file_name):
        if overwrite:
            if verbose:
                print(f'{file_name} exists, overwriting...')
        else:
            if verbose:
                print(f'{file_name} exists, skipping...')
            return
        
    await communicate.save(file_name)
    if play:
        pygame.mixer.init()
        pygame.mixer.music.load(file_name)
        pygame.mixer.music.play()
    if verbose:
        print(f'{file_name} created')
    
    time.sleep(1.5)

python


voices = ["en-US-GuyNeural", "en-US-AriaNeural", "en-GB-RyanNeural", "en-GB-LibbyNeural"]
regions = ['us', 'us', 'uk', 'uk']
genders = ['male', 'female', 'male', 'female']

# only_us = False
only_us = True
if only_us:
    voices = voices[:2]
    print(voices)

words = """
applying,
carrying,
crying,
denying,
qualifying,
replying,
satisfying,
specifying,
spying,
"""

for word in words.strip().split(','):
    print(word)
    for i, voice in enumerate(voices):
        w = word.strip().lower()
        if len(w) > 0:
            filename = f'../audios/{w.replace(" ", "-")}-{regions[i]}-{genders[i]}.mp3'
            await generate_edge_tts_audio(w, filename, voice=voice, verbose=True, overwrite=False, play=True)

python

def get_openai_tts_audio(text, path, performer='alloy'):
        
        from openai import OpenAI
        from dotenv import load_dotenv
        load_dotenv()
        client = OpenAI(
        )
        
        with client.audio.speech.with_streaming_response.create(
            model="tts-1",
            voice=performer,
            input=text.strip()
        ) as response:
            response.stream_to_file(path)
        
sentence = "She will cherish those memories and ever hold them close to her heart."

# remove all punctuation at the end of sentence,
# replace all spaces and punctuations in the sentence with dash
audio_filename_openai = sentence.strip().translate(str.maketrans(',.?! ', '-----')).replace("--", "-").lower().rstrip('-') + '_openai.mp3'
audio_filename_msedge = sentence.strip().translate(str.maketrans(',.?! ', '-----')).replace("--", "-").lower().rstrip('-') + '_msedge.mp3'

print(audio_filename_openai)
# get_openai_tts_audio(sentence, audio_filename_openai, performer='alloy')
# await generate_edge_tts_audio(sentence, audio_filename_msedge, voice="en-US-GuyNeural", verbose=True, overwrite=True, play=True)

for voice in ["alloy", "nova"]:
    get_openai_tts_audio(sentence, f'../audios/{sentence.replace(" ", "-")}-{voice}.mp3', performer=voice)

python

from openai import OpenAI
import os
import IPython
from datetime import datetime
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, APIC, TPE1, TALB, TCON
from dotenv import load_dotenv
from pydub import AudioSegment

load_dotenv()
client = OpenAI(
)

def get_openai_tts_audio(text, filename, performer="alloy"):

    # check artwork.png and ending.mp3 files exist
    if not os.path.isfile('Artwork.png') or not os.path.isfile('ending.mp3'):
        print("Either Artwork.png or ending.mp3 file not found.")
        return

    # split the text into lines
    text = markdown_to_text(text).split("\n")
    # remove empty lines
    text = [t for t in text if t]

    for t in text:
        speech_file_path = f'temp-{text.index(t)}.mp3'
        rspd_audio = client.audio.speech.create(
            model="tts-1",
            voice=performer,
            input=t.strip()
        ) 
        rspd_audio.stream_to_file(speech_file_path)
        # output a progress percentage 
        # keep updating within a line
        print(f"\rprocessing: {round((text.index(t)+1)/len(text)*100)}%", end='...')
    print("\n")

    # create an audio of 1 second of silence
    temp_audio = AudioSegment.silent(duration=1000)
    for t in text:
        seg = AudioSegment.from_file(f'temp-{text.index(t)}.mp3')
        temp_audio += seg + AudioSegment.silent(duration=1500)
        # delete the temp file
        os.remove(f'temp-{text.index(t)}.mp3')
    temp_audio.export('~temp.mp3', format='mp3')
    speech = AudioSegment.from_file('~temp.mp3')
    ending = AudioSegment.from_file('ending.mp3')
    combined = speech + ending
    os.remove('~temp.mp3')
    if filename:
        # if filename has no extension, add .mp3
        if filename.endswith('.mp3'):
            speech_file_path = filename
        else:
            speech_file_path = f'{filename}.mp3'        
    else:
        speech_file_path = f'{datetime.now().strftime("%Y%m%d_%H%M%S")}_{performer}.mp3'
    combined.export(speech_file_path, format='mp3')
    print(f"Audio file saved as {speech_file_path}")

    image_file = 'Artwork.png'
    artist = 'tts'
    album = 'Daily Speech Training'
    genre = 'SPEECH'

    add_metadata(speech_file_path, image_file, artist, album, genre)
    IPython.display.Audio(speech_file_path)

    return f'{speech_file_path} created successfully.'

English Voices

voice = "en-US-GuyNeural" (Male)
voice = "en-US-AnaNeural" (Female)
voice = "en-US-AndrewNeural" (Male)
voice = "en-US-AriaNeural" (Female)
voice = "en-US-AvaNeural" (Female)
voice = "en-US-BrianNeural" (Male)
voice = "en-US-ChristopherNeural" (Male)
voice = "en-US-EmmaNeural" (Female)
voice = "en-US-EricNeural" (Male)
voice = "en-US-GuyNeural" (Male)
voice = "en-US-JennyNeural" (Female)
voice = "en-US-MichelleNeural" (Female)
voice = "en-US-RogerNeural" (Male)
voice = "en-US-SteffanNeural" (Male)
voice = "en-GB-LibbyNeural" (Female)
voice = "en-GB-MaisieNeural" (Female)
voice = "en-GB-RyanNeural" (Male)
voice = "en-GB-SoniaNeural" (Female)
voice = "en-GB-ThomasNeural" (Male)
voice = "en-AU-NatashaNeural" (Female)
voice = "en-AU-WilliamNeural" (Male)
voice = "en-CA-ClaraNeural" (Female)
voice = "en-CA-LiamNeural" (Male)

python

# generate sentences using edge-tts
sentences = """
It's our pleasure.
"""
# split the sentences into lines
sentences = [s for s in sentences.strip().split("\n") if s]
for sentence in sentences:
    # get the first three worrds and replace the first with 'sentence', join them with '-'ArithmeticError
    # filename = f'../audios/sentence-{sentence.split(" ")[1]}-{sentence.split(" ")[2]}.mp3'
    # for voice in ["en-US-GuyNeural", "en-US-AriaNeural"]:
    for voice in ["alloy", "nova"]:

        filename = f'../audios/{sentence.rstrip(",.!?").replace("'", "").replace(" ", "-")}-us-{voice}.mp3'.replace("alloy", "male").replace("nova", "female")
        # filename = f'../audios/{sentence.rstrip(",.!?").replace("'", "").replace(" ", "-")}-us-{voice}.mp3'.replace("en-US-GuyNeural", "male").replace("en-US-AriaNeural", "female")
        print(filename)
        # await generate_edge_tts_audio(sentence, filename, voice=voice, verbose=True, overwrite=True, play=True)
        get_openai_tts_audio(sentence, filename, performer=voice)
    
    

    # get_openai_tts_audio(sentence, f'../audios/{sentence.replace(" ", "-")}-alloy.mp3', performer='alloy')
    # get_openai_tts_audio(sentence, f'../audios/{sentence.replace(" ", "-")}-nova.mp3', performer='nova')

python

def read_lines_from_file(file_path):
    with open(file_path, 'r') as file:
        return file.readlines()

import re
words = ''
# read lines from ../../sounds-of-american-english/6-vocabulary.md
lines = read_lines_from_file('../../sounds-of-american-english/6-vocabulary.md')
for line in lines:
    # whether line is markdown list item start with \d. or ' * '
    # regex
    if (re.match(r'^\d+\.', line) or re.match(r'^\s*\*\s', line)) and 'span>' in line:
    # remove starting ^\d+\.\s* or '^\s*\*\s*'
        line = re.sub(r'^\d+\.\s*|\s*\*\s*', '', line)
        # get the first word
        word = line.split(' ')[0]
        # append to words followed a comma
        words += word + ','
print(words)

python

voices = ["en-US-GuyNeural", "en-US-AriaNeural", "en-GB-RyanNeural", "en-GB-LibbyNeural"]
regions = ['us', 'us', 'uk', 'uk']
genders = ['male', 'female', 'male', 'female']

# only_us = False
only_us = True
if only_us:
    voices = voices[:2]
    print(voices)

for word in words.strip().split(','):
    print(word)
    for i, voice in enumerate(voices):
        w = word.strip().lower()
        if len(w) > 0:
            filename = f'../audios/{w.replace(" ", "-")}-{regions[i]}-{genders[i]}.mp3'
            await generate_edge_tts_audio(w, filename, voice=voice, verbose=True, overwrite=False, play=True)