1000-hours/public/jupyter-notebooks/edge-tts-valcab-pronounciation.ipynb
https://github.com/rany2/edge-tts
edge-tts is a Python module that allows you to use Microsoft Edge's online text-to-speech service from within your Python code or using the provided edge-tts or edge-playback command.
%pip install edge-tts pygame
import edge_tts
import os
import pygame
import time
async def generate_edge_tts_audio(text, file_name, voice='en-US-GuyNeural', style='newscast-formal', verbose=False, play=False, overwrite=False):
communicate = edge_tts.Communicate(text, voice)
# whether file exists?
if os.path.exists(file_name):
if overwrite:
if verbose:
print(f'{file_name} exists, overwriting...')
else:
if verbose:
print(f'{file_name} exists, skipping...')
return
await communicate.save(file_name)
if play:
pygame.mixer.init()
pygame.mixer.music.load(file_name)
pygame.mixer.music.play()
if verbose:
print(f'{file_name} created')
time.sleep(1.5)
voices = ["en-US-GuyNeural", "en-US-AriaNeural", "en-GB-RyanNeural", "en-GB-LibbyNeural"]
regions = ['us', 'us', 'uk', 'uk']
genders = ['male', 'female', 'male', 'female']
# only_us = False
only_us = True
if only_us:
voices = voices[:2]
print(voices)
words = """
applying,
carrying,
crying,
denying,
qualifying,
replying,
satisfying,
specifying,
spying,
"""
for word in words.strip().split(','):
print(word)
for i, voice in enumerate(voices):
w = word.strip().lower()
if len(w) > 0:
filename = f'../audios/{w.replace(" ", "-")}-{regions[i]}-{genders[i]}.mp3'
await generate_edge_tts_audio(w, filename, voice=voice, verbose=True, overwrite=False, play=True)
def get_openai_tts_audio(text, path, performer='alloy'):
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
client = OpenAI(
)
with client.audio.speech.with_streaming_response.create(
model="tts-1",
voice=performer,
input=text.strip()
) as response:
response.stream_to_file(path)
sentence = "She will cherish those memories and ever hold them close to her heart."
# remove all punctuation at the end of sentence,
# replace all spaces and punctuations in the sentence with dash
audio_filename_openai = sentence.strip().translate(str.maketrans(',.?! ', '-----')).replace("--", "-").lower().rstrip('-') + '_openai.mp3'
audio_filename_msedge = sentence.strip().translate(str.maketrans(',.?! ', '-----')).replace("--", "-").lower().rstrip('-') + '_msedge.mp3'
print(audio_filename_openai)
# get_openai_tts_audio(sentence, audio_filename_openai, performer='alloy')
# await generate_edge_tts_audio(sentence, audio_filename_msedge, voice="en-US-GuyNeural", verbose=True, overwrite=True, play=True)
for voice in ["alloy", "nova"]:
get_openai_tts_audio(sentence, f'../audios/{sentence.replace(" ", "-")}-{voice}.mp3', performer=voice)
from openai import OpenAI
import os
import IPython
from datetime import datetime
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, APIC, TPE1, TALB, TCON
from dotenv import load_dotenv
from pydub import AudioSegment
load_dotenv()
client = OpenAI(
)
def get_openai_tts_audio(text, filename, performer="alloy"):
# check artwork.png and ending.mp3 files exist
if not os.path.isfile('Artwork.png') or not os.path.isfile('ending.mp3'):
print("Either Artwork.png or ending.mp3 file not found.")
return
# split the text into lines
text = markdown_to_text(text).split("\n")
# remove empty lines
text = [t for t in text if t]
for t in text:
speech_file_path = f'temp-{text.index(t)}.mp3'
rspd_audio = client.audio.speech.create(
model="tts-1",
voice=performer,
input=t.strip()
)
rspd_audio.stream_to_file(speech_file_path)
# output a progress percentage
# keep updating within a line
print(f"\rprocessing: {round((text.index(t)+1)/len(text)*100)}%", end='...')
print("\n")
# create an audio of 1 second of silence
temp_audio = AudioSegment.silent(duration=1000)
for t in text:
seg = AudioSegment.from_file(f'temp-{text.index(t)}.mp3')
temp_audio += seg + AudioSegment.silent(duration=1500)
# delete the temp file
os.remove(f'temp-{text.index(t)}.mp3')
temp_audio.export('~temp.mp3', format='mp3')
speech = AudioSegment.from_file('~temp.mp3')
ending = AudioSegment.from_file('ending.mp3')
combined = speech + ending
os.remove('~temp.mp3')
if filename:
# if filename has no extension, add .mp3
if filename.endswith('.mp3'):
speech_file_path = filename
else:
speech_file_path = f'{filename}.mp3'
else:
speech_file_path = f'{datetime.now().strftime("%Y%m%d_%H%M%S")}_{performer}.mp3'
combined.export(speech_file_path, format='mp3')
print(f"Audio file saved as {speech_file_path}")
image_file = 'Artwork.png'
artist = 'tts'
album = 'Daily Speech Training'
genre = 'SPEECH'
add_metadata(speech_file_path, image_file, artist, album, genre)
IPython.display.Audio(speech_file_path)
return f'{speech_file_path} created successfully.'
# generate sentences using edge-tts
sentences = """
It's our pleasure.
"""
# split the sentences into lines
sentences = [s for s in sentences.strip().split("\n") if s]
for sentence in sentences:
# get the first three worrds and replace the first with 'sentence', join them with '-'ArithmeticError
# filename = f'../audios/sentence-{sentence.split(" ")[1]}-{sentence.split(" ")[2]}.mp3'
# for voice in ["en-US-GuyNeural", "en-US-AriaNeural"]:
for voice in ["alloy", "nova"]:
filename = f'../audios/{sentence.rstrip(",.!?").replace("'", "").replace(" ", "-")}-us-{voice}.mp3'.replace("alloy", "male").replace("nova", "female")
# filename = f'../audios/{sentence.rstrip(",.!?").replace("'", "").replace(" ", "-")}-us-{voice}.mp3'.replace("en-US-GuyNeural", "male").replace("en-US-AriaNeural", "female")
print(filename)
# await generate_edge_tts_audio(sentence, filename, voice=voice, verbose=True, overwrite=True, play=True)
get_openai_tts_audio(sentence, filename, performer=voice)
# get_openai_tts_audio(sentence, f'../audios/{sentence.replace(" ", "-")}-alloy.mp3', performer='alloy')
# get_openai_tts_audio(sentence, f'../audios/{sentence.replace(" ", "-")}-nova.mp3', performer='nova')
def read_lines_from_file(file_path):
with open(file_path, 'r') as file:
return file.readlines()
import re
words = ''
# read lines from ../../sounds-of-american-english/6-vocabulary.md
lines = read_lines_from_file('../../sounds-of-american-english/6-vocabulary.md')
for line in lines:
# whether line is markdown list item start with \d. or ' * '
# regex
if (re.match(r'^\d+\.', line) or re.match(r'^\s*\*\s', line)) and 'span>' in line:
# remove starting ^\d+\.\s* or '^\s*\*\s*'
line = re.sub(r'^\d+\.\s*|\s*\*\s*', '', line)
# get the first word
word = line.split(' ')[0]
# append to words followed a comma
words += word + ','
print(words)
voices = ["en-US-GuyNeural", "en-US-AriaNeural", "en-GB-RyanNeural", "en-GB-LibbyNeural"]
regions = ['us', 'us', 'uk', 'uk']
genders = ['male', 'female', 'male', 'female']
# only_us = False
only_us = True
if only_us:
voices = voices[:2]
print(voices)
for word in words.strip().split(','):
print(word)
for i, voice in enumerate(voices):
w = word.strip().lower()
if len(w) > 0:
filename = f'../audios/{w.replace(" ", "-")}-{regions[i]}-{genders[i]}.mp3'
await generate_edge_tts_audio(w, filename, voice=voice, verbose=True, overwrite=False, play=True)