This file was created by jhlfrfufyfn for choose speaker from the Belarusian Mozilla Voice corpus - Tts

python

# This file was created by jhlfrfufyfn for choose speaker from the Belarusian Mozilla Voice corpus
#
#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import librosa

python

# unpackage tar gz file cv-corpus-12.0-2022-12-07-be.tar.gz
# import tarfile
# tar = tarfile.open("cv-corpus-12.0-2022-12-07-be.tar.gz", "r:gz")
# tar.extractall()
# tar.close()

corpuspath = '/a/cv-corpus'
outputpath = '/storage/filtered_dataset'

python

# open validated.tsv
df = pd.read_csv(corpuspath+'/be/validated.tsv', sep='\t' ,low_memory=False)
df

python

# drop from df columns age, accents
df = df.drop(['age', 'accents', 'gender', 'variant', 'locale', 'segment'], axis=1)
df

python

# count number of recordes with down_votes > 0
df[df['down_votes'] > 0].count()

python

# count number of recordes with up_votes == 0
df[df['up_votes'] == 0].count()

python

# drop all rows with down_votes > 0 and up_votes == 0
df = df[df['down_votes'] == 0]
df = df[df['up_votes'] > 0]
df

python

# drop column down_votes and up_votes
df = df.drop(['down_votes', 'up_votes'], axis=1)
df

python

# sort by count
df_sorted = df.groupby('client_id').count().sort_values(by='path', ascending=False)
df_sorted

python

# get top 10 speakers
top_10_speakers = df_sorted.head(10)
top_10_speakers

python

# get for the first speaker ten random paths to audio files
def get_speaker_audio_list(speaker_id, n=10):
    return df[df['client_id'] == speaker_id].sample(n)['path'].values.tolist()

python

# CHOOSE : which speaker will we use
speaker_index = 0
speaker_audio_list = get_speaker_audio_list(top_10_speakers.index[speaker_index])
print(speaker_audio_list)

python

# open audio files from speaker_audio_list and play them
# audio files lie in cv-corpus-12.0-2022-12-07/be/clips
import IPython.display as ipd
for audio in speaker_audio_list:
    audio = corpuspath+'/be/clips/' + audio
    audio_data = ipd.Audio(audio)
    display(audio_data)

python

# 0 is pretty good
# 1 is bad
# 2 is partly 0, other are different
# 3 is bad
# 4 is pretty fast and clear, but not good
# 5 is echoing, sometimes mic cracks
# 6 is really slow and clear, but accent?
# 7 has a lot of intonation, but is pretty clear
# 8 is clear and slow, sometimes little mic crack
# 9 has background noise, whispering

# options: 0, 6, 8

python

# calculate speech rate in words per minute for each speaker
def get_speech_rate(speaker_id):
    df_speaker = df[df['client_id'] == speaker_id]
    # get 1000 random samples to calculate speech rate
    df_speaker = df_speaker.sample(1000)
    # get duration of each audio file
    df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))
    # get number of words in each audio file
    df_speaker['words'] = df_speaker['sentence'].apply(lambda x: len(x.split()))
    # calculate speech rate
    df_speaker['speech_rate'] = df_speaker['words'] / df_speaker['duration'] * 60
    # return mean speech rate
    return df_speaker['speech_rate'].mean()

python

# calculate speech rate for each speaker
print(f'Speech rate for speaker {speaker_index}: ', get_speech_rate(top_10_speakers.index[speaker_index]))

python

def get_average_duration(df_speaker):
    # get 1000 random samples to calculate speech rate
    df_speaker = df_speaker.sample(1000)
    # get duration of each audio file
    df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))
    return df_speaker['duration'].mean()

python

df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]

avg_duration = get_average_duration(df_speaker)
avg_total_duration = avg_duration * len(df_speaker.index)
print(f'Average duration for speaker {speaker_index}: ', avg_duration, ", average total duration(hours): ",(avg_total_duration/60.0/60.0))

python

# get df with speaker_index speaker 
df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]
df_speaker = df_speaker.drop(['client_id'], axis=1)

# get only x latest hours
limit_hours = 30
limit_files = round(limit_hours*60*60 / avg_duration)
df_speaker = df_speaker.tail(limit_files)

df_speaker

python

# # move all files of that speaker to another folder
# # use multiprocessing to speed up
# # add progress bar
# from tqdm import tqdm
# import multiprocessing
# from multiprocessing import Pool
# import shutil

# def move_file(file):
#     shutil.move(corpuspath+'/be/clips/' + file, corpuspath+'/be/speaker_0/' + file)

# # get list of files to move
# files = df_speaker['path'].values.tolist()

# # move files
# with Pool(multiprocessing.cpu_count()) as p:
#     r = list(tqdm(p.imap(move_file, files), total=len(files)))

python

# cleanup output and save text lines to csv
if os.path.isdir(outputpath):
    for file in os.scandir(outputpath):
        os.remove(file.path)
else:
    os.mkdir(outputpath)

df_speaker['path2'] = df_speaker['path'].str.replace('\.mp3$','.wav', regex=True)
df_speaker[['path2','sentence']].to_csv(outputpath+'/df_speaker.csv', sep='|', header=False, index=False)

python

# make rate=22050 of all mp3 files in speaker_0 folder with multiprocessing and tqdm
import multiprocessing
from multiprocessing import Pool
from tqdm import tqdm
from pydub import AudioSegment

def convert_mp3_to_wav(file):
    sound = AudioSegment.from_mp3(corpuspath+'/be/clips/' + file)
    sound = sound.set_frame_rate(22050)
    sound.export(outputpath+'/' + file[:-4] + '.wav', format='wav')

# get list of files to convert
files = df_speaker['path'].values.tolist()

# convert files
with Pool(multiprocessing.cpu_count()) as p:
    r = list(tqdm(p.imap(convert_mp3_to_wav, files), total=len(files)))