recipes/bel-alex73/choose_speaker.ipynb
# This file was created by jhlfrfufyfn for choose speaker from the Belarusian Mozilla Voice corpus
#
#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import librosa
# unpackage tar gz file cv-corpus-12.0-2022-12-07-be.tar.gz
# import tarfile
# tar = tarfile.open("cv-corpus-12.0-2022-12-07-be.tar.gz", "r:gz")
# tar.extractall()
# tar.close()
corpuspath = '/a/cv-corpus'
outputpath = '/storage/filtered_dataset'
# open validated.tsv
df = pd.read_csv(corpuspath+'/be/validated.tsv', sep='\t' ,low_memory=False)
df
# drop from df columns age, accents
df = df.drop(['age', 'accents', 'gender', 'variant', 'locale', 'segment'], axis=1)
df
# count number of recordes with down_votes > 0
df[df['down_votes'] > 0].count()
# count number of recordes with up_votes == 0
df[df['up_votes'] == 0].count()
# drop all rows with down_votes > 0 and up_votes == 0
df = df[df['down_votes'] == 0]
df = df[df['up_votes'] > 0]
df
# drop column down_votes and up_votes
df = df.drop(['down_votes', 'up_votes'], axis=1)
df
# sort by count
df_sorted = df.groupby('client_id').count().sort_values(by='path', ascending=False)
df_sorted
# get top 10 speakers
top_10_speakers = df_sorted.head(10)
top_10_speakers
# get for the first speaker ten random paths to audio files
def get_speaker_audio_list(speaker_id, n=10):
return df[df['client_id'] == speaker_id].sample(n)['path'].values.tolist()
# CHOOSE : which speaker will we use
speaker_index = 0
speaker_audio_list = get_speaker_audio_list(top_10_speakers.index[speaker_index])
print(speaker_audio_list)
# open audio files from speaker_audio_list and play them
# audio files lie in cv-corpus-12.0-2022-12-07/be/clips
import IPython.display as ipd
for audio in speaker_audio_list:
audio = corpuspath+'/be/clips/' + audio
audio_data = ipd.Audio(audio)
display(audio_data)
# 0 is pretty good
# 1 is bad
# 2 is partly 0, other are different
# 3 is bad
# 4 is pretty fast and clear, but not good
# 5 is echoing, sometimes mic cracks
# 6 is really slow and clear, but accent?
# 7 has a lot of intonation, but is pretty clear
# 8 is clear and slow, sometimes little mic crack
# 9 has background noise, whispering
# options: 0, 6, 8
# calculate speech rate in words per minute for each speaker
def get_speech_rate(speaker_id):
df_speaker = df[df['client_id'] == speaker_id]
# get 1000 random samples to calculate speech rate
df_speaker = df_speaker.sample(1000)
# get duration of each audio file
df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))
# get number of words in each audio file
df_speaker['words'] = df_speaker['sentence'].apply(lambda x: len(x.split()))
# calculate speech rate
df_speaker['speech_rate'] = df_speaker['words'] / df_speaker['duration'] * 60
# return mean speech rate
return df_speaker['speech_rate'].mean()
# calculate speech rate for each speaker
print(f'Speech rate for speaker {speaker_index}: ', get_speech_rate(top_10_speakers.index[speaker_index]))
def get_average_duration(df_speaker):
# get 1000 random samples to calculate speech rate
df_speaker = df_speaker.sample(1000)
# get duration of each audio file
df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))
return df_speaker['duration'].mean()
df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]
avg_duration = get_average_duration(df_speaker)
avg_total_duration = avg_duration * len(df_speaker.index)
print(f'Average duration for speaker {speaker_index}: ', avg_duration, ", average total duration(hours): ",(avg_total_duration/60.0/60.0))
# get df with speaker_index speaker
df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]
df_speaker = df_speaker.drop(['client_id'], axis=1)
# get only x latest hours
limit_hours = 30
limit_files = round(limit_hours*60*60 / avg_duration)
df_speaker = df_speaker.tail(limit_files)
df_speaker
# # move all files of that speaker to another folder
# # use multiprocessing to speed up
# # add progress bar
# from tqdm import tqdm
# import multiprocessing
# from multiprocessing import Pool
# import shutil
# def move_file(file):
# shutil.move(corpuspath+'/be/clips/' + file, corpuspath+'/be/speaker_0/' + file)
# # get list of files to move
# files = df_speaker['path'].values.tolist()
# # move files
# with Pool(multiprocessing.cpu_count()) as p:
# r = list(tqdm(p.imap(move_file, files), total=len(files)))
# cleanup output and save text lines to csv
if os.path.isdir(outputpath):
for file in os.scandir(outputpath):
os.remove(file.path)
else:
os.mkdir(outputpath)
df_speaker['path2'] = df_speaker['path'].str.replace('\.mp3$','.wav', regex=True)
df_speaker[['path2','sentence']].to_csv(outputpath+'/df_speaker.csv', sep='|', header=False, index=False)
# make rate=22050 of all mp3 files in speaker_0 folder with multiprocessing and tqdm
import multiprocessing
from multiprocessing import Pool
from tqdm import tqdm
from pydub import AudioSegment
def convert_mp3_to_wav(file):
sound = AudioSegment.from_mp3(corpuspath+'/be/clips/' + file)
sound = sound.set_frame_rate(22050)
sound.export(outputpath+'/' + file[:-4] + '.wav', format='wav')
# get list of files to convert
files = df_speaker['path'].values.tolist()
# convert files
with Pool(multiprocessing.cpu_count()) as p:
r = list(tqdm(p.imap(convert_mp3_to_wav, files), total=len(files)))