vits.ipynb
from utils.hparams import load_hparams_json
from utils.util import intersperse
import json
from models.synthesizer.models.vits import Vits
import torch
import numpy as np
import IPython.display as ipd
from models.synthesizer.utils.symbols import symbols
from models.synthesizer.utils.text import text_to_sequence
hps = load_hparams_json("data/ckpt/synthesizer/vits5/config.json")
print(hps.train)
model = Vits(
len(symbols),
hps["data"]["filter_length"] // 2 + 1,
hps["train"]["segment_size"] // hps["data"]["hop_length"],
n_speakers=hps["data"]["n_speakers"],
**hps["model"])
_ = model.eval()
device = torch.device("cpu")
checkpoint = torch.load(str("data/ckpt/synthesizer/vits5/G_56000.pth"), map_location=device)
if "model_state" in checkpoint:
state = checkpoint["model_state"]
else:
state = checkpoint["model"]
model.load_state_dict(state, strict=False)
# 随机抽取情感参考音频的根目录
random_emotion_root = "D:\\audiodata\\SV2TTS\\synthesizer\\emo\\"
import random, re
from pypinyin import lazy_pinyin, Style
import os
def tts(txt, emotion, sid=0):
txt = " ".join(lazy_pinyin(txt, style=Style.TONE3, neutral_tone_with_five=False))
text_norm = text_to_sequence(txt, hps["data"]["text_cleaners"])
# if hps["data"]["add_blank"]:
# text_norm = intersperse(text_norm, 0)
stn_tst = torch.LongTensor(text_norm)
with torch.no_grad(): #inference mode
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
sid = torch.LongTensor([sid])
if emotion.endswith("wav"):
from models.synthesizer.preprocess_audio import extract_emo
import librosa
wav, sr = librosa.load(emotion, 16000)
emo = torch.FloatTensor(extract_emo(np.expand_dims(wav, 0), sr, embeddings=True))
elif emotion == "random_sample":
rand_emo = random.sample(os.listdir(random_emotion_root), 1)[0]
print(rand_emo)
emo = torch.FloatTensor(np.load(f"{random_emotion_root}\\{rand_emo}")).unsqueeze(0)
elif emotion.endswith("npy"):
print(emotion)
emo = torch.FloatTensor(np.load(f"{random_emotion_root}\\{emotion}")).unsqueeze(0)
else:
print("emotion参数不正确")
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1, emo=emo)[0][0,0].data.float().numpy()
ipd.display(ipd.Audio(audio, rate=hps["data"]["sampling_rate"], normalize=False))
推理:
txt = "我们将其拓展到文本驱动数字人形象领域"
#正常:
tts(txt, emotion='emo-T0055G4906S0052.wav_00.npy', sid=100)
#快速:emo-T0055G2323S0179.wav_00.npy
#难过:
tts(txt, emotion='emo-15_4581_20170825202626.wav_00.npy', sid=100)
#开心:T0055G2412S0498.wav
tts(txt, emotion='emo-T0055G2412S0498.wav_00.npy', sid=100)
#愤怒 T0055G1371S0363.wav T0055G1344S0160.wav
tts(txt, emotion='emo-T0055G1344S0160.wav_00.npy', sid=100)
#疲惫
tts(txt, emotion='emo-T0055G2294S0476.wav_00.npy', sid=100)
#着急
tts(txt, emotion='emo-T0055G1671S0170.wav_00.npy', sid=100)
txt = "我们将其拓展到文本驱动数字人形象领域"
tts(txt, emotion='random_sample', sid=100)
tts(txt, emotion='random_sample', sid=100)
tts(txt, emotion='random_sample', sid=100)
tts(txt, emotion='random_sample', sid=100)
tts(txt, emotion='random_sample', sid=100)
tts(txt, emotion='random_sample', sid=100)
txt = "我们将其拓展到文本驱动数字人形象领域"
types = ["平淡", "激动", "疲惫", "兴奋", "沮丧", "开心"]
for t in types:
print(t)
tts(txt, emotion=f'C:\\Users\\babys\\Music\\{t}.wav', sid=100)
# tts(txt, emotion='D:\\audiodata\\aidatatang_200zh\\corpus\\train\\G1858\\T0055G1858S0342.wav', sid=5)
预处理:
from models.synthesizer.preprocess import preprocess_dataset
from pathlib import Path
from utils.hparams import HParams
datasets_root = Path("../audiodata/")
hparams = HParams(
n_fft = 1024, # filter_length
num_mels = 80,
hop_size = 256, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
win_size = 1024, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
fmin = 55,
min_level_db = -100,
ref_level_db = 20,
max_abs_value = 4., # Gradient explodes if too big, premature convergence if too small.
sample_rate = 16000,
rescale = True,
max_mel_frames = 900,
rescaling_max = 0.9,
preemphasis = 0.97, # Filter coefficient to use if preemphasize is True
preemphasize = True,
### Mel Visualization and Griffin-Lim
signal_normalization = True,
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
### Audio processing options
fmax = 7600, # Should not exceed (sample_rate // 2)
allow_clipping_in_normalization = True, # Used when signal_normalization = True
clip_mels_length = True, # If true, discards samples exceeding max_mel_frames
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
# and [0, max_abs_value] if False
trim_silence = False, # Use with sample_rate of 16000 for best results
)
preprocess_dataset(datasets_root=datasets_root,
out_dir=datasets_root.joinpath("SV2TTS", "synthesizer"),
n_processes=8,
skip_existing=True,
hparams=hparams,
no_alignments=False,
dataset="aidatatang_200zh",
emotion_extract=True)
训练:
from models.synthesizer.train_vits import run
from pathlib import Path
from utils.hparams import HParams
import torch, os
import torch.multiprocessing as mp
datasets_root = Path("../audiodata/SV2TTS/synthesizer")
hparams= HParams(
model_dir = "data/ckpt/synthesizer/vits",
)
hparams.loadJson(Path(hparams.model_dir).joinpath("config.json"))
hparams.data["training_files"] = str(datasets_root.joinpath("train.txt"))
hparams.data["validation_files"] = str(datasets_root.joinpath("train.txt"))
hparams.data["datasets_root"] = str(datasets_root)
n_gpus = torch.cuda.device_count()
# for spawn
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8899'
mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hparams))
挑选只有对应emo文件的meta数据
from pathlib import Path
import os
root = Path('../audiodata/SV2TTS/synthesizer')
dict_info = []
with open(root.joinpath("train.txt"), "r", encoding="utf-8") as dict_meta:
for raw in dict_meta:
if not raw:
continue
v = raw.split("|")[0].replace("audio","emo")
emo_fpath = root.joinpath("emo").joinpath(v)
if emo_fpath.exists():
dict_info.append(raw)
# else:
# print(emo_fpath)
# Iterate over each wav
meta2 = Path('../audiodata/SV2TTS/synthesizer/train2.txt')
metadata_file = meta2.open("w", encoding="utf-8")
for new_info in dict_info:
metadata_file.write(new_info)
metadata_file.close()
从训练集中抽取10%作为测试集
from pathlib import Path
root = Path('../audiodata/SV2TTS/synthesizer')
dict_info1 = []
dict_info2 = []
count = 1
with open(root.joinpath("train.txt"), "r", encoding="utf-8") as dict_meta:
for raw in dict_meta:
if not raw:
continue
if count % 10 == 0:
dict_info2.append(raw)
else:
dict_info1.append(raw)
count += 1
# Iterate over each wav
meta1 = Path('../audiodata/SV2TTS/synthesizer/train1.txt')
metadata_file = meta1.open("w", encoding="utf-8")
for new_info in dict_info1:
metadata_file.write(new_info)
metadata_file.close()
meta2 = Path('../audiodata/SV2TTS/synthesizer/eval.txt')
metadata_file = meta2.open("w", encoding="utf-8")
for new_info in dict_info2:
metadata_file.write(new_info)
metadata_file.close()
evaluation
from pathlib import Path
root = Path('../audiodata/SV2TTS/synthesizer')
spks = []
spk_id = {}
rows = []
with open(root.joinpath("eval.txt"), "r", encoding="utf-8") as dict_meta:
for raw in dict_meta:
speaker_name = raw.split("-")[1][6:10]
if speaker_name not in spk_id:
spks.append(speaker_name)
spk_id[speaker_name] = 1
rows.append(raw)
i = 0
spks.sort()
for sp in spks:
spk_id[sp] = str(i)
i = i + 1
print(len(spks))
meta2 = Path('../audiodata/SV2TTS/synthesizer/eval2.txt')
metadata_file = meta2.open("w", encoding="utf-8")
for row in rows:
speaker_n = row.split("-")[1][6:10]
metadata_file.write(row.strip()+"|"+spk_id[speaker_n]+"\n")
metadata_file.close()
[Not Recommended] Try to transcript map to detailed format: ni3 hao3 -> n i3 <pad> h ao3
After couple of tests, I think this method will not improve the quality of result and may cause the crash of monotonic alignment.
from pathlib import Path
datasets_root = Path("../audiodata/SV2TTS/synthesizer/")
dictionary_fp = Path("../audiodata/ProDiff/processed/mandarin_pinyin.dict")
dict_map = {}
for l in open(dictionary_fp, encoding='utf-8').readlines():
item = l.split("\t")
dict_map[item[0]] = item[1].replace("\n","")
with datasets_root.joinpath('train2.txt').open("w+", encoding='utf-8') as f:
for l in open(datasets_root.joinpath('train.txt'), encoding='utf-8').readlines():
items = l.strip().replace("\n","").replace("\t"," ").split("|")
phs_str = ""
for word in items[5].split(" "):
if word in dict_map:
phs_str += dict_map[word]
else:
phs_str += word
phs_str += " _ "
items[5] = phs_str
# if not os.path.exists(mfa_input_root.joinpath('train.txt')):
# with open(mfa_input_root.joinpath(fileName + 'lab'), 'w+', encoding="utf-8") as f:
f.write("|".join(items) + "\n")
预处理后的数据可视化
import matplotlib.pyplot as plt
import librosa.display
import librosa, torch
import numpy as np
from utils.audio_utils import spectrogram, mel_spectrogram, load_wav_to_torch, spec_to_mel
# x, sr = librosa.load("D:\audiodata\SV2TTS\synthesizer\audio\audio-T0055G2333S0196.wav_00.npy")
x = np.load("D:\\audiodata\\SV2TTS\\synthesizer\\audio\\audio-T0055G1858S0342.wav_00.npy")
plt.figure(figsize=(14, 5))
librosa.display.waveplot(x)
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, x_axis='time', y_axis='hz')
# spectrogram = np.load("D:\\audiodata\\SV2TTS\\synthesizer\\mels\\mel-T0055G1858S0342.wav_00.npy")
audio = torch.from_numpy(x.astype(np.float32))
# audio, sampling_rate = load_wav_to_torch("D:\\audiodata\\aidatatang_200zh\\corpus\\train\\G1858\\T0055G1858S0342.wav")
# audio_norm = audio / 32768.0
audio_norm = audio.unsqueeze(0)
spec = spectrogram(audio_norm, 1024, 256, 1024, center=False)
# spec = spec_to_mel()
spec = torch.squeeze(spec, 0)
mel = spec_to_mel(spec, 1024, 80, 16000, 0, None)
fig = plt.figure(figsize=(10, 8))
ax2 = fig.add_subplot(211)
im = ax2.imshow(mel, interpolation="none")
情感聚类
# from sklearn import metrics
# from sklearn.mixture import GaussianMixture # 高斯混合模型
import os
import numpy as np
import librosa
import IPython.display as ipd
from random import sample
embs = []
wavnames = []
emo_root_path = "D:\\audiodata\\SV2TTS\\synthesizer\\emo\\"
wav_root_path = "D:\\audiodata\\aidatatang_200zh\\corpus\\train\\"
for idx, emo_fpath in enumerate(sample(os.listdir(emo_root_path), 10000)):
if emo_fpath.endswith(".npy") and emo_fpath.startswith("emo-T"):
embs.append(np.expand_dims(np.load(emo_root_path + emo_fpath), axis=0))
wav_fpath = wav_root_path + emo_fpath[9:14] + "\\" + emo_fpath.split("_00")[0][4:]
wavnames.append(wav_fpath)
print(len(embs))
x = np.concatenate(embs, axis=0)
# 聚类算法类的数量
n_clusters = 20
from sklearn.cluster import *
# model = KMeans(n_clusters=n_clusters, random_state=10)
# model = DBSCAN(eps=0.002, min_samples=2)
# 可以自行尝试各种不同的聚类算法
# model = Birch(n_clusters= n_clusters, threshold= 0.2)
# model = SpectralClustering(n_clusters=n_clusters)
model = AgglomerativeClustering(n_clusters= n_clusters)
import random
y_predict = model.fit_predict(x)
def disp(wavname):
wav, sr =librosa.load(wavname, 16000)
display(ipd.Audio(wav, rate=sr))
classes=[[] for i in range(y_predict.max()+1)]
for idx, wavname in enumerate(wavnames):
classes[y_predict[idx]].append(wavname)
for i in range(y_predict.max()+1):
print("类别:", i, "本类中样本数量:", len(classes[i]))
"""每一个类只预览2条音频"""
for j in range(2):
idx = random.randint(0, len(classes[i]) - 1)
print(classes[i][idx])
disp(classes[i][idx])