随机抽取情感参考音频的根目录 - Mockingbird

python

from utils.hparams import load_hparams_json
from utils.util import intersperse
import json
from models.synthesizer.models.vits import Vits
import torch
import numpy as np
import IPython.display as ipd
from models.synthesizer.utils.symbols import symbols
from models.synthesizer.utils.text import text_to_sequence


hps = load_hparams_json("data/ckpt/synthesizer/vits5/config.json")
print(hps.train)
model = Vits(
    len(symbols),
    hps["data"]["filter_length"] // 2 + 1,
    hps["train"]["segment_size"] // hps["data"]["hop_length"],
    n_speakers=hps["data"]["n_speakers"],
    **hps["model"])
_ = model.eval()
device = torch.device("cpu")
checkpoint = torch.load(str("data/ckpt/synthesizer/vits5/G_56000.pth"), map_location=device)
if "model_state" in checkpoint:
    state = checkpoint["model_state"]
else:
    state = checkpoint["model"]
model.load_state_dict(state, strict=False)

# 随机抽取情感参考音频的根目录
random_emotion_root = "D:\\audiodata\\SV2TTS\\synthesizer\\emo\\"
import random, re
from pypinyin import lazy_pinyin, Style

import os

def tts(txt, emotion, sid=0):
    txt = " ".join(lazy_pinyin(txt, style=Style.TONE3, neutral_tone_with_five=False))
    text_norm = text_to_sequence(txt, hps["data"]["text_cleaners"])
    # if hps["data"]["add_blank"]:
    # text_norm = intersperse(text_norm, 0)
    stn_tst = torch.LongTensor(text_norm)

    with torch.no_grad(): #inference mode
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        sid = torch.LongTensor([sid])
        if emotion.endswith("wav"):
            from models.synthesizer.preprocess_audio import extract_emo
            import librosa
            wav, sr = librosa.load(emotion, 16000)
            emo = torch.FloatTensor(extract_emo(np.expand_dims(wav, 0), sr, embeddings=True))
        elif emotion == "random_sample":
            rand_emo = random.sample(os.listdir(random_emotion_root), 1)[0]
            print(rand_emo)
            emo = torch.FloatTensor(np.load(f"{random_emotion_root}\\{rand_emo}")).unsqueeze(0)
        elif emotion.endswith("npy"):
            print(emotion)
            emo = torch.FloatTensor(np.load(f"{random_emotion_root}\\{emotion}")).unsqueeze(0)
        else:
            print("emotion参数不正确")

        audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1, emo=emo)[0][0,0].data.float().numpy()
    ipd.display(ipd.Audio(audio, rate=hps["data"]["sampling_rate"], normalize=False))

推理：

python

txt = "我们将其拓展到文本驱动数字人形象领域"
#正常: 
tts(txt, emotion='emo-T0055G4906S0052.wav_00.npy', sid=100)
#快速：emo-T0055G2323S0179.wav_00.npy

#难过：
tts(txt, emotion='emo-15_4581_20170825202626.wav_00.npy', sid=100)

#开心：T0055G2412S0498.wav
tts(txt, emotion='emo-T0055G2412S0498.wav_00.npy', sid=100)

#愤怒 T0055G1371S0363.wav T0055G1344S0160.wav
tts(txt, emotion='emo-T0055G1344S0160.wav_00.npy', sid=100)

#疲惫
tts(txt, emotion='emo-T0055G2294S0476.wav_00.npy', sid=100)

#着急
tts(txt, emotion='emo-T0055G1671S0170.wav_00.npy', sid=100)

python

txt = "我们将其拓展到文本驱动数字人形象领域"
tts(txt, emotion='random_sample', sid=100)
tts(txt, emotion='random_sample', sid=100)
tts(txt, emotion='random_sample', sid=100)
tts(txt, emotion='random_sample', sid=100)
tts(txt, emotion='random_sample', sid=100)
tts(txt, emotion='random_sample', sid=100)

python

txt = "我们将其拓展到文本驱动数字人形象领域"
types = ["平淡", "激动", "疲惫", "兴奋", "沮丧", "开心"]
for t in types:
    print(t)
    tts(txt, emotion=f'C:\\Users\\babys\\Music\\{t}.wav', sid=100)
# tts(txt, emotion='D:\\audiodata\\aidatatang_200zh\\corpus\\train\\G1858\\T0055G1858S0342.wav', sid=5)

预处理：

python

from models.synthesizer.preprocess import preprocess_dataset
from pathlib import Path
from utils.hparams import HParams
datasets_root = Path("../audiodata/")
hparams = HParams(
        n_fft = 1024, # filter_length
        num_mels = 80,
        hop_size = 256,                             # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
        win_size = 1024,                             # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
        fmin = 55,
        min_level_db = -100,
        ref_level_db = 20,
        max_abs_value = 4.,                         # Gradient explodes if too big, premature convergence if too small.
        sample_rate = 16000,
        rescale = True,
        max_mel_frames = 900,
        rescaling_max = 0.9,        
        preemphasis = 0.97,                         # Filter coefficient to use if preemphasize is True
        preemphasize = True,
        ### Mel Visualization and Griffin-Lim
        signal_normalization = True,

        utterance_min_duration = 1.6,               # Duration in seconds below which utterances are discarded
        ### Audio processing options
        fmax = 7600,                                # Should not exceed (sample_rate // 2)
        allow_clipping_in_normalization = True,     # Used when signal_normalization = True
        clip_mels_length = True,                    # If true, discards samples exceeding max_mel_frames
        use_lws = False,                            # "Fast spectrogram phase recovery using local weighted sums"
        symmetric_mels = True,                      # Sets mel range to [-max_abs_value, max_abs_value] if True,
                                                    #               and [0, max_abs_value] if False
        trim_silence = False,                        # Use with sample_rate of 16000 for best results

)
preprocess_dataset(datasets_root=datasets_root, 
        out_dir=datasets_root.joinpath("SV2TTS", "synthesizer"),
        n_processes=8,
        skip_existing=True, 
        hparams=hparams, 
        no_alignments=False, 
        dataset="aidatatang_200zh", 
        emotion_extract=True)

训练：

python

from models.synthesizer.train_vits import run
from pathlib import Path
from utils.hparams import HParams
import torch, os
import torch.multiprocessing as mp

datasets_root = Path("../audiodata/SV2TTS/synthesizer")
hparams= HParams(
  model_dir = "data/ckpt/synthesizer/vits",
)
hparams.loadJson(Path(hparams.model_dir).joinpath("config.json"))
hparams.data["training_files"] = str(datasets_root.joinpath("train.txt"))
hparams.data["validation_files"] = str(datasets_root.joinpath("train.txt"))
hparams.data["datasets_root"] = str(datasets_root)

n_gpus = torch.cuda.device_count()
# for spawn
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8899'
mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hparams))

挑选只有对应emo文件的meta数据

python

from pathlib import Path
import os
root = Path('../audiodata/SV2TTS/synthesizer')
dict_info = []
with open(root.joinpath("train.txt"), "r", encoding="utf-8") as dict_meta:
    for raw in dict_meta:
        if not raw:
            continue
        v = raw.split("|")[0].replace("audio","emo")
        emo_fpath = root.joinpath("emo").joinpath(v)
        if emo_fpath.exists():
            dict_info.append(raw)
        # else:
        #     print(emo_fpath)
# Iterate over each wav
meta2 = Path('../audiodata/SV2TTS/synthesizer/train2.txt')
metadata_file = meta2.open("w", encoding="utf-8")
for new_info in dict_info:
    metadata_file.write(new_info)
metadata_file.close()

从训练集中抽取10%作为测试集

python

from pathlib import Path
root = Path('../audiodata/SV2TTS/synthesizer')
dict_info1 = []
dict_info2 = []
count = 1
with open(root.joinpath("train.txt"), "r", encoding="utf-8") as dict_meta:
    for raw in dict_meta:
        if not raw:
            continue
        if count % 10 == 0:
            dict_info2.append(raw)
        else:
            dict_info1.append(raw)
        count += 1
# Iterate over each wav
meta1 = Path('../audiodata/SV2TTS/synthesizer/train1.txt')
metadata_file = meta1.open("w", encoding="utf-8")
for new_info in dict_info1:
    metadata_file.write(new_info)
metadata_file.close()

meta2 = Path('../audiodata/SV2TTS/synthesizer/eval.txt')
metadata_file = meta2.open("w", encoding="utf-8")
for new_info in dict_info2:
    metadata_file.write(new_info)
metadata_file.close()

evaluation

python

from pathlib import Path
root = Path('../audiodata/SV2TTS/synthesizer')
spks = []
spk_id = {}
rows = []
with open(root.joinpath("eval.txt"), "r", encoding="utf-8") as dict_meta:
    for raw in dict_meta:
        speaker_name = raw.split("-")[1][6:10]
        if speaker_name not in spk_id:
            spks.append(speaker_name)
            spk_id[speaker_name] = 1
        rows.append(raw)
i = 0
spks.sort()

for sp in spks:
    spk_id[sp] = str(i)
    i = i + 1
print(len(spks))
meta2 = Path('../audiodata/SV2TTS/synthesizer/eval2.txt')
metadata_file = meta2.open("w", encoding="utf-8")
for row in rows:
    speaker_n = row.split("-")[1][6:10]
    metadata_file.write(row.strip()+"|"+spk_id[speaker_n]+"\n")
metadata_file.close()

[Not Recommended] Try to transcript map to detailed format: ni3 hao3 -> n i3 <pad> h ao3

After couple of tests, I think this method will not improve the quality of result and may cause the crash of monotonic alignment.

python


from pathlib import Path
datasets_root = Path("../audiodata/SV2TTS/synthesizer/")

dictionary_fp = Path("../audiodata/ProDiff/processed/mandarin_pinyin.dict")
dict_map = {}
for l in open(dictionary_fp, encoding='utf-8').readlines():
    item = l.split("\t")
    dict_map[item[0]] = item[1].replace("\n","")

with datasets_root.joinpath('train2.txt').open("w+", encoding='utf-8') as f:
    for l in open(datasets_root.joinpath('train.txt'), encoding='utf-8').readlines():
        items = l.strip().replace("\n","").replace("\t"," ").split("|")
        phs_str = ""
        for word in items[5].split(" "):
            if word in dict_map:
                phs_str += dict_map[word] 
            else:
                phs_str += word
            phs_str += " _ "
        items[5] = phs_str
        # if not os.path.exists(mfa_input_root.joinpath('train.txt')):
        #     with open(mfa_input_root.joinpath(fileName + 'lab'), 'w+', encoding="utf-8") as f:
        f.write("|".join(items) + "\n")

预处理后的数据可视化

python

import matplotlib.pyplot as plt
import librosa.display
import librosa, torch
import numpy as np
from utils.audio_utils import spectrogram, mel_spectrogram, load_wav_to_torch, spec_to_mel

# x, sr = librosa.load("D:\audiodata\SV2TTS\synthesizer\audio\audio-T0055G2333S0196.wav_00.npy")
x = np.load("D:\\audiodata\\SV2TTS\\synthesizer\\audio\\audio-T0055G1858S0342.wav_00.npy")

plt.figure(figsize=(14, 5))
librosa.display.waveplot(x)

X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb,  x_axis='time', y_axis='hz')

# spectrogram = np.load("D:\\audiodata\\SV2TTS\\synthesizer\\mels\\mel-T0055G1858S0342.wav_00.npy")
audio = torch.from_numpy(x.astype(np.float32))

# audio, sampling_rate = load_wav_to_torch("D:\\audiodata\\aidatatang_200zh\\corpus\\train\\G1858\\T0055G1858S0342.wav")
# audio_norm = audio / 32768.0
audio_norm = audio.unsqueeze(0)
spec = spectrogram(audio_norm, 1024, 256, 1024, center=False)
# spec = spec_to_mel()
spec = torch.squeeze(spec, 0)
mel = spec_to_mel(spec, 1024, 80, 16000, 0, None)

fig = plt.figure(figsize=(10, 8))
ax2 = fig.add_subplot(211)
im = ax2.imshow(mel, interpolation="none")

情感聚类

python


# from sklearn import metrics
# from sklearn.mixture import GaussianMixture  # 高斯混合模型
import os
import numpy as np
import librosa
import IPython.display as ipd
from random import sample

embs = []
wavnames = []
emo_root_path = "D:\\audiodata\\SV2TTS\\synthesizer\\emo\\"
wav_root_path = "D:\\audiodata\\aidatatang_200zh\\corpus\\train\\"
for idx, emo_fpath in enumerate(sample(os.listdir(emo_root_path), 10000)):
    if emo_fpath.endswith(".npy") and emo_fpath.startswith("emo-T"):
        embs.append(np.expand_dims(np.load(emo_root_path + emo_fpath), axis=0))
        wav_fpath = wav_root_path + emo_fpath[9:14] + "\\" + emo_fpath.split("_00")[0][4:]
        wavnames.append(wav_fpath)
print(len(embs))


x = np.concatenate(embs, axis=0)

python

# 聚类算法类的数量
n_clusters = 20
from sklearn.cluster import *
# model = KMeans(n_clusters=n_clusters, random_state=10)
# model = DBSCAN(eps=0.002, min_samples=2)
# 可以自行尝试各种不同的聚类算法
# model = Birch(n_clusters= n_clusters, threshold= 0.2)
# model = SpectralClustering(n_clusters=n_clusters)
model = AgglomerativeClustering(n_clusters= n_clusters)
import random

y_predict = model.fit_predict(x)

def disp(wavname):
    wav, sr =librosa.load(wavname, 16000)
    display(ipd.Audio(wav, rate=sr))

classes=[[] for i in range(y_predict.max()+1)]

for idx, wavname in enumerate(wavnames):
    classes[y_predict[idx]].append(wavname)

for i in range(y_predict.max()+1):
    print("类别:", i, "本类中样本数量:", len(classes[i]))
    """每一个类只预览2条音频"""
    for j in range(2):
        idx = random.randint(0, len(classes[i]) - 1)
        print(classes[i][idx])
        disp(classes[i][idx])