Marco-Voice-TTS / extract_embedding.py
tianfengping.tfp
init
149fbcd
raw
history blame
1.89 kB
import argparse
from concurrent.futures import ThreadPoolExecutor, as_completed
import onnxruntime
import torch
import torchaudio
import torchaudio.compliance.kaldi as kaldi
from tqdm import tqdm
import random
from collections import defaultdict
import numpy as np
eps = 1e-8
def single_job(utt):
audio, sample_rate = torchaudio.load(utt)
if sample_rate != 16000:
audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio)
feat = kaldi.fbank(audio,
num_mel_bins=80,
dither=0,
sample_frequency=16000)
feat = feat - feat.mean(dim=0, keepdim=True)
embedding = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
return utt, embedding
option = onnxruntime.SessionOptions()
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
option.intra_op_num_threads = 1
providers = ["CPUExecutionProvider"]
ort_session = onnxruntime.InferenceSession("/mnt/workspace/fengping/tools/CosyVoice_emosphere/pretrained_models/CosyVoice-300M-Instruct/campplus.onnx", sess_options=option, providers=providers)
utt, neutral = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_neutral.wav")
utt, happy = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_happy.wav")
utt, sad = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_sad.wav")
utt, surprise = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_surprise.wav")
utt, angry = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_angry.wav")
diff = torch.tensor(angry, dtype=torch.float64) - torch.tensor(neutral, dtype=torch.float64)
norm = torch.norm(diff)
if norm > eps: # 避免除零
torch.save(torch.tensor(diff / (norm + eps), dtype=torch.float64), "/mnt/workspace/fengping/tools/prompt/angry.pt")