import argparse from concurrent.futures import ThreadPoolExecutor, as_completed import onnxruntime import torch import torchaudio import torchaudio.compliance.kaldi as kaldi from tqdm import tqdm import random from collections import defaultdict import numpy as np eps = 1e-8 def single_job(utt): audio, sample_rate = torchaudio.load(utt) if sample_rate != 16000: audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio) feat = kaldi.fbank(audio, num_mel_bins=80, dither=0, sample_frequency=16000) feat = feat - feat.mean(dim=0, keepdim=True) embedding = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist() return utt, embedding option = onnxruntime.SessionOptions() option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL option.intra_op_num_threads = 1 providers = ["CPUExecutionProvider"] ort_session = onnxruntime.InferenceSession("/mnt/workspace/fengping/tools/CosyVoice_emosphere/pretrained_models/CosyVoice-300M-Instruct/campplus.onnx", sess_options=option, providers=providers) utt, neutral = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_neutral.wav") utt, happy = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_happy.wav") utt, sad = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_sad.wav") utt, surprise = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_surprise.wav") utt, angry = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_angry.wav") diff = torch.tensor(angry, dtype=torch.float64) - torch.tensor(neutral, dtype=torch.float64) norm = torch.norm(diff) if norm > eps: # 避免除零 torch.save(torch.tensor(diff / (norm + eps), dtype=torch.float64), "/mnt/workspace/fengping/tools/prompt/angry.pt")