Spaces:
Running
Running
| import argparse | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import onnxruntime | |
| import torch | |
| import torchaudio | |
| import torchaudio.compliance.kaldi as kaldi | |
| from tqdm import tqdm | |
| import random | |
| from collections import defaultdict | |
| import numpy as np | |
| eps = 1e-8 | |
| def single_job(utt): | |
| audio, sample_rate = torchaudio.load(utt) | |
| if sample_rate != 16000: | |
| audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio) | |
| feat = kaldi.fbank(audio, | |
| num_mel_bins=80, | |
| dither=0, | |
| sample_frequency=16000) | |
| feat = feat - feat.mean(dim=0, keepdim=True) | |
| embedding = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist() | |
| return utt, embedding | |
| option = onnxruntime.SessionOptions() | |
| option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL | |
| option.intra_op_num_threads = 1 | |
| providers = ["CPUExecutionProvider"] | |
| ort_session = onnxruntime.InferenceSession("/mnt/workspace/fengping/tools/CosyVoice_emosphere/pretrained_models/CosyVoice-300M-Instruct/campplus.onnx", sess_options=option, providers=providers) | |
| utt, neutral = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_neutral.wav") | |
| utt, happy = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_happy.wav") | |
| utt, sad = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_sad.wav") | |
| utt, surprise = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_surprise.wav") | |
| utt, angry = single_job("/mnt/workspace/fengping/tools/prompt/hanmo_angry.wav") | |
| diff = torch.tensor(angry, dtype=torch.float64) - torch.tensor(neutral, dtype=torch.float64) | |
| norm = torch.norm(diff) | |
| if norm > eps: # 避免除零 | |
| torch.save(torch.tensor(diff / (norm + eps), dtype=torch.float64), "/mnt/workspace/fengping/tools/prompt/angry.pt") | |