Spaces:
Running
Running
File size: 11,125 Bytes
149fbcd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
import gradio as gr
import sys, os
import torch
from cosyvoice.utils.file_utils import load_wav
from tts_model.base_model.cosyvoice import CosyVoice as CosyVoiceTTS_base
from tts_model.speaker_minus.cosyvoice import CosyVoice as CosyVoiceTTS_speakerminus
# from tts_model.model_cosy2_instruct import CosyVoiceTTS as CosyVoiceTTS_cosy2
from pydub import AudioSegment
import tempfile
import soundfile as sf
import subprocess
import numpy as np
import random
from pydub import AudioSegment
# AudioSegment.converter = "/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffmpeg"
# AudioSegment.ffprobe = "/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffprobe"
ffmpeg_path = os.path.expanduser("/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffmpeg/")
os.environ["PATH"] += os.pathsep + ffmpeg_path
sys.path.append('third_party/Matcha-TTS')
os.system('export PYTHONPATH=third_party/Matcha-TTS')
tts_base = CosyVoiceTTS_base(model_dir="./pretrained_models/CosyVoice-300M/")
tts_speakerminus = CosyVoiceTTS_speakerminus(model_dir="./pretrained_models/CosyVoice-300M-speakerminus/")
# tts_cosy2_instruct = CosyVoiceTTS_cosy2(model_path="./pretrained_models/CosyVoice-300M-Instruct_cosy2/")
text_prompt = {
"翟佳宁": "这个节目就是把四个男嘉宾,四个女嘉宾放一个大别墅里让他们朝夕相处一整个月,月末选择心动的彼此。",
"范志毅": "没这个能力知道吗,我已经说了,你像这样的比赛本身就没有打好基础。",
"呼兰": "发完之后那个工作人员说,老师,呼兰老师你还要再加个标签儿,我说加什么标签儿,他说你就加一个呼兰太好笑了。",
"江梓浩": "就是很多我们这帮演员一整年也就上这么一个脱口秀类型的节目。",
"李雪琴": "我就劝他,我说你呀,你没事儿也放松放松,你那身体都亮红灯儿了你还不注意。",
"刘旸": "比如这三年我在街上开车,会在开车的时候进行一些哲思,我有天开车的时候路过一个地方。",
"唐香玉": "大家好我叫唐香玉, 我年前把我的工作辞了,成了一个全职脱口秀演员。",
"小鹿": "然后我就老家的亲戚太多了,我也记不清谁该叫谁,所以我妈带着我和我。",
"于祥宇": "我大学专业学的是哲学,然后节目组就说那这期主题你可以刚好聊一下哲学专业毕业之后的就业方向。",
"赵晓卉": "终于没有人问我为什么不辞职了,结果谈到现在,谈恋爱第一天人家问我,能打个电话吗?我说你有啥事儿。",
"徐志胜": "最舒服的一个方式,这个舞台也不一定就是说是来第一年就好嘛,只要你坚持,肯定会有发光发热的那天嘛。"
}
audio_prompt = {
"翟佳宁": "zhaijianing",
"范志毅": "fanzhiyi",
"呼兰": "hulan",
"江梓浩": "jiangzhihao",
"李雪琴": "lixueqin",
"刘旸": "liuchang",
"唐香玉": "tangxiangyu",
"小鹿": "xiaolu",
"于祥宇": "yuxiangyu",
"赵晓卉": "zhaoxiaohui",
"徐志胜": "xuzhisheng"
}
audio_prompt_path = "/mnt/by079416/fengping/CosyVoice2/talk_show_prompt/"
def load_audio_and_convert_to_16bit(file_path, target_sample_rate=16000):
audio = AudioSegment.from_file(file_path)
if audio.channels > 1:
audio = audio.set_channels(1)
if audio.frame_rate != target_sample_rate:
audio = audio.set_frame_rate(target_sample_rate)
audio_data = np.array(audio.get_array_of_samples(), dtype=np.float32)
audio_data = audio_data / np.max(np.abs(audio_data))
audio_data = (audio_data * 32767).astype(np.int16)
return torch.tensor(audio_data), target_sample_rate
def convert_audio_with_sox(input_file, output_file, target_sample_rate=16000):
try:
command = [
'sox', input_file,
'-r', str(target_sample_rate),
'-b', '16',
'-c', '1',
output_file
]
subprocess.run(command, check=True)
print(f"Audio converted successfully: {output_file}")
except subprocess.CalledProcessError as e:
print(f"Error during conversion: {e}")
def generate_speech_base(tts_text, speed, speaker, emotion, ref_audio, ref_text):
if not ref_audio and not ref_text:
ref_text = text_prompt.get(speaker, "")
ref_audio = os.path.join(audio_prompt_path, f"{audio_prompt.get(speaker)}.wav")
else:
random_int = random.randint(0, 90)
soxsed_ref_audio = "/tmp/{random_int}_ref.wav"
convert_audio_with_sox(ref_audio, soxsed_ref_audio)
ref_audio = load_wav(ref_audio, 16000)
# ref_audio, target_sample_rate = load_audio_and_convert_to_16bit(ref_audio)
sample_rate, full_audio = tts_base.inference_zero_shot(
tts_text,
prompt_text = ref_text,
prompt_speech_16k = ref_audio,
speed=speed,
# speaker=speaker,
# emotion=emotion,
)
print("sample_rate:", sample_rate, "full_audio:", full_audio.min(), full_audio.max())
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
output_audio_path = temp_audio_file.name
audio_segment = AudioSegment(
full_audio.tobytes(),
frame_rate=sample_rate,
sample_width=full_audio.dtype.itemsize,
channels=1
)
audio_segment.export(output_audio_path, format="wav")
print(f"Audio saved to {output_audio_path}")
return output_audio_path
def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text):
if not ref_audio and not ref_text:
ref_text = text_prompt.get(speaker, "")
ref_audio = os.path.join(audio_prompt_path, f"{audio_prompt.get(speaker)}.wav")
else:
random_int = random.randint(0, 90)
soxsed_ref_audio = "/tmp/{random_int}_ref.wav"
convert_audio_with_sox(ref_audio, soxsed_ref_audio)
# print("output_file:", output_file)
# ref_audio, target_sample_rate = load_audio_and_convert_to_16bit(ref_audio)
ref_audio = load_wav(ref_audio, 16000)
# if key == "Surprise":
# emotion_info = torch.load("/mnt/by079416/surprise.pt")
# if key == "Sad":
# emotion_info = torch.load("/mnt/by079416/sad.pt")
# if key == "Angry":
# emotion_info = torch.load("/mnt/by079416/angry.pt")
# if key == "Happy":
# emotion_info = torch.load("/mnt/by079416/happy.pt")
emotion_info = torch.load("/mnt/by079416/fengping/CosyVoice2/embedding_info.pt")["0002"][key]
sample_rate, full_audio = tts_speakerminus.inference_zero_shot(
tts_text,
prompt_text = ref_text,
# speaker=speaker,
prompt_speech_16k = ref_audio,
key = key,
emotion_speakerminus=emotion_info,
# ref_audio = ref_audio,
speed=speed
)
print("sample_rate:", sample_rate, "full_audio:", full_audio.min(), full_audio.max())
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
output_audio_path = temp_audio_file.name
audio_segment = AudioSegment(
full_audio.tobytes(),
frame_rate=sample_rate,
sample_width=full_audio.dtype.itemsize,
channels=1
)
audio_segment.export(output_audio_path, format="wav")
print(f"Audio saved to {output_audio_path}")
return output_audio_path
names = [
"于祥宇",
"刘旸",
"呼兰",
"唐香玉",
"小鹿",
"李雪琴",
"江梓浩",
"翟佳宁",
"范志毅",
"赵晓卉",
"徐志胜"
]
with gr.Blocks() as demo:
gr.Markdown("base model and instruct model")
# base
with gr.Tab("TTS-v1"):
gr.Markdown("## base model testing ##")
tts_base_inputs = [
gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"),
gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="speed", info="Adjust speech rate (0.5x to 2.0x)"),
gr.Radio(choices=names, value="徐志胜", label="可选说话人", info="选择语音说话人"),
gr.Radio(choices=["peace"], value="peace", label="情感", info="选择情感风格"),
gr.Audio(type="filepath", label="输入音频"),
gr.Textbox(lines=2, placeholder="Enter audio corresponding text here...", label="音频对应的文本")
]
tts_base_output = gr.Audio(type="filepath", label="生成的语音")
tts_base_button = gr.Button("生成语音")
tts_base_button.click(
fn=generate_speech_base,
inputs=tts_base_inputs,
outputs=tts_base_output
)
# with gr.Tab("TTS-v2"):
# gr.Markdown("## base model testing ##")
# tts_base_inputs = [
# gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"),
# gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="speed", info="Adjust speech rate (0.5x to 2.0x)"),
# gr.Radio(choices=names, value="徐志胜", label="说话人", info="选择语音说话人"),
# gr.Radio(choices=["peace", "excited", "mixed"], value="peace", label="情感", info="选择情感风格"),
# gr.Textbox(lines=2, placeholder="Enter your instruct text here...", label="Input Text"),
# gr.Audio(type="filepath", label="输入音频"),
# gr.Textbox(lines=2, placeholder="Enter audio corresponding text here...", label="音频对应的文本")
# ]
# tts_base_output = gr.Audio(type="filepath", label="生成的语音")
# tts_base_button = gr.Button("生成语音")
# tts_base_button.click(
# fn=generate_speech_cosy2_instruct,
# inputs=tts_base_inputs,
# outputs=tts_base_output
# )
# # instruct
# def generate_speech_speakerminus(tts_text, speed, speaker, emotion_speakerminus, key, ref_audio, ref_text):
with gr.Tab("TTS-v3"):
gr.Markdown("## instruct model testing")
tts2_inputs = [
gr.Textbox(lines=2, placeholder="输入文本...", label="输入文本"),
gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="语速", info="调整语速 (0.5x 到 2.0x)"),
gr.Radio(choices=names, value="徐志胜", label="可选说话人", info="选择语音说话人"),
gr.Radio(choices=["Angry", "Happy", "Surprise", "Sad"], value="peace", label="情感", info="选择情感风格"),
gr.Audio(type="filepath", label="输入音频"),
gr.Textbox(lines=2, placeholder="Enter audio corresponding text here...", label="音频对应的文本")
]
tts2_output = gr.Audio(type="filepath", label="生成的语音")
tts2_button = gr.Button("生成语音")
tts2_button.click(
fn=generate_speech_speakerminus,
inputs=tts2_inputs,
outputs=tts2_output
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=10132,
share=False
) |