File size: 11,125 Bytes
149fbcd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import gradio as gr
import sys, os
import torch
from cosyvoice.utils.file_utils import load_wav
from tts_model.base_model.cosyvoice import CosyVoice as CosyVoiceTTS_base
from tts_model.speaker_minus.cosyvoice import CosyVoice as CosyVoiceTTS_speakerminus
# from tts_model.model_cosy2_instruct import CosyVoiceTTS as CosyVoiceTTS_cosy2
from pydub import AudioSegment
import tempfile
import soundfile as sf
import subprocess
import numpy as np
import random


from pydub import AudioSegment
# AudioSegment.converter = "/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffmpeg"
# AudioSegment.ffprobe = "/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffprobe"

ffmpeg_path = os.path.expanduser("/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffmpeg/")
os.environ["PATH"] += os.pathsep + ffmpeg_path

sys.path.append('third_party/Matcha-TTS')
os.system('export PYTHONPATH=third_party/Matcha-TTS')

tts_base = CosyVoiceTTS_base(model_dir="./pretrained_models/CosyVoice-300M/")
tts_speakerminus = CosyVoiceTTS_speakerminus(model_dir="./pretrained_models/CosyVoice-300M-speakerminus/")
# tts_cosy2_instruct = CosyVoiceTTS_cosy2(model_path="./pretrained_models/CosyVoice-300M-Instruct_cosy2/")

text_prompt = {
"翟佳宁": "这个节目就是把四个男嘉宾,四个女嘉宾放一个大别墅里让他们朝夕相处一整个月,月末选择心动的彼此。",
"范志毅": "没这个能力知道吗,我已经说了,你像这样的比赛本身就没有打好基础。",
"呼兰": "发完之后那个工作人员说,老师,呼兰老师你还要再加个标签儿,我说加什么标签儿,他说你就加一个呼兰太好笑了。",
"江梓浩": "就是很多我们这帮演员一整年也就上这么一个脱口秀类型的节目。",
"李雪琴": "我就劝他,我说你呀,你没事儿也放松放松,你那身体都亮红灯儿了你还不注意。",
"刘旸": "比如这三年我在街上开车,会在开车的时候进行一些哲思,我有天开车的时候路过一个地方。",
"唐香玉": "大家好我叫唐香玉, 我年前把我的工作辞了,成了一个全职脱口秀演员。",
"小鹿":  "然后我就老家的亲戚太多了,我也记不清谁该叫谁,所以我妈带着我和我。",
"于祥宇": "我大学专业学的是哲学,然后节目组就说那这期主题你可以刚好聊一下哲学专业毕业之后的就业方向。",
"赵晓卉": "终于没有人问我为什么不辞职了,结果谈到现在,谈恋爱第一天人家问我,能打个电话吗?我说你有啥事儿。",
"徐志胜": "最舒服的一个方式,这个舞台也不一定就是说是来第一年就好嘛,只要你坚持,肯定会有发光发热的那天嘛。"
} 
audio_prompt = {
    "翟佳宁": "zhaijianing",
    "范志毅": "fanzhiyi",
    "呼兰": "hulan",
    "江梓浩": "jiangzhihao",
    "李雪琴": "lixueqin",
    "刘旸": "liuchang",
    "唐香玉": "tangxiangyu",
    "小鹿": "xiaolu",
    "于祥宇": "yuxiangyu",
    "赵晓卉": "zhaoxiaohui",
    "徐志胜": "xuzhisheng"
}
audio_prompt_path = "/mnt/by079416/fengping/CosyVoice2/talk_show_prompt/"

def load_audio_and_convert_to_16bit(file_path, target_sample_rate=16000):
    audio = AudioSegment.from_file(file_path)
    
    if audio.channels > 1:
        audio = audio.set_channels(1)
    
    if audio.frame_rate != target_sample_rate:
        audio = audio.set_frame_rate(target_sample_rate)
    
    audio_data = np.array(audio.get_array_of_samples(), dtype=np.float32)
    
    audio_data = audio_data / np.max(np.abs(audio_data))
    
    audio_data = (audio_data * 32767).astype(np.int16)
    
    return torch.tensor(audio_data), target_sample_rate

def convert_audio_with_sox(input_file, output_file, target_sample_rate=16000):
    try:
        command = [
            'sox', input_file,  
            '-r', str(target_sample_rate),  
            '-b', '16',  
            '-c', '1',
            output_file  
        ]
        
        subprocess.run(command, check=True)
        print(f"Audio converted successfully: {output_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error during conversion: {e}")

def generate_speech_base(tts_text, speed, speaker, emotion, ref_audio, ref_text):
    if not ref_audio and not ref_text:
        ref_text = text_prompt.get(speaker, "")
        ref_audio = os.path.join(audio_prompt_path, f"{audio_prompt.get(speaker)}.wav") 
    else:
        random_int = random.randint(0, 90)
        soxsed_ref_audio = "/tmp/{random_int}_ref.wav"
        convert_audio_with_sox(ref_audio, soxsed_ref_audio)
    ref_audio = load_wav(ref_audio, 16000)
    # ref_audio, target_sample_rate = load_audio_and_convert_to_16bit(ref_audio)
    sample_rate, full_audio = tts_base.inference_zero_shot(
        tts_text,
        prompt_text = ref_text,
        prompt_speech_16k = ref_audio,
        speed=speed,
        # speaker=speaker,
        # emotion=emotion,

    )
    print("sample_rate:", sample_rate, "full_audio:", full_audio.min(), full_audio.max())
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
        output_audio_path = temp_audio_file.name
        audio_segment = AudioSegment(
            full_audio.tobytes(), 
            frame_rate=sample_rate,
            sample_width=full_audio.dtype.itemsize, 
            channels=1
        )
        audio_segment.export(output_audio_path, format="wav")
    print(f"Audio saved to {output_audio_path}")

    return output_audio_path

def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text):
    if not ref_audio and not ref_text:
        ref_text = text_prompt.get(speaker, "")
        ref_audio = os.path.join(audio_prompt_path, f"{audio_prompt.get(speaker)}.wav") 
    else:
        random_int = random.randint(0, 90)
        soxsed_ref_audio = "/tmp/{random_int}_ref.wav"
        convert_audio_with_sox(ref_audio, soxsed_ref_audio)
    # print("output_file:", output_file)
    # ref_audio, target_sample_rate = load_audio_and_convert_to_16bit(ref_audio)
    ref_audio = load_wav(ref_audio, 16000)
    # if key == "Surprise":
    #     emotion_info = torch.load("/mnt/by079416/surprise.pt")
    # if key == "Sad":
    #     emotion_info = torch.load("/mnt/by079416/sad.pt")
    # if key == "Angry":
    #     emotion_info = torch.load("/mnt/by079416/angry.pt")
    # if key == "Happy":
    #     emotion_info = torch.load("/mnt/by079416/happy.pt")

    emotion_info = torch.load("/mnt/by079416/fengping/CosyVoice2/embedding_info.pt")["0002"][key]
    sample_rate, full_audio = tts_speakerminus.inference_zero_shot(
        tts_text,
        prompt_text = ref_text,
        # speaker=speaker,
        prompt_speech_16k = ref_audio,
        key = key,
        emotion_speakerminus=emotion_info,
        # ref_audio = ref_audio,
        speed=speed
        
    )
    print("sample_rate:", sample_rate, "full_audio:", full_audio.min(), full_audio.max())
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
        output_audio_path = temp_audio_file.name
        audio_segment = AudioSegment(
            full_audio.tobytes(), 
            frame_rate=sample_rate,
            sample_width=full_audio.dtype.itemsize, 
            channels=1
        )
        audio_segment.export(output_audio_path, format="wav")
    print(f"Audio saved to {output_audio_path}")
    return output_audio_path

names = [
    "于祥宇",
    "刘旸",
    "呼兰",
    "唐香玉",
    "小鹿",
    "李雪琴",
    "江梓浩",
    "翟佳宁",
    "范志毅",
    "赵晓卉",
    "徐志胜"
]

with gr.Blocks() as demo:
    gr.Markdown("base model and instruct model")
    # base
    with gr.Tab("TTS-v1"):
        gr.Markdown("## base model testing ##")
        tts_base_inputs = [
            gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"),
            gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="speed", info="Adjust speech rate (0.5x to 2.0x)"),
            gr.Radio(choices=names, value="徐志胜", label="可选说话人", info="选择语音说话人"),
            gr.Radio(choices=["peace"], value="peace", label="情感", info="选择情感风格"),
            gr.Audio(type="filepath", label="输入音频"),
            gr.Textbox(lines=2, placeholder="Enter audio corresponding text here...", label="音频对应的文本")
        ]
        tts_base_output = gr.Audio(type="filepath", label="生成的语音")
        tts_base_button = gr.Button("生成语音")
        tts_base_button.click(
            fn=generate_speech_base,
            inputs=tts_base_inputs,
            outputs=tts_base_output
        )

    # with gr.Tab("TTS-v2"):
    #     gr.Markdown("## base model testing ##")
    #     tts_base_inputs = [
    #         gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text"),
    #         gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="speed", info="Adjust speech rate (0.5x to 2.0x)"),
    #         gr.Radio(choices=names, value="徐志胜", label="说话人", info="选择语音说话人"),
    #         gr.Radio(choices=["peace", "excited", "mixed"], value="peace", label="情感", info="选择情感风格"),
    #         gr.Textbox(lines=2, placeholder="Enter your instruct text here...", label="Input Text"),
    #         gr.Audio(type="filepath", label="输入音频"),
    #         gr.Textbox(lines=2, placeholder="Enter audio corresponding text here...", label="音频对应的文本")
    #     ]
    #     tts_base_output = gr.Audio(type="filepath", label="生成的语音")
    #     tts_base_button = gr.Button("生成语音")
    #     tts_base_button.click(
    #         fn=generate_speech_cosy2_instruct,
    #         inputs=tts_base_inputs,
    #         outputs=tts_base_output
    #     )

    # # instruct

    # def generate_speech_speakerminus(tts_text, speed, speaker, emotion_speakerminus, key, ref_audio, ref_text):

    with gr.Tab("TTS-v3"):
        gr.Markdown("## instruct model testing")
        tts2_inputs = [
            gr.Textbox(lines=2, placeholder="输入文本...", label="输入文本"),
            gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="语速", info="调整语速 (0.5x 到 2.0x)"),
            gr.Radio(choices=names, value="徐志胜", label="可选说话人", info="选择语音说话人"),
            gr.Radio(choices=["Angry", "Happy", "Surprise", "Sad"], value="peace", label="情感", info="选择情感风格"),
            gr.Audio(type="filepath", label="输入音频"),
            gr.Textbox(lines=2, placeholder="Enter audio corresponding text here...", label="音频对应的文本")
        ]
        tts2_output = gr.Audio(type="filepath", label="生成的语音")
        tts2_button = gr.Button("生成语音")
        tts2_button.click(
            fn=generate_speech_speakerminus,
            inputs=tts2_inputs,
            outputs=tts2_output
        )

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=10132,
        share=False
    )