import gradio as gr import sys, os import torch from cosyvoice.utils.file_utils import load_wav from tts_model.base_model.cosyvoice import CosyVoice as CosyVoiceTTS_base from tts_model.sft_model.cosyvoice import CosyVoice as CosyVoiceTTS_sft from uuid import uuid1 import uuid from tts_model.speaker_minus.cosyvoice import CosyVoice as CosyVoiceTTS_speakerminus # from tts_model.model_cosy2_instruct import CosyVoiceTTS as CosyVoiceTTS_cosy2 from pydub import AudioSegment import tempfile import soundfile as sf import subprocess import numpy as np import random import numpy from pydub import AudioSegment # AudioSegment.converter = "/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffmpeg" # AudioSegment.ffprobe = "/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffprobe" ffmpeg_path = os.path.expanduser("/mnt/by079416/fengping/ffmpeg-7.0.2-amd64-static/ffmpeg/") os.environ["PATH"] += os.pathsep + ffmpeg_path sys.path.append('third_party/Matcha-TTS') os.system('export PYTHONPATH=third_party/Matcha-TTS') tts_base = CosyVoiceTTS_base(model_dir="./pretrained_models/CosyVoice-300M/") tts_speakerminus = CosyVoiceTTS_speakerminus(model_dir="./pretrained_models/CosyVoice-300M-speakerminus/") # tts_cosy2_instruct = CosyVoiceTTS_cosy2(model_path="./pretrained_models/CosyVoice-300M-Instruct_cosy2/") tts_sft = CosyVoiceTTS_base(model_dir="./pretrained_models/CosyVoice-300M-SFT/") text_prompt = { "翟佳宁": "这个节目就是把四个男嘉宾,四个女嘉宾放一个大别墅里让他们朝夕相处一整个月,月末选择心动的彼此。", "范志毅": "没这个能力知道吗,我已经说了,你像这样的比赛本身就没有打好基础。", "呼兰": "发完之后那个工作人员说,老师,呼兰老师你还要再加个标签儿,我说加什么标签儿,他说你就加一个呼兰太好笑了。", "江梓浩": "就是很多我们这帮演员一整年也就上这么一个脱口秀类型的节目。", "李雪琴": "我就劝他,我说你呀,你没事儿也放松放松,你那身体都亮红灯儿了你还不注意。", "刘旸": "比如这三年我在街上开车,会在开车的时候进行一些哲思,我有天开车的时候路过一个地方。", "唐香玉": "大家好我叫唐香玉, 我年前把我的工作辞了,成了一个全职脱口秀演员。", "小鹿": "然后我就老家的亲戚太多了,我也记不清谁该叫谁,所以我妈带着我和我。", "于祥宇": "我大学专业学的是哲学,然后节目组就说那这期主题你可以刚好聊一下哲学专业毕业之后的就业方向。", "赵晓卉": "终于没有人问我为什么不辞职了,结果谈到现在,谈恋爱第一天人家问我,能打个电话吗?我说你有啥事儿。", "徐志胜": "最舒服的一个方式,这个舞台也不一定就是说是来第一年就好嘛,只要你坚持,肯定会有发光发热的那天嘛。" } audio_prompt = { "翟佳宁": "zhaijianing", "范志毅": "fanzhiyi", "呼兰": "hulan", "江梓浩": "jiangzhihao", "李雪琴": "lixueqin", "刘旸": "liuchang", "唐香玉": "tangxiangyu", "小鹿": "xiaolu", "于祥宇": "yuxiangyu", "赵晓卉": "zhaoxiaohui", "徐志胜": "xuzhisheng" } audio_prompt_path = "/mnt/by079416/fengping/CosyVoice2/talk_show_prompt/" def load_audio_and_convert_to_16bit(file_path, target_sample_rate=16000): audio = AudioSegment.from_file(file_path) if audio.channels > 1: audio = audio.set_channels(1) if audio.frame_rate != target_sample_rate: audio = audio.set_frame_rate(target_sample_rate) audio_data = np.array(audio.get_array_of_samples(), dtype=np.float32) audio_data = audio_data / np.max(np.abs(audio_data)) audio_data = (audio_data * 32767).astype(np.int16) return torch.tensor(audio_data), target_sample_rate def convert_audio_with_sox(input_file, output_file, target_sample_rate=16000): try: # command = [ # './ffmpeg-7.0.2-amd64-static/ffmpeg', input_file, # '-r', str(target_sample_rate), # '-b', '16', # '-c', '1', # output_file # ] command = [ './ffmpeg-7.0.2-amd64-static/ffmpeg', '-i', input_file, # 必须显式指定 -i 标记输入文件 '-ar', str(target_sample_rate), # 设置音频采样率 '-ac', '1', # 设置单通道 (mono) '-b:a', '16k', # 设置音频比特率为 16kbps '-f', 'wav', # 强制输出格式为 WAV output_file ] subprocess.run(command, check=True) print(f"Audio converted successfully: {output_file}") except subprocess.CalledProcessError as e: print(f"Error during conversion: {e}") os.makedirs("./tmp", exist_ok=True) def generate_speech_sft(tts_text, speaker): # if not ref_audio and not ref_text: # ref_text = text_prompt.get(speaker, "") # ref_audio = os.path.join(audio_prompt_path, f"{audio_prompt.get(speaker)}.wav") # else: # random_int = random.randint(0, 90) # soxsed_ref_audio = "/tmp/{random_int}_ref.wav" # convert_audio_with_sox(ref_audio, soxsed_ref_audio) # ref_audio = load_wav(ref_audio, 16000) # # ref_audio, target_sample_rate = load_audio_and_convert_to_16bit(ref_audio) sample_rate, full_audio = tts_sft.inference_sft( tts_text, spk_id = speaker # instruct = instruct # prompt_text = ref_text, # prompt_speech_16k = ref_audio, # speed=speed, # speaker=speaker, # emotion=emotion, ) full_audio = full_audio.astype(np.float32) if full_audio.max() > 1.0 or full_audio.min() < -1.0: full_audio /= 32768.0 # int16 → [-1,1] print("dtype:", full_audio.dtype, "shape:", full_audio.shape, "max:", full_audio.max(), "min:", full_audio.min()) out_path = os.path.join("./tmp", f"{uuid.uuid4().hex}.wav") audio_segment = AudioSegment( full_audio.tobytes(), frame_rate=sample_rate, sample_width=full_audio.dtype.itemsize, channels=1 ) audio_segment.export(out_path, format="wav") print(">>> audio path:", os.path.abspath(out_path)) # return out_path return (sample_rate, full_audio) # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: # output_audio_path = temp_audio_file.name # audio_segment = AudioSegment( # full_audio.tobytes(), # frame_rate=sample_rate, # sample_width=full_audio.dtype.itemsize, # channels=1 # ) # audio_segment.export(output_audio_path, format="wav") # print(f"Audio saved to {output_audio_path}") # return output_audio_path # def generate_speech_sft(tts_text, speaker): # sr = 22050 # 采样率 # t = np.linspace(0, 1, sr, dtype=np.float32) # audio_np = np.sin(2 * np.pi * 440 * t) # 1 秒 440 Hz 正弦波 # return (sr, audio_np) def generate_speech_base(tts_text, speed, speaker, ref_audio, ref_text): # import pdb;pdb.set_trace() if not ref_audio and not ref_text: ref_text = text_prompt.get(speaker, "") ref_audio = os.path.join(audio_prompt_path, f"{audio_prompt.get(speaker)}.wav") ref_audio = load_wav(ref_audio, 16000) else: random_int = random.randint(0, 90000) soxsed_ref_audio = f"/tmp/{random_int}_ref.wav" convert_audio_with_sox(ref_audio, soxsed_ref_audio) ref_audio = load_wav(soxsed_ref_audio, 16000) # ref_audio, target_sample_rate = load_audio_and_convert_to_16bit(ref_audio) sample_rate, full_audio = tts_base.inference_zero_shot( tts_text, prompt_text = ref_text, prompt_speech_16k = ref_audio, speed=speed, # speaker=speaker, # emotion=emotion, ) print("sample_rate:", sample_rate, "full_audio:", full_audio.min(), full_audio.max()) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: output_audio_path = temp_audio_file.name audio_segment = AudioSegment( full_audio.tobytes(), frame_rate=sample_rate, sample_width=full_audio.dtype.itemsize, channels=1 ) audio_segment.export(output_audio_path, format="wav") print(f"Audio saved to {output_audio_path}") return output_audio_path def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text): # import pdb;pdb.set_trace() if not ref_audio and not ref_text: ref_text = text_prompt.get(speaker, "") ref_audio = os.path.join(audio_prompt_path, f"{audio_prompt.get(speaker)}.wav") else: random_int = random.randint(0, 90) soxsed_ref_audio = f"/tmp/{random_int}_ref.wav" convert_audio_with_sox(ref_audio, soxsed_ref_audio) # print("output_file:", output_file) # ref_audio, target_sample_rate = load_audio_and_convert_to_16bit(ref_audio) ref_audio = load_wav(ref_audio, 16000) # if key == "Surprise": # emotion_info = torch.load("/mnt/by079416/surprise.pt") # if key == "Sad": # emotion_info = torch.load("/mnt/by079416/sad.pt") # if key == "Angry": # emotion_info = torch.load("/mnt/by079416/angry.pt") # if key == "Happy": # emotion_info = torch.load("/mnt/by079416/happy.pt") emotion_info = torch.load("/mnt/by079416/fengping/CosyVoice2/embedding_info.pt")["0002"][key] sample_rate, full_audio = tts_speakerminus.inference_zero_shot( tts_text, prompt_text = ref_text, # speaker=speaker, prompt_speech_16k = ref_audio, key = key, emotion_speakerminus=emotion_info, # ref_audio = ref_audio, speed=speed ) print("sample_rate:", sample_rate, "full_audio:", full_audio.min(), full_audio.max()) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: output_audio_path = temp_audio_file.name audio_segment = AudioSegment( full_audio.tobytes(), frame_rate=sample_rate, sample_width=full_audio.dtype.itemsize, channels=1 ) audio_segment.export(output_audio_path, format="wav") print(f"Audio saved to {output_audio_path}") return output_audio_path names = [ "于祥宇", "刘旸", "呼兰", "唐香玉", "小鹿", "李雪琴", "江梓浩", "翟佳宁", "范志毅", "赵晓卉", "徐志胜" ] custom_css = """ :root { --primary-color: #6a11cb; --secondary-color: #2575fc; --accent-color: #ff6b6b; --light-bg: #f8f9fa; --dark-bg: #212529; --card-bg: #ffffff; --text-color: #343a40; --border-radius: 12px; --box-shadow: 0 6px 16px rgba(0,0,0,0.1); --transition: all 0.3s ease; } body { background: linear-gradient(135deg, var(--light-bg) 0%, #e9ecef 100%); min-height: 100vh; font-family: 'Segoe UI', 'PingFang SC', 'Microsoft YaHei', sans-serif; color: var(--text-color); line-height: 1.6; } .gradio-container { max-width: 1200px !important; margin: 2rem auto !important; padding: 0 1rem; } .header { padding: 0 !important; border-radius: 10px; /* 整个 Header 的圆角 */ overflow: hidden; /* 隐藏超出圆角部分的内容,非常重要! */ } /* 2. 行容器:使用 Flex 布局,让左右两部分等高 */ #header-row { display: flex; align-items: stretch; /* 让子项高度自动拉伸以填满容器 */ } /* 3. Logo 容器:移除 Gradio 默认的所有样式,让它变成一个纯净的盒子 */ #logo-container { padding: 0 !important; border: none !important; background: none !important; box-shadow: none !important; min-width: 150px; /* 给 Logo 区域一个固定的最小宽度 */ flex-shrink: 0; /* 防止在空间不足时被压缩 */ } /* 4. Logo 图片本身:让图片填满它的容器,就像背景图一样 */ #logo-container img { width: 100%; height: 100%; object-fit: cover; /* 裁剪并填充,保持宽高比,确保填满容器 */ display: block; /* 移除图片底部的微小空隙 */ } /* 5. 标题区域:设置背景、内边距和文本对齐 */ #title-area { background: linear-gradient(to right, #5e57c2, #42a5f5); /* 右侧的渐变背景 */ padding: 20px 25px; /* 给标题文本留出足够的空间 */ display: flex; flex-direction: column; justify-content: center; /* 垂直居中标题 */ flex-grow: 1; /* 占据所有剩余空间 */ } /* 6. 标题文本样式 */ #header-title h1 { color: white; font-size: 28px; margin: 0; font-weight: 600; text-shadow: 1px 1px 2px rgba(0,0,0,0.2); } .tabs { background: transparent !important; border: none !important; box-shadow: none !important; } .tab-nav { background: var(--card-bg) !important; border-radius: var(--border-radius) !important; padding: 0.5rem !important; margin-bottom: 1.5rem !important; box-shadow: var(--box-shadow) !important; } .tab-button { padding: 1rem 1.5rem !important; border-radius: 8px !important; font-weight: 600 !important; transition: var(--transition) !important; border: none !important; } .tab-button.selected { background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%) !important; color: white !important; box-shadow: 0 4px 12px rgba(106, 17, 203, 0.3) !important; } .tab-content { background: var(--card-bg) !important; border-radius: var(--border-radius) !important; padding: 2rem !important; box-shadow: var(--box-shadow) !important; margin-bottom: 2rem; border: none !important; } .input-section { background: #f9fafb; padding: 1.5rem; border-radius: var(--border-radius); margin-bottom: 1.5rem; border: 1px solid #e9ecef; } .output-section { background: #edf2f7; padding: 1.5rem; border-radius: var(--border-radius); border: 1px solid #e9ecef; display: flex; flex-direction: column; height: 100%; } .control-group { margin-bottom: 1.2rem; } .control-group label { display: block; margin-bottom: 0.5rem; font-weight: 600; color: #495057; font-size: 0.95rem; } input[type="text"], textarea { border-radius: 8px !important; padding: 0.8rem 1rem !important; border: 1px solid #ced4da !important; transition: var(--transition) !important; } input[type="text"]:focus, textarea:focus { border-color: var(--primary-color) !important; box-shadow: 0 0 0 3px rgba(106, 17, 203, 0.1) !important; } .slider { margin-top: 0.5rem !important; } .btn-generate { background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%) !important; color: white !important; font-weight: 600 !important; padding: 1rem 1.8rem !important; border-radius: 8px !important; border: none !important; transition: var(--transition) !important; font-size: 1rem !important; margin-top: auto; width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.1) !important; } .btn-generate:hover { transform: translateY(-3px); box-shadow: 0 6px 12px rgba(106, 17, 203, 0.25) !important; } .example-text { background: #e9ecef; padding: 0.8rem; border-radius: 8px; font-style: italic; margin-top: 0.5rem; font-size: 0.9rem; color: #495057; } .audio-player { width: 100%; margin-top: 1rem; border-radius: 8px; overflow: hidden; } .model-info { background: #e6f7ff; padding: 1rem; border-radius: 8px; margin-top: 1.5rem; border-left: 4px solid #1890ff; font-size: 0.9rem; } .info-icon { color: #1890ff; margin-right: 8px; font-weight: bold; } .footer { text-align: center; color: #6c757d; font-size: 0.9rem; padding: 1.5rem 0; border-top: 1px solid #e9ecef; margin-top: 2rem; } .accordion { background: #f8f9fa !important; border-radius: 8px !important; padding: 0.8rem !important; margin-top: 1rem; border: 1px solid #e9ecef !important; } .accordion-title { font-weight: 600 !important; color: var(--primary-color) !important; } .audio-upload { border: 2px dashed #ced4da !important; border-radius: 8px !important; padding: 1.5rem !important; background: #f8f9fa !important; transition: var(--transition) !important; } .audio-upload:hover { border-color: var(--primary-color) !important; background: #f1f3f5 !important; } .audio-upload-label { font-weight: 500 !important; color: #495057 !important; margin-bottom: 0.5rem !important; } .radio-group { display: flex; flex-wrap: wrap; gap: 0.8rem; margin-top: 0.5rem; } .radio-item { flex: 1; min-width: 100px; text-align: center; padding: 0.8rem; border: 1px solid #ced4da; border-radius: 8px; cursor: pointer; transition: var(--transition); } .radio-item.selected { border-color: var(--primary-color); background: rgba(106, 17, 203, 0.05); color: var(--primary-color); font-weight: 500; } .radio-item:hover { border-color: var(--primary-color); } @media (max-width: 768px) { .header h1 { font-size: 2.2rem; } .header p { font-size: 1rem; } .gradio-container { padding: 0 0.5rem; } .tab-button { padding: 0.8rem 1rem !important; font-size: 0.9rem !important; } } """ # 创建界面 logo_path = "/mnt/by079416/fengping/CosyVoice2/logo2.png" with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: with gr.Column(elem_classes="header"): # 行容器,用于左右布局 with gr.Row(elem_id="header-row", variant="compact"): # 左侧:Logo gr.Image(value=logo_path, elem_id="logo-container", show_label=False, show_download_button=False, show_share_button=False) # 隐藏分享按钮 # 右侧:标题区域 with gr.Column(elem_id="title-area"): gr.Markdown("# 🎤 Marco-Voice 语音合成系统", elem_id="header-title") # gr.Markdown("") # 标签页 with gr.Tabs(elem_classes="tabs") as tabs: # Tab 1: 音色克隆 with gr.TabItem("🎭 音色克隆", id=0): with gr.Row(): with gr.Column(scale=2, elem_classes="input-section"): gr.Markdown("### 输入设置") tts_text_v1 = gr.Textbox( lines=3, placeholder="请输入要合成的文本内容...", label="合成文本", value="大家好,欢迎使用Marco Voice语音合成系统,这是一个强大的语音生成工具。" ) with gr.Row(): with gr.Column(): speed_v1 = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="语速控制", interactive=True ) with gr.Column(): speaker_v1 = gr.Dropdown( choices=names, value="徐志胜", label="预设音色", info="选择脱口秀演员音色" ) # [tts_text_v1, speed_v1, speaker_v1, emotion, ref_audio_v1, ref_text_v1] with gr.Accordion("高级设置", open=False, elem_classes="accordion"): gr.Markdown("上传3-10秒清晰人声作为参考音频") with gr.Row(): ref_audio_v1 = gr.Audio( type="filepath", label="上传参考音频", elem_classes="audio-upload" ) ref_text_v1 = gr.Textbox( lines=2, placeholder="参考音频对应的文本...", label="参考文本" ) gr.Markdown("""

ℹ️ 模型说明: 此模型使用零样本音色克隆技术,只需3-10秒参考音频即可模仿目标音色。

""") with gr.Column(scale=1, elem_classes="output-section"): gr.Markdown("### 输出结果") tts_base_output = gr.Audio( type="filepath", label="生成语音", elem_id="tts_output_audio", interactive=False ) tts_base_button = gr.Button( "🚀 生成语音", variant="primary", elem_classes="btn-generate" ) gr.Examples( examples=[ ["大家好,欢迎使用Marco-Voice语音合成系统,这是一个强大的语音生成工具。", "徐志胜"], ["科技改变生活,创新引领未来。人工智能正在深刻改变我们的世界。", "李雪琴"], ["在这个充满机遇的时代,我们要勇于探索,敢于创新,不断突破自我。", "范志毅"] ], inputs=[tts_text_v1, speaker_v1], label="示例文本" ) # Tab 2: 多语种合成 with gr.TabItem("🌍 多语种合成", id=1): with gr.Row(): with gr.Column(scale=2, elem_classes="input-section"): gr.Markdown("### 输入设置") tts_text_sft = gr.Textbox( lines=3, placeholder="请输入要合成的文本内容...", label="合成文本", value="Hello, welcome to Marco-Voice text-to-speech system. This is a powerful multilingual TTS tool." ) speaker_sft = gr.Dropdown( choices=["中文男", "中文女", "英文男", "英文女", "韩语女", "日语男"], value="英文男", label="说话人", info="选择语言和性别" ) gr.Markdown("""

ℹ️ 模型说明: 此模型支持多个语种,无需参考音频即可生成自然语音。

💡 使用技巧: 输入文本语言应与选择的说话人语言一致以获得最佳效果。

""") with gr.Column(scale=1, elem_classes="output-section"): gr.Markdown("### 输出结果") tts_sft_output = gr.Audio( type="numpy", label="生成语音", interactive=False ) tts_sft_button = gr.Button( "🚀 生成语音", variant="primary", elem_classes="btn-generate" ) gr.Examples( examples=[ ["Hello, welcome to Marco-Voice text-to-speech system.", "英文男"], ["こんにちは、Marco-Voiceテキスト読み上げシステムへようこそ。", "日语男"], ["안녕하세요, Marco-Voice 텍스트 음성 변환 시스템에 오신 것을 환영합니다.", "韩语女"] ], inputs=[tts_text_sft, speaker_sft], label="多语种示例" ) # Tab 3: 情感控制 with gr.TabItem("😄 情感控制", id=2): with gr.Row(): with gr.Column(scale=2, elem_classes="input-section"): gr.Markdown("### 输入设置") tts_text_v3 = gr.Textbox( lines=3, placeholder="请输入要合成的文本内容...", label="合成文本", value="这真是太令人兴奋了!我们刚刚完成了一个重大突破!" ) with gr.Row(): with gr.Column(): speed_v3 = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="语速控制" ) with gr.Column(): emotion_v3 = gr.Radio( choices=["Angry", "Happy", "Surprise", "Sad"], value="Happy", label="情感选择" ) with gr.Row(): with gr.Column(): speaker_v3 = gr.Dropdown( choices=names, value="徐志胜", label="预设音色" ) with gr.Column(): gr.Markdown("### 或使用自定义音色") with gr.Accordion("上传参考音频", open=False, elem_classes="accordion"): gr.Markdown("上传3-10秒清晰人声作为参考音频") ref_audio_v3 = gr.Audio( type="filepath", label="上传参考音频", elem_classes="audio-upload" ) ref_text_v3 = gr.Textbox( lines=2, placeholder="参考音频对应的文本...", label="参考文本" ) gr.Markdown("""

ℹ️ 模型说明: 此模型在音色克隆基础上增加了情感控制能力,可生成带有特定情感的语音。

💡 使用技巧: 情感表达效果与文本内容相关,请确保文本与所选情感匹配。

""") with gr.Column(scale=1, elem_classes="output-section"): gr.Markdown("### 输出结果") tts_v3_output = gr.Audio( type="filepath", label="生成语音", interactive=False ) tts_v3_button = gr.Button( "🚀 生成语音", variant="primary", elem_classes="btn-generate" ) gr.Examples( examples=[ ["这真是太令人兴奋了!我们刚刚完成了一个重大突破!", "Happy", "徐志胜"], ["我简直不敢相信!这怎么可能发生?", "Surprise", "李雪琴"], ["这太让人失望了,我们所有的努力都白费了。", "Sad", "范志毅"] ], inputs=[tts_text_v3, emotion_v3, speaker_v3], label="情感示例" ) # 页脚 gr.Markdown(""" """) # 绑定事件 # tts_text, speed, speaker, emotion, ref_audio, ref_text tts_base_button.click( fn=generate_speech_base, inputs=[tts_text_v1, speed_v1, speaker_v1, ref_audio_v1, ref_text_v1], outputs=tts_base_output ) tts_sft_button.click( fn=generate_speech_sft, inputs=[tts_text_sft, speaker_sft], outputs=tts_sft_output ) # tts_text, speed, speaker, key, ref_audio, ref_text tts_v3_button.click( fn=generate_speech_speakerminus, inputs=[tts_text_v3, speed_v3, speaker_v3, emotion_v3, ref_audio_v3, ref_text_v3], outputs=tts_v3_output ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=10163, share=True, favicon_path="/mnt/by079416/fengping/CosyVoice2/logo.png" )