import gradio as gr import sys, os from huggingface_hub import snapshot_download, hf_hub_download import torch from cosyvoice.utils.file_utils import load_wav from uuid import uuid1 import uuid from cosyvoice_rodis.cli.cosyvoice import CosyVoice as CosyVoiceTTS_speakerminus from pydub import AudioSegment import tempfile import soundfile as sf import subprocess import numpy as np import random import numpy # import imageio_ffmpeg # ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe() # print(f"FFmpeg path: {ffmpeg_path}") # user_bin = os.path.expanduser("~/bin") # if not os.path.exists(user_bin): # os.makedirs(user_bin) # ffmpeg_link = os.path.join(user_bin, "ffmpeg") # if os.path.exists(ffmpeg_link): # os.remove(ffmpeg_link) # os.symlink(ffmpeg_path, ffmpeg_link) # print(f"create symbolic link: {ffmpeg_link}") # os.environ["PATH"] = f"{user_bin}:{os.environ.get('PATH', '')}" sys.path.append('third_party/Matcha-TTS') os.system('export PYTHONPATH=third_party/Matcha-TTS') from huggingface_hub import hf_hub_download # Download assets and logos in background to avoid blocking startup assets_dir = None logo_path = None logo_path2 = None def load_assets(): """Load assets lazily""" global assets_dir, logo_path, logo_path2 if assets_dir is None: try: print("Downloading assets and logos...") assets_dir = snapshot_download( repo_id="tienfeng/prompt", repo_type="dataset", ) logo_path = hf_hub_download( repo_id="tienfeng/prompt", filename="logo2.png", repo_type="dataset", ) logo_path2 = hf_hub_download( repo_id="tienfeng/prompt", filename="logo.png", repo_type="dataset", ) print("Assets downloaded successfully") except Exception as e: print(f"Warning: Failed to download assets/logos: {e}") assets_dir = None logo_path = None logo_path2 = None # Start downloading assets in background (non-blocking) import threading import time assets_download_thread = threading.Thread(target=load_assets, daemon=True) assets_download_thread.start() # Wait for assets to download (with timeout) before creating UI # This ensures logo is available when UI is created max_wait_time = 30 # Maximum wait time in seconds wait_interval = 0.5 # Check every 0.5 seconds elapsed = 0 while logo_path is None and elapsed < max_wait_time: time.sleep(wait_interval) elapsed += wait_interval if logo_path is None: print("Warning: Logo download timed out, UI will be created without logo") # Delay model download to avoid blocking startup model_repo_id = "AIDC-AI/Marco-Voice" local_model = None local_model_path = None local_model_path_enhenced = None def load_models(): """Load models lazily when needed""" global local_model, local_model_path, local_model_path_enhenced if local_model is None: print("Downloading models...") local_model = snapshot_download( repo_id=model_repo_id, repo_type="model" # token=os.getenv("HF_TOKEN") ) local_model_path = os.path.join(local_model, "marco_voice") local_model_path_enhenced = os.path.join(local_model, "marco_voice_enhenced") print("Models downloaded successfully") # Delay model loading to avoid blocking startup # Models will be loaded lazily when first used tts_speakerminus = None tts_sft = None text_prompt = { "翟佳宁": "这个节目就是把四个男嘉宾,四个女嘉宾放一个大别墅里让他们朝夕相处一整个月,月末选择心动的彼此。", "范志毅": "没这个能力知道吗,我已经说了,你像这样的比赛本身就没有打好基础。", "呼兰": "发完之后那个工作人员说,老师,呼兰老师你还要再加个标签儿,我说加什么标签儿,他说你就加一个呼兰太好笑了。", "江梓浩": "就是很多我们这帮演员一整年也就上这么一个脱口秀类型的节目。", "李雪琴": "我就劝他,我说你呀,你没事儿也放松放松,你那身体都亮红灯儿了你还不注意。", "刘旸": "比如这三年我在街上开车,会在开车的时候进行一些哲思,我有天开车的时候路过一个地方。", "唐香玉": "大家好我叫唐香玉, 我年前把我的工作辞了,成了一个全职脱口秀演员。", "小鹿": "然后我就老家的亲戚太多了,我也记不清谁该叫谁,所以我妈带着我和我。", "于祥宇": "我大学专业学的是哲学,然后节目组就说那这期主题你可以刚好聊一下哲学专业毕业之后的就业方向。", "赵晓卉": "终于没有人问我为什么不辞职了,结果谈到现在,谈恋爱第一天人家问我,能打个电话吗?我说你有啥事儿。", "徐志胜": "最舒服的一个方式,这个舞台也不一定就是说是来第一年就好嘛,只要你坚持,肯定会有发光发热的那天嘛。" } audio_prompt = { "翟佳宁": "zhaijianing", "范志毅": "fanzhiyi", "呼兰": "hulan", "江梓浩": "jiangzhihao", "李雪琴": "lixueqin", "刘旸": "liuchang", "唐香玉": "tangxiangyu", "小鹿": "xiaolu", "于祥宇": "yuxiangyu", "赵晓卉": "zhaoxiaohui", "徐志胜": "xuzhisheng" } # audio_prompt_path = assets_dir def load_audio_and_convert_to_16bit(file_path, target_sample_rate=16000): audio = AudioSegment.from_file(file_path) if audio.channels > 1: audio = audio.set_channels(1) if audio.frame_rate != target_sample_rate: audio = audio.set_frame_rate(target_sample_rate) audio_data = np.array(audio.get_array_of_samples(), dtype=np.float32) audio_data = audio_data / np.max(np.abs(audio_data)) audio_data = (audio_data * 32767).astype(np.int16) return torch.tensor(audio_data), target_sample_rate def convert_audio_with_sox(input_file, output_file, target_sample_rate=16000): try: # command = [ # './ffmpeg-7.0.2-amd64-static/ffmpeg', input_file, # '-r', str(target_sample_rate), # '-b', '16', # '-c', '1', # output_file # ] command = [ './ffmpeg-7.0.2-amd64-static/ffmpeg', '-i', input_file, '-ar', str(target_sample_rate), '-ac', '1', '-b:a', '16k', '-f', 'wav', output_file ] subprocess.run(command, check=True) print(f"Audio converted successfully: {output_file}") except subprocess.CalledProcessError as e: print(f"Error during conversion: {e}") os.makedirs("./tmp", exist_ok=True) def generate_speech_speakerminus(tts_text, speed, speaker, key, ref_audio, ref_text): # import pdb;pdb.set_trace() global tts_speakerminus_global, local_model_path # Ensure models are downloaded (this may take time on first use) if local_model_path is None: print("Downloading models (this may take a few minutes on first use)...") load_models() if 'tts_speakerminus_global' not in globals() or tts_speakerminus_global is None: print("Loading CosyVoice (speakerminus) model...") tts_speakerminus_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path) if not ref_audio or not ref_text: # Ensure assets are loaded if assets_dir is None: load_assets() audio_prompt_path = assets_dir if audio_prompt_path is None or assets_dir is None: raise ValueError("Audio prompt path is not available. Please wait a moment and try again, or provide reference audio and text.") ref_text = text_prompt.get(speaker, "") speaker_audio_name = audio_prompt.get(speaker) if speaker_audio_name: ref_audio = os.path.join(audio_prompt_path, f"{speaker_audio_name}.wav") else: raise ValueError(f"Speaker '{speaker}' not found in audio_prompt dictionary") else: try: info = sf.info(ref_audio) sample_rate = info.samplerate channels = info.channels if sample_rate != 16000: raise ValueError(f"Invalid audio sample rate. Expected: 16000 Hz, got: {sample_rate} Hz. Please use a 16kHz audio file.") if channels != 1: raise ValueError(f"Invalid audio channel count. Expected: 1 (mono), got: {channels}. Please use a mono audio file.") file_ext = os.path.splitext(ref_audio)[1].lower() if file_ext != '.wav': raise ValueError(f"Invalid audio format. Expected: WAV format, got: {file_ext}. Please use a WAV format audio file.") except Exception as e: if isinstance(e, ValueError): raise e else: raise ValueError(f"Failed to read audio file: {str(e)}. Please ensure the audio file is in the correct format (16kHz mono WAV format).") if not ref_audio: raise ValueError("Reference audio is required but not provided") ref_audio = load_wav(ref_audio, 16000) emo = {"Sad": "伤心", "Fearful": "恐惧", "Happy": "快乐", "Surprise": "惊喜", "Angry": "生气", "Jolliest": "戏谑"} # key="快乐" emotion_file = "./emotion_info.pt" if not os.path.exists(emotion_file): raise FileNotFoundError(f"Emotion info file not found: {emotion_file}. Please ensure this file exists in the workspace.") emotion_data = torch.load(emotion_file) if key in ["Angry", "Surprise", "Happy"]: emotion_info = emotion_data["male005"][key] elif key in ["Sad"]: emotion_info = emotion_data["female005"][key] elif key in ["Fearful"]: emotion_info = emotion_data["female003"][key] else: emotion_info = emotion_data["male005"][key] sample_rate, full_audio = tts_speakerminus_global.synthesize( tts_text, prompt_text = ref_text, # speaker=speaker, prompt_speech_16k = ref_audio, key = emo.get(key), emotion_embedding=emotion_info, # ref_audio = ref_audio, # speed=speed ) print("sample_rate:", sample_rate, "full_audio:", full_audio.min(), full_audio.max()) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: output_audio_path = temp_audio_file.name audio_segment = AudioSegment( full_audio.tobytes(), frame_rate=sample_rate, sample_width=full_audio.dtype.itemsize, channels=1 ) audio_segment.export(output_audio_path, format="wav") print(f"Audio saved to {output_audio_path}") return output_audio_path def generate_speech_sft(tts_text, speed, speaker, key, ref_audio, ref_text): global tts_sft_global, local_model_path_enhenced # Ensure models are downloaded (this may take time on first use) if local_model_path_enhenced is None: print("Downloading models (this may take a few minutes on first use)...") load_models() if 'tts_sft_global' not in globals() or tts_sft_global is None: print("Loading CosyVoice (SFT enhanced) model...") tts_sft_global = CosyVoiceTTS_speakerminus(model_dir=local_model_path_enhenced) if not ref_audio and not ref_text: # Ensure assets are loaded if assets_dir is None: load_assets() audio_prompt_path = assets_dir if audio_prompt_path is None or assets_dir is None: raise ValueError("Audio prompt path is not available. Please wait a moment and try again, or provide reference audio and text.") ref_text = text_prompt.get(speaker, "") speaker_audio_name = audio_prompt.get(speaker) if speaker_audio_name: ref_audio = os.path.join(audio_prompt_path, f"{speaker_audio_name}.wav") else: raise ValueError(f"Speaker '{speaker}' not found in audio_prompt dictionary") else: try: info = sf.info(ref_audio) sample_rate = info.samplerate channels = info.channels if sample_rate != 16000: raise ValueError(f"Invalid audio sample rate. Expected: 16000 Hz, got: {sample_rate} Hz. Please use a 16kHz audio file.") if channels != 1: raise ValueError(f"Invalid audio channel count. Expected: 1 (mono), got: {channels}. Please use a mono audio file.") file_ext = os.path.splitext(ref_audio)[1].lower() if file_ext != '.wav': raise ValueError(f"Invalid audio format. Expected: WAV format, got: {file_ext}. Please use a WAV format audio file.") except Exception as e: if isinstance(e, ValueError): raise e else: raise ValueError(f"Failed to read audio file: {str(e)}. Please ensure the audio file is in the correct format (16kHz mono WAV format).") if not ref_audio: raise ValueError("Reference audio is required but not provided") ref_audio = load_wav(ref_audio, 16000) emo = {"Sad": "伤心", "Fearful": "恐惧", "Happy": "快乐", "Surprise": "惊喜", "Angry": "生气", "Jolliest": "戏谑"} # key="快乐" emotion_file = "./emotion_info.pt" if not os.path.exists(emotion_file): raise FileNotFoundError(f"Emotion info file not found: {emotion_file}. Please ensure this file exists in the workspace.") emotion_data = torch.load(emotion_file) if key in ["Angry", "Surprise", "Happy"]: emotion_info = emotion_data["male005"][key] elif key in ["Sad"]: emotion_info = emotion_data["female005"][key] elif key in ["Fearful"]: emotion_info = emotion_data["female003"][key] else: emotion_info = emotion_data["male005"][key] sample_rate, full_audio = tts_sft_global.synthesize( tts_text, prompt_text = ref_text, # speaker=speaker, prompt_speech_16k = ref_audio, key = emo.get(key), emotion_embedding=emotion_info, # ref_audio = ref_audio, # speed=speed ) print("sample_rate:", sample_rate, "full_audio:", full_audio.min(), full_audio.max()) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: output_audio_path = temp_audio_file.name audio_segment = AudioSegment( full_audio.tobytes(), frame_rate=sample_rate, sample_width=full_audio.dtype.itemsize, channels=1 ) audio_segment.export(output_audio_path, format="wav") print(f"Audio saved to {output_audio_path}") return output_audio_path names = [ "于祥宇", "刘旸", "呼兰", "唐香玉", "小鹿", "李雪琴", "江梓浩", "翟佳宁", "范志毅", "赵晓卉", "徐志胜" ] custom_css = """ :root { --primary-color: #6a11cb; --secondary-color: #2575fc; --accent-color: #ff6b6b; --light-bg: #f8f9fa; --dark-bg: #212529; --card-bg: #ffffff; --text-color: #343a40; --border-radius: 12px; --box-shadow: 0 6px 16px rgba(0,0,0,0.1); --transition: all 0.3s ease; } body { background: linear-gradient(135deg, var(--light-bg) 0%, #e9ecef 100%); min-height: 100vh; font-family: 'Segoe UI', 'PingFang SC', 'Microsoft YaHei', sans-serif; color: var(--text-color); line-height: 1.6; } .gradio-container { max-width: 1200px !important; margin: 2rem auto !important; padding: 0 1rem; } /* Main header with blue background */ #main-header { background: linear-gradient(135deg, #1e3a8a 0%, #3b82f6 50%, #60a5fa 100%) !important; padding: 60px 40px !important; border-radius: 0 !important; margin-bottom: 2rem !important; text-align: center; } #header-content { max-width: 1200px; margin: 0 auto; } /* Main title */ #main-title { color: white !important; font-size: 48px !important; font-weight: 700 !important; margin: 0 0 10px 0 !important; text-align: center; letter-spacing: -1px; } #main-title h1 { color: white !important; font-size: 48px !important; font-weight: 700 !important; margin: 0 !important; text-align: center; letter-spacing: -1px; } /* Subtitle */ #subtitle { color: rgba(255, 255, 255, 0.9) !important; font-size: 18px !important; font-weight: 400 !important; margin: 0 0 40px 0 !important; text-align: center; } #subtitle p { color: rgba(255, 255, 255, 0.9) !important; font-size: 18px !important; margin: 0 !important; } /* Button row */ #button-row { display: flex; justify-content: center; gap: 12px; flex-wrap: wrap; margin-top: 30px; } .header-btn { padding: 12px 24px !important; border-radius: 6px !important; font-weight: 600 !important; font-size: 14px !important; border: none !important; cursor: pointer; transition: all 0.3s ease !important; min-width: 120px; } .header-btn-secondary { background: rgba(255, 255, 255, 0.15) !important; color: white !important; backdrop-filter: blur(10px); } .header-btn-secondary:hover { background: rgba(255, 255, 255, 0.25) !important; transform: translateY(-2px); box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2) !important; } .header-btn-primary { background: #fbbf24 !important; color: #1e3a8a !important; } .header-btn-primary:hover { background: #f59e0b !important; transform: translateY(-2px); box-shadow: 0 4px 12px rgba(251, 191, 36, 0.4) !important; } .tabs { background: transparent !important; border: none !important; box-shadow: none !important; } .tab-nav { background: var(--card-bg) !important; border-radius: var(--border-radius) !important; padding: 0.5rem !important; margin-bottom: 1.5rem !important; box-shadow: var(--box-shadow) !important; } .tab-button { padding: 1rem 1.5rem !important; border-radius: 8px !important; font-weight: 600 !important; transition: var(--transition) !important; border: none !important; } .tab-button.selected { background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%) !important; color: white !important; box-shadow: 0 4px 12px rgba(106, 17, 203, 0.3) !important; } .tab-content { background: var(--card-bg) !important; border-radius: var(--border-radius) !important; padding: 2rem !important; box-shadow: var(--box-shadow) !important; margin-bottom: 2rem; border: none !important; } .input-section { background: #f9fafb; padding: 1.5rem; border-radius: var(--border-radius); margin-bottom: 1.5rem; border: 1px solid #e9ecef; } .output-section { background: #edf2f7; padding: 1.5rem; border-radius: var(--border-radius); border: 1px solid #e9ecef; display: flex; flex-direction: column; height: 100%; } .control-group { margin-bottom: 1.2rem; } .control-group label { display: block; margin-bottom: 0.5rem; font-weight: 600; color: #495057; font-size: 0.95rem; } input[type="text"], textarea { border-radius: 8px !important; padding: 0.8rem 1rem !important; border: 1px solid #ced4da !important; transition: var(--transition) !important; } input[type="text"]:focus, textarea:focus { border-color: var(--primary-color) !important; box-shadow: 0 0 0 3px rgba(106, 17, 203, 0.1) !important; } .slider { margin-top: 0.5rem !important; } .btn-generate { background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%) !important; color: white !important; font-weight: 600 !important; padding: 1rem 1.8rem !important; border-radius: 8px !important; border: none !important; transition: var(--transition) !important; font-size: 1rem !important; margin-top: auto; width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.1) !important; } .btn-generate:hover { transform: translateY(-3px); box-shadow: 0 6px 12px rgba(106, 17, 203, 0.25) !important; } .example-text { background: #e9ecef; padding: 0.8rem; border-radius: 8px; font-style: italic; margin-top: 0.5rem; font-size: 0.9rem; color: #495057; } .audio-player { width: 100%; margin-top: 1rem; border-radius: 8px; overflow: hidden; } .model-info { background: #e6f7ff; padding: 1rem; border-radius: 8px; margin-top: 1.5rem; border-left: 4px solid #1890ff; font-size: 0.9rem; } .info-icon { color: #1890ff; margin-right: 8px; font-weight: bold; } .footer { text-align: center; color: #6c757d; font-size: 0.9rem; padding: 1.5rem 0; border-top: 1px solid #e9ecef; margin-top: 2rem; } .accordion { background: #f8f9fa !important; border-radius: 8px !important; padding: 0.8rem !important; margin-top: 1rem; border: 1px solid #e9ecef !important; } .accordion-title { font-weight: 600 !important; color: var(--primary-color) !important; } .audio-upload { border: 2px dashed #ced4da !important; border-radius: 8px !important; padding: 1.5rem !important; background: #f8f9fa !important; transition: var(--transition) !important; } .audio-upload:hover { border-color: var(--primary-color) !important; background: #f1f3f5 !important; } .audio-upload-label { font-weight: 500 !important; color: #495057 !important; margin-bottom: 0.5rem !important; } .radio-group { display: flex; flex-wrap: wrap; gap: 0.8rem; margin-top: 0.5rem; } .radio-item { flex: 1; min-width: 100px; text-align: center; padding: 0.8rem; border: 1px solid #ced4da; border-radius: 8px; cursor: pointer; transition: var(--transition); } .radio-item.selected { border-color: var(--primary-color); background: rgba(106, 17, 203, 0.05); color: var(--primary-color); font-weight: 500; } .radio-item:hover { border-color: var(--primary-color); } @media (max-width: 768px) { #main-header { padding: 40px 20px !important; } #main-title h1 { font-size: 32px !important; } #subtitle p { font-size: 16px !important; } #button-row { gap: 8px; } .header-btn { padding: 10px 16px !important; font-size: 12px !important; min-width: 100px; } .gradio-container { padding: 0 0.5rem; } .tab-button { padding: 0.8rem 1rem !important; font-size: 0.9rem !important; } } """ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: # Header section with blue background gr.HTML( """

Marco-Voice-TTS

Alibaba International Digital Commerce

GitHub Hugging Face Model Demo HF Space
""" ) with gr.Tabs(elem_classes="tabs") as tabs: with gr.TabItem("😄 Control of emotion", id=0): with gr.Row(): with gr.Column(scale=2, elem_classes="input-section"): gr.Markdown("### Input Settings") tts_text_v1 = gr.Textbox( lines=3, placeholder="Enter the text content you want to compose...", label="Synthesizing text", value="这真是太令人兴奋了!我们刚刚完成了一个重大突破!" ) with gr.Row(): with gr.Column(): speed_v1 = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking rate control" ) with gr.Column(): emotion_v1 = gr.Radio( choices=["Angry", "Happy", "Surprise", "Sad", "Fearful", "Jolliest"], value="Happy", label="Emotion selection" ) with gr.Row(): with gr.Column(): speaker_v1 = gr.Dropdown( choices=names, value="徐志胜", label="Preset timbre" ) with gr.Column(): gr.Markdown("### Or use a custom timbre") with gr.Accordion("Upload reference audio", open=False, elem_classes="accordion"): gr.Markdown("Upload 3-10 seconds of clear human voice as reference audio") ref_audio_v1 = gr.Audio( type="filepath", label="upload audio", elem_classes="audio-upload" ) ref_text_v1 = gr.Textbox( lines=2, placeholder="ref text content...", label="ref text" ) gr.Markdown("""

ℹ️ specification of a model: This model added emotion control ability on the basis of timbre cloning, and could generate speech with specific emotion.

💡 use skill: The sentiment expression effect is related to the content of the text, make sure the text matches the selected sentiment.

""") with gr.Column(scale=1, elem_classes="output-section"): gr.Markdown("### output result") tts_v1_output = gr.Audio( type="filepath", label="Generating speech", interactive=False ) tts_v1_button = gr.Button( "🚀 Generating speech", variant="primary", elem_classes="btn-generate" ) gr.Examples( examples=[ ["这真是太令人兴奋了!我们刚刚完成了一个重大突破!", "Happy", "徐志胜"], ["我简直不敢相信!这怎么可能发生?", "Surprise", "李雪琴"], ["这太让人失望了,我们所有的努力都白费了。", "Sad", "范志毅"] ], inputs=[tts_text_v1, emotion_v1, speaker_v1], label="Emotion example" ) with gr.TabItem("😄 Control of emotion enhenced", id=1): with gr.Row(): with gr.Column(scale=2, elem_classes="input-section"): gr.Markdown("### Input Settings") tts_text_v2 = gr.Textbox( lines=3, placeholder="Enter the text content you want to compose...", label="Synthesizing text", value="这真是太令人兴奋了!我们刚刚完成了一个重大突破!" ) with gr.Row(): with gr.Column(): speed_v2 = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking rate control" ) with gr.Column(): emotion_v2 = gr.Radio( choices=["Angry", "Happy", "Surprise", "Sad", "Fearful", "Jolliest"], value="Happy", label="Emotion selection" ) with gr.Row(): with gr.Column(): speaker_v2 = gr.Dropdown( choices=names, value="徐志胜", label="Preset timbre" ) with gr.Column(): gr.Markdown("### Or use a custom timbre") with gr.Accordion("Upload reference audio", open=False, elem_classes="accordion"): gr.Markdown("Upload 3-10 seconds of clear human voice as reference audio") ref_audio_v2 = gr.Audio( type="filepath", label="upload audio", elem_classes="audio-upload" ) ref_text_v2 = gr.Textbox( lines=2, placeholder="ref text content...", label="ref text" ) gr.Markdown("""

ℹ️ specification of a model: This model added emotion control ability on the basis of timbre cloning, and could generate speech with specific emotion.

💡 use skill: The sentiment expression effect is related to the content of the text, make sure the text matches the selected sentiment.

""") with gr.Column(scale=1, elem_classes="output-section"): gr.Markdown("### output result") tts_v2_output = gr.Audio( type="filepath", label="Generating speech", interactive=False ) tts_v2_button = gr.Button( "🚀 Generating speech", variant="primary", elem_classes="btn-generate" ) gr.Examples( examples=[ ["这真是太令人兴奋了!我们刚刚完成了一个重大突破!", "Happy", "徐志胜"], ["我简直不敢相信!这怎么可能发生?", "Surprise", "李雪琴"], ["这太让人失望了,我们所有的努力都白费了。", "Sad", "范志毅"] ], inputs=[tts_text_v2, emotion_v2, speaker_v2], label="emotion example" ) gr.Markdown(""" """) tts_v1_button.click( fn=generate_speech_speakerminus, inputs=[tts_text_v1, speed_v1, speaker_v1, emotion_v1, ref_audio_v1, ref_text_v1], outputs=tts_v1_output ) # tts_text, speed, speaker, key, ref_audio, ref_text tts_v2_button.click( fn=generate_speech_sft, inputs=[tts_text_v2, speed_v2, speaker_v2, emotion_v2, ref_audio_v2, ref_text_v2], outputs=tts_v2_output ) # Don't preload models - let them download on first use to avoid startup timeout # Models will be downloaded and loaded lazily when first requested by user if __name__ == "__main__": # Use environment variable for port (Hugging Face Spaces uses 7860 by default) server_port = int(os.environ.get("SERVER_PORT", 7860)) launch_kwargs = { "server_name": "0.0.0.0", "server_port": server_port, "share": False, } # Only add favicon if it was successfully downloaded if logo_path is not None or logo_path2 is not None: launch_kwargs["favicon_path"] = logo_path demo.launch(**launch_kwargs)