import os import gradio as gr from tools.do_everything import do_everything from tools.utils import SUPPORT_VOICE # --- Wrapper: forwards new emotion controls to your pipeline safely --- def run_with_emotion( output_folder, video_url, num_videos, download_resolution, demucs_model, device, num_shifts, asr_backend, whisperx_size, batch_size, enable_diar, min_speakers, max_speakers, translation_method, subtitle_language, tts_method, tts_target_language, edgetts_voice, subtitles, playback_speed, fps, bgm_path, bgm_vol, video_vol, output_resolution, max_workers, max_retries, emotion, # <--- NEW (UI dropdown) emotion_strength, # <--- NEW (UI slider) ): try: return do_everything( output_folder, video_url, num_videos, download_resolution, demucs_model, device, num_shifts, asr_backend, whisperx_size, batch_size, enable_diar, min_speakers, max_speakers, translation_method, subtitle_language, tts_method, tts_target_language, edgetts_voice, subtitles, playback_speed, fps, bgm_path, bgm_vol, video_vol, output_resolution, max_workers, max_retries, emotion=emotion, # preferred kwarg path emotion_strength=float(emotion_strength), ) except TypeError: # Backward-compat: ENV bridge if do_everything doesn't yet accept these kwargs os.environ["EMOTION_PRESET"] = str(emotion) os.environ["EMOTION_STRENGTH"] = str(emotion_strength) return do_everything( output_folder, video_url, num_videos, download_resolution, demucs_model, device, num_shifts, asr_backend, whisperx_size, batch_size, enable_diar, min_speakers, max_speakers, translation_method, subtitle_language, tts_method, tts_target_language, edgetts_voice, subtitles, playback_speed, fps, bgm_path, bgm_vol, video_vol, output_resolution, max_workers, max_retries, ) my_theme = gr.themes.Soft(primary_hue="blue", secondary_hue="green") # One-click pipeline full_auto_interface = gr.Interface( theme=my_theme, title="Smart Multilingual Video Dubbing/Translation", fn=run_with_emotion, # <--- use wrapper inputs=[ gr.Textbox(label="Output folder", value="videos"), gr.Textbox( label="Video URL", placeholder="Enter a YouTube/Bilibili video, playlist, or channel URL", value="https://www.youtube.com/watch?v=VowXFWlAXIU", ), gr.Slider(minimum=1, maximum=100, step=1, label="Number of videos to download", value=5, visible=False), gr.Radio( ["4320p", "2160p", "1440p", "1080p", "720p", "480p", "360p", "240p", "144p"], label="Download resolution", value="1080p", visible=False, ), gr.Radio( ["htdemucs", "htdemucs_ft", "htdemucs_6s", "hdemucs_mmi", "mdx", "mdx_extra", "mdx_q", "mdx_extra_q", "SIG"], label="Demucs model", value="htdemucs_ft", visible=False ), gr.Radio(["auto", "cuda", "cpu"], label="Device", value="auto", visible=False), gr.Slider(minimum=0, maximum=10, step=1, label="Number of shifts", value=5, visible=False), # ASR gr.Dropdown(["Higgs"], label="ASR backend", value="Higgs"), gr.Radio(["large", "medium", "small", "base", "tiny"], label="WhisperX size", value="large", visible=False), gr.Slider(minimum=1, maximum=128, step=1, label="Batch size", value=32, visible=False), gr.Checkbox(label="Enable speaker diarization", value=True, visible=False), gr.Radio([None, 1, 2, 3, 4, 5, 6, 7, 8, 9], label="Min speakers", value=None, visible=False), gr.Radio([None, 1, 2, 3, 4, 5, 6, 7, 8, 9], label="Max speakers", value=None, visible=False), # Translation gr.Dropdown(["LLM"], label="Translation method (LLM uses Boson/Qwen)", value="LLM"), gr.Dropdown( ["Simplified Chinese (简体中文)", "Traditional Chinese (繁体中文)", "English", "Korean", "Spanish"], label="Subtitle language", value="Simplified Chinese (简体中文)", ), # TTS gr.Dropdown(["Higgs", "xtts", "cosyvoice"], label="TTS method", value="Higgs"), gr.Dropdown( ["Chinese (中文)", "English", "Korean", "Spanish", "French"], label="TTS target language", value="Chinese (中文)", ), gr.Dropdown(SUPPORT_VOICE, value="zh-CN-XiaoxiaoNeural", label="EdgeTTS voice", visible=False), gr.Checkbox(label="Subtitles", value=True), gr.Slider(minimum=0.5, maximum=2, step=0.05, label="Playback speed", value=1.00, visible=False), gr.Slider(minimum=1, maximum=60, step=1, label="FPS", value=30, visible=False), gr.Audio(label="Background music", sources=["upload"], type="filepath", visible=False), gr.Slider(minimum=0, maximum=1, step=0.05, label="BGM volume", value=0.5, visible=False), gr.Slider(minimum=0, maximum=1, step=0.05, label="Video volume", value=1.0, visible=False), gr.Radio( ["4320p", "2160p", "1440p", "1080p", "720p", "480p", "360p", "240p", "144p"], label="Output resolution", value="1080p", visible=False ), gr.Slider(minimum=1, maximum=100, step=1, label="Max workers", value=1, visible=False), gr.Slider(minimum=1, maximum=10, step=1, label="Max retries", value=3, visible=False), # --- NEW: Emotion controls (auto-tuned via Higgs-understanding in pipeline) --- gr.Dropdown( ["natural", "happy", "sad", "angry"], label="Emotion", value="natural", info="Auto-tuned after TTS via Higgs understanding. 'natural' skips shaping.", ), gr.Slider( minimum=0.0, maximum=1.0, step=0.05, value=0.6, label="Emotion strength", info="0=no change, 1=max intensity. Used by the auto-tuner.", ), ], outputs=[gr.Text(label="Status"), gr.Video(label="Sample output")], allow_flagging="never", ) demo = full_auto_interface demo = demo.queue(concurrency_count=1, max_size=8) if __name__ == "__main__": demo.launch() # no host/port/share/inbrowser args