# app.py import gradio as gr from playdiffusion import PlayDiffusion, RVCInput import os import wget import torch # Import torch to check for CUDA availability # --- Model Downloading --- print("--- Checking and Downloading Model Assets ---") MODEL_FILES = { "kmeans_10k.npy": "https://huggingface.co/PlayHT/PlayDiffusion/resolve/main/kmeans_10k.npy", "last_250k_fixed.pkl": "https://huggingface.co/PlayHT/PlayDiffusion/resolve/main/last_250k_fixed.pkl", "tokenizer-multi_bpe16384_merged_extended_58M.json": "https://huggingface.co/PlayHT/PlayDiffusion/resolve/main/tokenizer-multi_bpe16384_merged_extended_58M.json", "v090_g_01105000": "https://huggingface.co/PlayHT/PlayDiffusion/resolve/main/v090_g_01105000", "voice_encoder_1992000.pt": "https://huggingface.co/PlayHT/PlayDiffusion/resolve/main/voice_encoder_1992000.pt", "xlsr2_1b_v2_custom.pt": "https://huggingface.co/PlayHT/PlayDiffusion/resolve/main/xlsr2_1b_v2_custom.pt" } for filename, url in MODEL_FILES.items(): if not os.path.exists(filename): print(f"Downloading {filename}...") wget.download(url, filename) else: print(f"{filename} already exists. Skipping download.") # --- Gradio App --- # --- KEY CHANGE: Force the model to load on the CPU --- # Check if a GPU is available, otherwise default to CPU device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f"--- Device selected: {device.upper()} ---") print("Initializing PlayDiffusion... This will load the models into memory.") # Pass the selected device to the PlayDiffusion constructor inpainter = PlayDiffusion(device=device) print("PlayDiffusion initialized successfully.") # ---------------------------------------------------- def speech_rvc(rvc_source_speech, rvc_target_voice): if rvc_source_speech is None or rvc_target_voice is None: raise gr.Error("Please provide both a source speech audio and a target voice audio.") print("Starting voice conversion...") converted_audio = inpainter.rvc(RVCInput(source_speech=rvc_source_speech, target_voice=rvc_target_voice)) print("Voice conversion finished.") return converted_audio with gr.Blocks(theme=gr.themes.Soft(), title="PlayDiffusion Voice Conversion") as demo: gr.Markdown("# 🗣️ PlayDiffusion Voice Conversion") gr.Markdown(f"### Running on: **{device.upper()}**") gr.Markdown("Upload a **Source Speech** audio and a **Target Voice** audio to convert the speech.") if device == 'cpu': gr.Warning("Running on CPU. The voice conversion process will be extremely slow and may time out.") with gr.Row(): rvc_source_speech = gr.Audio(label="Source Speech", sources=["upload", "microphone"], type="filepath") rvc_target_voice = gr.Audio(label="Target Voice", sources=["upload", "microphone"], type="filepath") rvc_submit = gr.Button("🚀 Run Voice Conversion", variant="primary") gr.Markdown("### Converted Speech Output") rvc_output = gr.Audio(label="Result", interactive=False) rvc_submit.click(fn=speech_rvc, inputs=[rvc_source_speech, rvc_target_voice], outputs=[rvc_output]) demo.launch()