Spaces:

Mohaddz
/

speach

Sleeping

App Files Files Community

Mohaddz commited on Mar 26, 2025

Commit

580bcb7

verified ·

1 Parent(s): e360346

Update app.py

Browse files

Files changed (1) hide show

app.py +198 -105

app.py CHANGED Viewed

@@ -3,66 +3,102 @@ from snac import SNAC
 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
 load_dotenv()
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading SNAC model...")
-snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-snac_model = snac_model.to(device)
-model_name = "Mohaddz/orpheus-3b-0.1-ft-ar"
-# Download only model config and safetensors
-snapshot_download(
-    repo_id=model_name,
-    allow_patterns=[
-        "config.json",
-        "*.safetensors",
-        "model.safetensors.index.json",
-    ],
-    ignore_patterns=[
-        "optimizer.pt",
-        "pytorch_model.bin",
-        "training_args.bin",
-        "scheduler.pt",
-        "tokenizer.json",
-        "tokenizer_config.json",
-        "special_tokens_map.json",
-        "vocab.json",
-        "merges.txt",
-        "tokenizer.*"
-    ]
-)
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
-model.to(device)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-print(f"Orpheus model loaded to {device}")
-# Process text prompt
-def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
-    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
     start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
     end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human
     modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)  # SOH SOT Text EOT EOH
-    # No padding needed for single input
     attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
-# Parse output tokens to audio
 def parse_output(generated_ids):
     token_to_find = 128257
     token_to_remove = 128258
     token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
     if len(token_indices[1]) > 0:
@@ -81,19 +117,23 @@ def parse_output(generated_ids):
         row_length = row.size(0)
         new_length = (row_length // 7) * 7
         trimmed_row = row[:new_length]
-        trimmed_row = [t - 128266 for t in trimmed_row]
         code_lists.append(trimmed_row)
-    return code_lists[0]  # Return just the first one for single sample
-# Redistribute codes for audio generation
-def redistribute_codes(code_list, snac_model):
-    device = next(snac_model.parameters()).device  # Get the device of SNAC model
     layer_1 = []
     layer_2 = []
     layer_3 = []
-    for i in range((len(code_list)+1)//7):
         layer_1.append(code_list[7*i])
         layer_2.append(code_list[7*i+1]-4096)
         layer_3.append(code_list[7*i+2]-(2*4096))
@@ -101,137 +141,190 @@ def redistribute_codes(code_list, snac_model):
         layer_2.append(code_list[7*i+4]-(4*4096))
         layer_3.append(code_list[7*i+5]-(5*4096))
         layer_3.append(code_list[7*i+6]-(6*4096))
-    # Move tensors to the same device as the SNAC model
     codes = [
-        torch.tensor(layer_1, device=device).unsqueeze(0),
-        torch.tensor(layer_2, device=device).unsqueeze(0),
-        torch.tensor(layer_3, device=device).unsqueeze(0)
     ]
-    audio_hat = snac_model.decode(codes)
-    return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
-# Main generation function
 @spaces.GPU()
-def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
     if not text.strip():
         return None
     try:
         progress(0.1, "Processing text...")
-        input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
         progress(0.3, "Generating speech tokens...")
         with torch.no_grad():
-            generated_ids = model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 max_new_tokens=max_new_tokens,
                 do_sample=True,
-                temperature=temperature,
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
-                eos_token_id=128258,
             )
         progress(0.6, "Processing speech tokens...")
         code_list = parse_output(generated_ids)
         progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
         return (24000, audio_samples)  # Return sample rate and audio
     except Exception as e:
         print(f"Error generating speech: {e}")
         return None
 # Examples for the UI
 examples = [
-    ["Hey there my name is Tara, <chuckle> and I'm a speech generation model that can sound like a person.", "tara", 0.6, 0.95, 1.1, 1200],
-    ["I've also been taught to understand and produce paralinguistic things like sighing, or chuckling, or yawning!", "dan", 0.7, 0.95, 1.1, 1200],
-    ["I live in San Francisco, and have, uhm let's see, 3 billion 7 hundred ... well, lets just say a lot of parameters.", "emma", 0.6, 0.9, 1.2, 1200]
 ]
-# Available voices
-VOICES = ["tara", "dan", "josh", "emma"]
 # Create Gradio interface
 with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
     gr.Markdown("""
-    # 🎵 [Orpheus Text-to-Speech](https://github.com/canopyai/Orpheus-TTS)
-    Enter your text below and hear it converted to natural-sounding speech with the Orpheus TTS model.
-    ## Tips for better prompts:
-    - Add paralinguistic elements like `<chuckle>`, `<sigh>`, or `uhm` for more human-like speech.
-    - Longer text prompts generally work better than very short phrases
-    - Adjust the temperature slider for more varied (higher) or consistent (lower) speech patterns
-    """)
     with gr.Row():
         with gr.Column(scale=3):
             text_input = gr.Textbox(
-                label="Text to speak",
-                placeholder="Enter your text here...",
-                lines=5
             )
             voice = gr.Dropdown(
-                choices=VOICES,
-                value="tara",
-                label="Voice"
             )
-            with gr.Accordion("Advanced Settings", open=False):
                 temperature = gr.Slider(
                     minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                    label="Temperature",
                     info="Higher values (0.7-1.0) create more expressive but less stable speech"
                 )
                 top_p = gr.Slider(
                     minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                    label="Top P",
                     info="Nucleus sampling threshold"
                 )
                 repetition_penalty = gr.Slider(
                     minimum=1.0, maximum=2.0, value=1.1, step=0.05,
-                    label="Repetition Penalty",
                     info="Higher values discourage repetitive patterns"
                 )
                 max_new_tokens = gr.Slider(
                     minimum=100, maximum=2000, value=1200, step=100,
-                    label="Max Length",
                     info="Maximum length of generated audio (in tokens)"
                 )
             with gr.Row():
-                submit_btn = gr.Button("Generate Speech", variant="primary")
-                clear_btn = gr.Button("Clear")
         with gr.Column(scale=2):
-            audio_output = gr.Audio(label="Generated Speech", type="numpy")
     # Set up examples
     gr.Examples(
         examples=examples,
         inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output,
-        fn=generate_speech,
-        cache_examples=True,
     )
-    # Set up event handlers
     submit_btn.click(
         fn=generate_speech,
         inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output
     )
     clear_btn.click(
         fn=lambda: (None, None),
         inputs=[],
         outputs=[text_input, audio_output]
     )
 # Launch the app
 if __name__ == "__main__":
-    demo.queue().launch(share=False, ssr_mode=False)

 import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
+# Removed snapshot_download as from_pretrained handles caching
 from dotenv import load_dotenv
+import gc # Import garbage collector for memory management
 load_dotenv()
+# --- Global Variables ---
+current_model = None
+current_tokenizer = None
+current_model_name = None
+model_choices = ["Mohaddz/orpheus-3b-0.1-ft-ar", "Mohaddz/orpheus-arabic-exp"]
+default_model_name = "Mohaddz/orpheus-3b-0.1-ft-ar" # Or your preferred default
+# --- End Global Variables ---
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.bfloat16 if device == "cuda" else torch.float32 # Use float32 on CPU
 print("Loading SNAC model...")
+try:
+    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
+    snac_model = snac_model.to(device)
+    print("SNAC model loaded.")
+except Exception as e:
+    print(f"Error loading SNAC model: {e}")
+    snac_model = None # Handle case where SNAC fails
+# --- Model Loading Function ---
+def load_model_and_tokenizer(model_name_to_load, progress=gr.Progress(track_tqdm=True)):
+    global current_model, current_tokenizer, current_model_name, device, dtype
+    if model_name_to_load == current_model_name and current_model is not None:
+        print(f"Model {model_name_to_load} is already loaded.")
+        gr.Info(f"Model {model_name_to_load} is already loaded.")
+        return f"Model {model_name_to_load} already loaded." # Return status message
+    print(f"Unloading previous model if exists...")
+    # Explicitly delete previous model and clear cache to free VRAM
+    if current_model is not None:
+        del current_model
+        current_model = None
+    if current_tokenizer is not None:
+        del current_tokenizer
+        current_tokenizer = None
+    gc.collect() # Run garbage collection
+    if device == "cuda":
+        torch.cuda.empty_cache() # Clear CUDA cache
+    print(f"Loading Orpheus model: {model_name_to_load}...")
+    try:
+        # Use from_pretrained which handles download and caching
+        new_model = AutoModelForCausalLM.from_pretrained(model_name_to_load, torch_dtype=dtype)
+        new_model.to(device)
+        new_tokenizer = AutoTokenizer.from_pretrained(model_name_to_load)
+        # Update global variables
+        current_model = new_model
+        current_tokenizer = new_tokenizer
+        current_model_name = model_name_to_load
+        print(f"Orpheus model {current_model_name} loaded successfully to {device}")
+        gr.Info(f"Model {current_model_name} loaded.")
+        return f"Model {current_model_name} loaded." # Return status message
+    except Exception as e:
+        print(f"Error loading model {model_name_to_load}: {e}")
+        # Reset globals if loading fails
+        current_model = None
+        current_tokenizer = None
+        current_model_name = None
+        gr.Warning(f"Failed to load model {model_name_to_load}. Please try again or select another model.")
+        return f"Error loading {model_name_to_load}." # Return status message
+# --- End Model Loading Function ---
+# Process text prompt (Uses global tokenizer now)
+def process_prompt(prompt, voice, device):
+    if current_tokenizer is None:
+        raise ValueError("Tokenizer not loaded.")
     prompt = f"{voice}: {prompt}"
+    input_ids = current_tokenizer(prompt, return_tensors="pt").input_ids
     start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
     end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human
     modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)  # SOH SOT Text EOT EOH
     attention_mask = torch.ones_like(modified_input_ids)
     return modified_input_ids.to(device), attention_mask.to(device)
+# Parse output tokens to audio (no change needed)
 def parse_output(generated_ids):
     token_to_find = 128257
     token_to_remove = 128258
     token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
     if len(token_indices[1]) > 0:
         row_length = row.size(0)
         new_length = (row_length // 7) * 7
         trimmed_row = row[:new_length]
+        trimmed_row = [t - 128266 for t in trimmed_row] # Adjust based on actual token IDs if needed
         code_lists.append(trimmed_row)
+    return code_lists[0] if code_lists else [] # Handle empty case
+# Redistribute codes for audio generation (no change needed)
+def redistribute_codes(code_list, snac_model_instance):
+    if not snac_model_instance or not code_list:
+         print("SNAC model not loaded or code list empty.")
+         return None
+    snac_device = next(snac_model_instance.parameters()).device
     layer_1 = []
     layer_2 = []
     layer_3 = []
+    num_frames = len(code_list) // 7 # Use integer division
+    for i in range(num_frames):
         layer_1.append(code_list[7*i])
         layer_2.append(code_list[7*i+1]-4096)
         layer_3.append(code_list[7*i+2]-(2*4096))
         layer_2.append(code_list[7*i+4]-(4*4096))
         layer_3.append(code_list[7*i+5]-(5*4096))
         layer_3.append(code_list[7*i+6]-(6*4096))
+    if not layer_1: # Check if any codes were processed
+        print("No valid frames found in code list.")
+        return None
     codes = [
+        torch.tensor(layer_1, device=snac_device).unsqueeze(0),
+        torch.tensor(layer_2, device=snac_device).unsqueeze(0),
+        torch.tensor(layer_3, device=snac_device).unsqueeze(0)
     ]
+    with torch.no_grad():
+        audio_hat = snac_model_instance.decode(codes)
+    return audio_hat.detach().squeeze().cpu().numpy()
+# Main generation function (Uses global model now)
 @spaces.GPU()
+def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress(track_tqdm=True)):
+    global current_model, device # Access globals
+    if current_model is None or current_tokenizer is None:
+        gr.Warning("Orpheus model not loaded. Please select a model and wait for it to load.")
+        return None
+    if snac_model is None:
+        gr.Warning("SNAC vocoder model failed to load. Cannot generate audio.")
+        return None
     if not text.strip():
+        gr.Info("Please enter some text.")
         return None
     try:
         progress(0.1, "Processing text...")
+        input_ids, attention_mask = process_prompt(text, voice, device)
         progress(0.3, "Generating speech tokens...")
         with torch.no_grad():
+            # Make sure generation parameters are appropriate
+            generated_ids = current_model.generate(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 max_new_tokens=max_new_tokens,
                 do_sample=True,
+                temperature=max(temperature, 0.01), # Ensure temp is not zero
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
+                eos_token_id=128258, # Make sure this is correct for the models
+                pad_token_id=current_tokenizer.pad_token_id if current_tokenizer.pad_token_id is not None else current_tokenizer.eos_token_id # Use tokenizer's pad/eos token
             )
         progress(0.6, "Processing speech tokens...")
         code_list = parse_output(generated_ids)
         progress(0.8, "Converting to audio...")
         audio_samples = redistribute_codes(code_list, snac_model)
+        if audio_samples is None:
+            gr.Warning("Failed to generate audio samples.")
+            return None
         return (24000, audio_samples)  # Return sample rate and audio
     except Exception as e:
         print(f"Error generating speech: {e}")
+        import traceback
+        traceback.print_exc() # Print full traceback for debugging
+        gr.Error(f"An error occurred during generation: {e}")
         return None
+# --- Load Default Model at Startup ---
+# Moved initial loading to happen *before* launching the UI
+# This ensures a model is ready when the interface appears.
+print("Loading default model...")
+initial_status = load_model_and_tokenizer(default_model_name)
+print(initial_status)
+# --- End Load Default Model ---
 # Examples for the UI
 examples = [
+    # Examples might need adjusting if voices/behavior differ between models
+    ["السلام عليكم كيف حالكم اليوم؟", "tara", 0.6, 0.95, 1.1, 1200],
+    ["أنا نموذج لتحويل النص إلى كلام يمكنه التحدث باللغة العربية.", "dan", 0.7, 0.95, 1.1, 1200],
+    # ["I live in San Francisco, and have, uhm let's see, 3 billion 7 hundred ... well, lets just say a lot of parameters.", "emma", 0.6, 0.9, 1.2, 1200] # Keep or remove English examples
 ]
+# Available voices (Might need updating based on your fine-tuned models)
+# You might need different voice lists per model, or just use 'tara'/'dan' if they exist in both
+VOICES = ["tara", "dan", "josh", "emma"] # Adjust as needed
 # Create Gradio interface
 with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
     gr.Markdown("""
+    # 🎵 Orpheus Text-to-Speech (Arabic Fine-tuned)
+    Enter your text below and hear it converted to natural-sounding speech.
+    Select the desired fine-tuned model below.
+    """)
+    with gr.Row():
+        # Model Selection Dropdown
+        model_selector = gr.Dropdown(
+            choices=model_choices,
+            value=current_model_name, # Default to the loaded model
+            label="Select Fine-Tuned Model",
+            interactive=True
+        )
+        # Status Textbox (Optional)
+        status_display = gr.Textbox(label="Model Status", value=initial_status, interactive=False)
     with gr.Row():
         with gr.Column(scale=3):
             text_input = gr.Textbox(
+                label="Text to speak (النص)",
+                placeholder="أدخل النص هنا...",
+                lines=5,
+                text_align="right" # Align text right for Arabic
             )
             voice = gr.Dropdown(
+                choices=VOICES,
+                value="tara", # Default voice
+                label="Voice (الصوت)"
             )
+            with gr.Accordion("Advanced Settings (إعدادات متقدمة)", open=False):
                 temperature = gr.Slider(
                     minimum=0.1, maximum=1.5, value=0.6, step=0.05,
+                    label="Temperature (درجة الحرارة)",
                     info="Higher values (0.7-1.0) create more expressive but less stable speech"
                 )
                 top_p = gr.Slider(
                     minimum=0.1, maximum=1.0, value=0.95, step=0.05,
+                    label="Top P",
                     info="Nucleus sampling threshold"
                 )
                 repetition_penalty = gr.Slider(
                     minimum=1.0, maximum=2.0, value=1.1, step=0.05,
+                    label="Repetition Penalty (عقوبة التكرار)",
                     info="Higher values discourage repetitive patterns"
                 )
                 max_new_tokens = gr.Slider(
                     minimum=100, maximum=2000, value=1200, step=100,
+                    label="Max Length (الطول الأقصى)",
                     info="Maximum length of generated audio (in tokens)"
                 )
             with gr.Row():
+                submit_btn = gr.Button("Generate Speech (توليد الكلام)", variant="primary")
+                clear_btn = gr.Button("Clear (مسح)")
         with gr.Column(scale=2):
+            audio_output = gr.Audio(label="Generated Speech (الكلام المولّد)", type="numpy")
     # Set up examples
     gr.Examples(
         examples=examples,
         inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output,
+        fn=generate_speech, # Function to call for examples
+        cache_examples=False, # Disable caching if models change behavior
+    )
+    # --- Event Handlers ---
+    # Trigger model loading when dropdown changes
+    model_selector.change(
+        fn=load_model_and_tokenizer,
+        inputs=[model_selector],
+        outputs=[status_display] # Update status display
     )
+    # Generate speech button click
     submit_btn.click(
         fn=generate_speech,
         inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output
     )
+    # Clear button click
     clear_btn.click(
         fn=lambda: (None, None),
         inputs=[],
         outputs=[text_input, audio_output]
     )
+    # --- End Event Handlers ---
 # Launch the app
 if __name__ == "__main__":
+    demo.queue().launch(share=False) # Removed ssr_mode=False, queue is usually enough