Viet-SpeechT5-TTS-finetuning

Sleeping

App Files Files Community

danhtran2mind commited on Oct 1

Commit

1efd737

verified ·

1 Parent(s): 7a64576

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -5

app.py CHANGED Viewed

@@ -4,6 +4,27 @@ import soundfile as sf
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from datasets import load_dataset
 import numpy as np
 # Load the processor, model, and vocoder
 processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
@@ -15,9 +36,7 @@ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validat
 def generate_speech(text, voice):
     # Select speaker embedding based on voice choice
-    speaker_dict = {"male": 2000,
-                    "female": 7000}
     speaker_id = speaker_dict[voice.lower()]
     speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0)
@@ -38,6 +57,9 @@ def generate_speech(text, voice):
     sf.write(output_path, speech.numpy(), samplerate=16000)
     return output_path
 # Create Gradio interface
 iface = gr.Interface(
     fn=generate_speech,
@@ -47,9 +69,10 @@ iface = gr.Interface(
     ],
     outputs=gr.Audio(label="Generated Speech"),
     title="Vietnamese Text-to-Speech with SpeechT5",
-    description="Enter Vietnamese text and select a voice (Male or Female) to generate speech."
 )
 # Launch the app
 if __name__ == "__main__":
-    iface.launch(debug=True)

 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from datasets import load_dataset
 import numpy as np
+import json
+import os
+# Directory containing config files
+CONFIG_DIR = "assets/Viet-SpeechT5-TTS-finetuning"
+# Load all config.json files from the directory
+def load_configs(directory):
+    examples = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file == "config.json":
+                file_path = os.path.join(root, file)
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as f:
+                        config = json.load(f)
+                        if "input_text" in config and "voice" in config:
+                            examples.append([config["input_text"], config["voice"]])
+                except Exception as e:
+                    print(f"Error reading {file_path}: {e}")
+    return examples
 # Load the processor, model, and vocoder
 processor = SpeechT5Processor.from_pretrained("danhtran2mind/Viet-SpeechT5-TTS-finetuning")
 def generate_speech(text, voice):
     # Select speaker embedding based on voice choice
+    speaker_dict = {"male": 2000, "female": 7000}
     speaker_id = speaker_dict[voice.lower()]
     speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0)
     sf.write(output_path, speech.numpy(), samplerate=16000)
     return output_path
+# Load examples from config files
+examples = load_configs(CONFIG_DIR)
 # Create Gradio interface
 iface = gr.Interface(
     fn=generate_speech,
     ],
     outputs=gr.Audio(label="Generated Speech"),
     title="Vietnamese Text-to-Speech with SpeechT5",
+    description="Enter Vietnamese text and select a voice (Male or Female) to generate speech.",
+    examples=examples
 )
 # Launch the app
 if __name__ == "__main__":
+    iface.launch()