Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer | |
| import numpy as np | |
| from scipy.io.wavfile import write | |
| import os | |
| print("Initializing models...") | |
| # Initialize lyrics generation model (using GPT-2 as an example) | |
| lyrics_model_name = "gpt2" # You can use a fine-tuned model specific to lyrics | |
| lyrics_tokenizer = AutoTokenizer.from_pretrained(lyrics_model_name) | |
| lyrics_model = AutoModelForCausalLM.from_pretrained(lyrics_model_name) | |
| lyrics_generator = pipeline("text-generation", model=lyrics_model, tokenizer=lyrics_tokenizer) | |
| # Initialize Bark for vocals and music generation | |
| from transformers import BarkModel, BarkProcessor | |
| print("Loading Bark model...") | |
| bark_processor = BarkProcessor.from_pretrained("suno/bark") | |
| bark_model = BarkModel.from_pretrained("suno/bark") | |
| # Move to GPU if available | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| bark_model = bark_model.to(device) | |
| def generate_lyrics(prompt, max_length=150): | |
| """Generate song lyrics based on the input prompt""" | |
| # Add specific instructions to guide the model to generate lyrics | |
| enhanced_prompt = f"Write song lyrics about {prompt}. Include a verse and chorus structure:" | |
| # Generate lyrics using the model | |
| generated = lyrics_generator( | |
| enhanced_prompt, | |
| max_length=max_length, | |
| num_return_sequences=1, | |
| temperature=0.9, | |
| top_k=50, | |
| top_p=0.95 | |
| ) | |
| # Extract lyrics from generation | |
| lyrics = generated[0]['generated_text'].replace(enhanced_prompt, "").strip() | |
| return lyrics | |
| def generate_vocals(lyrics, voice_preset="v2/en_speaker_6"): | |
| """Generate vocals using Bark""" | |
| print(f"Generating vocals with lyrics: {lyrics[:50]}...") | |
| # Process text for better vocal generation by adding musical notation | |
| vocals_text = f"♪ {lyrics} ♪" | |
| inputs = bark_processor(text=vocals_text, voice_preset=voice_preset) | |
| audio_array = bark_model.generate(**inputs) | |
| audio_array = audio_array.cpu().numpy().squeeze() | |
| # Convert to proper audio format | |
| sample_rate = 24000 # Bark's output sample rate | |
| # Save temporarily and return path | |
| os.makedirs("outputs", exist_ok=True) | |
| output_path = "outputs/vocals.wav" | |
| write(output_path, sample_rate, audio_array) | |
| return output_path, sample_rate, audio_array | |
| def generate_simple_music(prompt, voice_preset="v2/en_speaker_9"): | |
| """Generate simple music using Bark's capability to create singing/humming""" | |
| print(f"Generating music for theme: {prompt}...") | |
| # Create a prompt that instructs Bark to generate instrumental sounds | |
| music_text = f"[music: {prompt}, instrumental, background music without lyrics] ♪ hmm hmm hmm ♪" | |
| inputs = bark_processor(text=music_text, voice_preset=voice_preset) | |
| audio_array = bark_model.generate(**inputs) | |
| audio_array = audio_array.cpu().numpy().squeeze() | |
| # Convert to proper audio format | |
| sample_rate = 24000 # Bark's output sample rate | |
| # Save temporarily and return path | |
| os.makedirs("outputs", exist_ok=True) | |
| output_path = "outputs/music.wav" | |
| write(output_path, sample_rate, audio_array) | |
| return output_path, sample_rate, audio_array | |
| def mix_audio(vocals_data, music_data, vocals_volume=0.7, music_volume=0.4): | |
| """Combine vocals and music with basic mixing""" | |
| vocals_path, vocals_sr, vocals_array = vocals_data | |
| music_path, music_sr, music_array = music_data | |
| # Adjust length - make sure both are the same length by padding or truncating | |
| max_length = max(len(vocals_array), len(music_array)) | |
| if len(vocals_array) < max_length: | |
| vocals_array = np.pad(vocals_array, (0, max_length - len(vocals_array))) | |
| if len(music_array) < max_length: | |
| music_array = np.pad(music_array, (0, max_length - len(music_array))) | |
| else: | |
| # Truncate music if too long | |
| music_array = music_array[:max_length] | |
| # Mix - make vocals louder than music | |
| mixed_audio = vocals_volume * vocals_array + music_volume * music_array | |
| # Normalize | |
| mixed_audio = mixed_audio / np.max(np.abs(mixed_audio)) * 0.9 | |
| # Save final mix | |
| os.makedirs("outputs", exist_ok=True) | |
| output_path = "outputs/final_song.wav" | |
| write(output_path, vocals_sr, mixed_audio) | |
| return output_path | |
| def text_to_song(prompt, voice_selection): | |
| """Main function to convert text prompt to a song""" | |
| print(f"Processing prompt: {prompt}") | |
| # Set the voice based on selection | |
| voice_presets = { | |
| "Female Singer": "v2/en_speaker_6", | |
| "Male Singer": "v2/en_speaker_5", | |
| "Female Alto": "v2/en_speaker_9", | |
| "Male Baritone": "v2/en_speaker_0" | |
| } | |
| selected_voice = voice_presets.get(voice_selection, "v2/en_speaker_6") | |
| # Step 1: Generate lyrics | |
| lyrics = generate_lyrics(prompt) | |
| # Step 2: Generate vocals | |
| vocals_data = generate_vocals(lyrics, voice_preset=selected_voice) | |
| # Step 3: Generate simple music using Bark | |
| music_data = generate_simple_music(prompt) | |
| # Step 4: Mix vocals and music | |
| final_song_path = mix_audio(vocals_data, music_data) | |
| return lyrics, final_song_path | |
| # Create Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Text to Song Generation App") | |
| gr.Markdown("Enter a prompt describing the song you want to generate") | |
| with gr.Row(): | |
| prompt_input = gr.Textbox( | |
| label="Prompt", | |
| placeholder="Enter a description for your song...", | |
| value="a love song about summer" | |
| ) | |
| voice_selection = gr.Dropdown( | |
| choices=["Female Singer", "Male Singer", "Female Alto", "Male Baritone"], | |
| label="Select Voice", | |
| value="Female Singer" | |
| ) | |
| generate_button = gr.Button("Generate Song") | |
| with gr.Row(): | |
| lyrics_output = gr.Textbox(label="Generated Lyrics") | |
| with gr.Row(): | |
| audio_output = gr.Audio(label="Generated Song") | |
| generate_button.click( | |
| fn=text_to_song, | |
| inputs=[prompt_input, voice_selection], | |
| outputs=[lyrics_output, audio_output] | |
| ) | |
| # Add examples | |
| gr.Examples( | |
| examples=[ | |
| ["a heartfelt country ballad about lost love", "Male Singer"], | |
| ["an upbeat pop song about friendship", "Female Singer"], | |
| ["a rock anthem about overcoming challenges", "Male Baritone"] | |
| ], | |
| inputs=[prompt_input, voice_selection] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |