import gradio as gr import torch import torchaudio from diffusers import StableDiffusionPipeline from TTS.api import TTS import moviepy.editor as mp import numpy as np import os from PIL import Image def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7): words = text.split() chunks = [] current_chunk = [] current_duration = 0 for word in words: current_chunk.append(word) current_duration += 1 / words_per_second if current_duration >= min_sec: if current_duration >= max_sec or len(current_chunk) > 20: chunks.append(" ".join(current_chunk)) current_chunk = [] current_duration = 0 if current_chunk: chunks.append(" ".join(current_chunk)) return chunks def generate_speech(text): #tts = TTS("tts_models/en/ljspeech/tacotron2-DDC") tts = TTS("tts_models/en/ljspeech/glow-tts") #tts.synthesizer.model.decoder.max_decoder_steps = 30000 # Increase limit wav_path = "speech.wav" tts.tts_to_file(text=text, file_path=wav_path) return wav_path def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40): image_paths = [] if use_diffusion: pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") pipe.to("cuda" if torch.cuda.is_available() else "cpu") for i, chunk in enumerate(chunks): if use_diffusion: image = pipe(chunk, num_inference_steps=num_steps).images[0] image = image.resize(image_size) else: image = Image.new("RGB", image_size, (0, 0, 0)) img_path = f"image_{i}.png" image.save(img_path) image_paths.append(img_path) return image_paths def create_video(images, durations, speech_path, movie_title, add_subtitles, chunks, image_size=(640, 480)): clips = [] title_clip = mp.TextClip(movie_title, fontsize=50, color='white', size=image_size) title_clip = title_clip.set_duration(1).set_position('center') black_start = mp.ColorClip(image_size, color=(0,0,0), duration=1).set_opacity(0.8) clips.append(mp.CompositeVideoClip([black_start, title_clip])) for img, dur, chunk in zip(images, durations, chunks): frame = np.array(Image.open(img).resize(image_size, Image.Resampling.LANCZOS)) clip = mp.ImageClip(frame).set_duration(dur) if add_subtitles: txt_clip = mp.TextClip(chunk, fontsize=30, color='white', size=(image_size[0] - 20, None), method='caption') txt_clip = txt_clip.set_duration(dur).set_position(('center', 'bottom')) clip = mp.CompositeVideoClip([clip, txt_clip]) clips.append(clip) black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2) video = mp.concatenate_videoclips(clips + [black_end]) audio = mp.AudioFileClip(speech_path) final_video = video.set_audio(audio) final_video.write_videofile("output.mp4", fps=24) return "output.mp4" def process_text(text, movie_title, image_size, use_diffusion, num_steps, add_subtitles): chunks = estimate_chunk_durations(text) speech_path = generate_speech(text) image_paths = generate_images(chunks, image_size, use_diffusion, num_steps) durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks] video_path = create_video(image_paths, durations, speech_path, movie_title, add_subtitles, chunks, image_size) return video_path with gr.Blocks() as demo: gr.Markdown("# Text-to-Video Generator using AI 🎥") text_input = gr.Textbox(label="Enter your text") movie_title_input = gr.Textbox(label="Movie Title", value="") file_input = gr.File(label="Or upload a .txt file") image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480") use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True) num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=5, label="Diffusion Model Steps") add_subtitles_input = gr.Checkbox(label="Add Subtitles", value=False) process_btn = gr.Button("Generate Video") output_video = gr.Video() def handle_request(text, movie_title, file, image_size, use_diffusion, num_steps, add_subtitles): if file is not None: text = open(file.name, "r").read() image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)} return process_text(text, movie_title, image_size_dict[image_size], use_diffusion, num_steps, add_subtitles) process_btn.click(handle_request, inputs=[text_input, movie_title_input, file_input, image_size_input, use_diffusion_input, num_steps_input, add_subtitles_input], outputs=output_video) demo.launch()