Spaces:
Running
Running
File size: 4,823 Bytes
3920bf8 96714cf 3920bf8 872b88b 96714cf 10d615f 17a42fc 96714cf d9d7e9a 96714cf dc44a16 96714cf dc44a16 96714cf db78349 e952ea3 db78349 d9d7e9a db78349 e952ea3 db78349 a43fcb6 db78349 96714cf bce1eba 96714cf db78349 96714cf dc44a16 96714cf db78349 96714cf 350d456 db78349 96714cf cd94723 dc44a16 db78349 96714cf db78349 96714cf cd94723 db78349 96714cf db78349 3920bf8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import gradio as gr
import torch
import torchaudio
from diffusers import StableDiffusionPipeline
from TTS.api import TTS
import moviepy.editor as mp
import numpy as np
import os
from PIL import Image
def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7):
words = text.split()
chunks = []
current_chunk = []
current_duration = 0
for word in words:
current_chunk.append(word)
current_duration += 1 / words_per_second
if current_duration >= min_sec:
if current_duration >= max_sec or len(current_chunk) > 20:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_duration = 0
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def generate_speech(text):
#tts = TTS("tts_models/en/ljspeech/tacotron2-DDC")
tts = TTS("tts_models/en/ljspeech/glow-tts")
#tts.synthesizer.model.decoder.max_decoder_steps = 30000 # Increase limit
wav_path = "speech.wav"
tts.tts_to_file(text=text, file_path=wav_path)
return wav_path
def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40):
image_paths = []
if use_diffusion:
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
pipe.to("cuda" if torch.cuda.is_available() else "cpu")
for i, chunk in enumerate(chunks):
if use_diffusion:
image = pipe(chunk, num_inference_steps=num_steps).images[0]
image = image.resize(image_size)
else:
image = Image.new("RGB", image_size, (0, 0, 0))
img_path = f"image_{i}.png"
image.save(img_path)
image_paths.append(img_path)
return image_paths
def create_video(images, durations, speech_path, movie_title, add_subtitles, chunks, image_size=(640, 480)):
clips = []
title_clip = mp.TextClip(movie_title, fontsize=50, color='white', size=image_size)
title_clip = title_clip.set_duration(1).set_position('center')
black_start = mp.ColorClip(image_size, color=(0,0,0), duration=1).set_opacity(0.8)
clips.append(mp.CompositeVideoClip([black_start, title_clip]))
for img, dur, chunk in zip(images, durations, chunks):
frame = np.array(Image.open(img).resize(image_size, Image.Resampling.LANCZOS))
clip = mp.ImageClip(frame).set_duration(dur)
if add_subtitles:
txt_clip = mp.TextClip(chunk, fontsize=30, color='white', size=(image_size[0] - 20, None), method='caption')
txt_clip = txt_clip.set_duration(dur).set_position(('center', 'bottom'))
clip = mp.CompositeVideoClip([clip, txt_clip])
clips.append(clip)
black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2)
video = mp.concatenate_videoclips(clips + [black_end])
audio = mp.AudioFileClip(speech_path)
final_video = video.set_audio(audio)
final_video.write_videofile("output.mp4", fps=24)
return "output.mp4"
def process_text(text, movie_title, image_size, use_diffusion, num_steps, add_subtitles):
chunks = estimate_chunk_durations(text)
speech_path = generate_speech(text)
image_paths = generate_images(chunks, image_size, use_diffusion, num_steps)
durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
video_path = create_video(image_paths, durations, speech_path, movie_title, add_subtitles, chunks, image_size)
return video_path
with gr.Blocks() as demo:
gr.Markdown("# Text-to-Video Generator using AI 🎥")
text_input = gr.Textbox(label="Enter your text")
movie_title_input = gr.Textbox(label="Movie Title", value="")
file_input = gr.File(label="Or upload a .txt file")
image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True)
num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=5, label="Diffusion Model Steps")
add_subtitles_input = gr.Checkbox(label="Add Subtitles", value=False)
process_btn = gr.Button("Generate Video")
output_video = gr.Video()
def handle_request(text, movie_title, file, image_size, use_diffusion, num_steps, add_subtitles):
if file is not None:
text = open(file.name, "r").read()
image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
return process_text(text, movie_title, image_size_dict[image_size], use_diffusion, num_steps, add_subtitles)
process_btn.click(handle_request, inputs=[text_input, movie_title_input, file_input, image_size_input, use_diffusion_input, num_steps_input, add_subtitles_input], outputs=output_video)
demo.launch()
|