File size: 4,823 Bytes
3920bf8
96714cf
 
 
 
 
 
 
 
3920bf8
872b88b
96714cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10d615f
 
 
17a42fc
96714cf
 
 
 
d9d7e9a
96714cf
dc44a16
 
 
 
96714cf
dc44a16
 
 
 
 
96714cf
 
 
 
 
db78349
e952ea3
db78349
 
 
 
 
 
 
d9d7e9a
db78349
 
 
 
e952ea3
db78349
a43fcb6
db78349
96714cf
 
bce1eba
96714cf
 
db78349
96714cf
 
dc44a16
96714cf
db78349
96714cf
 
 
 
350d456
db78349
96714cf
cd94723
dc44a16
 
db78349
96714cf
 
 
db78349
96714cf
 
cd94723
db78349
96714cf
db78349
3920bf8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import gradio as gr
import torch
import torchaudio
from diffusers import StableDiffusionPipeline
from TTS.api import TTS
import moviepy.editor as mp
import numpy as np
import os
from PIL import Image

def estimate_chunk_durations(text, words_per_second=2.5, min_sec=5, max_sec=7):
    words = text.split()
    chunks = []
    current_chunk = []
    current_duration = 0
    for word in words:
        current_chunk.append(word)
        current_duration += 1 / words_per_second
        if current_duration >= min_sec:
            if current_duration >= max_sec or len(current_chunk) > 20:
                chunks.append(" ".join(current_chunk))
                current_chunk = []
                current_duration = 0
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

def generate_speech(text):
    #tts = TTS("tts_models/en/ljspeech/tacotron2-DDC")
    tts = TTS("tts_models/en/ljspeech/glow-tts")

    #tts.synthesizer.model.decoder.max_decoder_steps = 30000  # Increase limit
    wav_path = "speech.wav"
    tts.tts_to_file(text=text, file_path=wav_path)
    return wav_path

def generate_images(chunks, image_size=(640, 480), use_diffusion=True, num_steps=40):
    image_paths = []
    if use_diffusion:
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
        pipe.to("cuda" if torch.cuda.is_available() else "cpu")
    
    for i, chunk in enumerate(chunks):
        if use_diffusion:
            image = pipe(chunk, num_inference_steps=num_steps).images[0]
            image = image.resize(image_size)
        else:
            image = Image.new("RGB", image_size, (0, 0, 0))
        img_path = f"image_{i}.png"
        image.save(img_path)
        image_paths.append(img_path)
    return image_paths

def create_video(images, durations, speech_path, movie_title, add_subtitles, chunks, image_size=(640, 480)):
    clips = []
    title_clip = mp.TextClip(movie_title, fontsize=50, color='white', size=image_size)
    title_clip = title_clip.set_duration(1).set_position('center')
    black_start = mp.ColorClip(image_size, color=(0,0,0), duration=1).set_opacity(0.8)
    clips.append(mp.CompositeVideoClip([black_start, title_clip]))
    
    for img, dur, chunk in zip(images, durations, chunks):
        frame = np.array(Image.open(img).resize(image_size, Image.Resampling.LANCZOS))
        clip = mp.ImageClip(frame).set_duration(dur)
        if add_subtitles:
            txt_clip = mp.TextClip(chunk, fontsize=30, color='white', size=(image_size[0] - 20, None), method='caption')
            txt_clip = txt_clip.set_duration(dur).set_position(('center', 'bottom'))
            clip = mp.CompositeVideoClip([clip, txt_clip])
        clips.append(clip)
    
    black_end = mp.ColorClip(image_size, color=(0,0,0), duration=2)
    video = mp.concatenate_videoclips(clips + [black_end])
    audio = mp.AudioFileClip(speech_path)
    final_video = video.set_audio(audio)
    final_video.write_videofile("output.mp4", fps=24)
    return "output.mp4"

def process_text(text, movie_title, image_size, use_diffusion, num_steps, add_subtitles):
    chunks = estimate_chunk_durations(text)
    speech_path = generate_speech(text)
    image_paths = generate_images(chunks, image_size, use_diffusion, num_steps)
    durations = [min(10, max(5, len(chunk.split()) / 2.5)) for chunk in chunks]
    video_path = create_video(image_paths, durations, speech_path, movie_title, add_subtitles, chunks, image_size)
    return video_path

with gr.Blocks() as demo:
    gr.Markdown("# Text-to-Video Generator using AI 🎥")
    text_input = gr.Textbox(label="Enter your text")
    movie_title_input = gr.Textbox(label="Movie Title", value="")
    file_input = gr.File(label="Or upload a .txt file")
    image_size_input = gr.Radio(choices=["640x480", "800x600", "1024x768"], label="Select Image Size", value="640x480")
    use_diffusion_input = gr.Checkbox(label="Use Diffusion Images", value=True)
    num_steps_input = gr.Slider(minimum=1, maximum=50, step=1, value=5, label="Diffusion Model Steps")
    add_subtitles_input = gr.Checkbox(label="Add Subtitles", value=False)
    process_btn = gr.Button("Generate Video")
    output_video = gr.Video()
    
    def handle_request(text, movie_title, file, image_size, use_diffusion, num_steps, add_subtitles):
        if file is not None:
            text = open(file.name, "r").read()
        image_size_dict = {"640x480": (640, 480), "800x600": (800, 600), "1024x768": (1024, 768)}
        return process_text(text, movie_title, image_size_dict[image_size], use_diffusion, num_steps, add_subtitles)
    
    process_btn.click(handle_request, inputs=[text_input, movie_title_input, file_input, image_size_input, use_diffusion_input, num_steps_input, add_subtitles_input], outputs=output_video)

demo.launch()