|
|
--- |
|
|
license: mit |
|
|
--- |
|
|
use normal pipeline to run it |
|
|
example: |
|
|
``` |
|
|
|
|
|
from diffusers import LTX2Pipeline |
|
|
from diffusers.pipelines.ltx2.export_utils import encode_video |
|
|
repo= 'smthem/ltx-2-19b-dev-diffusers-4bit' |
|
|
|
|
|
|
|
|
### text_encoder |
|
|
from transformers import Gemma3ForConditionalGeneration |
|
|
text_encoder = Gemma3ForConditionalGeneration.from_pretrained( |
|
|
repo, |
|
|
subfolder="text_encoder", |
|
|
) |
|
|
|
|
|
### transformer |
|
|
transformer_4bit = AutoModel.from_pretrained( |
|
|
repo, |
|
|
subfolder="transformer", |
|
|
) |
|
|
pipeline = LTX2Pipeline.from_pretrained("smthem/ltx-2-19b-dev-diffusers-test",transformer=transformer_4bit,text_encoder=text_encoder,torch_dtype=torch.float16,) |
|
|
pipeline.enable_model_cpu_offload() |
|
|
|
|
|
prompt='A video of a dog dancing to energetic electronic dance music' |
|
|
negative_prompt="blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, " |
|
|
"grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, " |
|
|
"deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, " |
|
|
"wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of " |
|
|
"field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent " |
|
|
"lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny " |
|
|
"valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, " |
|
|
"mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, " |
|
|
"off-sync audio,incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward " |
|
|
"pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, " |
|
|
"inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts." |
|
|
|
|
|
|
|
|
video, audio = pipeline( |
|
|
prompt=prompt, |
|
|
negative_prompt=negative_prompt, |
|
|
height=512, |
|
|
width=768, |
|
|
num_frames=121, |
|
|
frame_rate=25, |
|
|
num_inference_steps=20, |
|
|
guidance_scale=guidance_scale, |
|
|
generator=torch.Generator(device="cuda").manual_seed(42), |
|
|
output_type="np", |
|
|
return_dict=False, |
|
|
) |
|
|
|
|
|
# Convert video to uint8 (but keep as NumPy array) |
|
|
video = (video * 255).round().astype("uint8") |
|
|
video = torch.from_numpy(video) |
|
|
|
|
|
encode_video( |
|
|
video[0], |
|
|
fps=args.frame_rate, |
|
|
audio=audio[0].float().cpu(), |
|
|
audio_sample_rate=pipeline.vocoder.config.output_sampling_rate, # should be 24000 |
|
|
output_path=os.path.join(args.output_dir, args.output_filename), |
|
|
) |
|
|
|
|
|
|
|
|
``` |
|
|
|