Update README.md
Browse files
README.md
CHANGED
|
@@ -1,3 +1,73 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: mit
|
| 3 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
---
|
| 4 |
+
use normal pipeline to run it
|
| 5 |
+
example:
|
| 6 |
+
```
|
| 7 |
+
|
| 8 |
+
from diffusers import LTX2Pipeline
|
| 9 |
+
from diffusers.pipelines.ltx2.export_utils import encode_video
|
| 10 |
+
repo= 'smthem/ltx-2-19b-dev-diffusers-4bit'
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
### text_encoder
|
| 14 |
+
from transformers import Gemma3ForConditionalGeneration
|
| 15 |
+
text_encoder = Gemma3ForConditionalGeneration.from_pretrained(
|
| 16 |
+
repo,
|
| 17 |
+
subfolder="text_encoder",
|
| 18 |
+
quantization_config=quant_config,
|
| 19 |
+
torch_dtype=torch.float16,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
### transformer
|
| 23 |
+
transformer_4bit = AutoModel.from_pretrained(
|
| 24 |
+
repo,
|
| 25 |
+
subfolder="transformer",
|
| 26 |
+
quantization_config=quant_config,
|
| 27 |
+
torch_dtype=torch.float16,
|
| 28 |
+
)
|
| 29 |
+
pipeline = LTX2Pipeline.from_pretrained("smthem/ltx-2-19b-dev-diffusers-test",transformer=transformer_4bit,text_encoder=text_encoder,torch_dtype=torch.float16,)
|
| 30 |
+
pipeline.enable_model_cpu_offload()
|
| 31 |
+
|
| 32 |
+
prompt='A video of a dog dancing to energetic electronic dance music'
|
| 33 |
+
negative_prompt="blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
|
| 34 |
+
"grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
|
| 35 |
+
"deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
|
| 36 |
+
"wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
|
| 37 |
+
"field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
|
| 38 |
+
"lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
|
| 39 |
+
"valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
|
| 40 |
+
"mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
|
| 41 |
+
"off-sync audio,incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
|
| 42 |
+
"pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
|
| 43 |
+
"inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
video, audio = pipeline(
|
| 47 |
+
prompt=prompt,
|
| 48 |
+
negative_prompt=negative_prompt,
|
| 49 |
+
height=512,
|
| 50 |
+
width=768,
|
| 51 |
+
num_frames=121,
|
| 52 |
+
frame_rate=25,
|
| 53 |
+
num_inference_steps=20,
|
| 54 |
+
guidance_scale=guidance_scale,
|
| 55 |
+
generator=torch.Generator(device="cuda").manual_seed(42),
|
| 56 |
+
output_type="np",
|
| 57 |
+
return_dict=False,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Convert video to uint8 (but keep as NumPy array)
|
| 61 |
+
video = (video * 255).round().astype("uint8")
|
| 62 |
+
video = torch.from_numpy(video)
|
| 63 |
+
|
| 64 |
+
encode_video(
|
| 65 |
+
video[0],
|
| 66 |
+
fps=args.frame_rate,
|
| 67 |
+
audio=audio[0].float().cpu(),
|
| 68 |
+
audio_sample_rate=pipeline.vocoder.config.output_sampling_rate, # should be 24000
|
| 69 |
+
output_path=os.path.join(args.output_dir, args.output_filename),
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
```
|