smthem
/

ltx-2-19b-dev-diffusers-4bit

Model card Files Files and versions

smthem commited on 8 days ago

Commit

d47fef9

·

verified ·

1 Parent(s): a4d6110

Update README.md

Files changed (1) hide show

README.md +73 -3

README.md CHANGED Viewed

@@ -1,3 +1,73 @@
----
-license: mit
----

+---
+license: mit
+---
+use normal pipeline to run it
+example:
+```
+from diffusers import LTX2Pipeline
+from diffusers.pipelines.ltx2.export_utils import encode_video
+repo= 'smthem/ltx-2-19b-dev-diffusers-4bit'
+### text_encoder
+from transformers import Gemma3ForConditionalGeneration
+text_encoder = Gemma3ForConditionalGeneration.from_pretrained(
+                  repo,
+                  subfolder="text_encoder",
+                  quantization_config=quant_config,
+                  torch_dtype=torch.float16,
+                )
+### transformer
+transformer_4bit = AutoModel.from_pretrained(
+                    repo,
+                    subfolder="transformer",
+                    quantization_config=quant_config,
+                    torch_dtype=torch.float16,
+                )
+pipeline = LTX2Pipeline.from_pretrained("smthem/ltx-2-19b-dev-diffusers-test",transformer=transformer_4bit,text_encoder=text_encoder,torch_dtype=torch.float16,)
+pipeline.enable_model_cpu_offload()
+prompt='A video of a dog dancing to energetic electronic dance music'
+negative_prompt="blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+            "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+            "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+            "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+            "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+            "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+            "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+            "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+            "off-sync audio,incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+            "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+            "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+video, audio = pipeline(
+      prompt=prompt,
+      negative_prompt=negative_prompt,
+      height=512,
+      width=768,
+      num_frames=121,
+      frame_rate=25,
+      num_inference_steps=20,
+      guidance_scale=guidance_scale,
+      generator=torch.Generator(device="cuda").manual_seed(42),
+      output_type="np",
+      return_dict=False,
+  )
+# Convert video to uint8 (but keep as NumPy array)
+video = (video * 255).round().astype("uint8")
+video = torch.from_numpy(video)
+encode_video(
+      video[0],
+      fps=args.frame_rate,
+      audio=audio[0].float().cpu(),
+      audio_sample_rate=pipeline.vocoder.config.output_sampling_rate,  # should be 24000
+      output_path=os.path.join(args.output_dir, args.output_filename),
+  )
+```