Spaces:
Running
on
Zero
Running
on
Zero
| import importlib.util | |
| from .cogvideox_xfuser import CogVideoXMultiGPUsAttnProcessor2_0 | |
| from .flux2_xfuser import Flux2MultiGPUsAttnProcessor2_0 | |
| from .flux_xfuser import FluxMultiGPUsAttnProcessor2_0 | |
| from .fsdp import shard_model | |
| from .fuser import (get_sequence_parallel_rank, | |
| get_sequence_parallel_world_size, get_sp_group, | |
| get_world_group, init_distributed_environment, | |
| initialize_model_parallel, sequence_parallel_all_gather, | |
| sequence_parallel_chunk, set_multi_gpus_devices, | |
| xFuserLongContextAttention) | |
| from .hunyuanvideo_xfuser import HunyuanVideoMultiGPUsAttnProcessor2_0 | |
| from .qwen_xfuser import QwenImageMultiGPUsAttnProcessor2_0 | |
| from .wan_xfuser import usp_attn_forward, usp_attn_s2v_forward | |
| from .z_image_xfuser import ZMultiGPUsSingleStreamAttnProcessor | |
| # The pai_fuser is an internally developed acceleration package, which can be used on PAI. | |
| if importlib.util.find_spec("paifuser") is not None: | |
| # --------------------------------------------------------------- # | |
| # The simple_wrapper is used to solve the problem | |
| # about conflicts between cython and torch.compile | |
| # --------------------------------------------------------------- # | |
| def simple_wrapper(func): | |
| def inner(*args, **kwargs): | |
| return func(*args, **kwargs) | |
| return inner | |
| # --------------------------------------------------------------- # | |
| # Sparse Attention Kernel | |
| # --------------------------------------------------------------- # | |
| from paifuser.models import parallel_magvit_vae | |
| from paifuser.ops import wan_usp_sparse_attention_wrapper | |
| from . import wan_xfuser | |
| # --------------------------------------------------------------- # | |
| # Sparse Attention | |
| # --------------------------------------------------------------- # | |
| usp_sparse_attn_wrap_forward = simple_wrapper(wan_usp_sparse_attention_wrapper()(wan_xfuser.usp_attn_forward)) | |
| wan_xfuser.usp_attn_forward = usp_sparse_attn_wrap_forward | |
| usp_attn_forward = usp_sparse_attn_wrap_forward | |
| print("Import PAI VAE Turbo and Sparse Attention") | |
| # --------------------------------------------------------------- # | |
| # Fast Rope Kernel | |
| # --------------------------------------------------------------- # | |
| import types | |
| import torch | |
| from paifuser.ops import (ENABLE_KERNEL, usp_fast_rope_apply_qk, | |
| usp_rope_apply_real_qk) | |
| def deepcopy_function(f): | |
| return types.FunctionType(f.__code__, f.__globals__, name=f.__name__, argdefs=f.__defaults__,closure=f.__closure__) | |
| local_rope_apply_qk = deepcopy_function(wan_xfuser.rope_apply_qk) | |
| if ENABLE_KERNEL: | |
| def adaptive_fast_usp_rope_apply_qk(q, k, grid_sizes, freqs): | |
| if torch.is_grad_enabled(): | |
| return local_rope_apply_qk(q, k, grid_sizes, freqs) | |
| else: | |
| return usp_fast_rope_apply_qk(q, k, grid_sizes, freqs) | |
| else: | |
| def adaptive_fast_usp_rope_apply_qk(q, k, grid_sizes, freqs): | |
| return usp_rope_apply_real_qk(q, k, grid_sizes, freqs) | |
| wan_xfuser.rope_apply_qk = adaptive_fast_usp_rope_apply_qk | |
| rope_apply_qk = adaptive_fast_usp_rope_apply_qk | |
| print("Import PAI Fast rope") |