Alexander Bagus
22
26893dc
import importlib.util
from .cogvideox_xfuser import CogVideoXMultiGPUsAttnProcessor2_0
from .flux2_xfuser import Flux2MultiGPUsAttnProcessor2_0
from .flux_xfuser import FluxMultiGPUsAttnProcessor2_0
from .fsdp import shard_model
from .fuser import (get_sequence_parallel_rank,
get_sequence_parallel_world_size, get_sp_group,
get_world_group, init_distributed_environment,
initialize_model_parallel, sequence_parallel_all_gather,
sequence_parallel_chunk, set_multi_gpus_devices,
xFuserLongContextAttention)
from .hunyuanvideo_xfuser import HunyuanVideoMultiGPUsAttnProcessor2_0
from .qwen_xfuser import QwenImageMultiGPUsAttnProcessor2_0
from .wan_xfuser import usp_attn_forward, usp_attn_s2v_forward
from .z_image_xfuser import ZMultiGPUsSingleStreamAttnProcessor
# The pai_fuser is an internally developed acceleration package, which can be used on PAI.
if importlib.util.find_spec("paifuser") is not None:
# --------------------------------------------------------------- #
# The simple_wrapper is used to solve the problem
# about conflicts between cython and torch.compile
# --------------------------------------------------------------- #
def simple_wrapper(func):
def inner(*args, **kwargs):
return func(*args, **kwargs)
return inner
# --------------------------------------------------------------- #
# Sparse Attention Kernel
# --------------------------------------------------------------- #
from paifuser.models import parallel_magvit_vae
from paifuser.ops import wan_usp_sparse_attention_wrapper
from . import wan_xfuser
# --------------------------------------------------------------- #
# Sparse Attention
# --------------------------------------------------------------- #
usp_sparse_attn_wrap_forward = simple_wrapper(wan_usp_sparse_attention_wrapper()(wan_xfuser.usp_attn_forward))
wan_xfuser.usp_attn_forward = usp_sparse_attn_wrap_forward
usp_attn_forward = usp_sparse_attn_wrap_forward
print("Import PAI VAE Turbo and Sparse Attention")
# --------------------------------------------------------------- #
# Fast Rope Kernel
# --------------------------------------------------------------- #
import types
import torch
from paifuser.ops import (ENABLE_KERNEL, usp_fast_rope_apply_qk,
usp_rope_apply_real_qk)
def deepcopy_function(f):
return types.FunctionType(f.__code__, f.__globals__, name=f.__name__, argdefs=f.__defaults__,closure=f.__closure__)
local_rope_apply_qk = deepcopy_function(wan_xfuser.rope_apply_qk)
if ENABLE_KERNEL:
def adaptive_fast_usp_rope_apply_qk(q, k, grid_sizes, freqs):
if torch.is_grad_enabled():
return local_rope_apply_qk(q, k, grid_sizes, freqs)
else:
return usp_fast_rope_apply_qk(q, k, grid_sizes, freqs)
else:
def adaptive_fast_usp_rope_apply_qk(q, k, grid_sizes, freqs):
return usp_rope_apply_real_qk(q, k, grid_sizes, freqs)
wan_xfuser.rope_apply_qk = adaptive_fast_usp_rope_apply_qk
rope_apply_qk = adaptive_fast_usp_rope_apply_qk
print("Import PAI Fast rope")