Spaces:

ouclxy
/

stablehairv2_demo

Runtime error

App Files Files Community

ouclxy commited on Sep 3

Commit

d5a352b

verified ·

1 Parent(s): 8924b08

Update gradio_app.py

Browse files

Files changed (1) hide show

gradio_app.py +35 -0

gradio_app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import spaces
 import torch
 import cv2
 import numpy as np
 from huggingface_hub import snapshot_download
@@ -297,16 +298,26 @@ def _load_models_cpu_once():
     from omegaconf import OmegaConf
     # Config
     G_INFER_CONFIG = OmegaConf.load('./configs/inference/inference_v2.yaml')
     # Tokenizer / encoders / vae (CPU)
     G_TOKENIZER = AutoTokenizer.from_pretrained(G_ARGS.pretrained_model_name_or_path, subfolder="tokenizer",
                                                 revision=G_ARGS.revision)
     G_IMAGE_ENCODER = CLIPVisionModelWithProjection.from_pretrained(G_ARGS.image_encoder, revision=G_ARGS.revision)
     G_VAE = AutoencoderKL.from_pretrained(G_ARGS.pretrained_model_name_or_path, subfolder="vae",
                                           revision=G_ARGS.revision)
     # UNet2D with 8-channel conv_in (CPU)
     G_UNET2 = UNet2DConditionModel.from_pretrained(
         G_ARGS.pretrained_model_name_or_path, subfolder="unet", revision=G_ARGS.revision, torch_dtype=torch.float32
     )
@@ -318,13 +329,17 @@ def _load_models_cpu_once():
     conv_in_8.weight[:, :4, :, :].copy_(G_UNET2.conv_in.weight)
     conv_in_8.bias.copy_(G_UNET2.conv_in.bias)
     G_UNET2.conv_in = conv_in_8
     # ControlNet (CPU)
     G_CONTROLNET = ControlNetModel.from_unet(G_UNET2)
     state_dict2 = torch.load(os.path.join(G_ARGS.model_path, "pytorch_model.bin"), map_location="cpu")
     G_CONTROLNET.load_state_dict(state_dict2, strict=False)
     # UNet3D (CPU)
     prefix = "motion_module"
     ckpt_num = "4140000"
     save_path = os.path.join(G_ARGS.model_path, f"{prefix}-{ckpt_num}.pth")
@@ -334,13 +349,17 @@ def _load_models_cpu_once():
         subfolder="unet",
         unet_additional_kwargs=G_INFER_CONFIG.unet_additional_kwargs,
     )
     # CC projection (CPU)
     G_CC_PROJ = CCProjection()
     state_dict3 = torch.load(os.path.join(G_ARGS.model_path, "pytorch_model_1.bin"), map_location="cpu")
     G_CC_PROJ.load_state_dict(state_dict3, strict=False)
     # Hair encoder (CPU)
     from ref_encoder.reference_unet import ref_unet
     G_HAIR_ENCODER = ref_unet.from_pretrained(
         G_ARGS.pretrained_model_name_or_path, subfolder="unet", revision=G_ARGS.revision, low_cpu_mem_usage=False,
@@ -348,6 +367,8 @@ def _load_models_cpu_once():
     )
     state_dict4 = torch.load(os.path.join(G_ARGS.model_path, "pytorch_model_2.bin"), map_location="cpu")
     G_HAIR_ENCODER.load_state_dict(state_dict4, strict=False)
 try:
@@ -381,10 +402,12 @@ def _ensure_models_loaded():
 with open("imgs/background.png", "rb") as f:
     _b64_bg = base64.b64encode(f.read()).decode()
 @spaces.GPU(duration=300)
 def inference(id_image, hair_image):
     # ZeroGPU: 强制使用 'cuda' 设备（ZeroGPU 下 torch.cuda.is_available 可能为 False）。
     device = torch.device("cuda")
     # 确保全局模型已加载
     _ensure_models_loaded()
@@ -412,8 +435,10 @@ def inference(id_image, hair_image):
     hair_image.save(hair_path)
     # Align
     aligned_id = _maybe_align_image(id_path, output_size=1024, prefer_cuda=True)
     aligned_hair = _maybe_align_image(hair_path, output_size=1024, prefer_cuda=True)
     aligned_id_path = "gradio_outputs/aligned_id.png"
     aligned_hair_path = "gradio_outputs/aligned_hair.png"
@@ -421,9 +446,11 @@ def inference(id_image, hair_image):
     cv2.imwrite(aligned_hair_path, cv2.cvtColor(aligned_hair, cv2.COLOR_RGB2BGR))
     # Balding
     bald_id_path = "gradio_outputs/bald_id.png"
     cv2.imwrite(bald_id_path, cv2.cvtColor(aligned_id, cv2.COLOR_RGB2BGR))
     bald_head(bald_id_path, bald_id_path)
     # Resolve trained model dir
     trained_model_dir = os.path.abspath("trained_model") if os.path.isdir("trained_model") else None
@@ -459,6 +486,7 @@ def inference(id_image, hair_image):
     logger = logging.getLogger(__name__)
     # 将已加载的全局模型迁移到 GPU
     tokenizer = G_TOKENIZER
     image_encoder = G_IMAGE_ENCODER.to(device)
     vae = G_VAE.to(device, dtype=torch.float32)
@@ -467,17 +495,21 @@ def inference(id_image, hair_image):
     denoising_unet = G_DENOISING_UNET.to(device)
     cc_projection = G_CC_PROJ.to(device)
     Hair_Encoder = G_HAIR_ENCODER.to(device)
     # Run inference
     log_validation(
         vae, tokenizer, image_encoder, denoising_unet,
         args, device, logger,
         cc_projection, controlnet, Hair_Encoder
     )
     output_video = os.path.join(args.output_dir, "validation", "generated_video_0.mp4")
     # Extract frames for slider preview
     frames_dir = os.path.join(args.output_dir, "frames", uuid.uuid4().hex)
     os.makedirs(frames_dir, exist_ok=True)
     cap = cv2.VideoCapture(output_video)
@@ -492,6 +524,9 @@ def inference(id_image, hair_image):
         frames_list.append(fp)
         idx += 1
     cap.release()
     max_frames = len(frames_list) if frames_list else 1
     first_frame = frames_list[0] if frames_list else None

 import torch
 import cv2
 import numpy as np
+import time
 from huggingface_hub import snapshot_download
     from omegaconf import OmegaConf
     # Config
+    t0 = time.perf_counter()
+    t = time.perf_counter()
     G_INFER_CONFIG = OmegaConf.load('./configs/inference/inference_v2.yaml')
+    print(f"[timing:init] load infer config: {time.perf_counter()-t:.2f}s", flush=True)
     # Tokenizer / encoders / vae (CPU)
+    t = time.perf_counter()
     G_TOKENIZER = AutoTokenizer.from_pretrained(G_ARGS.pretrained_model_name_or_path, subfolder="tokenizer",
                                                 revision=G_ARGS.revision)
+    print(f"[timing:init] tokenizer: {time.perf_counter()-t:.2f}s", flush=True)
+    t = time.perf_counter()
     G_IMAGE_ENCODER = CLIPVisionModelWithProjection.from_pretrained(G_ARGS.image_encoder, revision=G_ARGS.revision)
+    print(f"[timing:init] image_encoder: {time.perf_counter()-t:.2f}s", flush=True)
+    t = time.perf_counter()
     G_VAE = AutoencoderKL.from_pretrained(G_ARGS.pretrained_model_name_or_path, subfolder="vae",
                                           revision=G_ARGS.revision)
+    print(f"[timing:init] vae: {time.perf_counter()-t:.2f}s", flush=True)
     # UNet2D with 8-channel conv_in (CPU)
+    t = time.perf_counter()
     G_UNET2 = UNet2DConditionModel.from_pretrained(
         G_ARGS.pretrained_model_name_or_path, subfolder="unet", revision=G_ARGS.revision, torch_dtype=torch.float32
     )
     conv_in_8.weight[:, :4, :, :].copy_(G_UNET2.conv_in.weight)
     conv_in_8.bias.copy_(G_UNET2.conv_in.bias)
     G_UNET2.conv_in = conv_in_8
+    print(f"[timing:init] unet2 + conv_in adapt: {time.perf_counter()-t:.2f}s", flush=True)
     # ControlNet (CPU)
+    t = time.perf_counter()
     G_CONTROLNET = ControlNetModel.from_unet(G_UNET2)
     state_dict2 = torch.load(os.path.join(G_ARGS.model_path, "pytorch_model.bin"), map_location="cpu")
     G_CONTROLNET.load_state_dict(state_dict2, strict=False)
+    print(f"[timing:init] controlnet load_state: {time.perf_counter()-t:.2f}s", flush=True)
     # UNet3D (CPU)
+    t = time.perf_counter()
     prefix = "motion_module"
     ckpt_num = "4140000"
     save_path = os.path.join(G_ARGS.model_path, f"{prefix}-{ckpt_num}.pth")
         subfolder="unet",
         unet_additional_kwargs=G_INFER_CONFIG.unet_additional_kwargs,
     )
+    print(f"[timing:init] unet3d from_pretrained_2d: {time.perf_counter()-t:.2f}s", flush=True)
     # CC projection (CPU)
+    t = time.perf_counter()
     G_CC_PROJ = CCProjection()
     state_dict3 = torch.load(os.path.join(G_ARGS.model_path, "pytorch_model_1.bin"), map_location="cpu")
     G_CC_PROJ.load_state_dict(state_dict3, strict=False)
+    print(f"[timing:init] cc_projection load_state: {time.perf_counter()-t:.2f}s", flush=True)
     # Hair encoder (CPU)
+    t = time.perf_counter()
     from ref_encoder.reference_unet import ref_unet
     G_HAIR_ENCODER = ref_unet.from_pretrained(
         G_ARGS.pretrained_model_name_or_path, subfolder="unet", revision=G_ARGS.revision, low_cpu_mem_usage=False,
     )
     state_dict4 = torch.load(os.path.join(G_ARGS.model_path, "pytorch_model_2.bin"), map_location="cpu")
     G_HAIR_ENCODER.load_state_dict(state_dict4, strict=False)
+    print(f"[timing:init] hair_encoder load_state: {time.perf_counter()-t:.2f}s", flush=True)
+    print(f"[timing:init] total preload: {time.perf_counter()-t0:.2f}s", flush=True)
 try:
 with open("imgs/background.png", "rb") as f:
     _b64_bg = base64.b64encode(f.read()).decode()
 @spaces.GPU(duration=300)
 def inference(id_image, hair_image):
     # ZeroGPU: 强制使用 'cuda' 设备（ZeroGPU 下 torch.cuda.is_available 可能为 False）。
     device = torch.device("cuda")
+    t_total = time.perf_counter()
     # 确保全局模型已加载
     _ensure_models_loaded()
     hair_image.save(hair_path)
     # Align
+    t = time.perf_counter()
     aligned_id = _maybe_align_image(id_path, output_size=1024, prefer_cuda=True)
     aligned_hair = _maybe_align_image(hair_path, output_size=1024, prefer_cuda=True)
+    print(f"[timing] align total: {time.perf_counter()-t:.2f}s", flush=True)
     aligned_id_path = "gradio_outputs/aligned_id.png"
     aligned_hair_path = "gradio_outputs/aligned_hair.png"
     cv2.imwrite(aligned_hair_path, cv2.cvtColor(aligned_hair, cv2.COLOR_RGB2BGR))
     # Balding
+    t = time.perf_counter()
     bald_id_path = "gradio_outputs/bald_id.png"
     cv2.imwrite(bald_id_path, cv2.cvtColor(aligned_id, cv2.COLOR_RGB2BGR))
     bald_head(bald_id_path, bald_id_path)
+    print(f"[timing] bald_head: {time.perf_counter()-t:.2f}s", flush=True)
     # Resolve trained model dir
     trained_model_dir = os.path.abspath("trained_model") if os.path.isdir("trained_model") else None
     logger = logging.getLogger(__name__)
     # 将已加载的全局模型迁移到 GPU
+    t = time.perf_counter()
     tokenizer = G_TOKENIZER
     image_encoder = G_IMAGE_ENCODER.to(device)
     vae = G_VAE.to(device, dtype=torch.float32)
     denoising_unet = G_DENOISING_UNET.to(device)
     cc_projection = G_CC_PROJ.to(device)
     Hair_Encoder = G_HAIR_ENCODER.to(device)
+    print(f"[timing] move models to cuda: {time.perf_counter()-t:.2f}s", flush=True)
     # Run inference
+    t = time.perf_counter()
     log_validation(
         vae, tokenizer, image_encoder, denoising_unet,
         args, device, logger,
         cc_projection, controlnet, Hair_Encoder
     )
+    print(f"[timing] sd pipeline (log_validation): {time.perf_counter()-t:.2f}s", flush=True)
     output_video = os.path.join(args.output_dir, "validation", "generated_video_0.mp4")
     # Extract frames for slider preview
+    t = time.perf_counter()
     frames_dir = os.path.join(args.output_dir, "frames", uuid.uuid4().hex)
     os.makedirs(frames_dir, exist_ok=True)
     cap = cv2.VideoCapture(output_video)
         frames_list.append(fp)
         idx += 1
     cap.release()
+    print(f"[timing] extract frames: {time.perf_counter()-t:.2f}s", flush=True)
+    print(f"[timing] total inference: {time.perf_counter()-t_total:.2f}s", flush=True)
     max_frames = len(frames_list) if frames_list else 1
     first_frame = frames_list[0] if frames_list else None