Spaces:

primerz
/

face-to-pixel-art

Running on Zero

File size: 5,818 Bytes

911dcd6
 
3885620
a910636
911dcd6
 
 
 
 
ff014fd
460592a
ff014fd
 
460592a
ff014fd
3885620
 
ff014fd
3885620
 
ff014fd
3885620
 
60bf1c5
ff014fd
3885620
ff014fd
3e3e641
31c79b1
 
 
 
f389872
3885620
 
 
27381b4
3885620
 
ff014fd
31c79b1
3885620
911dcd6
3885620
60bf1c5
 
3885620
589234e
911dcd6
 
 
5a9aef6
c82ccd6
 
5a9aef6
 
3885620
911dcd6
 
 
 
cb173bd
911dcd6
3885620
ff014fd
 
911dcd6
3885620
 
62e516c
589234e
ff014fd
3885620
f3238f2
 
963056d
f3238f2
 
963056d
 
3885620
 
589234e
0d4d25b
 
27381b4
0d4d25b
 
 
 
 
911dcd6
ff014fd
3885620
a910636
3885620
589234e
3885620
 
ff014fd
589234e
 
0d4d25b
 
 
 
 
 
 
 
 
 
 
 
 
ff014fd
3885620
069fe14
 
 
 
3885620
069fe14
ff014fd
 
911dcd6
 
f389872
3885620
ff014fd
3885620
f389872
60bf1c5
3885620
5cf276c
ff014fd
3885620
 
31c79b1
ff014fd
911dcd6
3885620
d2e1bcd
228348f
911dcd6

import torch
from config import Config
from utils import resize_image_to_1mp, get_caption, draw_kps
from PIL import Image

class Generator:
    def __init__(self, model_handler):
        self.mh = model_handler

    def prepare_control_images(self, image, width, height):
        """
        Generates conditioning maps, ensuring they are resized
        to the exact target dimensions (width, height).
        """
        print(f"Generating control maps for {width}x{height}...")
        
        # Generate depth map
        depth_map_raw = self.mh.leres_detector(image) 
        
        # Generate lineart map
        lineart_map_raw = self.mh.lineart_anime_detector(image)
        
        # Manually resize maps to match the exact output resolution
        depth_map = depth_map_raw.resize((width, height), Image.LANCZOS)
        lineart_map = lineart_map_raw.resize((width, height), Image.LANCZOS)
        
        return depth_map, lineart_map

    def predict(
        self, 
        input_image, 
        user_prompt="",
        negative_prompt="",
        guidance_scale=1.5,
        num_inference_steps=6,
        img2img_strength=0.3,
        face_strength=0.3,
        depth_strength=0.3,
        lineart_strength=0.3,
        seed=-1
    ):
        # 1. Pre-process Inputs
        print("Processing Input...")
        processed_image = resize_image_to_1mp(input_image)
        target_width, target_height = processed_image.size
        
        # 2. Get Face Info (replaces get_face_embedding)
        face_info = self.mh.get_face_info(processed_image)
        
        # 3. Generate Prompt
        if not user_prompt.strip():
            try:
                generated_caption = get_caption(processed_image)
                final_prompt = f"{Config.STYLE_TRIGGER}, {generated_caption}"
            except Exception as e:
                print(f"Captioning failed: {e}, using default prompt.")
                final_prompt = f"{Config.STYLE_TRIGGER}, a beautiful pixel art image"
        else:
            final_prompt = f"{Config.STYLE_TRIGGER}, {user_prompt}"
            
        print(f"Prompt: {final_prompt}")
        print(f"Negative Prompt: {negative_prompt}")

        # 4. Generate OTHER Control Maps (Structure)
        print("Generating Control Maps (Depth, LineArt)...")
        depth_map, lineart_map = self.prepare_control_images(processed_image, target_width, target_height)
        
        # 5. Logic for Face vs No-Face (NOW INCLUDES KPS)
        # ControlNet order: [InstantID_KPS, Zoe_Depth, LineArt]
        
        if face_info is not None:
            print("Face detected: Applying InstantID with keypoints.")
            
            # We use face_info['embedding'] (raw) instead of normed_embedding.
            # Raw embedding has higher magnitude (~20-30) required for the adapter.
            face_emb = torch.tensor(
                face_info['embedding'], 
                dtype=Config.DTYPE,
                device=Config.DEVICE
            ).unsqueeze(0)

            # Create keypoint image
            face_kps = draw_kps(processed_image, face_info['kps'])
            
            # Set strengths
            controlnet_conditioning_scale = [face_strength, depth_strength, lineart_strength] 
            
            # --- UPDATED: Reduced IP Adapter Scale ---
            # Lowered from 0.8 to 0.7 to allow LoRA style (pixel art) to 
            # override realistic skin textures while keeping identity.
            self.mh.pipeline.set_ip_adapter_scale(0.7)
        else:
            print("No face detected: Disabling InstantID.")
            # Create dummy embedding
            face_emb = torch.zeros((1, 512), dtype=Config.DTYPE, device=Config.DEVICE)
            # Create dummy keypoint image (black)
            face_kps = Image.new('RGB', (target_width, target_height), (0, 0, 0))
            
            # Set strengths
            controlnet_conditioning_scale = [0.0, depth_strength, lineart_strength] 
            self.mh.pipeline.set_ip_adapter_scale(0.0)

        # --- UPDATED: Control Guidance End Strategy ---
        # We cap the Face ControlNet duration. 
        # Even if strength is 1.0, we stop it at 0.6 (60%) of the steps.
        # This leaves the final 40% of steps pure for the Pixel Art LoRA 
        # to "pixelize" the face without the ControlNet trying to fix it back to a photo.
        
        face_end_step = min(0.6, face_strength)
        
        control_guidance_end = [
            face_end_step,      # InstantID: Stop early for style
            depth_strength,     # Depth: Keep structure longer
            lineart_strength    # Lineart: Keep outlines longer
        ] 

        # --- Seed/Generator Logic ---
        if seed == -1 or seed is None:
            seed = torch.Generator().seed()
        generator = torch.Generator(device=Config.DEVICE).manual_seed(int(seed))
        print(f"Using seed: {seed}")
        # --- END ---

        # 6. Run Inference
        print("Running pipeline...")
        result = self.mh.pipeline(
            prompt=final_prompt,
            negative_prompt=negative_prompt,
            image=processed_image,  # Base img2img image
            control_image=[face_kps, depth_map, lineart_map],
            image_embeds=face_emb,  # Face identity embedding
            generator=generator,
            
            # --- Parameters from UI ---
            strength=img2img_strength,
            num_inference_steps=num_inference_steps, 
            guidance_scale=guidance_scale,
            # --- End Parameters from UI ---
            
            controlnet_conditioning_scale=controlnet_conditioning_scale,
            control_guidance_end=control_guidance_end,
            
            clip_skip=Config.CLIP_SKIP,
            
        ).images[0]
        
        return result