NAME = 'XL Vec' from torch import Tensor, FloatTensor, nn import gradio as gr from modules.processing import StableDiffusionProcessing from modules import scripts from scripts.sdhook import SDHook from scripts.xl_clip import CLIP_SDXL, get_pooled from scripts.xl_vec_xyz import init_xyz def hook_input( args: 'Hook', mod: nn.Module, inputs: tuple[dict[str,Tensor]] ): if not args.enabled: return assert isinstance(mod, CLIP_SDXL) input = inputs[0] overwritten_keys = set() def create(v: list[float], src: FloatTensor): return FloatTensor(v).to(dtype=src.dtype, device=src.device) def put(name: str, v: list[float]): if name in input: src = input[name] input[name] = create(v, src).reshape(src.shape) overwritten_keys.add(name) old = {k: v for k, v in input.items()} put('original_size_as_tuple', [args.original_height, args.original_width]) put('crop_coords_top_left', [args.crop_top, args.crop_left]) put('target_size_as_tuple', [args.target_height, args.target_width]) if input['aesthetic_score'].item() == 6.0: # positive prompt put('aesthetic_score', [args.aesthetic_score]) else: # negative prompt put('aesthetic_score', [args.negative_aesthetic_score]) new = {k: v for k, v in input.items()} for k in overwritten_keys: print(f"{k}: {old[k].tolist()} -> {new[k].tolist()}") return inputs def hook_output( args: 'Hook', mod: nn.Module, inputs: tuple[dict[str,Tensor]], output: dict, ): if not args.enabled: return if inputs[0]['aesthetic_score'].item() == 6.0: # positive prompt prompt = args.extra_prompt index = args.token_index multiplier = args.eot_multiplier else: # negative prompt prompt = args.extra_negative_prompt index = args.negative_token_index multiplier = args.negative_eot_multiplier if prompt is None or len(prompt) == 0: if index == -1 and multiplier == 1.0: # default return # use original prompt prompt = inputs[0]['txt'][0] assert isinstance(mod, CLIP_SDXL) try: args.enabled = False pooled, at = get_pooled(mod, prompt, index=index) # (1,1280) assert pooled.shape == (1, 1280), f'pooled.shape={pooled.shape}' finally: args.enabled = True output['vector'][:, 0:1280] = pooled[:] * multiplier print(f"vector[:, 0:1280]: {inputs[0]['txt']} -> {[prompt]} @ {at} [M={multiplier:.3f}]") return output class Hook(SDHook): def __init__( self, enabled: bool, p: StableDiffusionProcessing, crop_left: float, crop_top: float, original_width: float, original_height: float, target_width: float, target_height: float, aesthetic_score: float, negative_aesthetic_score: float, extra_prompt: str|None, extra_negative_prompt: str|None, token_index: int|float, negative_token_index: int|float, eot_multiplier: float, negative_eot_multiplier: float, with_hr: bool, ): super().__init__(enabled) self.p = p self.crop_left = float(crop_left) self.crop_top = float(crop_top) self.original_width = float(original_width) self.original_height = float(original_height) self.target_width = float(target_width) self.target_height = float(target_height) self.aesthetic_score = float(aesthetic_score) self.negative_aesthetic_score = float(negative_aesthetic_score) self.extra_prompt = extra_prompt self.extra_negative_prompt = extra_negative_prompt self.token_index = int(token_index) self.negative_token_index = int(negative_token_index) self.eot_multiplier = float(eot_multiplier) self.negative_eot_multiplier = float(negative_eot_multiplier) self.with_hr = bool(with_hr) def hook_clip(self, p: StableDiffusionProcessing, clip: nn.Module): if not hasattr(p.sd_model, 'is_sdxl') or not p.sd_model.is_sdxl: return def inp(*args, **kwargs): return hook_input(self, *args, **kwargs) def outp(*args, **kwargs): return hook_output(self, *args, **kwargs) self.hook_layer_pre(clip, inp) self.hook_layer(clip, outp) class Script(scripts.Script): def __init__(self): super().__init__() self.last_hooker: SDHook|None = None def title(self): return NAME def show(self, is_img2img): return scripts.AlwaysVisible def ui(self, is_img2img): with gr.Accordion(NAME, open=False): with gr.Row(): enabled = gr.Checkbox(label='Enabled', value=False) with_hr = gr.Checkbox(label='Also enable on Hires fix', value=False, visible=False) crop_left = gr.Slider(minimum=-512, maximum=512, step=1, value=0, label='Crop Left') crop_top = gr.Slider(minimum=-512, maximum=512, step=1, value=0, label='Crop Top') original_width = gr.Slider(minimum=-1, maximum=4096, step=1, value=-1, label='Original Width (-1 is original size)') original_height = gr.Slider(minimum=-1, maximum=4096, step=1, value=-1, label='Original Height (-1 is original size)') target_width = gr.Slider(minimum=-1, maximum=4096, step=1, value=-1, label='Target Width (-1 is original size)') target_height = gr.Slider(minimum=-1, maximum=4096, step=1, value=-1, label='Target Height (-1 is original size)') aesthetic_score = gr.Slider(minimum=0.0, maximum=10.0, step=0.05, value=6.0, label="Aesthetic Score (0..10)") negative_aesthetic_score = gr.Slider(minimum=0.0, maximum=10.0, step=0.05, value=2.5, label="Negative Aesthetic Score (0..10)") extra_prompt = gr.Textbox(lines=3, label='Extra prompt (set empty to be disabled)') extra_negative_prompt = gr.Textbox(lines=3, label='Extra negative prompt (set empty to be disabled)') token_index = gr.Slider(minimum=-77, maximum=76, step=1, value=-1, label='Token index in the prompt for the vector (-1 is first EOT)') negative_token_index = gr.Slider(minimum=-77, maximum=76, step=1, value=-1, label='Token index in the negative prompt for the vector (-1 is first EOT)') eot_multiplier = gr.Slider(minimum=-4.0, maximum=8.0, step=0.05, value=1.0, label='Token multiplier') negative_eot_multiplier = gr.Slider(minimum=-4.0, maximum=8.0, step=0.05, value=1.0, label='Negative token multiplier') return [ enabled, crop_left, crop_top, original_width, original_height, target_width, target_height, aesthetic_score, negative_aesthetic_score, extra_prompt, extra_negative_prompt, token_index, negative_token_index, eot_multiplier, negative_eot_multiplier, with_hr, ] def process( self, p: StableDiffusionProcessing, enabled: bool, crop_left: float, crop_top: float, original_width: float, original_height: float, target_width: float, target_height: float, aesthetic_score: float, negative_aesthetic_score: float, extra_prompt: str, extra_negative_prompt: str, token_index: float, negative_token_index: float, eot_multiplier: float, negative_eot_multiplier: float, with_hr: bool, ): if self.last_hooker is not None: self.last_hooker.__exit__(None, None, None) self.last_hooker = None if not enabled: return if original_width < 0: original_width = p.width if original_height < 0: original_height = p.height if target_width < 0: target_width = p.width if target_height < 0: target_height = p.height self.last_hooker = Hook( enabled=True, p=p, crop_left=crop_left, crop_top=crop_top, original_width=original_width, original_height=original_height, target_width=target_width, target_height=target_height, aesthetic_score=aesthetic_score, negative_aesthetic_score=negative_aesthetic_score, extra_prompt=extra_prompt, extra_negative_prompt=extra_negative_prompt, token_index=token_index, negative_token_index=negative_token_index, eot_multiplier=eot_multiplier, negative_eot_multiplier=negative_eot_multiplier, with_hr=with_hr, ) self.last_hooker.setup(p) self.last_hooker.__enter__() p.extra_generation_params.update({ f'[{NAME}] Enabled': enabled, #f'[{NAME}] With HR': with_hr, f'[{NAME}] Crop Left': crop_left, f'[{NAME}] Crop Top': crop_top, f'[{NAME}] Original Width': original_width, f'[{NAME}] Original Height': original_height, f'[{NAME}] Target Width': target_width, f'[{NAME}] Target Height': target_height, f'[{NAME}] Aesthetic Score': aesthetic_score, f'[{NAME}] Negative Aesthetic Score': negative_aesthetic_score, f'[{NAME}] Extra Prompt': extra_prompt.__repr__(), f'[{NAME}] Extra Negative Prompt': extra_negative_prompt.__repr__(), f'[{NAME}] Token Index': token_index, f'[{NAME}] Negative Token Index': negative_token_index, f'[{NAME}] EOT Multiplier': eot_multiplier, f'[{NAME}] Negative EOT Multiplier': negative_eot_multiplier, }) if hasattr(p, 'cached_c'): p.cached_c = [None, None] if hasattr(p, 'cached_uc'): p.cached_uc = [None, None] init_xyz(Script, NAME)