licyk commited on
Commit
d827b6a
·
verified ·
1 Parent(s): 9b5d11e

Upload 2 files

Browse files
configs/stable-diffusion/sd_xl_inpaint.yaml ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ target: sgm.models.diffusion.DiffusionEngine
3
+ params:
4
+ scale_factor: 0.13025
5
+ disable_first_stage_autocast: True
6
+
7
+ denoiser_config:
8
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
9
+ params:
10
+ num_idx: 1000
11
+
12
+ weighting_config:
13
+ target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
14
+ scaling_config:
15
+ target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
16
+ discretization_config:
17
+ target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
18
+
19
+ network_config:
20
+ target: sgm.modules.diffusionmodules.openaimodel.UNetModel
21
+ params:
22
+ adm_in_channels: 2816
23
+ num_classes: sequential
24
+ use_checkpoint: True
25
+ in_channels: 9
26
+ out_channels: 4
27
+ model_channels: 320
28
+ attention_resolutions: [4, 2]
29
+ num_res_blocks: 2
30
+ channel_mult: [1, 2, 4]
31
+ num_head_channels: 64
32
+ use_spatial_transformer: True
33
+ use_linear_in_transformer: True
34
+ transformer_depth: [1, 2, 10] # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
35
+ context_dim: 2048
36
+ spatial_transformer_attn_type: softmax-xformers
37
+ legacy: False
38
+
39
+ conditioner_config:
40
+ target: sgm.modules.GeneralConditioner
41
+ params:
42
+ emb_models:
43
+ # crossattn cond
44
+ - is_trainable: False
45
+ input_key: txt
46
+ target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
47
+ params:
48
+ layer: hidden
49
+ layer_idx: 11
50
+ # crossattn and vector cond
51
+ - is_trainable: False
52
+ input_key: txt
53
+ target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
54
+ params:
55
+ arch: ViT-bigG-14
56
+ version: laion2b_s39b_b160k
57
+ freeze: True
58
+ layer: penultimate
59
+ always_return_pooled: True
60
+ legacy: False
61
+ # vector cond
62
+ - is_trainable: False
63
+ input_key: original_size_as_tuple
64
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
65
+ params:
66
+ outdim: 256 # multiplied by two
67
+ # vector cond
68
+ - is_trainable: False
69
+ input_key: crop_coords_top_left
70
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
71
+ params:
72
+ outdim: 256 # multiplied by two
73
+ # vector cond
74
+ - is_trainable: False
75
+ input_key: target_size_as_tuple
76
+ target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
77
+ params:
78
+ outdim: 256 # multiplied by two
79
+
80
+ first_stage_config:
81
+ target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
82
+ params:
83
+ embed_dim: 4
84
+ monitor: val/rec_loss
85
+ ddconfig:
86
+ attn_type: vanilla-xformers
87
+ double_z: true
88
+ z_channels: 4
89
+ resolution: 256
90
+ in_channels: 3
91
+ out_ch: 3
92
+ ch: 128
93
+ ch_mult: [1, 2, 4, 4]
94
+ num_res_blocks: 2
95
+ attn_resolutions: []
96
+ dropout: 0.0
97
+ lossconfig:
98
+ target: torch.nn.Identity
configs/stable-diffusion/v2-midas-inference.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 5.0e-07
3
+ target: ldm.models.diffusion.ddpm.LatentDepth2ImageDiffusion
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.0120
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: "jpg"
11
+ cond_stage_key: "txt"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: hybrid
16
+ scale_factor: 0.18215
17
+ monitor: val/loss_simple_ema
18
+ finetune_keys: null
19
+ use_ema: False
20
+
21
+ depth_stage_config:
22
+ target: ldm.modules.midas.api.MiDaSInference
23
+ params:
24
+ model_type: "dpt_hybrid"
25
+
26
+ unet_config:
27
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
28
+ params:
29
+ use_checkpoint: True
30
+ image_size: 32 # unused
31
+ in_channels: 5
32
+ out_channels: 4
33
+ model_channels: 320
34
+ attention_resolutions: [ 4, 2, 1 ]
35
+ num_res_blocks: 2
36
+ channel_mult: [ 1, 2, 4, 4 ]
37
+ num_head_channels: 64 # need to fix for flash-attn
38
+ use_spatial_transformer: True
39
+ use_linear_in_transformer: True
40
+ transformer_depth: 1
41
+ context_dim: 1024
42
+ legacy: False
43
+
44
+ first_stage_config:
45
+ target: ldm.models.autoencoder.AutoencoderKL
46
+ params:
47
+ embed_dim: 4
48
+ monitor: val/rec_loss
49
+ ddconfig:
50
+ #attn_type: "vanilla-xformers"
51
+ double_z: true
52
+ z_channels: 4
53
+ resolution: 256
54
+ in_channels: 3
55
+ out_ch: 3
56
+ ch: 128
57
+ ch_mult:
58
+ - 1
59
+ - 2
60
+ - 4
61
+ - 4
62
+ num_res_blocks: 2
63
+ attn_resolutions: [ ]
64
+ dropout: 0.0
65
+ lossconfig:
66
+ target: torch.nn.Identity
67
+
68
+ cond_stage_config:
69
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
70
+ params:
71
+ freeze: True
72
+ layer: "penultimate"
73
+
74
+