{ "vae": { "_class_name": "CausalVideoAutoencoder", "dims": 3, "in_channels": 3, "out_channels": 3, "latent_channels": 128, "encoder_blocks": [ [ "res_x", { "num_layers": 4 } ], [ "compress_space_res", { "multiplier": 2 } ], [ "res_x", { "num_layers": 6 } ], [ "compress_time_res", { "multiplier": 2 } ], [ "res_x", { "num_layers": 6 } ], [ "compress_all_res", { "multiplier": 2 } ], [ "res_x", { "num_layers": 2 } ], [ "compress_all_res", { "multiplier": 2 } ], [ "res_x", { "num_layers": 2 } ] ], "decoder_blocks": [ [ "res_x", { "num_layers": 5, "inject_noise": false } ], [ "compress_all", { "residual": true, "multiplier": 2 } ], [ "res_x", { "num_layers": 5, "inject_noise": false } ], [ "compress_all", { "residual": true, "multiplier": 2 } ], [ "res_x", { "num_layers": 5, "inject_noise": false } ], [ "compress_all", { "residual": true, "multiplier": 2 } ], [ "res_x", { "num_layers": 5, "inject_noise": false } ] ], "scaling_factor": 1.0, "norm_layer": "pixel_norm", "patch_size": 4, "latent_log_var": "uniform", "use_quant_conv": false, "causal_decoder": false, "timestep_conditioning": false, "normalize_latent_channels": false, "encoder_base_channels": 128, "decoder_base_channels": 128 }, "_class_name": "CausalVideoAutoencoder" }