num_classes:1000# timesteps for noise conditioning (here constant, just need one)
use_checkpoint:True
image_size:128
num_classes:1000# timesteps for noise conditioning (here constant, just need one)
in_channels:7
image_size:128
out_channels:4
in_channels:7
model_channels:256
out_channels:4
attention_resolutions:[2,4,8]
model_channels:256
num_res_blocks:2
attention_resolutions:[2,4,8]
channel_mult:[1,2,2,4]
num_res_blocks:2
disable_self_attentions:[True,True,True,False]
channel_mult:[1,2,2,4]
disable_middle_self_attn:False
disable_self_attentions:[True,True,True,False]
num_heads:8
disable_middle_self_attn:False
use_spatial_transformer:True
num_heads:8
transformer_depth:1
use_spatial_transformer:True
context_dim:1024
transformer_depth:1
legacy:False
context_dim:1024
use_linear_in_transformer:True
legacy:False
use_linear_in_transformer:True
first_stage_config:
first_stage_config:
target:ldm.models.autoencoder.AutoencoderKL
embed_dim:4
params:
ddconfig:
embed_dim:4
# attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)
ddconfig:
double_z:True
# attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)