num_classes:1000# timesteps for noise conditioning (here constant, just need one)
image_size:128
in_channels:7
out_channels:4
model_channels:256
attention_resolutions:[2,4,8]
num_res_blocks:2
channel_mult:[1,2,2,4]
disable_self_attentions:[True,True,True,False]
disable_middle_self_attn:False
num_heads:8
use_spatial_transformer:True
transformer_depth:1
context_dim:1024
legacy:False
use_linear_in_transformer:True
use_checkpoint:True
num_classes:1000# timesteps for noise conditioning (here constant, just need one)
image_size:128
in_channels:7
out_channels:4
model_channels:256
attention_resolutions:[2,4,8]
num_res_blocks:2
channel_mult:[1,2,2,4]
disable_self_attentions:[True,True,True,False]
disable_middle_self_attn:False
num_heads:8
use_spatial_transformer:True
transformer_depth:1
context_dim:1024
legacy:False
use_linear_in_transformer:True
first_stage_config:
target:ldm.models.autoencoder.AutoencoderKL
params:
embed_dim:4
ddconfig:
# attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)
double_z:True
z_channels:4
resolution:256
in_channels:3
out_ch:3
ch:128
ch_mult:[1,2,4]# num_down = len(ch_mult)-1
num_res_blocks:2
attn_resolutions:[]
dropout:0.0
embed_dim:4
ddconfig:
# attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)