num_classes:1000# timesteps for noise conditioning (here constant, just need one)
image_size:128
...
...
@@ -49,8 +45,6 @@ model:
use_linear_in_transformer:True
first_stage_config:
target:ldm.models.autoencoder.AutoencoderKL
params:
embed_dim:4
ddconfig:
# attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)