args: mode: inference relay_model: False load: "/home/models/CogView4/CogView3/cogview3-base/transformer" batch_size: 4 grid_num_columns: 2 input_type: txt input_file: "configs/test.txt" fp16: True force_inference: True sampling_image_size: 512 output_dir: "outputs/cogview3_base-512x512" deepspeed_config: { } model: scale_factor: 0.13025 disable_first_stage_autocast: true log_keys: - txt denoiser_config: target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser params: num_idx: 1000 weighting_config: target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting scaling_config: target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization network_config: target: sgm.modules.diffusionmodules.openaimodel.UNetModel params: adm_in_channels: 1536 num_classes: sequential use_checkpoint: True use_fp16: True in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [ 4, 2 ] num_res_blocks: 2 channel_mult: [ 1, 2, 4 ] num_head_channels: 64 use_spatial_transformer: True use_linear_in_transformer: True transformer_depth: [ 1, 2, 10 ] # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16 context_dim: 4096 spatial_transformer_attn_type: softmax-xformers legacy: False conditioner_config: target: sgm.modules.GeneralConditioner params: emb_models: # crossattn cond - is_trainable: False input_key: txt target: sgm.modules.encoders.modules.FrozenT5Embedder params: model_dir: "/home/models/CogView4/t5-v1_1-xxl" max_length: 225 # vector cond - is_trainable: False input_key: original_size_as_tuple target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 # multiplied by two # vector cond - is_trainable: False input_key: crop_coords_top_left target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 # multiplied by two # vector cond - is_trainable: False input_key: target_size_as_tuple target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 # multiplied by two first_stage_config: target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper params: ckpt_path: "/home/models/CogView4/CogView3/cogview3-base/vae/sdxl_vae.safetensors" embed_dim: 4 monitor: val/rec_loss ddconfig: attn_type: vanilla-xformers double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [ 1, 2, 4, 4 ] num_res_blocks: 2 attn_resolutions: [ ] dropout: 0.0 lossconfig: target: torch.nn.Identity loss_fn_config: target: torch.nn.Identity sampler_config: target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler params: num_steps: 10 verbose: True discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization guider_config: target: sgm.modules.diffusionmodules.guiders.VanillaCFG params: scale: 7.5