cogview3_base.yaml

args:
  mode: inference
  relay_model: False
  load: "/home/models/CogView4/CogView3/cogview3-base/transformer"
  batch_size: 4
  grid_num_columns: 2
  input_type: txt
  input_file: "configs/test.txt"
  fp16: True
  force_inference: True
  sampling_image_size: 512
  output_dir: "outputs/cogview3_base-512x512"
  deepspeed_config: { }

model:
  scale_factor: 0.13025
  disable_first_stage_autocast: true
  log_keys:
    - txt
  
  denoiser_config:
    target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
    params:
      num_idx: 1000

      weighting_config:
        target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
      scaling_config:
        target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
      discretization_config:
        target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization

  network_config:
    target: sgm.modules.diffusionmodules.openaimodel.UNetModel
    params:
      adm_in_channels: 1536
      num_classes: sequential
      use_checkpoint: True
      use_fp16: True
      in_channels: 4
      out_channels: 4
      model_channels: 320
      attention_resolutions: [ 4, 2 ]
      num_res_blocks: 2
      channel_mult: [ 1, 2, 4 ]
      num_head_channels: 64
      use_spatial_transformer: True
      use_linear_in_transformer: True
      transformer_depth: [ 1, 2, 10 ]  # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
      context_dim: 4096
      spatial_transformer_attn_type: softmax-xformers
      legacy: False

  conditioner_config:
    target: sgm.modules.GeneralConditioner
    params:
      emb_models:

        # crossattn cond
        - is_trainable: False
          input_key: txt
          target: sgm.modules.encoders.modules.FrozenT5Embedder
          params:
            model_dir: "/home/models/CogView4/t5-v1_1-xxl"
            max_length: 225

        # vector cond
        - is_trainable: False
          input_key: original_size_as_tuple
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256  # multiplied by two
        # vector cond
        - is_trainable: False
          input_key: crop_coords_top_left
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256  # multiplied by two
        # vector cond
        - is_trainable: False
          input_key: target_size_as_tuple
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256  # multiplied by two

  first_stage_config:
    target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
    params:
      ckpt_path: "/home/models/CogView4/CogView3/cogview3-base/vae/sdxl_vae.safetensors"
      embed_dim: 4
      monitor: val/rec_loss
      ddconfig:
        attn_type: vanilla-xformers
        double_z: true
        z_channels: 4
        resolution: 256
        in_channels: 3
        out_ch: 3
        ch: 128
        ch_mult: [ 1, 2, 4, 4 ]
        num_res_blocks: 2
        attn_resolutions: [ ]
        dropout: 0.0
      lossconfig:
        target: torch.nn.Identity
  
  loss_fn_config:
    target: torch.nn.Identity

  sampler_config:
    target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
    params:
      num_steps: 10
      verbose: True

      discretization_config:
        target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization

      guider_config:
        target: sgm.modules.diffusionmodules.guiders.VanillaCFG
        params:
          scale: 7.5