cogview3_relay.yaml 3.99 KB
Newer Older
suily's avatar
suily committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
args:
  mode: inference
  relay_model: True
  load: "/home/models/CogView4/CogView3/cogview3-relay/transformer"
  batch_size: 4
  grid_num_columns: 2
  input_type: txt
  input_file:  "configs/test.txt"
  fp16: True
  force_inference: True
  sampling_image_size: 1024
  output_dir: "outputs/cogview3_relay-1024x1024"
  input_dir: "outputs/cogview3_base-512x512"
  deepspeed_config: { }

model:
  scale_factor: 0.13025
  disable_first_stage_autocast: true
  lr_scale: 2
  log_keys:
    - txt
  
  denoiser_config:
    target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
    params:
      num_idx: 1000

      weighting_config:
        target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
      scaling_config:
        target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
      discretization_config:
        target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization

  network_config:
    target: sgm.modules.diffusionmodules.openaimodel.UNetModel
    params:
      adm_in_channels: 1536
      num_classes: sequential
      use_checkpoint: True
      use_fp16: True
      in_channels: 4
      out_channels: 4
      model_channels: 320
      attention_resolutions: [ 4, 2 ]
      num_res_blocks: 2
      channel_mult: [ 1, 2, 4 ]
      num_head_channels: 64
      use_spatial_transformer: True
      use_linear_in_transformer: True
      # note: the first is unused (due to attn_res starting at 2) 32, 16, 8 --> 64, 32, 16
      transformer_depth: [ 1, 2, 10 ]
      context_dim: 4096
      spatial_transformer_attn_type: softmax-xformers
      legacy: False

  conditioner_config:
    target: sgm.modules.GeneralConditioner
    params:
      emb_models:
        # crossattn cond
        - is_trainable: False
          input_key: txt
          target: sgm.modules.encoders.modules.FrozenT5Embedder
          params:
            model_dir: "/home/models/CogView4/t5-v1_1-xxl"
            max_length: 225
        # vector cond
        - is_trainable: False
          input_key: original_size_as_tuple
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256  # multiplied by two
        # vector cond
        - is_trainable: False
          input_key: crop_coords_top_left
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256  # multiplied by two
        # vector cond
        - is_trainable: False
          input_key: target_size_as_tuple
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256  # multiplied by two

  first_stage_config:
    target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
    params:
      ckpt_path: "/home/models/CogView4/CogView3/cogview3-relay/vae/sdxl_vae.safetensors"
      embed_dim: 4
      monitor: val/rec_loss
      ddconfig:
        attn_type: vanilla-xformers
        double_z: true
        z_channels: 4
        resolution: 256
        in_channels: 3
        out_ch: 3
        ch: 128
        ch_mult: [ 1, 2, 4, 4 ]
        num_res_blocks: 2
        attn_resolutions: [ ]
        dropout: 0.0
      lossconfig:
        target: torch.nn.Identity
  
  loss_fn_config:
    target: sgm.modules.diffusionmodules.loss.LinearRelayDiffusionLoss
    params:
      offset_noise_level: 0.05
      partial_num_steps: 500
      sigma_sampler_config:
        target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
        params:
          num_idx: 1000

          discretization_config:
            target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization

  sampler_config:
    target: sgm.modules.diffusionmodules.sampling.LinearRelayEDMSampler
    params:
      # Suggestion config
      partial_num_steps: 12
      num_steps: 24
      verbose: True

      discretization_config:
        target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization

      guider_config:
        target: sgm.modules.diffusionmodules.guiders.VanillaCFG
        params:
          scale: 7.5