nvcomposer.yaml 3.64 KB
Newer Older
chenpangpang's avatar
chenpangpang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
num_frames: &num_frames 16
resolution: &resolution [576, 1024]
model:
  base_learning_rate: 1.0e-5
  scale_lr: false
  target: core.models.diffusion.DualStreamMultiViewDiffusionModel
  params:
    use_task_embedding: false
    ray_as_image: false
    apply_condition_mask_in_training_loss: true
    separate_noise_and_condition: true
    condition_padding_with_anchor: false
    use_ray_decoder_loss_high_frequency_isolation: false
    train_with_multi_view_feature_alignment: true
    use_text_cross_attention_condition: false

    linear_start: 0.00085
    linear_end: 0.012
    num_time_steps_cond: 1
    log_every_t: 200
    time_steps: 1000
    
    data_key_images: combined_images
    data_key_rays: combined_rays
    data_key_text_condition: caption
    cond_stage_trainable: false
    image_size: [72, 128]
    
    channels: 10
    monitor: global_step
    scale_by_std: false
    scale_factor: 0.18215
    use_dynamic_rescale: true
    base_scale: 0.3
    
    use_ema: false
    uncond_prob: 0.05
    uncond_type: 'empty_seq'
    
    use_camera_pose_query_transformer: false
    random_cond: false
    cond_concat: true
    frame_mask: false
    padding: true
    per_frame_auto_encoding: true
    parameterization: "v"
    rescale_betas_zero_snr: true
    use_noise_offset: false
    scheduler_config:
      target: utils.lr_scheduler.LambdaLRScheduler
      interval: 'step'
      frequency: 100
      params:
        start_step: 0
        final_decay_ratio: 0.1
        decay_steps: 100
    bd_noise: false

    unet_config:
      target: core.modules.networks.unet_modules.UNetModel
      params:
        in_channels: 20
        out_channels: 10
        model_channels: 320
        attention_resolutions:
        - 4
        - 2
        - 1
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 4
        - 4
        dropout: 0.1
        num_head_channels: 64
        transformer_depth: 1
        context_dim: 1024
        use_linear: true
        use_checkpoint: true
        temporal_conv: true
        temporal_attention: true
        temporal_selfatt_only: true
        use_relative_position: false
        use_causal_attention: false
        temporal_length: *num_frames
        addition_attention: true
        image_cross_attention: true
        image_cross_attention_scale_learnable: true
        default_fs: 3
        fs_condition: false
        use_spatial_temporal_attention: true
        use_addition_ray_output_head: true
        ray_channels: 6
        use_lora_for_rays_in_output_blocks: false
        use_task_embedding: false
        use_ray_decoder: true
        use_ray_decoder_residual: true
        full_spatial_temporal_attention: true
        enhance_multi_view_correspondence: false
        camera_pose_condition: true
        use_feature_alignment: true

    first_stage_config:
      target: core.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult: [1, 2, 4, 4]
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity

    cond_img_config:
      target: core.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2
      params:
        freeze: true

    image_proj_model_config:
      target: core.modules.encoders.resampler.Resampler
      params:
        dim: 1024
        depth: 4
        dim_head: 64
        heads: 12
        num_queries: 16
        embedding_dim: 1280
        output_dim: 1024
        ff_mult: 4
        video_length: *num_frames