higen_train.yaml 2.57 KB
Newer Older
mashun1's avatar
mashun1 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
TASK_TYPE: train_t2v_higen_entrance
ENABLE: true
use_ema: true
num_workers: 6
frame_lens: [32, 32, 32, 32, 32, 32, 32, 32]
sample_fps: [8,  8,  8, 8, 8, 8,  8, 8]
resolution: [448, 256]
vit_resolution: [224, 224]
vid_dataset: {
    'type': 'VideoDataset',
    'data_list': ['data/vid_list.txt', ],
    'data_dir_list': ['data/videos/', ],
    'vit_resolution': [224, 224],
    'resolution': [448, 256],
    'get_first_frame': True,
    'max_words': 1000,
}
img_dataset: {
    'type': 'ImageDataset',
    'data_list': ['data/img_list.txt', ],
    'data_dir_list': ['data/images', ],
    'vit_resolution': [224, 224],
    'resolution': [448, 256],
    'max_words': 1000
}
embedder: {
    'type': 'FrozenOpenCLIPTextVisualEmbedder',
    'layer': 'penultimate',
    'vit_resolution': [224, 224],
    'pretrained': 'models/open_clip_pytorch_model.bin'
}
UNet: {
    'type': 'UNetSD_HiGen',
    'in_dim': 4,
    'y_dim': 1024,
    'upper_len': 128,
    'context_dim': 1024,
    'concat_dim': 4,
    'out_dim': 4,
    'dim_mult': [1, 2, 4, 4],
    'num_heads': 8,
    'default_fps': 8,
    'head_dim': 64,
    'num_res_blocks': 2,
    'dropout': 0.1,
    'temporal_attention': True,
    'temporal_attn_times': 1,
    'use_checkpoint': True,
    'use_fps_condition': False,
    'use_sim_mask': False,
    'context_embedding_depth': 2,
    'num_tokens': 16
}
Diffusion: {
    'type': 'DiffusionDDIM',
    'schedule': 'linear_sd', # linear_sd
    'schedule_param': {
        'num_timesteps': 1000,
        'zero_terminal_snr': True,
        'init_beta': 0.00085,
        'last_beta': 0.0120
    },
    'mean_type': 'v',
    'loss_type': 'mse',
    'var_type': 'fixed_small',
    'rescale_timesteps': False,
    'noise_strength': 0.1
}
batch_sizes: {
    "1": 256,
    "4": 96,
    "8": 48,
    "16": 32,
    "24": 24,
    "32": 10
}
visual_train: {
    'type': 'VisualTrainTextImageToVideo',
    'partial_keys': [
        # ['y', 'local_image', 'fps'],
        # ['image', 'local_image', 'fps'],
        ['y', 'image', 'local_image', 'fps']
    ],
    'use_offset_noise': True,
    'guide_scale': 9.0, 
}

Pretrain: {
    'type': pretrain_specific_strategies,
    'fix_weight': False,
    'grad_scale': 0.5,
    'resume_checkpoint': 'models/i2vgen_xl_00854500.pth',
    'sd_keys_path': 'models/stable_diffusion_image_key_temporal_attention_x1.json',
}

chunk_size: 4
decoder_bs: 4
lr: 0.00003

noise_strength: 0.1
# classifier-free guidance
p_zero: 0.0
guide_scale: 3.0
num_steps: 1000000

use_zero_infer: True
viz_interval: 50        # 200
save_ckp_interval: 50   # 500

# Log
log_dir: "workspace/experiments"
log_interval: 1
seed: 6666