TASK_TYPE: train_i2v_vs_img_text_entrance ENABLE: true use_ema: true num_workers: 6 frame_lens: [16, 16, 16, 16, 16, 32, 32, 32] sample_fps: [8, 8, 16, 16, 16, 8, 16, 16] resolution: [1280, 704] vit_resolution: [224, 224] vid_dataset: { 'type': 'VideoDataset', 'data_list': ['data/vid_list.txt', ], 'data_dir_list': ['data/videos/', ], 'vit_resolution': [224, 224], 'resolution': [1280, 704], 'get_first_frame': True, 'max_words': 1000, } img_dataset: { 'type': 'ImageDataset', 'data_list': ['data/img_list.txt', ], 'data_dir_list': ['data/images', ], 'vit_resolution': [224, 224], 'resolution': [1280, 704], 'max_words': 1000 } embedder: { 'type': 'FrozenOpenCLIPTextVisualEmbedder', 'layer': 'penultimate', 'vit_resolution': [224, 224], 'pretrained': 'i2vgen-xl/open_clip_pytorch_model.bin' } UNet: { 'type': 'UNetSD_I2VGen', 'in_dim': 4, 'y_dim': 1024, 'upper_len': 128, 'context_dim': 1024, 'concat_dim': 4, 'out_dim': 4, 'dim_mult': [1, 2, 4, 4], 'num_heads': 8, 'default_fps': 8, 'head_dim': 64, 'num_res_blocks': 2, 'dropout': 0.1, 'temporal_attention': True, 'temporal_attn_times': 1, 'use_checkpoint': True, 'use_fps_condition': False, 'use_sim_mask': False } Diffusion: { 'type': 'DiffusionDDIM', 'schedule': 'cosine', # cosine 'schedule_param': { 'num_timesteps': 1000, 'cosine_s': 0.008, 'zero_terminal_snr': True, }, 'mean_type': 'v', 'loss_type': 'mse', 'var_type': 'fixed_small', 'rescale_timesteps': False, 'noise_strength': 0.1 } batch_sizes: { "1": 32, "4": 8, "8": 4, "16": 2, "32": 1, } visual_train: { 'type': 'VisualTrainTextImageToVideo', 'partial_keys': [ # ['y', 'local_image', 'fps'], # ['image', 'local_image', 'fps'], ['y', 'image', 'local_image', 'fps'] ], 'use_offset_noise': True, 'guide_scale': 9.0, } Pretrain: { 'type': pretrain_specific_strategies, 'fix_weight': False, 'grad_scale': 0.5, 'resume_checkpoint': 'i2vgen-xl/i2vgen_xl_00854500.pth', 'sd_keys_path': 'i2vgen-xl/stable_diffusion_image_key_temporal_attention_x1.json', } chunk_size: 4 decoder_bs: 4 lr: 0.00003 noise_strength: 0.1 # classifier-free guidance p_zero: 0.0 guide_scale: 3.0 num_steps: 1000000 use_zero_infer: True viz_interval: 50 # 200 save_ckp_interval: 50 # 500 # Log log_dir: "workspace/experiments" log_interval: 1 seed: 6666