ffs_img_train.yaml

# dataset
dataset: "ffs_img"

data_path: "/path/to/datasets/preprocessed_ffs/train/videos/"
frame_data_path: "/path/to/datasets/preprocessed_ffs/train/images/"
frame_data_txt: "/path/to/datasets/preprocessed_ffs/train_list.txt"
pretrained_model_path: "/path/to/pretrained/Latte/"

# save and load
results_dir: "./results_img"
pretrained:

# model config: 
model: LatteIMG-XL/2
num_frames: 16
image_size: 256 # choices=[256, 512]
num_sampling_steps: 250
frame_interval: 3
fixed_spatial: False
attention_bias: True
learn_sigma: True # important
extras: 1 # [1, 2, 78]

# train config:
save_ceph: True # important
use_image_num: 8
learning_rate: 1e-4
ckpt_every: 10000
clip_max_norm: 0.1
start_clip_iter: 500000
local_batch_size: 4 # important
max_train_steps: 1000000
global_seed: 3407
num_workers: 8
log_every: 100
lr_warmup_steps: 0
resume_from_checkpoint:
gradient_accumulation_steps: 1 # TODO
num_classes:

# low VRAM and speed up training
use_compile: False
mixed_precision: False
enable_xformers_memory_efficient_attention: False
gradient_checkpointing: False