open_sora_inference

78bae405 · mashun1 · 78bae405 · 78bae405 · 78bae405 · 78bae405
Commit 78bae405 authored Mar 25, 2024 by mashun1
20 changed files
--- a/opensora/utils/misc.py
+++ b/opensora/utils/misc.py
+import collections
+import importlib
+import logging
+import os
+import time
+from collections import OrderedDict
+from collections.abc import Sequence
+from itertools import repeat
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+
+def print_rank(var_name, var_value, rank=0):
+    if dist.get_rank() == rank:
+        print(f"[Rank {rank}] {var_name}: {var_value}")
+
+
+def print_0(*args, **kwargs):
+    if dist.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def requires_grad(model: torch.nn.Module, flag: bool = True) -> None:
+    """
+    Set requires_grad flag for all parameters in a model.
+    """
+    for p in model.parameters():
+        p.requires_grad = flag
+
+
+def format_numel_str(numel: int) -> str:
+    B = 1024**3
+    M = 1024**2
+    K = 1024
+    if numel >= B:
+        return f"{numel / B:.2f} B"
+    elif numel >= M:
+        return f"{numel / M:.2f} M"
+    elif numel >= K:
+        return f"{numel / K:.2f} K"
+    else:
+        return f"{numel}"
+
+
+def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
+    dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
+    tensor.div_(dist.get_world_size())
+    return tensor
+
+
+def get_model_numel(model: torch.nn.Module) -> (int, int):
+    num_params = 0
+    num_params_trainable = 0
+    for p in model.parameters():
+        num_params += p.numel()
+        if p.requires_grad:
+            num_params_trainable += p.numel()
+    return num_params, num_params_trainable
+
+
+def try_import(name):
+    """Try to import a module.
+
+    Args:
+        name (str): Specifies what module to import in absolute or relative
+            terms (e.g. either pkg.mod or ..mod).
+    Returns:
+        ModuleType or None: If importing successfully, returns the imported
+        module, otherwise returns None.
+    """
+    try:
+        return importlib.import_module(name)
+    except ImportError:
+        return None
+
+
+def transpose(x):
+    """
+    transpose a list of list
+    Args:
+        x (list[list]):
+    """
+    ret = list(map(list, zip(*x)))
+    return ret
+
+
+def get_timestamp():
+    timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))
+    return timestamp
+
+
+def format_time(seconds):
+    days = int(seconds / 3600 / 24)
+    seconds = seconds - days * 3600 * 24
+    hours = int(seconds / 3600)
+    seconds = seconds - hours * 3600
+    minutes = int(seconds / 60)
+    seconds = seconds - minutes * 60
+    secondsf = int(seconds)
+    seconds = seconds - secondsf
+    millis = int(seconds * 1000)
+
+    f = ""
+    i = 1
+    if days > 0:
+        f += str(days) + "D"
+        i += 1
+    if hours > 0 and i <= 2:
+        f += str(hours) + "h"
+        i += 1
+    if minutes > 0 and i <= 2:
+        f += str(minutes) + "m"
+        i += 1
+    if secondsf > 0 and i <= 2:
+        f += str(secondsf) + "s"
+        i += 1
+    if millis > 0 and i <= 2:
+        f += str(millis) + "ms"
+        i += 1
+    if f == "":
+        f = "0ms"
+    return f
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not isinstance(data, str):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f"type {type(data)} cannot be converted to tensor.")
+
+
+def to_ndarray(data):
+    if isinstance(data, torch.Tensor):
+        return data.numpy()
+    elif isinstance(data, np.ndarray):
+        return data
+    elif isinstance(data, Sequence):
+        return np.array(data)
+    elif isinstance(data, int):
+        return np.ndarray([data], dtype=int)
+    elif isinstance(data, float):
+        return np.array([data], dtype=float)
+    else:
+        raise TypeError(f"type {type(data)} cannot be converted to ndarray.")
+
+
+def to_torch_dtype(dtype):
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    elif isinstance(dtype, str):
+        dtype_mapping = {
+            "float64": torch.float64,
+            "float32": torch.float32,
+            "float16": torch.float16,
+            "fp32": torch.float32,
+            "fp16": torch.float16,
+            "half": torch.float16,
+            "bf16": torch.bfloat16,
+        }
+        if dtype not in dtype_mapping:
+            raise ValueError
+        dtype = dtype_mapping[dtype]
+        return dtype
+    else:
+        raise ValueError
+
+
+def count_params(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+def convert_SyncBN_to_BN2d(model_cfg):
+    for k in model_cfg:
+        v = model_cfg[k]
+        if k == "norm_cfg" and v["type"] == "SyncBN":
+            v["type"] = "BN2d"
+        elif isinstance(v, dict):
+            convert_SyncBN_to_BN2d(v)
+
+
+def get_topk(x, dim=4, k=5):
+    x = to_tensor(x)
+    inds = x[..., dim].topk(k)[1]
+    return x[inds]
+
+
+def param_sigmoid(x, alpha):
+    ret = 1 / (1 + (-alpha * x).exp())
+    return ret
+
+
+def inverse_param_sigmoid(x, alpha, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2) / alpha
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+def count_columns(df, columns):
+    cnt_dict = OrderedDict()
+    num_samples = len(df)
+
+    for col in columns:
+        d_i = df[col].value_counts().to_dict()
+        for k in d_i:
+            d_i[k] = (d_i[k], d_i[k] / num_samples)
+        cnt_dict[col] = d_i
+
+    return cnt_dict
+
+
+def build_logger(work_dir, cfgname):
+    log_file = cfgname + ".log"
+    log_path = os.path.join(work_dir, log_file)
+
+    logger = logging.getLogger(cfgname)
+    logger.setLevel(logging.INFO)
+    # formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+    formatter = logging.Formatter("%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
+
+    handler1 = logging.FileHandler(log_path)
+    handler1.setFormatter(formatter)
+
+    handler2 = logging.StreamHandler()
+    handler2.setFormatter(formatter)
+
+    logger.addHandler(handler1)
+    logger.addHandler(handler2)
+    logger.propagate = False
+
+    return logger
--- a/opensora/utils/train_utils.py
+++ b/opensora/utils/train_utils.py
+from collections import OrderedDict
+
+import torch
+
+
+@torch.no_grad()
+def update_ema(
+    ema_model: torch.nn.Module, model: torch.nn.Module, optimizer=None, decay: float = 0.9999, sharded: bool = True
+) -> None:
+    """
+    Step the EMA model towards the current model.
+    """
+    ema_params = OrderedDict(ema_model.named_parameters())
+    model_params = OrderedDict(model.named_parameters())
+
+    for name, param in model_params.items():
+        if name == "pos_embed":
+            continue
+        if param.requires_grad == False:
+            continue
+        if not sharded:
+            param_data = param.data
+            ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)
+        else:
+            if param.data.dtype != torch.float32:
+                param_id = id(param)
+                master_param = optimizer._param_store.working_to_master_param[param_id]
+                param_data = master_param.data
+            else:
+                param_data = param.data
+            ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)
--- a/readme_imgs/image-1.png
+++ b/readme_imgs/image-1.png
--- a/readme_imgs/image-2.png
+++ b/readme_imgs/image-2.png
--- a/readme_imgs/r0.gif
+++ b/readme_imgs/r0.gif
--- a/readme_imgs/r1.gif
+++ b/readme_imgs/r1.gif
--- a/readme_imgs/r2.gif
+++ b/readme_imgs/r2.gif
--- a/readme_imgs/r3.gif
+++ b/readme_imgs/r3.gif
--- a/readme_imgs/r4.gif
+++ b/readme_imgs/r4.gif
--- a/requirements.txt
+++ b/requirements.txt
+colossalai
+accelerate
+diffusers==0.24.0
+ftfy
+gdown
+mmengine==0.10.0
+pre-commit==3.3.2
+# pyav
+av==11.0.0
+pyav2==0.0.2
+tensorboard
+timm
+tqdm
+transformers
+wandb
+sentencepiece
--- a/scripts/inference.py
+++ b/scripts/inference.py
+import os
+
+import torch
+import colossalai
+import torch.distributed as dist
+from mmengine.runner import set_random_seed
+
+from opensora.datasets import save_sample
+from opensora.registry import MODELS, SCHEDULERS, build_module
+from opensora.utils.config_utils import parse_configs
+from opensora.utils.misc import to_torch_dtype
+from opensora.acceleration.parallel_states import set_sequence_parallel_group
+from colossalai.cluster import DistCoordinator
+
+
+def load_prompts(prompt_path):
+    with open(prompt_path, "r") as f:
+        prompts = [line.strip() for line in f.readlines()]
+    return prompts
+
+
+def main():
+    # ======================================================
+    # 1. cfg and init distributed env
+    # ======================================================
+    cfg = parse_configs(training=False)
+    print(cfg)
+
+    # init distributed
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+
+    if coordinator.world_size > 1:
+        set_sequence_parallel_group(dist.group.WORLD) 
+        enable_sequence_parallelism = True
+    else:
+        enable_sequence_parallelism = False
+
+    # ======================================================
+    # 2. runtime variables
+    # ======================================================
+    torch.set_grad_enabled(False)
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = to_torch_dtype(cfg.dtype)
+    set_random_seed(seed=cfg.seed)
+    prompts = load_prompts(cfg.prompt_path)
+
+    # ======================================================
+    # 3. build model & load weights
+    # ======================================================
+    # 3.1. build model
+    input_size = (cfg.num_frames, *cfg.image_size)
+    vae = build_module(cfg.vae, MODELS)
+    latent_size = vae.get_latent_size(input_size)
+    text_encoder = build_module(cfg.text_encoder, MODELS, device=device)  # T5 must be fp32
+    model = build_module(
+        cfg.model,
+        MODELS,
+        input_size=latent_size,
+        in_channels=vae.out_channels,
+        caption_channels=text_encoder.output_dim,
+        model_max_length=text_encoder.model_max_length,
+        dtype=dtype,
+        enable_sequence_parallelism=enable_sequence_parallelism,
+    )
+    text_encoder.y_embedder = model.y_embedder  # hack for classifier-free guidance
+
+    # 3.2. move to device & eval
+    vae = vae.to(device, dtype).eval()
+    model = model.to(device, dtype).eval()
+
+    # 3.3. build scheduler
+    scheduler = build_module(cfg.scheduler, SCHEDULERS)
+
+    # 3.4. support for multi-resolution
+    model_args = dict()
+    if cfg.multi_resolution:
+        image_size = cfg.image_size
+        hw = torch.tensor([image_size], device=device, dtype=dtype).repeat(cfg.batch_size, 1)
+        ar = torch.tensor([[image_size[0] / image_size[1]]], device=device, dtype=dtype).repeat(cfg.batch_size, 1)
+        model_args["data_info"] = dict(ar=ar, hw=hw)
+
+    # ======================================================
+    # 4. inference
+    # ======================================================
+    sample_idx = 0
+    save_dir = cfg.save_dir
+    os.makedirs(save_dir, exist_ok=True)
+    for i in range(0, len(prompts), cfg.batch_size):
+        batch_prompts = prompts[i : i + cfg.batch_size]
+        samples = scheduler.sample(
+            model,
+            text_encoder,
+            z_size=(vae.out_channels, *latent_size),
+            prompts=batch_prompts,
+            device=device,
+            additional_args=model_args,
+        )
+        samples = vae.decode(samples.to(dtype))
+
+        if coordinator.is_master():
+            for idx, sample in enumerate(samples):
+                print(f"Prompt: {batch_prompts[idx]}")
+                save_path = os.path.join(save_dir, f"sample_{sample_idx}")
+                save_sample(sample, fps=cfg.fps, save_path=save_path)
+                sample_idx += 1
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/train.py
+++ b/scripts/train.py
+from copy import deepcopy
+
+import colossalai
+import torch
+import torch.distributed as dist
+import wandb
+from colossalai.booster import Booster
+from colossalai.booster.plugin import LowLevelZeroPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+from tqdm import tqdm
+
+from opensora.acceleration.checkpoint import set_grad_checkpoint
+from opensora.acceleration.parallel_states import (
+    get_data_parallel_group,
+    set_data_parallel_group,
+    set_sequence_parallel_group,
+)
+from opensora.acceleration.plugin import ZeroSeqParallelPlugin
+from opensora.datasets import DatasetFromCSV, get_transforms_image, get_transforms_video, prepare_dataloader
+from opensora.registry import MODELS, SCHEDULERS, build_module
+from opensora.utils.ckpt_utils import create_logger, load, model_sharding, record_model_param_shape, save
+from opensora.utils.config_utils import (
+    create_experiment_workspace,
+    create_tensorboard_writer,
+    parse_configs,
+    save_training_config,
+)
+from opensora.utils.misc import all_reduce_mean, format_numel_str, get_model_numel, requires_grad, to_torch_dtype
+from opensora.utils.train_utils import update_ema
+
+
+def main():
+    # ======================================================
+    # 1. args & cfg
+    # ======================================================
+    cfg = parse_configs(training=True)
+    print(cfg)
+    exp_name, exp_dir = create_experiment_workspace(cfg)
+    save_training_config(cfg._cfg_dict, exp_dir)
+
+    # ======================================================
+    # 2. runtime variables & colossalai launch
+    # ======================================================
+    assert torch.cuda.is_available(), "Training currently requires at least one GPU."
+    assert cfg.dtype in ["fp16", "bf16"], f"Unknown mixed precision {cfg.dtype}"
+
+    # 2.1. colossalai init distributed training
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+    device = get_current_device()
+    dtype = to_torch_dtype(cfg.dtype)
+
+    # 2.2. init logger, tensorboard & wandb
+    if not coordinator.is_master():
+        logger = create_logger(None)
+    else:
+        logger = create_logger(exp_dir)
+        logger.info(f"Experiment directory created at {exp_dir}")
+
+        writer = create_tensorboard_writer(exp_dir)
+        if cfg.wandb:
+            wandb.init(project="minisora", name=exp_name, config=cfg._cfg_dict)
+
+    # 2.3. initialize ColossalAI booster
+    if cfg.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=cfg.dtype,
+            initial_scale=2**16,
+            max_norm=cfg.grad_clip,
+        )
+        set_data_parallel_group(dist.group.WORLD)
+    elif cfg.plugin == "zero2-seq":
+        plugin = ZeroSeqParallelPlugin(
+            sp_size=cfg.sp_size,
+            stage=2,
+            precision=cfg.dtype,
+            initial_scale=2**16,
+            max_norm=cfg.grad_clip,
+        )
+        set_sequence_parallel_group(plugin.sp_group)
+        set_data_parallel_group(plugin.dp_group)
+    else:
+        raise ValueError(f"Unknown plugin {cfg.plugin}")
+    booster = Booster(plugin=plugin)
+
+    # ======================================================
+    # 3. build dataset and dataloader
+    # ======================================================
+    dataset = DatasetFromCSV(
+        cfg.data_path,
+        # TODO: change transforms
+        transform=(
+            get_transforms_video(cfg.image_size[0])
+            if not cfg.use_image_transform
+            else get_transforms_image(cfg.image_size[0])
+        ),
+        num_frames=cfg.num_frames,
+        frame_interval=cfg.frame_interval,
+        root=cfg.root,
+    )
+
+    # TODO: use plugin's prepare dataloader
+    # a batch contains:
+    # {
+    #      "video": torch.Tensor,  # [B, C, T, H, W],
+    #      "text": List[str],
+    # }
+    dataloader = prepare_dataloader(
+        dataset,
+        batch_size=cfg.batch_size,
+        num_workers=cfg.num_workers,
+        shuffle=True,
+        drop_last=True,
+        pin_memory=True,
+        process_group=get_data_parallel_group(),
+    )
+    logger.info(f"Dataset contains {len(dataset):,} videos ({cfg.data_path})")
+
+    total_batch_size = cfg.batch_size * dist.get_world_size() // cfg.sp_size
+    logger.info(f"Total batch size: {total_batch_size}")
+
+    # ======================================================
+    # 4. build model
+    # ======================================================
+    # 4.1. build model
+    input_size = (cfg.num_frames, *cfg.image_size)
+    vae = build_module(cfg.vae, MODELS)
+    latent_size = vae.get_latent_size(input_size)
+    text_encoder = build_module(cfg.text_encoder, MODELS, device=device)  # T5 must be fp32
+    model = build_module(
+        cfg.model,
+        MODELS,
+        input_size=latent_size,
+        in_channels=vae.out_channels,
+        caption_channels=text_encoder.output_dim,
+        model_max_length=text_encoder.model_max_length,
+        dtype=dtype,
+    )
+    model_numel, model_numel_trainable = get_model_numel(model)
+    logger.info(
+        f"Trainable model params: {format_numel_str(model_numel_trainable)}, Total model params: {format_numel_str(model_numel)}"
+    )
+
+    # 4.2. create ema
+    ema = deepcopy(model).to(torch.float32).to(device)
+    requires_grad(ema, False)
+    ema_shape_dict = record_model_param_shape(ema)
+
+    # 4.3. move to device
+    vae = vae.to(device, dtype)
+    model = model.to(device, dtype)
+
+    # 4.4. build scheduler
+    scheduler = build_module(cfg.scheduler, SCHEDULERS)
+
+    # 4.5. setup optimizer
+    optimizer = HybridAdam(
+        filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.lr, weight_decay=0, adamw_mode=True
+    )
+    lr_scheduler = None
+
+    # 4.6. prepare for training
+    if cfg.grad_checkpoint:
+        set_grad_checkpoint(model)
+    model.train()
+    update_ema(ema, model, decay=0, sharded=False)
+    ema.eval()
+
+    # =======================================================
+    # 5. boost model for distributed training with colossalai
+    # =======================================================
+    torch.set_default_dtype(dtype)
+    model, optimizer, _, dataloader, lr_scheduler = booster.boost(
+        model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, dataloader=dataloader
+    )
+    torch.set_default_dtype(torch.float)
+    num_steps_per_epoch = len(dataloader)
+    logger.info("Boost model for distributed training")
+
+    # =======================================================
+    # 6. training loop
+    # =======================================================
+    start_epoch = start_step = log_step = sampler_start_idx = 0
+    running_loss = 0.0
+
+    # 6.1. resume training
+    if cfg.load is not None:
+        logger.info("Loading checkpoint")
+        start_epoch, start_step, sampler_start_idx = load(booster, model, ema, optimizer, lr_scheduler, cfg.load)
+        logger.info(f"Loaded checkpoint {cfg.load} at epoch {start_epoch} step {start_step}")
+    logger.info(f"Training for {cfg.epochs} epochs with {num_steps_per_epoch} steps per epoch")
+
+    dataloader.sampler.set_start_index(sampler_start_idx)
+    model_sharding(ema)
+
+    # 6.2. training loop
+    for epoch in range(start_epoch, cfg.epochs):
+        dataloader.sampler.set_epoch(epoch)
+        dataloader_iter = iter(dataloader)
+        logger.info(f"Beginning epoch {epoch}...")
+
+        with tqdm(
+            range(start_step, num_steps_per_epoch),
+            desc=f"Epoch {epoch}",
+            disable=not coordinator.is_master(),
+            total=num_steps_per_epoch,
+            initial=start_step,
+        ) as pbar:
+            for step in pbar:
+                batch = next(dataloader_iter)
+                x = batch["video"].to(device, dtype)  # [B, C, T, H, W]
+                y = batch["text"]
+
+                with torch.no_grad():
+                    # Prepare visual inputs
+                    x = vae.encode(x)  # [B, C, T, H/P, W/P]
+                    # Prepare text inputs
+                    model_args = text_encoder.encode(y)
+
+                # Diffusion
+                t = torch.randint(0, scheduler.num_timesteps, (x.shape[0],), device=device)
+                loss_dict = scheduler.training_losses(model, x, t, model_args)
+
+                # Backward & update
+                loss = loss_dict["loss"].mean()
+                booster.backward(loss=loss, optimizer=optimizer)
+                optimizer.step()
+                optimizer.zero_grad()
+
+                # Update EMA
+                update_ema(ema, model.module, optimizer=optimizer)
+
+                # Log loss values:
+                all_reduce_mean(loss)
+                running_loss += loss.item()
+                global_step = epoch * num_steps_per_epoch + step
+                log_step += 1
+
+                # Log to tensorboard
+                if coordinator.is_master() and (global_step + 1) % cfg.log_every == 0:
+                    avg_loss = running_loss / log_step
+                    pbar.set_postfix({"loss": avg_loss, "step": step, "global_step": global_step})
+                    running_loss = 0
+                    log_step = 0
+                    writer.add_scalar("loss", loss.item(), global_step)
+                    if cfg.wandb:
+                        wandb.log(
+                            {
+                                "iter": global_step,
+                                "num_samples": global_step * total_batch_size,
+                                "epoch": epoch,
+                                "loss": loss.item(),
+                                "avg_loss": avg_loss,
+                            },
+                            step=global_step,
+                        )
+
+                # Save checkpoint
+                if cfg.ckpt_every > 0 and (global_step + 1) % cfg.ckpt_every == 0:
+                    save(
+                        booster,
+                        model,
+                        ema,
+                        optimizer,
+                        lr_scheduler,
+                        epoch,
+                        step + 1,
+                        global_step + 1,
+                        cfg.batch_size,
+                        coordinator,
+                        exp_dir,
+                        ema_shape_dict,
+                    )
+                    logger.info(
+                        f"Saved checkpoint at epoch {epoch} step {step + 1} global_step {global_step + 1} to {exp_dir}"
+                    )
+
+        # the continue epochs are not resumed, so we need to reset the sampler start index and start step
+        dataloader.sampler.set_start_index(0)
+        start_step = 0
+
+
+if __name__ == "__main__":
+    main()
--- a/setup.py
+++ b/setup.py
+from typing import List
+
+from setuptools import find_packages, setup
+
+
+def fetch_requirements(path) -> List[str]:
+    """
+    This function reads the requirements file.
+
+    Args:
+        path (str): the path to the requirements file.
+
+    Returns:
+        The lines in the requirements file.
+    """
+    with open(path, "r") as fd:
+        return [r.strip() for r in fd.readlines()]
+
+
+def fetch_readme() -> str:
+    """
+    This function reads the README.md file in the current directory.
+
+    Returns:
+        The lines in the README file.
+    """
+    with open("README.md", encoding="utf-8") as f:
+        return f.read()
+
+
+setup(
+    name="opensora",
+    version="1.0.0",
+    packages=find_packages(
+        exclude=(
+            "assets",
+            "configs",
+            "docs",
+            "outputs",
+            "pretrained_models",
+            "scripts",
+            "tests",
+            "tools",
+            "*.egg-info",
+        )
+    ),
+    description="Democratizing Efficient Video Production for All",
+    long_description=fetch_readme(),
+    long_description_content_type="text/markdown",
+    license="Apache Software License 2.0",
+    install_requires=fetch_requirements("requirements.txt"),
+    python_requires=">=3.6",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Environment :: GPU :: NVIDIA CUDA",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: System :: Distributed Computing",
+    ],
+)
--- a/tests/test_seq_parallel_attention.py
+++ b/tests/test_seq_parallel_attention.py
+import colossalai
+import torch
+import torch.distributed as dist
+from colossalai.testing import spawn
+
+from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward
+from opensora.acceleration.parallel_states import set_sequence_parallel_group
+from opensora.models.layers.blocks import (
+    Attention,
+    MultiHeadCrossAttention,
+    SeqParallelAttention,
+    SeqParallelMultiHeadCrossAttention,
+)
+
+
+def run_attention(rank, world_size):
+    # create model
+    torch.manual_seed(1024)
+    set_sequence_parallel_group(dist.group.WORLD)
+
+    seq_parallel_attention = SeqParallelAttention(dim=256, num_heads=4, qkv_bias=True, enable_flashattn=False).cuda()
+
+    torch.manual_seed(1024)
+    attention = Attention(
+        dim=256,
+        num_heads=4,
+        qkv_bias=True,
+        enable_flashattn=False,
+    ).cuda()
+
+    # create inputs
+    torch.manual_seed(1024)
+    x = torch.randn(4, 64, 256).cuda()
+    seq_x = x.clone().detach()
+
+    x.requires_grad = True
+    x.retain_grad()
+    seq_x.requires_grad = True
+    seq_x.retain_grad()
+
+    sub_seq_x = split_forward_gather_backward(seq_x, dist.group.WORLD, dim=1, grad_scale="down")
+
+    # run model
+    out = attention(x)
+    sub_seq_out = seq_parallel_attention(sub_seq_x)
+    seq_out = gather_forward_split_backward(sub_seq_out, dist.group.WORLD, dim=1, grad_scale="up")
+
+    assert torch.allclose(seq_out, out, atol=1e-7), f"{seq_out}\nvs\n{out}"
+
+    # run backward
+    seq_out.mean().backward()
+    out.mean().backward()
+
+    # all reduce gradient for sp
+    for p in seq_parallel_attention.parameters():
+        if p.grad is not None:
+            dist.all_reduce(p.grad, group=dist.group.WORLD)
+            p.grad.div_(world_size)
+
+    # check grad
+    for p1, p2 in zip(seq_parallel_attention.parameters(), attention.parameters()):
+        assert torch.allclose(p1.grad, p2.grad, atol=1e-7), f"{p1.grad}\nvs\n{p2.grad}"
+
+    # check input grad
+    assert torch.allclose(x.grad, seq_x.grad, atol=1e-7), f"{x.grad}\nvs\n{seq_x.grad}"
+
+
+def run_cross_attention(rank, world_size):
+    # create model
+    torch.manual_seed(1024)
+    set_sequence_parallel_group(dist.group.WORLD)
+    seq_parallel_attention = SeqParallelMultiHeadCrossAttention(
+        d_model=256,
+        num_heads=4,
+    ).cuda().to(torch.bfloat16)
+
+    torch.manual_seed(1024)
+    attention = MultiHeadCrossAttention(
+        d_model=256,
+        num_heads=4,
+    ).cuda().to(torch.bfloat16)
+
+    # make sure the weights are the same
+    for p1, p2 in zip(seq_parallel_attention.parameters(), attention.parameters()):
+        p1.data.copy_(p2.data)
+
+    # create inputs
+    torch.manual_seed(1024)
+    x = torch.randn(4, 64, 256).cuda().to(torch.bfloat16)
+    y = torch.randn(4, 32, 256).cuda().to(torch.bfloat16)
+
+    mask = [2, 10, 8, 16]
+    mask = None
+    seq_x = x.clone().detach()
+    seq_y = y.clone().detach()
+
+    # set grad
+    x.requires_grad = True
+    x.retain_grad()
+    seq_x.requires_grad = True
+    seq_x.retain_grad()
+    y.requires_grad = True
+    y.retain_grad()
+    seq_y.requires_grad = True
+    seq_y.retain_grad()
+
+    # split by sequence
+    sub_seq_x = split_forward_gather_backward(seq_x, dist.group.WORLD, dim=1, grad_scale="down")
+
+    # run model
+    out = attention(x, y, mask)
+    sub_seq_out = seq_parallel_attention(sub_seq_x, seq_y, mask)
+    seq_out = gather_forward_split_backward(sub_seq_out, dist.group.WORLD, dim=1, grad_scale="up")
+
+    assert torch.allclose(seq_out, out, rtol=1e-5, atol=1e-6), f"\n{seq_out}\nvs\n{out}"
+
+    # run backward
+    seq_out.mean().backward()
+    out.mean().backward()
+
+    # all reduce gradient for sp
+    for name, p in seq_parallel_attention.named_parameters():
+        if p.grad is not None:
+            dist.all_reduce(p.grad, group=dist.group.WORLD)
+            p.grad.div_(world_size)
+        else:
+            print(f"grad of {name} is None")
+
+    # # check grad
+    for p1, p2 in zip(seq_parallel_attention.named_parameters(), attention.named_parameters()):
+        assert torch.allclose(p1[1].grad, p2[1].grad, rtol=1e-3, atol=1e-4), f"\n{p1[0]}\nvs\n{p2[0]}:\n{p1[1].grad}\nvs\n{p2[1].grad}"
+
+    # # check input grad
+    assert torch.allclose(x.grad, seq_x.grad, atol=1e-7), f"{x.grad}\nvs\n{seq_x.grad}"
+    assert torch.allclose(y.grad, seq_y.grad, atol=1e-7), f"{y.grad}\nvs\n{seq_y.grad}"
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch({}, rank=rank, world_size=world_size, host="localhost", port=port)
+    # run_attention(rank, world_size)
+    run_cross_attention(rank, world_size)
+
+
+def test_seq_parallel_attention():
+    spawn(run_dist, nprocs=2)
+
+
+if __name__ == "__main__":
+    test_seq_parallel_attention()
--- a/tests/test_t5_shardformer.py
+++ b/tests/test_t5_shardformer.py
+import time
+from copy import deepcopy
+
+import colossalai
+import torch
+from colossalai.shardformer import ShardConfig, ShardFormer
+from colossalai.testing import spawn
+
+from opensora.acceleration.shardformer.policy.t5_encoder import T5EncoderPolicy
+from opensora.models.text_encoder.t5 import T5Embedder
+
+
+def run_t5_encoder(rank, world_size, port):
+    colossalai.launch({}, rank=rank, world_size=world_size, port=port, host="localhost")
+
+    # t5 embedder
+    t5_path = "./pretrained_models/t5_ckpts"
+    hf_t5 = T5Embedder(device="cuda", local_cache=True, cache_dir=t5_path, torch_dtype=torch.float)
+    sf_t5 = deepcopy(hf_t5)
+
+    # create huggingface model as normal
+    shard_config = ShardConfig(
+        tensor_parallel_process_group=None,
+        pipeline_stage_manager=None,
+        enable_tensor_parallelism=False,
+        enable_fused_normalization=False,
+        enable_flash_attention=False,
+        enable_jit_fused=True,
+        enable_sequence_parallelism=False,
+        enable_sequence_overlap=False,
+    )
+    shard_former = ShardFormer(shard_config=shard_config)
+    sharded_model, _ = shard_former.optimize(sf_t5.model, policy=T5EncoderPolicy())
+    sf_t5.model = sharded_model
+
+    # test t5 embedder
+    texts = ["Who is the best player in the history of NBA?", "How to study computer science?"]
+    for i in range(5):
+        hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
+        sf_embs, sf_masks = sf_t5.get_text_embeddings(texts)
+
+    # check accuracy
+    assert torch.allclose(hf_embs, sf_embs, rtol=1e-4, atol=1e-5), f"{hf_embs} \nvs\n{sf_embs}"
+    assert torch.allclose(hf_masks, sf_masks), f"{hf_masks} \nvs\n{sf_masks}"
+
+    # measure perf
+    torch.cuda.synchronize()
+    hf_start = time.time()
+    for i in range(20):
+        hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
+    torch.cuda.synchronize()
+    hf_end = time.time()
+
+    # convert sf to fp16
+    hf_t5.model = hf_t5.model.half()
+    torch.cuda.synchronize()
+    sf_start = time.time()
+    for i in range(20):
+        hf_embs, hf_masks = hf_t5.get_text_embeddings(texts)
+    torch.cuda.synchronize()
+    sf_end = time.time()
+
+    print(f"[Performance] native: {hf_end - hf_start}s, shardformer: {sf_end - sf_start} s")
+
+
+def test_t5_encoder():
+    spawn(run_t5_encoder)
+
+
+if __name__ == "__main__":
+    test_t5_encoder()
--- a/tools/__init__.py
+++ b/tools/__init__.py
--- a/tools/caption/README.md
+++ b/tools/caption/README.md
+# Video Captioning
+
+Human labeling of videos is expensive and time-consuming. We adopt powerful image captioning models to generate captions for videos. Although GPT-4V achieves a better performance, its 20s/sample speed is too slow for us. With batch inference, we can achieve a speed of 3s/sample with LLaVA, and the quality is comparable. LLaVA is the second best open-source model in [MMMU](https://mmmu-benchmark.github.io/) and accepts any resolution.
+
+![Caption](https://i0.imgs.ovh/2024/03/16/eXdvC.png)
+
+## GPT-4V Captioning
+
+Run the following command to generate captions for videos with GPT-4V:
+
+```bash
+python -m tools.caption.caption_gpt4 FOLDER_WITH_VIDEOS output.csv --key $OPENAI_API_KEY
+```
+
+The cost is approximately $0.01 per video (3 frames per video). The output is a CSV file with path and caption.
+
+## LLaVA Captioning
+
+First, install LLaVA according to their [official instructions](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#install). We use the `liuhaotian/llava-v1.6-34b` model for captioning, which can be download [here](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b). Then, run the following command to generate captions for videos with LLaVA:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 python -m tools.caption.caption_llava samples output.csv
+```
+
+The Yi-34B requires 2 80GB GPUs and 3s/sample. The output is a CSV file with path and caption.
--- a/tools/caption/__init__.py
+++ b/tools/caption/__init__.py
--- a/tools/caption/caption_gpt4.py
+++ b/tools/caption/caption_gpt4.py
+import argparse
+import csv
+import os
+
+import requests
+import tqdm
+
+from .utils import extract_frames, prompts, read_video_list
+
+
+def get_caption(frame, prompt, api_key):
+    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
+    payload = {
+        "model": "gpt-4-vision-preview",
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": prompt,
+                    },
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[0]}"}},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[1]}"}},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[2]}"}},
+                ],
+            }
+        ],
+        "max_tokens": 300,
+    }
+    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=60)
+    caption = response.json()["choices"][0]["message"]["content"]
+    caption = caption.replace("\n", " ")
+    return caption
+
+
+def main(args):
+    # ======================================================
+    # 1. read video list
+    # ======================================================
+    videos = read_video_list(args.video_folder, args.output_file)
+    f = open(args.output_file, "a")
+    writer = csv.writer(f)
+
+    # ======================================================
+    # 2. generate captions
+    # ======================================================
+    for video in tqdm.tqdm(videos):
+        video_path = os.path.join(args.video_folder, video)
+        frame, length = extract_frames(video_path, base_64=True)
+        if len(frame) < 3:
+            continue
+
+        prompt = prompts[args.prompt]
+        caption = get_caption(frame, prompt, args.key)
+
+        writer.writerow((video, caption, length))
+    f.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("video_folder", type=str)
+    parser.add_argument("output_file", type=str)
+    parser.add_argument("--prompt", type=str, default="three_frames")
+    parser.add_argument("--key", type=str)
+    args = parser.parse_args()
+
+    main(args)
--- a/tools/caption/caption_llava.py
+++ b/tools/caption/caption_llava.py
+import argparse
+import csv
+import os
+import warnings
+
+import torch
+from llava.constants import DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from llava.conversation import conv_templates
+from llava.mm_utils import get_anyres_image_grid_shape, get_model_name_from_path, process_images, tokenizer_image_token
+from llava.model.builder import load_pretrained_model
+from llava.model.llava_arch import unpad_image
+from llava.utils import disable_torch_init
+from tqdm import tqdm
+
+from .utils import extract_frames, prompts, read_video_list
+
+disable_torch_init()
+
+
+def prepare_inputs_labels_for_multimodal(
+    self, input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes=None
+):
+    # llava_arch.py
+    vision_tower = self.get_vision_tower()
+    if vision_tower is None or images is None or input_ids.shape[1] == 1:
+        return input_ids, position_ids, attention_mask, past_key_values, None, labels
+
+    if type(images) is list or images.ndim == 5:
+        if type(images) is list:
+            images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
+        concat_images = torch.cat([image for image in images], dim=0)
+        image_features = self.encode_images(concat_images)
+        split_sizes = [image.shape[0] for image in images]
+        image_features = torch.split(image_features, split_sizes, dim=0)
+        mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+        image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+        if mm_patch_merge_type == "flat":
+            image_features = [x.flatten(0, 1) for x in image_features]
+        elif mm_patch_merge_type.startswith("spatial"):
+            new_image_features = []
+            for image_idx, image_feature in enumerate(image_features):
+                if image_feature.shape[0] > 1:
+                    base_image_feature = image_feature[0]
+                    image_feature = image_feature[1:]
+                    height = width = self.get_vision_tower().num_patches_per_side
+                    assert height * width == base_image_feature.shape[0]
+                    if image_aspect_ratio == "anyres":
+                        num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+                            image_sizes[image_idx],
+                            self.config.image_grid_pinpoints,
+                            self.get_vision_tower().config.image_size,
+                        )
+                        image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                    else:
+                        raise NotImplementedError
+                    if "unpad" in mm_patch_merge_type:
+                        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                        image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                        image_feature = torch.cat(
+                            (
+                                image_feature,
+                                self.model.image_newline[:, None, None]
+                                .expand(*image_feature.shape[:-1], 1)
+                                .to(image_feature.device),
+                            ),
+                            dim=-1,
+                        )
+                        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                    else:
+                        image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+                        image_feature = image_feature.flatten(0, 3)
+                    image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+                else:
+                    image_feature = image_feature[0]
+                    if "unpad" in mm_patch_merge_type:
+                        image_feature = torch.cat(
+                            (image_feature, self.model.image_newline[None].to(image_feature.device)), dim=0
+                        )
+                new_image_features.append(image_feature)
+            image_features = new_image_features
+        else:
+            raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
+    else:
+        image_features = self.encode_images(images)
+
+    # TODO: image start / end is not implemented here to support pretraining.
+    if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(self.config, "mm_use_im_start_end", False):
+        raise NotImplementedError
+
+    # Let's just add dummy tensors if they do not exist,
+    # it is a headache to deal with None all the time.
+    # But it is not ideal, and if you have a better idea,
+    # please open an issue / submit a PR, thanks.
+    _labels = labels
+    _position_ids = position_ids
+    _attention_mask = attention_mask
+    if attention_mask is None:
+        attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+    else:
+        attention_mask = attention_mask.bool()
+    if position_ids is None:
+        position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+    if labels is None:
+        labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+    # remove the padding using attention_mask -- FIXME
+    input_ids = [
+        cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+    ]
+    labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+
+    new_input_embeds = []
+    new_labels = []
+    cur_image_idx = 0
+    for batch_idx, cur_input_ids in enumerate(input_ids):
+        num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+        if num_images == 0:
+            cur_image_features = image_features[cur_image_idx]
+            cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+            cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+            new_input_embeds.append(cur_input_embeds)
+            new_labels.append(labels[batch_idx])
+            cur_image_idx += 1
+            continue
+
+        image_token_indices = (
+            [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+        )
+        cur_input_ids_noim = []
+        cur_labels = labels[batch_idx]
+        cur_labels_noim = []
+        for i in range(len(image_token_indices) - 1):
+            cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+            cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+        split_sizes = [x.shape[0] for x in cur_labels_noim]
+        cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+        cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+        cur_new_input_embeds = []
+        cur_new_labels = []
+
+        for i in range(num_images + 1):
+            cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+            cur_new_labels.append(cur_labels_noim[i])
+            if i < num_images:
+                cur_image_features = image_features[cur_image_idx]
+                cur_image_idx += 1
+                cur_new_input_embeds.append(cur_image_features)
+                cur_new_labels.append(
+                    torch.full(
+                        (cur_image_features.shape[0],),
+                        IGNORE_INDEX,
+                        device=cur_labels.device,
+                        dtype=cur_labels.dtype,
+                    )
+                )
+
+        cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+
+        cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+        cur_new_labels = torch.cat(cur_new_labels)
+
+        new_input_embeds.append(cur_new_input_embeds)
+        new_labels.append(cur_new_labels)
+
+    # Truncate sequences to max length as image embeddings can make the sequence longer
+    tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+    if tokenizer_model_max_length is not None:
+        new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+        new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+
+    # Combine them
+    max_len = max(x.shape[0] for x in new_input_embeds)
+    batch_size = len(new_input_embeds)
+
+    new_input_embeds_padded = []
+    new_labels_padded = torch.full(
+        (batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device
+    )
+    attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+    position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+
+    for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+        cur_len = cur_new_embed.shape[0]
+        if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+            new_input_embeds_padded.append(
+                torch.cat(
+                    (
+                        torch.zeros(
+                            (max_len - cur_len, cur_new_embed.shape[1]),
+                            dtype=cur_new_embed.dtype,
+                            device=cur_new_embed.device,
+                        ),
+                        cur_new_embed,
+                    ),
+                    dim=0,
+                )
+            )
+            if cur_len > 0:
+                new_labels_padded[i, -cur_len:] = cur_new_labels
+                attention_mask[i, -cur_len:] = True
+                position_ids[i, -cur_len:] = torch.arange(
+                    0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                )
+        else:
+            new_input_embeds_padded.append(
+                torch.cat(
+                    (
+                        cur_new_embed,
+                        torch.zeros(
+                            (max_len - cur_len, cur_new_embed.shape[1]),
+                            dtype=cur_new_embed.dtype,
+                            device=cur_new_embed.device,
+                        ),
+                    ),
+                    dim=0,
+                )
+            )
+            if cur_len > 0:
+                new_labels_padded[i, :cur_len] = cur_new_labels
+                attention_mask[i, :cur_len] = True
+                position_ids[i, :cur_len] = torch.arange(
+                    0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                )
+
+    new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+
+    if _labels is None:
+        new_labels = None
+    else:
+        new_labels = new_labels_padded
+
+    if _attention_mask is None:
+        attention_mask = None
+    else:
+        attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+
+    if _position_ids is None:
+        position_ids = None
+
+    return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+
+
+@torch.inference_mode()
+def main(args):
+    # ======================================================
+    # 1. read video list
+    # ======================================================
+    videos = read_video_list(args.video_folder, args.output_file)
+    f = open(args.output_file, "a")
+    writer = csv.writer(f)
+
+    # ======================================================
+    # 2. load model and prepare prompts
+    # ======================================================
+    model_path = "liuhaotian/llava-v1.6-34b"
+    query = prompts[args.prompt]
+    print(f"Prompt: {query}")
+    conv = conv_templates["chatml_direct"].copy()
+    conv.append_message(conv.roles[0], DEFAULT_IMAGE_TOKEN + "\n" + query)
+    prompt = conv.get_prompt()
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")  # Pytorch non-meta copying warning fills out the console
+        tokenizer, model, image_processor, context_len = load_pretrained_model(
+            model_path=model_path,
+            model_base=None,
+            model_name=get_model_name_from_path(model_path),
+        )
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+    input_ids = input_ids.unsqueeze(0).to(model.device)
+
+    # ======================================================
+    # 3. generate captions
+    # ======================================================
+    bs = args.bs
+    for i in tqdm(range(0, len(videos), bs)):
+        # prepare a batch of inputs
+        video_files = videos[i : i + bs]
+        frames = []
+        video_lengths = []
+        for video_file in video_files:
+            frame, length = extract_frames(os.path.join(args.video_folder, video_file))
+            if len(frame) < 3:
+                continue
+            frames.append(frame)
+            video_lengths.append(length)
+        if len(frames) == 0:
+            continue
+
+        # encode the batch of inputs
+        samples = []
+        for imgs in frames:
+            imgs_size = [img.size for img in imgs]
+            imgs = process_images(imgs, image_processor, model.config)
+            imgs = imgs.to(model.device, dtype=torch.float16)
+            with torch.inference_mode():
+                _, _, _, _, inputs_embeds, _ = prepare_inputs_labels_for_multimodal(
+                    model, input_ids, None, None, None, None, images=imgs, image_sizes=imgs_size
+                )
+            samples.append(inputs_embeds)
+
+        # padding
+        max_len = max([sample.shape[1] for sample in samples])
+        attention_mask = torch.tensor(
+            [[0] * (max_len - samples[i].shape[1]) + [1] * samples[i].shape[1] for i in range(len(samples))]
+        ).to(model.device)
+        inputs_embeds = [
+            torch.cat(
+                [
+                    torch.zeros(
+                        (1, max_len - samples[i].shape[1], samples[i].shape[-1]),
+                        device=model.device,
+                        dtype=torch.float16,
+                    ),
+                    samples[i],
+                ],
+                dim=1,
+            )
+            for i in range(len(samples))
+        ]
+        inputs_embeds = torch.cat(inputs_embeds, dim=0)
+
+        # generate outputs
+        output_ids = super(type(model), model).generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            do_sample=True,
+            temperature=0.2,
+            max_new_tokens=512,
+            use_cache=True,
+        )
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        outputs = [output.replace("\n", " ").strip() for output in outputs]
+
+        # save results
+        result = list(zip(video_files, outputs, video_lengths))
+        for t in result:
+            writer.writerow(t)
+
+    f.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("video_folder", type=str)
+    parser.add_argument("output_file", type=str)
+    parser.add_argument("--bs", type=int, default=32)
+    parser.add_argument("--prompt", type=str, default="three_frames")
+    args = parser.parse_args()
+
+    main(args)