hyi2v

1da75ff3 · mashun1 · 1da75ff3 · 1da75ff3 · 1da75ff3 · 1da75ff3
Commit 1da75ff3 authored Mar 19, 2025 by mashun1
20 changed files
--- a/hyvideo/utils/file_utils.py
+++ b/hyvideo/utils/file_utils.py
+import logging
+import os
+from pathlib import Path
+import json
+import tarfile
+from collections import defaultdict
+from einops import rearrange
+from typing import List
+import torch
+import torchvision
+import numpy as np
+import imageio
+import PIL.Image
+from PIL import Image
+
+CODE_SUFFIXES = {
+    ".py",  # Python codes
+    ".sh",  # Shell scripts
+    ".yaml",
+    ".yml",  # Configuration files
+}
+
+
+def build_pretraining_data_loader():
+    pass
+
+
+def logger_filter(name):
+    def filter_(record):
+        return record["extra"].get("name") == name
+
+    return filter_
+
+
+def resolve_resume_path(resume, results_dir):
+    # Detect the resume path. Support both the experiment index and the full path.
+    if resume.isnumeric():
+        tmp_dirs = list(Path(results_dir).glob("*"))
+        id2exp_dir = defaultdict(list)
+        for tmp_dir in tmp_dirs:
+            part0 = tmp_dir.name.split("_")[0]
+            if part0.isnumeric():
+                id2exp_dir[int(part0)].append(tmp_dir)
+        resume_id = int(resume)
+        valid_exp_dir = id2exp_dir.get(resume_id)
+        if len(valid_exp_dir) == 0:
+            raise ValueError(
+                f"No valid experiment directories found in {results_dir} with the experiment "
+                f"index {resume}."
+            )
+        elif len(valid_exp_dir) > 1:
+            raise ValueError(
+                f"Multiple valid experiment directories found in {results_dir} with the experiment "
+                f"index {resume}: {valid_exp_dir}."
+            )
+        resume_path = valid_exp_dir[0] / "checkpoints"
+    else:
+        resume_path = Path(resume)
+
+    if not resume_path.exists():
+        raise FileNotFoundError(f"Resume path {resume_path} not found.")
+
+    return resume_path
+
+
+def dump_codes(save_path, root, sub_dirs=None, valid_suffixes=None, save_prefix="./"):
+    """
+    Dump codes to the experiment directory.
+
+    Args:
+        save_path (str): Path to the experiment directory.
+        root (Path): Path to the root directory of the codes.
+        sub_dirs (list): List of subdirectories to be dumped. If None, all files in the root directory will
+            be dumped. (default: None)
+        valid_suffixes (tuple, optional): Valid suffixes of the files to be dumped. If None, CODE_SUFFIXES will be used.
+            (default: None)
+        save_prefix (str, optional): Prefix to be added to the files in the tarball. (default: './')
+    """
+    if valid_suffixes is None:
+        valid_suffixes = CODE_SUFFIXES
+
+    # Force to use tar.gz suffix
+    save_path = safe_file(save_path)
+    assert save_path.name.endswith(
+        ".tar.gz"
+    ), f"save_path should end with .tar.gz, got {save_path.name}."
+    # Make root absolute
+    root = Path(root).absolute()
+    # Make a tarball of the codes
+    with tarfile.open(save_path, "w:gz") as tar:
+        # Recursively add all files in the root directory
+        if sub_dirs is None:
+            sub_dirs = list(root.iterdir())
+        for sub_dir in sub_dirs:
+            for file in Path(sub_dir).rglob("*"):
+                if file.is_file() and file.suffix in valid_suffixes:
+                    # make file absolute
+                    file = file.absolute()
+                    arcname = Path(save_prefix) / file.relative_to(root)
+                    tar.add(file, arcname=arcname)
+    return root
+
+
+def dump_args(args, save_path, extra_args=None):
+    args_dict = vars(args)
+    if extra_args:
+        assert isinstance(
+            extra_args, dict
+        ), f"extra_args should be a dictionary, got {type(extra_args)}."
+        args_dict.update(extra_args)
+    # Save to file
+    with safe_file(save_path).open("w") as f:
+        json.dump(args_dict, f, indent=4, sort_keys=True, ensure_ascii=False)
+
+
+def empty_logger():
+    logger = logging.getLogger("hymm_empty_logger")
+    logger.addHandler(logging.NullHandler())
+    logger.setLevel(logging.CRITICAL)
+    return logger
+
+
+def is_valid_experiment(path):
+    path = Path(path)
+    if path.is_dir() and path.name.split("_")[0].isdigit():
+        return True
+    return False
+
+
+def get_experiment_max_number(experiments):
+    valid_experiment_numbers = []
+    for exp in experiments:
+        if is_valid_experiment(exp):
+            valid_experiment_numbers.append(int(Path(exp).name.split("_")[0]))
+    if valid_experiment_numbers:
+        return max(valid_experiment_numbers)
+    return 0
+
+
+def safe_dir(path):
+    """
+    Create a directory (or the parent directory of a file) if it does not exist.
+
+    Args:
+        path (str or Path): Path to the directory.
+
+    Returns:
+        path (Path): Path object of the directory.
+    """
+    path = Path(path)
+    path.mkdir(exist_ok=True, parents=True)
+    return path
+
+
+def safe_file(path):
+    """
+    Create the parent directory of a file if it does not exist.
+
+    Args:
+        path (str or Path): Path to the file.
+
+    Returns:
+        path (Path): Path object of the file.
+    """
+    path = Path(path)
+    path.parent.mkdir(exist_ok=True, parents=True)
+    return path
+
+
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=1, fps=24):
+    """save videos by video tensor
+       copy from https://github.com/guoyww/AnimateDiff/blob/e92bd5671ba62c0d774a32951453e328018b7c5b/animatediff/utils/util.py#L61
+
+    Args:
+        videos (torch.Tensor): video tensor predicted by the model
+        path (str): path to save video
+        rescale (bool, optional): rescale the video tensor from [-1, 1] to  . Defaults to False.
+        n_rows (int, optional): Defaults to 1.
+        fps (int, optional): video save fps. Defaults to 8.
+    """
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = torch.clamp(x, 0, 1)
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    imageio.mimsave(path, outputs, fps=fps)
--- a/hyvideo/utils/helpers.py
+++ b/hyvideo/utils/helpers.py
+import collections.abc
+
+from itertools import repeat
+
+import contextlib
+import os
+import random
+
+import numpy as np
+import torch
+import deepspeed
+import torch.distributed as dist
+from torch.utils.tensorboard import SummaryWriter
+
+
+def all_gather_sum(running_value, device):
+    value = torch.tensor(running_value, device=device)
+    dist.all_reduce(value, op=dist.ReduceOp.SUM)
+    return value.item()
+
+
+class EventsMonitor(object):
+    def __init__(self, events_root, rank):
+        self.rank = rank
+        if rank == 0:
+            self.writer = SummaryWriter(log_dir=events_root)
+        else:
+            self.writer = None
+
+    def write_events(self, events):
+        for event in events:
+            name, val, count = event
+            if self.rank == 0:
+                self.writer.add_scalar(name, val, global_step=count)
+
+
+def profiler_context(enable, exp_dir, worker_name):
+    if enable:
+        return torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            schedule=torch.profiler.schedule(
+                skip_first=10,
+                wait=5,
+                warmup=1,
+                active=3,
+                repeat=2,
+            ),
+            profile_memory=True,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                exp_dir, worker_name=worker_name
+            ),
+        )
+    else:
+        # return empty python context manager
+        return contextlib.nullcontext()
+
+
+def set_reproducibility(enable, global_seed=None):
+    if enable:
+        # Configure the seed for reproducibility
+        set_manual_seed(global_seed)
+    # Set following debug environment variable
+    # See the link for details: https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+    # Cudnn benchmarking
+    torch.backends.cudnn.benchmark = not enable
+    # Use deterministic algorithms in PyTorch
+    torch.use_deterministic_algorithms(enable)
+
+    # LSTM and RNN networks are not deterministic
+
+
+def set_manual_seed(global_seed):
+    # Seed the RNG for Python
+    random.seed(global_seed)
+    # Seed the RNG for Numpy
+    np.random.seed(global_seed)
+    # Seed the RNG for all devices (both CPU and CUDA)
+    torch.manual_seed(global_seed)
+    # Seed cuda
+    torch.cuda.manual_seed_all(global_seed)
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            x = tuple(x)
+            if len(x) == 1:
+                x = tuple(repeat(x[0], n))
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+
+
+def as_tuple(x):
+    if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+        return tuple(x)
+    if x is None or isinstance(x, (int, float, str)):
+        return (x,)
+    else:
+        raise ValueError(f"Unknown type {type(x)}")
+
+
+def as_list_of_2tuple(x):
+    x = as_tuple(x)
+    if len(x) == 1:
+        x = (x[0], x[0])
+    assert len(x) % 2 == 0, f"Expect even length, got {len(x)}."
+    lst = []
+    for i in range(0, len(x), 2):
+        lst.append((x[i], x[i + 1]))
+    return lst
--- a/hyvideo/utils/lora_utils.py
+++ b/hyvideo/utils/lora_utils.py
+import torch
+from safetensors.torch import load_file
+
+
+# load kohya lora for diffusers pipeline
+def load_lora_for_pipeline(
+    pipeline,
+    lora_path,
+    LORA_PREFIX_TRANSFORMER="",
+    LORA_PREFIX_TEXT_ENCODER="",
+    alpha=1.0,
+    device=0,
+):
+    # load LoRA weight from .safetensors
+    state_dict = load_file(lora_path, device=device)
+
+    visited = []
+
+    # directly update weight in diffusers model
+    for key in state_dict:
+        # it is suggested to print out the key, it usually will be something like below
+        # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
+
+        # as we have set the alpha beforehand, so just skip
+        if ".alpha" in key or key in visited:
+            continue
+
+        if "text" in key:
+            layer_infos = (
+                key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+            )
+            curr_layer = pipeline.text_encoder
+        else:
+            layer_infos = (
+                key.split(".")[0].split(LORA_PREFIX_TRANSFORMER + "_")[-1].split("_")
+            )
+            curr_layer = pipeline.transformer
+
+        # find the target layer
+        temp_name = layer_infos.pop(0)
+        while len(layer_infos) > -1:
+            try:
+                curr_layer = curr_layer.__getattr__(temp_name)
+                if len(layer_infos) > 0:
+                    temp_name = layer_infos.pop(0)
+                elif len(layer_infos) == 0:
+                    break
+            except Exception:
+                if len(temp_name) > 0:
+                    temp_name += "_" + layer_infos.pop(0)
+                else:
+                    temp_name = layer_infos.pop(0)
+
+        pair_keys = []
+        if "lora_down" in key:
+            pair_keys.append(key.replace("lora_down", "lora_up"))
+            pair_keys.append(key)
+        else:
+            pair_keys.append(key)
+            pair_keys.append(key.replace("lora_up", "lora_down"))
+
+        # update weight
+        if len(state_dict[pair_keys[0]].shape) == 4:
+            weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
+            weight_down = (
+                state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
+            )
+            curr_layer.weight.data += alpha * torch.mm(
+                weight_up, weight_down
+            ).unsqueeze(2).unsqueeze(3)
+        else:
+            weight_up = state_dict[pair_keys[0]].to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down)
+
+        # update visited list
+        for item in pair_keys:
+            visited.append(item)
+    del state_dict
+
+    return pipeline
--- a/hyvideo/utils/preprocess_text_encoder_tokenizer_utils.py
+++ b/hyvideo/utils/preprocess_text_encoder_tokenizer_utils.py
+import argparse
+import torch
+from transformers import (
+    AutoProcessor,
+    LlavaForConditionalGeneration,
+)
+
+
+def preprocess_text_encoder_tokenizer(args):
+
+    processor = AutoProcessor.from_pretrained(args.input_dir)
+    model = LlavaForConditionalGeneration.from_pretrained(
+        args.input_dir,
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
+    ).to(0)
+
+    model.language_model.save_pretrained(f"{args.output_dir}")
+    processor.tokenizer.save_pretrained(f"{args.output_dir}")
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        required=True,
+        help="The path to the llava-llama-3-8b-v1_1-transformers.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="",
+        help="The output path of the llava-llama-3-8b-text-encoder-tokenizer."
+        "if '', the parent dir of output will be the same as input dir.",
+    )
+    args = parser.parse_args()
+
+    if len(args.output_dir) == 0:
+        args.output_dir = "/".join(args.input_dir.split("/")[:-1])
+
+    preprocess_text_encoder_tokenizer(args)
--- a/hyvideo/utils/train_utils.py
+++ b/hyvideo/utils/train_utils.py
+import random
+import torchvision.transforms as transforms
+
+import numpy as np
+import torch
+
+import imageio
+import os
+import PIL.Image
+from typing import Union, Optional, List
+from peft import get_peft_model_state_dict
+
+from hyvideo.modules.posemb_layers import get_nd_rotary_pos_embed
+from hyvideo.vae import AutoencoderKLCausal3D
+
+from pathlib import Path
+from einops import rearrange
+from PIL import Image
+
+from hyvideo.constants import PRECISION_TO_TYPE
+from safetensors.torch import load_file
+
+
+def convert_kohya_to_peft_keys(
+    kohya_dict: dict,
+    kohya_prefix="",
+    peft_prefix: str = "base_model.model",
+    device="cpu",
+) -> dict:
+    peft_dict = {}
+    for k, v in kohya_dict.items():
+        if ".alpha" in k:
+            continue
+        new_key = k.replace(f"{kohya_prefix}_lora_", f"{peft_prefix}.")
+        new_key = new_key.replace("single_blocks_", "single_blocks.")
+        new_key = new_key.replace("double_blocks_", "double_blocks.")
+        new_key = new_key.replace("_img_attn_proj", ".img_attn_proj")
+        new_key = new_key.replace("_img_attn_qkv", ".img_attn_qkv")
+        new_key = new_key.replace("_img_mlp_fc", ".img_mlp.fc")
+        new_key = new_key.replace("_txt_mlp_fc", ".txt_mlp.fc")
+        new_key = new_key.replace("_img_mod", ".img_mod")
+        new_key = new_key.replace("_txt", ".txt")
+        new_key = new_key.replace("_modulation", ".modulation")
+        new_key = new_key.replace("_linear", ".linear")
+        new_key = new_key.replace("lora_down", "lora_A.default")
+        new_key = new_key.replace("lora_up", "lora_B.default")
+        new_key = new_key.replace(
+            "_individual_token_refiner_blocks_", ".individual_token_refiner.blocks."
+        )
+        new_key = new_key.replace("_mlp_fc", ".mlp.fc")
+
+        peft_dict[new_key] = v.to(device)
+    return peft_dict
+
+
+def load_lora(model, lora_path, device):
+    kohya_weights = load_file(lora_path)
+    peft_weights = convert_kohya_to_peft_keys(
+        kohya_weights, kohya_prefix="Hunyuan_video_I2V", device=device
+    )
+    model.load_state_dict(peft_weights, strict=False)
+    return model
+
+
+def black_image(width, height):
+    black_image = Image.new("RGB", (width, height), (0, 0, 0))
+    return black_image
+
+
+def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    if images.shape[-1] == 1:
+        # special case for grayscale (single channel) images
+        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+    else:
+        pil_images = [Image.fromarray(image) for image in images]
+
+    return pil_images
+
+
+def get_cond_latents(args, latents, vae):
+    """get conditioned latent by decode and encode the first frame latents"""
+    first_image_latents = latents[:, :, 0, ...] if len(latents.shape) == 5 else latents
+    first_image_latents = 1 / vae.config.scaling_factor * first_image_latents
+    first_images = vae.decode(
+        first_image_latents.unsqueeze(2).to(vae.dtype), return_dict=False
+    )[0]
+    first_images = first_images.squeeze(2)
+    first_images = (first_images / 2 + 0.5).clamp(0, 1)
+    first_images = first_images.cpu().permute(0, 2, 3, 1).float().numpy()
+    first_images = numpy_to_pil(first_images)
+
+    image_transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
+    )
+    first_images_pixel_values = [image_transform(image) for image in first_images]
+    first_images_pixel_values = (
+        torch.cat(first_images_pixel_values).unsqueeze(0).unsqueeze(2).to(vae.device)
+    )
+
+    vae_dtype = PRECISION_TO_TYPE[args.vae_precision]
+    with torch.autocast(
+        device_type="cuda", dtype=vae_dtype, enabled=vae_dtype != torch.float32
+    ):
+        cond_latents = vae.encode(
+            first_images_pixel_values
+        ).latent_dist.sample()  # B, C, F, H, W
+        cond_latents.mul_(vae.config.scaling_factor)
+
+    return cond_latents
+
+
+def get_cond_images(args, latents, vae, is_uncond=False):
+    """get conditioned images by decode the first frame latents"""
+    sematic_image_latents = (
+        latents[:, :, 0, ...] if len(latents.shape) == 5 else latents
+    )
+    sematic_image_latents = 1 / vae.config.scaling_factor * sematic_image_latents
+    semantic_images = vae.decode(
+        sematic_image_latents.unsqueeze(2).to(vae.dtype), return_dict=False
+    )[0]
+    semantic_images = semantic_images.squeeze(2)
+    semantic_images = (semantic_images / 2 + 0.5).clamp(0, 1)
+    semantic_images = semantic_images.cpu().permute(0, 2, 3, 1).float().numpy()
+    semantic_images = numpy_to_pil(semantic_images)
+    if is_uncond:
+        semantic_images = [
+            black_image(img.size[0], img.size[1]) for img in semantic_images
+        ]
+
+    return semantic_images
+
+
+def load_state_dict(args, model, logger):
+    pretrained_model_path = Path(args.model_base)
+    if not pretrained_model_path.exists():
+        raise ValueError(f"`models_root` not exists: {pretrained_model_path}")
+
+    load_key = args.load_key
+    if args.i2v_mode:
+        dit_weight = Path(args.i2v_dit_weight)
+    else:
+        dit_weight = Path(args.dit_weight)
+
+    if dit_weight is None:
+        model_dir = pretrained_model_path / f"t2v_{args.model_resolution}"
+        files = list(model_dir.glob("*.pt"))
+        if len(files) == 0:
+            raise ValueError(f"No model weights found in {model_dir}")
+        if str(files[0]).startswith("pytorch_model_"):
+            model_path = dit_weight / f"pytorch_model_{load_key}.pt"
+            bare_model = True
+        elif any(str(f).endswith("_model_states.pt") for f in files):
+            files = [f for f in files if str(f).endswith("_model_states.pt")]
+            model_path = files[0]
+            if len(files) > 1:
+                logger.warning(
+                    f"Multiple model weights found in {dit_weight}, using {model_path}"
+                )
+            bare_model = False
+        else:
+            raise ValueError(
+                f"Invalid model path: {dit_weight} with unrecognized weight format: "
+                f"{list(map(str, files))}. When given a directory as --dit-weight, only "
+                f"`pytorch_model_*.pt`(provided by HunyuanVideo official) and "
+                f"`*_model_states.pt`(saved by deepspeed) can be parsed. If you want to load a "
+                f"specific weight file, please provide the full path to the file."
+            )
+    else:
+        if dit_weight.is_dir():
+            files = list(dit_weight.glob("*.pt"))
+            if len(files) == 0:
+                raise ValueError(f"No model weights found in {dit_weight}")
+            if str(files[0]).startswith("pytorch_model_"):
+                model_path = dit_weight / f"pytorch_model_{load_key}.pt"
+                bare_model = True
+            elif any(str(f).endswith("_model_states.pt") for f in files):
+                files = [f for f in files if str(f).endswith("_model_states.pt")]
+                model_path = files[0]
+                if len(files) > 1:
+                    logger.warning(
+                        f"Multiple model weights found in {dit_weight}, using {model_path}"
+                    )
+                bare_model = False
+            else:
+                raise ValueError(
+                    f"Invalid model path: {dit_weight} with unrecognized weight format: "
+                    f"{list(map(str, files))}. When given a directory as --dit-weight, only "
+                    f"`pytorch_model_*.pt`(provided by HunyuanVideo official) and "
+                    f"`*_model_states.pt`(saved by deepspeed) can be parsed. If you want to load a "
+                    f"specific weight file, please provide the full path to the file."
+                )
+        elif dit_weight.is_file():
+            model_path = dit_weight
+            bare_model = "unknown"
+        else:
+            raise ValueError(f"Invalid model path: {dit_weight}")
+
+    if not model_path.exists():
+        raise ValueError(f"model_path not exists: {model_path}")
+    logger.info(f"Loading torch model {model_path}...")
+    state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
+
+    if bare_model == "unknown" and ("ema" in state_dict or "module" in state_dict):
+        bare_model = False
+    if bare_model is False:
+        if load_key in state_dict:
+            state_dict = state_dict[load_key]
+        else:
+            raise KeyError(
+                f"Missing key: `{load_key}` in the checkpoint: {model_path}. The keys in the checkpoint "
+                f"are: {list(state_dict.keys())}."
+            )
+    model.load_state_dict(state_dict, strict=True)
+    return model
+
+
+class set_worker_seed_builder:
+    def __init__(self, global_rank):
+        self.global_rank = global_rank
+
+    def __call__(self, worker_id):
+        set_manual_seed(torch.initial_seed() % (2 ** 32 - 1))
+
+
+def set_reproducibility(enable, global_seed=None):
+    if enable:
+        # Configure the seed for reproducibility
+        set_manual_seed(global_seed)
+    # Set following debug environment variable
+    # See the link for details: https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+    # Cudnn benchmarking
+    torch.backends.cudnn.benchmark = not enable
+    # Use deterministic algorithms in PyTorch
+    torch.use_deterministic_algorithms(enable)
+
+    # LSTM and RNN networks are not deterministic
+
+
+def prepare_model_inputs(
+    args,
+    batch: tuple,
+    device: Union[int, str],
+    model,
+    vae,
+    text_encoder,
+    text_encoder_2=None,
+    rope_theta_rescale_factor: Union[float, List[float]] = 1.0,
+    rope_interpolation_factor: Union[float, List[float]] = 1.0,
+):
+    media, latents, *batch_args = batch
+    if len(batch_args) == 3:
+        text_ids, text_mask, kwargs = batch_args
+        text_ids_2, text_mask_2 = None, None
+    elif len(batch_args) == 5:
+        text_ids, text_mask, text_ids_2, text_mask_2, kwargs = batch_args
+    else:
+        raise ValueError(f"Unexpected batch_args.")
+    data_type = kwargs["type"][0]
+
+    # Move batch to device
+    media = media.to(device)
+    latents = latents.to(device)
+    text_ids = text_ids.to(device)
+    text_mask = text_mask.to(device)
+
+    # ======================================== Encode media ======================================
+    # Used for 3D VAE with 2D inputs(image).
+    # Prepare media shape for 2D/3D VAE
+    if len(latents.shape) == 1:
+        if len(media.shape) == 4:
+            # media is a batch of image with shape [b, c, h, w]
+            if isinstance(vae, AutoencoderKLCausal3D):
+                media = media.unsqueeze(2)  # [b, c, 1, h, w]
+        elif len(media.shape) == 5:
+            # media is a batch of video with shape [b, c, f, h, w]
+            if not isinstance(vae, AutoencoderKLCausal3D):
+                media = rearrange(media, "b c f h w -> (b f) c h w")
+        else:
+            raise ValueError(
+                f"Only support media with shape (b, c, h, w) or (b, c, f, h, w), but got {media.shape}."
+            )
+
+        vae_dtype = PRECISION_TO_TYPE[args.vae_precision]
+        with torch.autocast(
+            device_type="cuda", dtype=vae_dtype, enabled=vae_dtype != torch.float32
+        ):
+            latents = vae.encode(media).latent_dist.sample()
+            if hasattr(vae.config, "shift_factor") and vae.config.shift_factor:
+                latents.sub_(vae.config.shift_factor).mul_(vae.config.scaling_factor)
+            else:
+                latents.mul_(vae.config.scaling_factor)
+    elif len(latents.shape) == 5 or len(latents.shape) == 4:  # Using video/image cache
+        latents = (
+            latents * vae.config.scaling_factor
+        )  # vae cache is not multiplied by scaling_factor
+    else:
+        raise ValueError(
+            f"Only support media/latent with shape (b, c, h, w) or (b, c, f, h, w), but got {media.shape} {latents.shape}."
+        )
+
+    cond_latents = get_cond_latents(args, latents, vae)
+    is_uncond = (
+        torch.tensor(1).to(torch.int64)
+        if random.random() < args.sematic_cond_drop_p
+        else torch.tensor(0).to(torch.int64)
+    )
+    semantic_images = get_cond_images(args, latents, vae, is_uncond=is_uncond)
+
+    # ======================================== Encode text ======================================
+    # Autocast is handled by text_encoder itself.
+    # Whether to apply text_mask is determined by args.use_attention_mask.
+    text_outputs = text_encoder.encode(
+        {"input_ids": text_ids, "attention_mask": text_mask},
+        data_type=batch_args[-1]["type"][0],
+        semantic_images=semantic_images,
+    )
+    text_states = text_outputs.hidden_state
+    text_mask = text_outputs.attention_mask
+    text_states_2 = (
+        text_encoder_2.encode(
+            {"input_ids": text_ids_2, "attention_mask": text_mask_2},
+            data_type=data_type,
+        ).hidden_state
+        if text_encoder_2 is not None
+        else None
+    )
+
+    # ======================================== Build RoPE ======================================
+    target_ndim = 3  # n-d RoPE
+    ndim = len(latents.shape) - 2
+    latents_size = list(latents.shape[-ndim:])
+    freqs_cos, freqs_sin = get_rope_freq_from_size(
+        args,
+        model,
+        latents_size,
+        ndim,
+        target_ndim,
+        rope_theta_rescale_factor=rope_theta_rescale_factor,
+        rope_interpolation_factor=rope_interpolation_factor,
+    )
+
+    # ===================================== Pack model kwargs ==================================
+    model_kwargs = dict(
+        text_states=text_states,  # [b, 256, 4096]
+        text_mask=text_mask,  # [b, 256]
+        text_states_2=text_states_2,  # [b, 768]
+        freqs_cos=freqs_cos,  # [seqlen, head_dim]
+        freqs_sin=freqs_sin,  # [seqlen, head_dim]
+        return_dict=True,
+    )
+
+    return latents, model_kwargs, freqs_cos.shape[0], cond_latents
+
+
+def format_params(params):
+    if params < 1e6:
+        return f"{params} (less than 1M)"
+    elif params < 1e9:
+        return f"{params / 1e6:.2f}M"
+    else:
+        return f"{params / 1e9:.2f}B"
+
+
+def set_manual_seed(global_seed):
+    random.seed(global_seed)
+    np.random.seed(global_seed)
+    torch.manual_seed(global_seed)
+
+
+def get_rope_freq_from_size(
+    args,
+    model,
+    latents_size,
+    ndim,
+    target_ndim,
+    rope_theta_rescale_factor=1.0,
+    rope_interpolation_factor=1.0,
+):
+
+    if isinstance(model.patch_size, int):
+        assert all(s % model.patch_size == 0 for s in latents_size), (
+            f"Latent size(last {ndim} dimensions) should be divisible by patch size({model.patch_size}), "
+            f"but got {latents_size}."
+        )
+        rope_sizes = [s // model.patch_size for s in latents_size]
+
+    elif isinstance(model.patch_size, list):
+        assert all(
+            s % model.patch_size[idx] == 0 for idx, s in enumerate(latents_size)
+        ), (
+            f"Latent size(last {ndim} dimensions) should be divisible by patch size({model.patch_size}), "
+            f"but got {latents_size}."
+        )
+        rope_sizes = [s // model.patch_size[idx] for idx, s in enumerate(latents_size)]
+
+    if len(rope_sizes) != target_ndim:
+        rope_sizes = [1] * (target_ndim - len(rope_sizes)) + rope_sizes  # time axis
+    head_dim = model.hidden_size // model.heads_num
+    rope_dim_list = model.rope_dim_list
+
+    if rope_dim_list is None:
+        rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+    assert (
+        sum(rope_dim_list) == head_dim
+    ), "sum(rope_dim_list) should equal to head_dim of attention layer"
+
+    freqs_cos, freqs_sin = get_nd_rotary_pos_embed(
+        rope_dim_list,
+        rope_sizes,
+        theta=args.rope_theta,
+        use_real=True,
+        theta_rescale_factor=rope_theta_rescale_factor,
+        interpolation_factor=rope_interpolation_factor,
+    )
+
+    return freqs_cos, freqs_sin
+
+
+# copy from https://github.com/huggingface/diffusers/blob/ec9bfa9e148b7764137dd92247ce859d915abcb0/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py#L258
+# get kohya lora state dict
+def get_module_kohya_state_dict(module, prefix, dtype, adapter_name="default"):
+    kohya_ss_state_dict = {}
+    for peft_key, weight in get_peft_model_state_dict(
+        module, adapter_name=adapter_name
+    ).items():
+        kohya_key = peft_key.replace("base_model.model", prefix)
+        kohya_key = kohya_key.replace("lora_A", "lora_down")
+        kohya_key = kohya_key.replace("lora_B", "lora_up")
+        kohya_key = kohya_key.replace(".", "_", kohya_key.count(".") - 2)
+        kohya_ss_state_dict[kohya_key] = weight.to(dtype)
+
+        # Set alpha parameter
+        if "lora_down" in kohya_key:
+            alpha_key = f'{kohya_key.split(".")[0]}.alpha'
+            kohya_ss_state_dict[alpha_key] = torch.tensor(
+                module.peft_config[adapter_name].lora_alpha
+            ).to(dtype)
+
+    return kohya_ss_state_dict
+
+
+# get diffusers lora state dict
+def get_module_diffusers_state_dict(module, dtype, adapter_name="default"):
+    diffusers_ss_state_dict = {}
+    for peft_key, weight in get_peft_model_state_dict(
+        module, adapter_name=adapter_name
+    ).items():
+        diffusers_key = peft_key.replace("base_model.model", "diffusion_model")
+        diffusers_ss_state_dict[diffusers_key] = weight.to(dtype)
+
+    return diffusers_ss_state_dict
--- a/hyvideo/vae/__init__.py
+++ b/hyvideo/vae/__init__.py
+from pathlib import Path
+
+import torch
+
+from .autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from ..constants import VAE_PATH, PRECISION_TO_TYPE
+
+def load_vae(vae_type: str="884-16c-hy",
+             vae_precision: str=None,
+             sample_size: tuple=None,
+             vae_path: str=None,
+             logger=None,
+             device=None
+             ):
+    """the fucntion to load the 3D VAE model
+
+    Args:
+        vae_type (str): the type of the 3D VAE model. Defaults to "884-16c-hy".
+        vae_precision (str, optional): the precision to load vae. Defaults to None.
+        sample_size (tuple, optional): the tiling size. Defaults to None.
+        vae_path (str, optional): the path to vae. Defaults to None.
+        logger (_type_, optional): logger. Defaults to None.
+        device (_type_, optional): device to load vae. Defaults to None.
+    """
+    if vae_path is None:
+        vae_path = VAE_PATH[vae_type]
+    
+    if logger is not None:
+        logger.info(f"Loading 3D VAE model ({vae_type}) from: {vae_path}")
+    config = AutoencoderKLCausal3D.load_config(vae_path)
+    if sample_size:
+        vae = AutoencoderKLCausal3D.from_config(config, sample_size=sample_size)
+    else:
+        vae = AutoencoderKLCausal3D.from_config(config)
+    
+    vae_ckpt = Path(vae_path) / "pytorch_model.pt"
+    assert vae_ckpt.exists(), f"VAE checkpoint not found: {vae_ckpt}"
+    
+    ckpt = torch.load(vae_ckpt, map_location=vae.device)
+    if "state_dict" in ckpt:
+        ckpt = ckpt["state_dict"]
+    if any(k.startswith("vae.") for k in ckpt.keys()):
+        ckpt = {k.replace("vae.", ""): v for k, v in ckpt.items() if k.startswith("vae.")}
+    vae.load_state_dict(ckpt)
+
+    spatial_compression_ratio = vae.config.spatial_compression_ratio
+    time_compression_ratio = vae.config.time_compression_ratio
+    
+    if vae_precision is not None:
+        vae = vae.to(dtype=PRECISION_TO_TYPE[vae_precision])
+
+    vae.requires_grad_(False)
+
+    if logger is not None:
+        logger.info(f"VAE to dtype: {vae.dtype}")
+
+    if device is not None:
+        vae = vae.to(device)
+
+    vae.eval()
+
+    return vae, vae_path, spatial_compression_ratio, time_compression_ratio
--- a/hyvideo/vae/autoencoder_kl_causal_3d.py
+++ b/hyvideo/vae/autoencoder_kl_causal_3d.py
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+from typing import Dict, Optional, Tuple, Union
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+
+try:
+    # This diffusers is modified and packed in the mirror.
+    from diffusers.loaders import FromOriginalVAEMixin
+except ImportError:
+    # Use this to be compatible with the original diffusers.
+    from diffusers.loaders.single_file_model import FromOriginalModelMixin as FromOriginalVAEMixin
+from diffusers.utils.accelerate_utils import apply_forward_hook
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from .vae import DecoderCausal3D, BaseOutput, DecoderOutput, DiagonalGaussianDistribution, EncoderCausal3D
+
+
+@dataclass
+class DecoderOutput2(BaseOutput):
+    sample: torch.FloatTensor
+    posterior: Optional[DiagonalGaussianDistribution] = None
+
+
+class AutoencoderKLCausal3D(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
+    r"""
+    A VAE model with KL loss for encoding images/videos into latents and decoding latent representations into images/videos.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlockCausal3D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlockCausal3D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        sample_tsize: int = 64,
+        scaling_factor: float = 0.18215,
+        force_upcast: float = True,
+        spatial_compression_ratio: int = 8,
+        time_compression_ratio: int = 4,
+        mid_block_add_attention: bool = True,
+    ):
+        super().__init__()
+
+        self.time_compression_ratio = time_compression_ratio
+
+        self.encoder = EncoderCausal3D(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+            time_compression_ratio=time_compression_ratio,
+            spatial_compression_ratio=spatial_compression_ratio,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+        self.decoder = DecoderCausal3D(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            time_compression_ratio=time_compression_ratio,
+            spatial_compression_ratio=spatial_compression_ratio,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+        self.quant_conv = nn.Conv3d(2 * latent_channels, 2 * latent_channels, kernel_size=1)
+        self.post_quant_conv = nn.Conv3d(latent_channels, latent_channels, kernel_size=1)
+
+        self.use_slicing = False
+        self.use_spatial_tiling = False
+        self.use_temporal_tiling = False
+
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_tsize = sample_tsize
+        self.tile_latent_min_tsize = sample_tsize // time_compression_ratio
+
+        self.tile_sample_min_size = self.config.sample_size
+        sample_size = (
+            self.config.sample_size[0]
+            if isinstance(self.config.sample_size, (list, tuple))
+            else self.config.sample_size
+        )
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (EncoderCausal3D, DecoderCausal3D)):
+            module.gradient_checkpointing = value
+
+    def enable_temporal_tiling(self, use_tiling: bool = True):
+        self.use_temporal_tiling = use_tiling
+
+    def disable_temporal_tiling(self):
+        self.enable_temporal_tiling(False)
+
+    def enable_spatial_tiling(self, use_tiling: bool = True):
+        self.use_spatial_tiling = use_tiling
+
+    def disable_spatial_tiling(self):
+        self.enable_spatial_tiling(False)
+
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger videos.
+        """
+        self.enable_spatial_tiling(use_tiling)
+        self.enable_temporal_tiling(use_tiling)
+
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.disable_spatial_tiling()
+        self.disable_temporal_tiling()
+
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor, _remove_lora=True)
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images/videos into latents.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images/videos.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images/videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        assert len(x.shape) == 5, "The input tensor should have 5 dimensions."
+
+        if self.use_temporal_tiling and x.shape[2] > self.tile_sample_min_tsize:
+            return self.temporal_tiled_encode(x, return_dict=return_dict)
+
+        if self.use_spatial_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.spatial_tiled_encode(x, return_dict=return_dict)
+
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self.encoder(x)
+
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        assert len(z.shape) == 5, "The input tensor should have 5 dimensions."
+
+        if self.use_temporal_tiling and z.shape[2] > self.tile_latent_min_tsize:
+            return self.temporal_tiled_decode(z, return_dict=return_dict)
+
+        if self.use_spatial_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+            return self.spatial_tiled_decode(z, return_dict=return_dict)
+
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    @apply_forward_hook
+    def decode(
+        self, z: torch.FloatTensor, return_dict: bool = True, generator=None
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        """
+        Decode a batch of images/videos.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
+        return b
+
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (x / blend_extent)
+        return b
+
+    def spatial_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True, return_moments: bool = False) -> AutoencoderKLOutput:
+        r"""Encode a batch of images/videos using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image/videos size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images/videos.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        # Split video into tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[-2], overlap_size):
+            row = []
+            for j in range(0, x.shape[-1], overlap_size):
+                tile = x[:, :, :, i: i + self.tile_sample_min_size, j: j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        moments = torch.cat(result_rows, dim=-2)
+        if return_moments:
+            return moments
+
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def spatial_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Decode a batch of images/videos using a tiled decoder.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[-2], overlap_size):
+            row = []
+            for j in range(0, z.shape[-1], overlap_size):
+                tile = z[:, :, :, i: i + self.tile_latent_min_size, j: j + self.tile_latent_min_size]
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        dec = torch.cat(result_rows, dim=-2)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def temporal_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
+
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor)
+        t_limit = self.tile_latent_min_tsize - blend_extent
+
+        # Split the video into tiles and encode them separately.
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = x[:, :, i: i + self.tile_sample_min_tsize + 1, :, :]
+            if self.use_spatial_tiling and (tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size):
+                tile = self.spatial_tiled_encode(tile, return_moments=True)
+            else:
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+            if i > 0:
+                tile = tile[:, :, 1:, :, :]
+            row.append(tile)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :t_limit, :, :])
+            else:
+                result_row.append(tile[:, :, :t_limit + 1, :, :])
+
+        moments = torch.cat(result_row, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def temporal_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        # Split z into overlapping tiles and decode them separately.
+
+        B, C, T, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)
+        t_limit = self.tile_sample_min_tsize - blend_extent
+
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = z[:, :, i: i + self.tile_latent_min_tsize + 1, :, :]
+            if self.use_spatial_tiling and (tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size):
+                decoded = self.spatial_tiled_decode(tile, return_dict=True).sample
+            else:
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+            if i > 0:
+                decoded = decoded[:, :, 1:, :, :]
+            row.append(decoded)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :t_limit, :, :])
+            else:
+                result_row.append(tile[:, :, :t_limit + 1, :, :])
+
+        dec = torch.cat(result_row, dim=2)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        return_posterior: bool = False,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput2, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z).sample
+
+        if not return_dict:
+            if return_posterior:
+                return (dec, posterior)
+            else:
+                return (dec,)
+        if return_posterior:
+            return DecoderOutput2(sample=dec, posterior=posterior)
+        else:
+            return DecoderOutput2(sample=dec)
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
--- a/hyvideo/vae/unet_causal_3d_blocks.py
+++ b/hyvideo/vae/unet_causal_3d_blocks.py
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange
+
+from diffusers.utils import logging
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import SpatialNorm
+from diffusers.models.attention_processor import Attention
+from diffusers.models.normalization import AdaGroupNorm
+from diffusers.models.normalization import RMSNorm
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def prepare_causal_attention_mask(n_frame: int, n_hw: int, dtype, device, batch_size: int = None):
+    seq_len = n_frame * n_hw
+    mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
+    for i in range(seq_len):
+        i_frame = i // n_hw
+        mask[i, : (i_frame + 1) * n_hw] = 0
+    if batch_size is not None:
+        mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
+    return mask
+
+
+class CausalConv3d(nn.Module):
+    """
+    Implements a causal 3D convolution layer where each position only depends on previous timesteps and current spatial locations.
+    This maintains temporal causality in video generation tasks.
+    """
+
+    def __init__(
+        self,
+        chan_in,
+        chan_out,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        pad_mode='replicate',
+        **kwargs
+    ):
+        super().__init__()
+
+        self.pad_mode = pad_mode
+        padding = (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size - 1, 0)  # W, H, T
+        self.time_causal_padding = padding
+
+        self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs)
+
+    def forward(self, x):
+        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
+        return self.conv(x)
+
+
+class UpsampleCausal3D(nn.Module):
+    """
+    A 3D upsampling layer with an optional convolution.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        use_conv_transpose: bool = False,
+        out_channels: Optional[int] = None,
+        name: str = "conv",
+        kernel_size: Optional[int] = None,
+        padding=1,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+        interpolate=True,
+        upsample_factor=(2, 2, 2),
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        self.interpolate = interpolate
+        self.upsample_factor = upsample_factor
+
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps, elementwise_affine)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+
+        conv = None
+        if use_conv_transpose:
+            raise NotImplementedError
+        elif use_conv:
+            if kernel_size is None:
+                kernel_size = 3
+            conv = CausalConv3d(self.channels, self.out_channels, kernel_size=kernel_size, bias=bias)
+
+        if name == "conv":
+            self.conv = conv
+        else:
+            self.Conv2d_0 = conv
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        output_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+
+        if self.norm is not None:
+            raise NotImplementedError
+
+        if self.use_conv_transpose:
+            return self.conv(hidden_states)
+
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+
+        # if `output_size` is passed we force the interpolation output
+        # size and do not make use of `scale_factor=2`
+        if self.interpolate:
+            B, C, T, H, W = hidden_states.shape
+            first_h, other_h = hidden_states.split((1, T - 1), dim=2)
+            if output_size is None:
+                if T > 1:
+                    other_h = F.interpolate(other_h, scale_factor=self.upsample_factor, mode="nearest")
+
+                first_h = first_h.squeeze(2)
+                first_h = F.interpolate(first_h, scale_factor=self.upsample_factor[1:], mode="nearest")
+                first_h = first_h.unsqueeze(2)
+            else:
+                raise NotImplementedError
+
+            if T > 1:
+                hidden_states = torch.cat((first_h, other_h), dim=2)
+            else:
+                hidden_states = first_h
+
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+
+        if self.use_conv:
+            if self.name == "conv":
+                hidden_states = self.conv(hidden_states)
+            else:
+                hidden_states = self.Conv2d_0(hidden_states)
+
+        return hidden_states
+
+
+class DownsampleCausal3D(nn.Module):
+    """
+    A 3D downsampling layer with an optional convolution.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        padding: int = 1,
+        name: str = "conv",
+        kernel_size=3,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+        stride=2,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = stride
+        self.name = name
+
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps, elementwise_affine)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+
+        if use_conv:
+            conv = CausalConv3d(
+                self.channels, self.out_channels, kernel_size=kernel_size, stride=stride, bias=bias
+            )
+        else:
+            raise NotImplementedError
+
+        if name == "conv":
+            self.Conv2d_0 = conv
+            self.conv = conv
+        elif name == "Conv2d_0":
+            self.conv = conv
+        else:
+            self.conv = conv
+
+    def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+
+        if self.norm is not None:
+            hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+
+        assert hidden_states.shape[1] == self.channels
+
+        hidden_states = self.conv(hidden_states)
+
+        return hidden_states
+
+
+class ResnetBlockCausal3D(nn.Module):
+    r"""
+    A Resnet block.
+    """
+
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        pre_norm: bool = True,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        skip_time_act: bool = False,
+        # default, scale_shift, ada_group, spatial
+        time_embedding_norm: str = "default",
+        kernel: Optional[torch.FloatTensor] = None,
+        output_scale_factor: float = 1.0,
+        use_in_shortcut: Optional[bool] = None,
+        up: bool = False,
+        down: bool = False,
+        conv_shortcut_bias: bool = True,
+        conv_3d_out_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+        self.time_embedding_norm = time_embedding_norm
+        self.skip_time_act = skip_time_act
+
+        linear_cls = nn.Linear
+
+        if groups_out is None:
+            groups_out = groups
+
+        if self.time_embedding_norm == "ada_group":
+            self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm1 = SpatialNorm(in_channels, temb_channels)
+        else:
+            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+
+        self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, stride=1)
+
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                self.time_emb_proj = linear_cls(temb_channels, out_channels)
+            elif self.time_embedding_norm == "scale_shift":
+                self.time_emb_proj = linear_cls(temb_channels, 2 * out_channels)
+            elif self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+                self.time_emb_proj = None
+            else:
+                raise ValueError(f"Unknown time_embedding_norm : {self.time_embedding_norm} ")
+        else:
+            self.time_emb_proj = None
+
+        if self.time_embedding_norm == "ada_group":
+            self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm2 = SpatialNorm(out_channels, temb_channels)
+        else:
+            self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+
+        self.dropout = torch.nn.Dropout(dropout)
+        conv_3d_out_channels = conv_3d_out_channels or out_channels
+        self.conv2 = CausalConv3d(out_channels, conv_3d_out_channels, kernel_size=3, stride=1)
+
+        self.nonlinearity = get_activation(non_linearity)
+
+        self.upsample = self.downsample = None
+        if self.up:
+            self.upsample = UpsampleCausal3D(in_channels, use_conv=False)
+        elif self.down:
+            self.downsample = DownsampleCausal3D(in_channels, use_conv=False, name="op")
+
+        self.use_in_shortcut = self.in_channels != conv_3d_out_channels if use_in_shortcut is None else use_in_shortcut
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = CausalConv3d(
+                in_channels,
+                conv_3d_out_channels,
+                kernel_size=1,
+                stride=1,
+                bias=conv_shortcut_bias,
+            )
+
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm1(hidden_states, temb)
+        else:
+            hidden_states = self.norm1(hidden_states)
+
+        hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if hidden_states.shape[0] >= 64:
+                input_tensor = input_tensor.contiguous()
+                hidden_states = hidden_states.contiguous()
+            input_tensor = (
+                self.upsample(input_tensor, scale=scale)
+            )
+            hidden_states = (
+                self.upsample(hidden_states, scale=scale)
+            )
+        elif self.downsample is not None:
+            input_tensor = (
+                self.downsample(input_tensor, scale=scale)
+            )
+            hidden_states = (
+                self.downsample(hidden_states, scale=scale)
+            )
+
+        hidden_states = self.conv1(hidden_states)
+
+        if self.time_emb_proj is not None:
+            if not self.skip_time_act:
+                temb = self.nonlinearity(temb)
+            temb = (
+                self.time_emb_proj(temb, scale)[:, :, None, None]
+            )
+
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm2(hidden_states, temb)
+        else:
+            hidden_states = self.norm2(hidden_states)
+
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+
+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            input_tensor = (
+                self.conv_shortcut(input_tensor)
+            )
+
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+
+        return output_tensor
+
+
+def get_down_block3d(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+    downsample_stride: int,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    downsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownEncoderBlockCausal3D":
+        return DownEncoderBlockCausal3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            downsample_stride=downsample_stride,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block3d(
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int,
+    temb_channels: int,
+    add_upsample: bool,
+    upsample_scale_factor: Tuple,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resolution_idx: Optional[int] = None,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    upsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+) -> nn.Module:
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpDecoderBlockCausal3D":
+        return UpDecoderBlockCausal3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            upsample_scale_factor=upsample_scale_factor,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class UNetMidBlockCausal3D(nn.Module):
+    """
+    A 3D UNet mid-block [`UNetMidBlockCausal3D`] with multiple residual blocks and optional attention blocks.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.add_attention = add_attention
+
+        if attn_groups is None:
+            attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlockCausal3D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+
+            resnets.append(
+                ResnetBlockCausal3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                B, C, T, H, W = hidden_states.shape
+                hidden_states = rearrange(hidden_states, "b c f h w -> b (f h w) c")
+                attention_mask = prepare_causal_attention_mask(
+                    T, H * W, hidden_states.dtype, hidden_states.device, batch_size=B
+                )
+                hidden_states = attn(hidden_states, temb=temb, attention_mask=attention_mask)
+                hidden_states = rearrange(hidden_states, "b (f h w) c -> b c f h w", f=T, h=H, w=W)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class DownEncoderBlockCausal3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_stride: int = 2,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlockCausal3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    DownsampleCausal3D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                        stride=downsample_stride,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None, scale=scale)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale)
+
+        return hidden_states
+
+
+class UpDecoderBlockCausal3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        upsample_scale_factor=(2, 2, 2),
+        temb_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlockCausal3D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    UpsampleCausal3D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        upsample_factor=upsample_scale_factor,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
--- a/hyvideo/vae/vae.py
+++ b/hyvideo/vae/vae.py
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from diffusers.utils import BaseOutput, is_torch_version
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.attention_processor import SpatialNorm
+from .unet_causal_3d_blocks import (
+    CausalConv3d,
+    UNetMidBlockCausal3D,
+    get_down_block3d,
+    get_up_block3d,
+)
+
+
+@dataclass
+class DecoderOutput(BaseOutput):
+    r"""
+    Output of decoding method.
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The decoded output sample from the last layer of the model.
+    """
+
+    sample: torch.FloatTensor
+
+
+class EncoderCausal3D(nn.Module):
+    r"""
+    The `EncoderCausal3D` layer of a variational autoencoder that encodes its input into a latent representation.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlockCausal3D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        double_z: bool = True,
+        mid_block_add_attention=True,
+        time_compression_ratio: int = 4,
+        spatial_compression_ratio: int = 8,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, stride=1)
+        self.mid_block = None
+        self.down_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            num_spatial_downsample_layers = int(np.log2(spatial_compression_ratio))
+            num_time_downsample_layers = int(np.log2(time_compression_ratio))
+
+            if time_compression_ratio == 4:
+                add_spatial_downsample = bool(i < num_spatial_downsample_layers)
+                add_time_downsample = bool(
+                    i >= (len(block_out_channels) - 1 - num_time_downsample_layers)
+                    and not is_final_block
+                )
+            else:
+                raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}.")
+
+            downsample_stride_HW = (2, 2) if add_spatial_downsample else (1, 1)
+            downsample_stride_T = (2,) if add_time_downsample else (1,)
+            downsample_stride = tuple(downsample_stride_T + downsample_stride_HW)
+            down_block = get_down_block3d(
+                down_block_type,
+                num_layers=self.layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                add_downsample=bool(add_spatial_downsample or add_time_downsample),
+                downsample_stride=downsample_stride,
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=None,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlockCausal3D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+            add_attention=mid_block_add_attention,
+        )
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = CausalConv3d(block_out_channels[-1], conv_out_channels, kernel_size=3)
+
+    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `EncoderCausal3D` class."""
+        assert len(sample.shape) == 5, "The input tensor should have 5 dimensions"
+
+        sample = self.conv_in(sample)
+
+        # down
+        for down_block in self.down_blocks:
+            sample = down_block(sample)
+
+        # middle
+        sample = self.mid_block(sample)
+
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class DecoderCausal3D(nn.Module):
+    r"""
+    The `DecoderCausal3D` layer of a variational autoencoder that decodes its latent representation into an output sample.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlockCausal3D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
+        mid_block_add_attention=True,
+        time_compression_ratio: int = 4,
+        spatial_compression_ratio: int = 8,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = CausalConv3d(in_channels, block_out_channels[-1], kernel_size=3, stride=1)
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+
+        temb_channels = in_channels if norm_type == "spatial" else None
+
+        # mid
+        self.mid_block = UNetMidBlockCausal3D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=temb_channels,
+            add_attention=mid_block_add_attention,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            num_spatial_upsample_layers = int(np.log2(spatial_compression_ratio))
+            num_time_upsample_layers = int(np.log2(time_compression_ratio))
+
+            if time_compression_ratio == 4:
+                add_spatial_upsample = bool(i < num_spatial_upsample_layers)
+                add_time_upsample = bool(
+                    i >= len(block_out_channels) - 1 - num_time_upsample_layers
+                    and not is_final_block
+                )
+            else:
+                raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}.")
+
+            upsample_scale_factor_HW = (2, 2) if add_spatial_upsample else (1, 1)
+            upsample_scale_factor_T = (2,) if add_time_upsample else (1,)
+            upsample_scale_factor = tuple(upsample_scale_factor_T + upsample_scale_factor_HW)
+            up_block = get_up_block3d(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=bool(add_spatial_upsample or add_time_upsample),
+                upsample_scale_factor=upsample_scale_factor,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_type == "spatial":
+            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+        else:
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = CausalConv3d(block_out_channels[0], out_channels, kernel_size=3)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        latent_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `DecoderCausal3D` class."""
+        assert len(sample.shape) == 5, "The input tensor should have 5 dimensions."
+
+        sample = self.conv_in(sample)
+
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    latent_embeds,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        latent_embeds,
+                        use_reentrant=False,
+                    )
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, latent_embeds
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+        else:
+            # middle
+            sample = self.mid_block(sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
+
+            # up
+            for up_block in self.up_blocks:
+                sample = up_block(sample, latent_embeds)
+
+        # post-process
+        if latent_embeds is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, latent_embeds)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        if parameters.ndim == 3:
+            dim = 2  # (B, L, C)
+        elif parameters.ndim == 5 or parameters.ndim == 4:
+            dim = 1  # (B, C, T, H ,W) / (B, C, H, W)
+        else:
+            raise NotImplementedError
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+            )
+
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            reduce_dim = list(range(1, self.mean.ndim))
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=reduce_dim,
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=reduce_dim,
+                )
+
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar +
+            torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+
+    def mode(self) -> torch.Tensor:
+        return self.mean
--- a/icon.png
+++ b/icon.png
--- a/model.properties
+++ b/model.properties
+# 模型编码
+modelCode=1462
+# 模型名称
+modelName=HunyuanVideo-I2V_pytorch
+# 模型描述
+modelDescription=腾讯混元系列超高质量视频生成模型
+# 应用场景
+appScenario=推理,视频生成,电商,教育,广媒
+# 框架类型
+frameType=pytorch
--- a/modified/config.py
+++ b/modified/config.py
+import os
+import torch
+import torch.distributed as dist
+from packaging import version
+from dataclasses import dataclass, fields
+
+from torch import distributed as dist
+
+from xfuser.logger import init_logger
+import xfuser.envs as envs
+# from xfuser.envs import CUDA_VERSION, TORCH_VERSION, PACKAGES_CHECKER
+from xfuser.envs import TORCH_VERSION, PACKAGES_CHECKER
+
+logger = init_logger(__name__)
+
+from typing import Union, Optional, List
+
+env_info = PACKAGES_CHECKER.get_packages_info()
+HAS_LONG_CTX_ATTN = env_info["has_long_ctx_attn"]
+HAS_FLASH_ATTN = env_info["has_flash_attn"]
+
+
+def check_packages():
+    import diffusers
+
+    if not version.parse(diffusers.__version__) > version.parse("0.30.2"):
+        raise RuntimeError(
+            "This project requires diffusers version > 0.30.2. Currently, you can not install a correct version of diffusers by pip install."
+            "Please install it from source code!"
+        )
+
+
+def check_env():
+    # https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/cudagraph.html
+    #if CUDA_VERSION < version.parse("11.3"):
+    #    raise RuntimeError("NCCL CUDA Graph support requires CUDA 11.3 or above")
+    if TORCH_VERSION < version.parse("2.2.0"):
+        # https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/
+        raise RuntimeError(
+            "CUDAGraph with NCCL support requires PyTorch 2.2.0 or above. "
+            "If it is not released yet, please install nightly built PyTorch "
+            "with `pip3 install --pre torch torchvision torchaudio --index-url "
+            "https://download.pytorch.org/whl/nightly/cu121`"
+        )
+
+
+@dataclass
+class ModelConfig:
+    model: str
+    download_dir: Optional[str] = None
+    trust_remote_code: bool = False
+
+
+@dataclass
+class RuntimeConfig:
+    warmup_steps: int = 1
+    dtype: torch.dtype = torch.float16
+    use_cuda_graph: bool = False
+    use_parallel_vae: bool = False
+    use_profiler: bool = False
+    use_torch_compile: bool = False
+    use_onediff: bool = False
+    use_fp8_t5_encoder: bool = False
+
+    def __post_init__(self):
+        check_packages()
+        if self.use_cuda_graph:
+            check_env()
+
+
+@dataclass
+class FastAttnConfig:
+    use_fast_attn: bool = False
+    n_step: int = 20
+    n_calib: int = 8
+    threshold: float = 0.5
+    window_size: int = 64
+    coco_path: Optional[str] = None
+    use_cache: bool = False
+
+    def __post_init__(self):
+        assert self.n_calib > 0, "n_calib must be greater than 0"
+        assert self.threshold > 0.0, "threshold must be greater than 0"
+
+
+@dataclass
+class DataParallelConfig:
+    dp_degree: int = 1
+    use_cfg_parallel: bool = False
+    world_size: int = 1
+
+    def __post_init__(self):
+        assert self.dp_degree >= 1, "dp_degree must greater than or equal to 1"
+
+        # set classifier_free_guidance_degree parallel for split batch
+        if self.use_cfg_parallel:
+            self.cfg_degree = 2
+        else:
+            self.cfg_degree = 1
+        assert self.dp_degree * self.cfg_degree <= self.world_size, (
+            "dp_degree * cfg_degree must be less than or equal to "
+            "world_size because of classifier free guidance"
+        )
+        assert (
+            self.world_size % (self.dp_degree * self.cfg_degree) == 0
+        ), "world_size must be divisible by dp_degree * cfg_degree"
+
+
+@dataclass
+class SequenceParallelConfig:
+    ulysses_degree: Optional[int] = None
+    ring_degree: Optional[int] = None
+    world_size: int = 1
+
+    def __post_init__(self):
+        if self.ulysses_degree is None:
+            self.ulysses_degree = 1
+            logger.info(
+                f"Ulysses degree not set, " f"using default value {self.ulysses_degree}"
+            )
+        if self.ring_degree is None:
+            self.ring_degree = 1
+            logger.info(
+                f"Ring degree not set, " f"using default value {self.ring_degree}"
+            )
+        self.sp_degree = self.ulysses_degree * self.ring_degree
+
+        if not HAS_LONG_CTX_ATTN and self.sp_degree > 1:
+            raise ImportError(
+                f"Sequence Parallel kit 'yunchang' not found but "
+                f"sp_degree is {self.sp_degree}, please set it "
+                f"to 1 or install 'yunchang' to use it"
+            )
+
+
+@dataclass
+class TensorParallelConfig:
+    tp_degree: int = 1
+    split_scheme: Optional[str] = "row"
+    world_size: int = 1
+
+    def __post_init__(self):
+        assert self.tp_degree >= 1, "tp_degree must greater than 1"
+        assert (
+            self.tp_degree <= self.world_size
+        ), "tp_degree must be less than or equal to world_size"
+
+
+@dataclass
+class PipeFusionParallelConfig:
+    pp_degree: int = 1
+    num_pipeline_patch: Optional[int] = None
+    attn_layer_num_for_pp: Optional[List[int]] = (None,)
+    world_size: int = 1
+
+    def __post_init__(self):
+        assert (
+            self.pp_degree is not None and self.pp_degree >= 1
+        ), "pipefusion_degree must be set and greater than 1 to use pipefusion"
+        assert (
+            self.pp_degree <= self.world_size
+        ), "pipefusion_degree must be less than or equal to world_size"
+        if self.num_pipeline_patch is None:
+            self.num_pipeline_patch = self.pp_degree
+            logger.info(
+                f"Pipeline patch number not set, "
+                f"using default value {self.pp_degree}"
+            )
+        if self.attn_layer_num_for_pp is not None:
+            logger.info(
+                f"attn_layer_num_for_pp set, splitting attention layers"
+                f"to {self.attn_layer_num_for_pp}"
+            )
+            assert len(self.attn_layer_num_for_pp) == self.pp_degree, (
+                "attn_layer_num_for_pp must have the same "
+                "length as pp_degree if not None"
+            )
+        if self.pp_degree == 1 and self.num_pipeline_patch > 1:
+            logger.warning(
+                f"Pipefusion degree is 1, pipeline will not be used,"
+                f"num_pipeline_patch will be ignored"
+            )
+            self.num_pipeline_patch = 1
+
+
+@dataclass
+class ParallelConfig:
+    dp_config: DataParallelConfig
+    sp_config: SequenceParallelConfig
+    pp_config: PipeFusionParallelConfig
+    tp_config: TensorParallelConfig
+    world_size: int = 1 # FIXME: remove this
+    worker_cls: str = "xfuser.ray.worker.worker.Worker"
+
+    def __post_init__(self):
+        assert self.tp_config is not None, "tp_config must be set"
+        assert self.dp_config is not None, "dp_config must be set"
+        assert self.sp_config is not None, "sp_config must be set"
+        assert self.pp_config is not None, "pp_config must be set"
+        parallel_world_size = (
+            self.dp_config.dp_degree
+            * self.dp_config.cfg_degree
+            * self.sp_config.sp_degree
+            * self.tp_config.tp_degree
+            * self.pp_config.pp_degree
+        )
+        world_size = self.world_size
+        assert parallel_world_size == world_size, (
+            f"parallel_world_size {parallel_world_size} "
+            f"must be equal to world_size {self.world_size}"
+        )
+        assert (
+            world_size % (self.dp_config.dp_degree * self.dp_config.cfg_degree) == 0
+        ), "world_size must be divisible by dp_degree * cfg_degree"
+        assert (
+            world_size % self.pp_config.pp_degree == 0
+        ), "world_size must be divisible by pp_degree"
+        assert (
+            world_size % self.sp_config.sp_degree == 0
+        ), "world_size must be divisible by sp_degree"
+        assert (
+            world_size % self.tp_config.tp_degree == 0
+        ), "world_size must be divisible by tp_degree"
+        self.dp_degree = self.dp_config.dp_degree
+        self.cfg_degree = self.dp_config.cfg_degree
+        self.sp_degree = self.sp_config.sp_degree
+        self.pp_degree = self.pp_config.pp_degree
+        self.tp_degree = self.tp_config.tp_degree
+
+        self.ulysses_degree = self.sp_config.ulysses_degree
+        self.ring_degree = self.sp_config.ring_degree
+
+
+@dataclass(frozen=True)
+class EngineConfig:
+    model_config: ModelConfig
+    runtime_config: RuntimeConfig
+    parallel_config: ParallelConfig
+    fast_attn_config: FastAttnConfig
+
+    def __post_init__(self):
+        world_size = self.parallel_config.world_size
+        if self.fast_attn_config.use_fast_attn:
+            assert self.parallel_config.dp_degree == world_size, f"world_size must be equal to dp_degree when using DiTFastAttn"
+
+    def to_dict(self):
+        """Return the configs as a dictionary, for use in **kwargs."""
+        return dict((field.name, getattr(self, field.name)) for field in fields(self))
+
+
+@dataclass
+class InputConfig:
+    height: int = 1024
+    width: int = 1024
+    num_frames: int = 49
+    use_resolution_binning: bool = (True,)
+    batch_size: Optional[int] = None
+    img_file_path: Optional[str] = None
+    prompt: Union[str, List[str]] = ""
+    negative_prompt: Union[str, List[str]] = ""
+    num_inference_steps: int = 20
+    max_sequence_length: int = 256
+    seed: int = 42
+    output_type: str = "pil"
+
+    def __post_init__(self):
+        if isinstance(self.prompt, list):
+            assert (
+                len(self.prompt) == len(self.negative_prompt)
+                or len(self.negative_prompt) == 0
+            ), "prompts and negative_prompts must have the same quantities"
+            self.batch_size = self.batch_size or len(self.prompt)
+        else:
+            self.batch_size = self.batch_size or 1
+        assert self.output_type in [
+            "pil",
+            "latent",
+            "pt",
+        ], "output_pil must be either 'pil' or 'latent'"
+
--- a/modified/envs.py
+++ b/modified/envs.py
+import os
+import torch
+import diffusers
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
+from packaging import version
+
+from xfuser.logger import init_logger
+
+logger = init_logger(__name__)
+
+if TYPE_CHECKING:
+    MASTER_ADDR: str = ""
+    MASTER_PORT: Optional[int] = None
+    CUDA_HOME: Optional[str] = None
+    LOCAL_RANK: int = 0
+    CUDA_VISIBLE_DEVICES: Optional[str] = None
+    XDIT_LOGGING_LEVEL: str = "INFO"
+    CUDA_VERSION: version.Version
+    TORCH_VERSION: version.Version
+
+
+environment_variables: Dict[str, Callable[[], Any]] = {
+    # ================== Runtime Env Vars ==================
+    # used in distributed environment to determine the master address
+    "MASTER_ADDR": lambda: os.getenv("MASTER_ADDR", ""),
+    # used in distributed environment to manually set the communication port
+    "MASTER_PORT": lambda: (
+        int(os.getenv("MASTER_PORT", "0")) if "MASTER_PORT" in os.environ else None
+    ),
+    # path to cudatoolkit home directory, under which should be bin, include,
+    # and lib directories.
+    "CUDA_HOME": lambda: os.environ.get("CUDA_HOME", None),
+    # local rank of the process in the distributed setting, used to determine
+    # the GPU device id
+    "LOCAL_RANK": lambda: int(os.environ.get("LOCAL_RANK", "0")),
+    # used to control the visible devices in the distributed setting
+    "CUDA_VISIBLE_DEVICES": lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
+    # this is used for configuring the default logging level
+    "XDIT_LOGGING_LEVEL": lambda: os.getenv("XDIT_LOGGING_LEVEL", "INFO"),
+}
+
+variables: Dict[str, Callable[[], Any]] = {
+    # ================== Other Vars ==================
+    # used in version checking
+    # "CUDA_VERSION": lambda: version.parse(torch.version.cuda),
+    "CUDA_VERSION": "gfx928",
+    "TORCH_VERSION": lambda: version.parse(
+        version.parse(torch.__version__).base_version
+    ),
+}
+
+
+class PackagesEnvChecker:
+    _instance = None
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(PackagesEnvChecker, cls).__new__(cls)
+            cls._instance.initialize()
+        return cls._instance
+
+    def initialize(self):
+        self.packages_info = {
+            "has_flash_attn": self.check_flash_attn(),
+            "has_long_ctx_attn": self.check_long_ctx_attn(),
+            "diffusers_version": self.check_diffusers_version(),
+        }
+
+    def check_flash_attn(self):
+        try:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            gpu_name = torch.cuda.get_device_name(device)
+            if "Turing" in gpu_name or "Tesla" in gpu_name or "T4" in gpu_name:
+                return False
+            else:
+                from flash_attn import flash_attn_func
+                from flash_attn import __version__
+
+                if __version__ < "2.6.0":
+                    raise ImportError(f"install flash_attn >= 2.6.0")
+                return True
+        except ImportError:
+            logger.warning(
+                f'Flash Attention library "flash_attn" not found, '
+                f"using pytorch attention implementation"
+            )
+            return False
+
+    def check_long_ctx_attn(self):
+        try:
+            from yunchang import (
+                set_seq_parallel_pg,
+                ring_flash_attn_func,
+                UlyssesAttention,
+                LongContextAttention,
+                LongContextAttentionQKVPacked,
+            )
+
+            return True
+        except ImportError:
+            logger.warning(
+                f'Ring Flash Attention library "yunchang" not found, '
+                f"using pytorch attention implementation"
+            )
+            return False
+
+    def check_diffusers_version(self):
+        if version.parse(
+            version.parse(diffusers.__version__).base_version
+        ) < version.parse("0.30.0"):
+            raise RuntimeError(
+                f"Diffusers version: {version.parse(version.parse(diffusers.__version__).base_version)} is not supported,"
+                f"please upgrade to version > 0.30.0"
+            )
+        return version.parse(version.parse(diffusers.__version__).base_version)
+
+    def get_packages_info(self):
+        return self.packages_info
+
+
+PACKAGES_CHECKER = PackagesEnvChecker()
+
+
+def __getattr__(name):
+    # lazy evaluation of environment variables
+    if name in environment_variables:
+        return environment_variables[name]()
+    if name in variables:
+        return variables[name]()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    return list(environment_variables.keys())
+
--- a/modified/fix.sh
+++ b/modified/fix.sh
+#!/bin/bash
+
+cp modified/config.py /usr/local/lib/python3.10/site-packages/xfuser/config/
+
+cp modified/envs.py /usr/local/lib/python3.10/site-packages/xfuser/
+
--- a/readme_imgs/alg.png
+++ b/readme_imgs/alg.png
--- a/readme_imgs/arch.png
+++ b/readme_imgs/arch.png
--- a/readme_imgs/hy_i2v.gif
+++ b/readme_imgs/hy_i2v.gif
--- a/requirements.txt
+++ b/requirements.txt
+opencv-python==4.9.0.80
+diffusers==0.31.0
+accelerate==1.1.1
+pandas==2.0.3
+# numpy==1.24.4
+einops==0.7.0
+tqdm==4.66.2
+loguru==0.7.2
+imageio==2.34.0
+imageio-ffmpeg==0.5.1
+safetensors==0.4.3
+peft==0.13.2
+transformers==4.39.3
+tokenizers==0.15.0
+# deepspeed==0.15.1
+pyarrow==14.0.1
+tensorboard==2.19.0
+# git+https://github.com/openai/CLIP.git
--- a/sample_image2video.py
+++ b/sample_image2video.py
+import os
+import time
+from pathlib import Path
+from loguru import logger
+from datetime import datetime
+
+from hyvideo.utils.file_utils import save_videos_grid
+from hyvideo.config import parse_args
+from hyvideo.inference import HunyuanVideoSampler
+
+
+def main():
+    args = parse_args()
+    print(args)
+    models_root_path = Path(args.model_base)
+    if not models_root_path.exists():
+        raise ValueError(f"`models_root` not exists: {models_root_path}")
+    
+    # Create save folder to save the samples
+    save_path = args.save_path if args.save_path_suffix=="" else f'{args.save_path}_{args.save_path_suffix}'
+    if not os.path.exists(save_path):
+        os.makedirs(save_path, exist_ok=True)
+
+    # Load models
+    hunyuan_video_sampler = HunyuanVideoSampler.from_pretrained(models_root_path, args=args)
+    
+    # Get the updated args
+    args = hunyuan_video_sampler.args
+
+    # Start sampling
+    # TODO: batch inference check
+    outputs = hunyuan_video_sampler.predict(
+        prompt=args.prompt, 
+        height=args.video_size[0],
+        width=args.video_size[1],
+        video_length=args.video_length,
+        seed=args.seed,
+        negative_prompt=args.neg_prompt,
+        infer_steps=args.infer_steps,
+        guidance_scale=args.cfg_scale,
+        num_videos_per_prompt=args.num_videos,
+        flow_shift=args.flow_shift,
+        batch_size=args.batch_size,
+        embedded_guidance_scale=args.embedded_cfg_scale,
+        i2v_mode=args.i2v_mode,
+        i2v_resolution=args.i2v_resolution,
+        i2v_image_path=args.i2v_image_path,
+        i2v_condition_type=args.i2v_condition_type,
+        i2v_stability=args.i2v_stability,
+        ulysses_degree=args.ulysses_degree,
+        ring_degree=args.ring_degree,
+        xdit_adaptive_size=args.xdit_adaptive_size
+    )
+    samples = outputs['samples']
+    
+    # Save samples
+    if 'LOCAL_RANK' not in os.environ or int(os.environ['LOCAL_RANK']) == 0:
+        for i, sample in enumerate(samples):
+            sample = samples[i].unsqueeze(0)
+            time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%H:%M:%S")
+            cur_save_path = f"{save_path}/{time_flag}_seed{outputs['seeds'][i]}_{outputs['prompts'][i][:100].replace('/','')}.mp4"
+            save_videos_grid(sample, cur_save_path, fps=24)
+            logger.info(f'Sample save to: {cur_save_path}')
+
+if __name__ == "__main__":
+    main()
--- a/scripts/run_sample_image2video_dynamic.sh
+++ b/scripts/run_sample_image2video_dynamic.sh
+#!/bin/bash
+
+python3 sample_image2video.py \
+    --prompt "An Asian man with short hair in black tactical uniform and white clothes waves a firework stick." \
+    --i2v-image-path ./assets/demo/i2v/imgs/0.jpg \
+    --model HYVideo-T/2 \
+    --i2v-mode \
+    --i2v-resolution 720p \
+    --infer-steps 50 \
+    --video-length 129 \
+    --flow-reverse \
+    --flow-shift 17.0 \
+    --embedded-cfg-scale 6.0 \
+    --seed 0 \
+    --use-cpu-offload \
+    --save-path ./results \
+
+# More example
+#    --prompt "A girl walks on the road, shooting stars pass by." \
+#    --i2v-image-path ./assets/demo/i2v/imgs/1.png \
\ No newline at end of file