Commit 1da75ff3 authored by mashun1's avatar mashun1
Browse files

hyi2v

parents
Pipeline #2556 failed with stages
in 0 seconds
import logging
import os
from pathlib import Path
import json
import tarfile
from collections import defaultdict
from einops import rearrange
from typing import List
import torch
import torchvision
import numpy as np
import imageio
import PIL.Image
from PIL import Image
CODE_SUFFIXES = {
".py", # Python codes
".sh", # Shell scripts
".yaml",
".yml", # Configuration files
}
def build_pretraining_data_loader():
pass
def logger_filter(name):
def filter_(record):
return record["extra"].get("name") == name
return filter_
def resolve_resume_path(resume, results_dir):
# Detect the resume path. Support both the experiment index and the full path.
if resume.isnumeric():
tmp_dirs = list(Path(results_dir).glob("*"))
id2exp_dir = defaultdict(list)
for tmp_dir in tmp_dirs:
part0 = tmp_dir.name.split("_")[0]
if part0.isnumeric():
id2exp_dir[int(part0)].append(tmp_dir)
resume_id = int(resume)
valid_exp_dir = id2exp_dir.get(resume_id)
if len(valid_exp_dir) == 0:
raise ValueError(
f"No valid experiment directories found in {results_dir} with the experiment "
f"index {resume}."
)
elif len(valid_exp_dir) > 1:
raise ValueError(
f"Multiple valid experiment directories found in {results_dir} with the experiment "
f"index {resume}: {valid_exp_dir}."
)
resume_path = valid_exp_dir[0] / "checkpoints"
else:
resume_path = Path(resume)
if not resume_path.exists():
raise FileNotFoundError(f"Resume path {resume_path} not found.")
return resume_path
def dump_codes(save_path, root, sub_dirs=None, valid_suffixes=None, save_prefix="./"):
"""
Dump codes to the experiment directory.
Args:
save_path (str): Path to the experiment directory.
root (Path): Path to the root directory of the codes.
sub_dirs (list): List of subdirectories to be dumped. If None, all files in the root directory will
be dumped. (default: None)
valid_suffixes (tuple, optional): Valid suffixes of the files to be dumped. If None, CODE_SUFFIXES will be used.
(default: None)
save_prefix (str, optional): Prefix to be added to the files in the tarball. (default: './')
"""
if valid_suffixes is None:
valid_suffixes = CODE_SUFFIXES
# Force to use tar.gz suffix
save_path = safe_file(save_path)
assert save_path.name.endswith(
".tar.gz"
), f"save_path should end with .tar.gz, got {save_path.name}."
# Make root absolute
root = Path(root).absolute()
# Make a tarball of the codes
with tarfile.open(save_path, "w:gz") as tar:
# Recursively add all files in the root directory
if sub_dirs is None:
sub_dirs = list(root.iterdir())
for sub_dir in sub_dirs:
for file in Path(sub_dir).rglob("*"):
if file.is_file() and file.suffix in valid_suffixes:
# make file absolute
file = file.absolute()
arcname = Path(save_prefix) / file.relative_to(root)
tar.add(file, arcname=arcname)
return root
def dump_args(args, save_path, extra_args=None):
args_dict = vars(args)
if extra_args:
assert isinstance(
extra_args, dict
), f"extra_args should be a dictionary, got {type(extra_args)}."
args_dict.update(extra_args)
# Save to file
with safe_file(save_path).open("w") as f:
json.dump(args_dict, f, indent=4, sort_keys=True, ensure_ascii=False)
def empty_logger():
logger = logging.getLogger("hymm_empty_logger")
logger.addHandler(logging.NullHandler())
logger.setLevel(logging.CRITICAL)
return logger
def is_valid_experiment(path):
path = Path(path)
if path.is_dir() and path.name.split("_")[0].isdigit():
return True
return False
def get_experiment_max_number(experiments):
valid_experiment_numbers = []
for exp in experiments:
if is_valid_experiment(exp):
valid_experiment_numbers.append(int(Path(exp).name.split("_")[0]))
if valid_experiment_numbers:
return max(valid_experiment_numbers)
return 0
def safe_dir(path):
"""
Create a directory (or the parent directory of a file) if it does not exist.
Args:
path (str or Path): Path to the directory.
Returns:
path (Path): Path object of the directory.
"""
path = Path(path)
path.mkdir(exist_ok=True, parents=True)
return path
def safe_file(path):
"""
Create the parent directory of a file if it does not exist.
Args:
path (str or Path): Path to the file.
Returns:
path (Path): Path object of the file.
"""
path = Path(path)
path.parent.mkdir(exist_ok=True, parents=True)
return path
def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=1, fps=24):
"""save videos by video tensor
copy from https://github.com/guoyww/AnimateDiff/blob/e92bd5671ba62c0d774a32951453e328018b7c5b/animatediff/utils/util.py#L61
Args:
videos (torch.Tensor): video tensor predicted by the model
path (str): path to save video
rescale (bool, optional): rescale the video tensor from [-1, 1] to . Defaults to False.
n_rows (int, optional): Defaults to 1.
fps (int, optional): video save fps. Defaults to 8.
"""
videos = rearrange(videos, "b c t h w -> t b c h w")
outputs = []
for x in videos:
x = torchvision.utils.make_grid(x, nrow=n_rows)
x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
if rescale:
x = (x + 1.0) / 2.0 # -1,1 -> 0,1
x = torch.clamp(x, 0, 1)
x = (x * 255).numpy().astype(np.uint8)
outputs.append(x)
os.makedirs(os.path.dirname(path), exist_ok=True)
imageio.mimsave(path, outputs, fps=fps)
import collections.abc
from itertools import repeat
import contextlib
import os
import random
import numpy as np
import torch
import deepspeed
import torch.distributed as dist
from torch.utils.tensorboard import SummaryWriter
def all_gather_sum(running_value, device):
value = torch.tensor(running_value, device=device)
dist.all_reduce(value, op=dist.ReduceOp.SUM)
return value.item()
class EventsMonitor(object):
def __init__(self, events_root, rank):
self.rank = rank
if rank == 0:
self.writer = SummaryWriter(log_dir=events_root)
else:
self.writer = None
def write_events(self, events):
for event in events:
name, val, count = event
if self.rank == 0:
self.writer.add_scalar(name, val, global_step=count)
def profiler_context(enable, exp_dir, worker_name):
if enable:
return torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
schedule=torch.profiler.schedule(
skip_first=10,
wait=5,
warmup=1,
active=3,
repeat=2,
),
profile_memory=True,
on_trace_ready=torch.profiler.tensorboard_trace_handler(
exp_dir, worker_name=worker_name
),
)
else:
# return empty python context manager
return contextlib.nullcontext()
def set_reproducibility(enable, global_seed=None):
if enable:
# Configure the seed for reproducibility
set_manual_seed(global_seed)
# Set following debug environment variable
# See the link for details: https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
# Cudnn benchmarking
torch.backends.cudnn.benchmark = not enable
# Use deterministic algorithms in PyTorch
torch.use_deterministic_algorithms(enable)
# LSTM and RNN networks are not deterministic
def set_manual_seed(global_seed):
# Seed the RNG for Python
random.seed(global_seed)
# Seed the RNG for Numpy
np.random.seed(global_seed)
# Seed the RNG for all devices (both CPU and CUDA)
torch.manual_seed(global_seed)
# Seed cuda
torch.cuda.manual_seed_all(global_seed)
def _ntuple(n):
def parse(x):
if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
x = tuple(x)
if len(x) == 1:
x = tuple(repeat(x[0], n))
return x
return tuple(repeat(x, n))
return parse
to_1tuple = _ntuple(1)
to_2tuple = _ntuple(2)
to_3tuple = _ntuple(3)
to_4tuple = _ntuple(4)
def as_tuple(x):
if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
return tuple(x)
if x is None or isinstance(x, (int, float, str)):
return (x,)
else:
raise ValueError(f"Unknown type {type(x)}")
def as_list_of_2tuple(x):
x = as_tuple(x)
if len(x) == 1:
x = (x[0], x[0])
assert len(x) % 2 == 0, f"Expect even length, got {len(x)}."
lst = []
for i in range(0, len(x), 2):
lst.append((x[i], x[i + 1]))
return lst
import torch
from safetensors.torch import load_file
# load kohya lora for diffusers pipeline
def load_lora_for_pipeline(
pipeline,
lora_path,
LORA_PREFIX_TRANSFORMER="",
LORA_PREFIX_TEXT_ENCODER="",
alpha=1.0,
device=0,
):
# load LoRA weight from .safetensors
state_dict = load_file(lora_path, device=device)
visited = []
# directly update weight in diffusers model
for key in state_dict:
# it is suggested to print out the key, it usually will be something like below
# "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
# as we have set the alpha beforehand, so just skip
if ".alpha" in key or key in visited:
continue
if "text" in key:
layer_infos = (
key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
)
curr_layer = pipeline.text_encoder
else:
layer_infos = (
key.split(".")[0].split(LORA_PREFIX_TRANSFORMER + "_")[-1].split("_")
)
curr_layer = pipeline.transformer
# find the target layer
temp_name = layer_infos.pop(0)
while len(layer_infos) > -1:
try:
curr_layer = curr_layer.__getattr__(temp_name)
if len(layer_infos) > 0:
temp_name = layer_infos.pop(0)
elif len(layer_infos) == 0:
break
except Exception:
if len(temp_name) > 0:
temp_name += "_" + layer_infos.pop(0)
else:
temp_name = layer_infos.pop(0)
pair_keys = []
if "lora_down" in key:
pair_keys.append(key.replace("lora_down", "lora_up"))
pair_keys.append(key)
else:
pair_keys.append(key)
pair_keys.append(key.replace("lora_up", "lora_down"))
# update weight
if len(state_dict[pair_keys[0]].shape) == 4:
weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
weight_down = (
state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
)
curr_layer.weight.data += alpha * torch.mm(
weight_up, weight_down
).unsqueeze(2).unsqueeze(3)
else:
weight_up = state_dict[pair_keys[0]].to(torch.float32)
weight_down = state_dict[pair_keys[1]].to(torch.float32)
curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down)
# update visited list
for item in pair_keys:
visited.append(item)
del state_dict
return pipeline
import argparse
import torch
from transformers import (
AutoProcessor,
LlavaForConditionalGeneration,
)
def preprocess_text_encoder_tokenizer(args):
processor = AutoProcessor.from_pretrained(args.input_dir)
model = LlavaForConditionalGeneration.from_pretrained(
args.input_dir,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(0)
model.language_model.save_pretrained(f"{args.output_dir}")
processor.tokenizer.save_pretrained(f"{args.output_dir}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_dir",
type=str,
required=True,
help="The path to the llava-llama-3-8b-v1_1-transformers.",
)
parser.add_argument(
"--output_dir",
type=str,
default="",
help="The output path of the llava-llama-3-8b-text-encoder-tokenizer."
"if '', the parent dir of output will be the same as input dir.",
)
args = parser.parse_args()
if len(args.output_dir) == 0:
args.output_dir = "/".join(args.input_dir.split("/")[:-1])
preprocess_text_encoder_tokenizer(args)
import random
import torchvision.transforms as transforms
import numpy as np
import torch
import imageio
import os
import PIL.Image
from typing import Union, Optional, List
from peft import get_peft_model_state_dict
from hyvideo.modules.posemb_layers import get_nd_rotary_pos_embed
from hyvideo.vae import AutoencoderKLCausal3D
from pathlib import Path
from einops import rearrange
from PIL import Image
from hyvideo.constants import PRECISION_TO_TYPE
from safetensors.torch import load_file
def convert_kohya_to_peft_keys(
kohya_dict: dict,
kohya_prefix="",
peft_prefix: str = "base_model.model",
device="cpu",
) -> dict:
peft_dict = {}
for k, v in kohya_dict.items():
if ".alpha" in k:
continue
new_key = k.replace(f"{kohya_prefix}_lora_", f"{peft_prefix}.")
new_key = new_key.replace("single_blocks_", "single_blocks.")
new_key = new_key.replace("double_blocks_", "double_blocks.")
new_key = new_key.replace("_img_attn_proj", ".img_attn_proj")
new_key = new_key.replace("_img_attn_qkv", ".img_attn_qkv")
new_key = new_key.replace("_img_mlp_fc", ".img_mlp.fc")
new_key = new_key.replace("_txt_mlp_fc", ".txt_mlp.fc")
new_key = new_key.replace("_img_mod", ".img_mod")
new_key = new_key.replace("_txt", ".txt")
new_key = new_key.replace("_modulation", ".modulation")
new_key = new_key.replace("_linear", ".linear")
new_key = new_key.replace("lora_down", "lora_A.default")
new_key = new_key.replace("lora_up", "lora_B.default")
new_key = new_key.replace(
"_individual_token_refiner_blocks_", ".individual_token_refiner.blocks."
)
new_key = new_key.replace("_mlp_fc", ".mlp.fc")
peft_dict[new_key] = v.to(device)
return peft_dict
def load_lora(model, lora_path, device):
kohya_weights = load_file(lora_path)
peft_weights = convert_kohya_to_peft_keys(
kohya_weights, kohya_prefix="Hunyuan_video_I2V", device=device
)
model.load_state_dict(peft_weights, strict=False)
return model
def black_image(width, height):
black_image = Image.new("RGB", (width, height), (0, 0, 0))
return black_image
def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
if images.ndim == 3:
images = images[None, ...]
images = (images * 255).round().astype("uint8")
if images.shape[-1] == 1:
# special case for grayscale (single channel) images
pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
else:
pil_images = [Image.fromarray(image) for image in images]
return pil_images
def get_cond_latents(args, latents, vae):
"""get conditioned latent by decode and encode the first frame latents"""
first_image_latents = latents[:, :, 0, ...] if len(latents.shape) == 5 else latents
first_image_latents = 1 / vae.config.scaling_factor * first_image_latents
first_images = vae.decode(
first_image_latents.unsqueeze(2).to(vae.dtype), return_dict=False
)[0]
first_images = first_images.squeeze(2)
first_images = (first_images / 2 + 0.5).clamp(0, 1)
first_images = first_images.cpu().permute(0, 2, 3, 1).float().numpy()
first_images = numpy_to_pil(first_images)
image_transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
)
first_images_pixel_values = [image_transform(image) for image in first_images]
first_images_pixel_values = (
torch.cat(first_images_pixel_values).unsqueeze(0).unsqueeze(2).to(vae.device)
)
vae_dtype = PRECISION_TO_TYPE[args.vae_precision]
with torch.autocast(
device_type="cuda", dtype=vae_dtype, enabled=vae_dtype != torch.float32
):
cond_latents = vae.encode(
first_images_pixel_values
).latent_dist.sample() # B, C, F, H, W
cond_latents.mul_(vae.config.scaling_factor)
return cond_latents
def get_cond_images(args, latents, vae, is_uncond=False):
"""get conditioned images by decode the first frame latents"""
sematic_image_latents = (
latents[:, :, 0, ...] if len(latents.shape) == 5 else latents
)
sematic_image_latents = 1 / vae.config.scaling_factor * sematic_image_latents
semantic_images = vae.decode(
sematic_image_latents.unsqueeze(2).to(vae.dtype), return_dict=False
)[0]
semantic_images = semantic_images.squeeze(2)
semantic_images = (semantic_images / 2 + 0.5).clamp(0, 1)
semantic_images = semantic_images.cpu().permute(0, 2, 3, 1).float().numpy()
semantic_images = numpy_to_pil(semantic_images)
if is_uncond:
semantic_images = [
black_image(img.size[0], img.size[1]) for img in semantic_images
]
return semantic_images
def load_state_dict(args, model, logger):
pretrained_model_path = Path(args.model_base)
if not pretrained_model_path.exists():
raise ValueError(f"`models_root` not exists: {pretrained_model_path}")
load_key = args.load_key
if args.i2v_mode:
dit_weight = Path(args.i2v_dit_weight)
else:
dit_weight = Path(args.dit_weight)
if dit_weight is None:
model_dir = pretrained_model_path / f"t2v_{args.model_resolution}"
files = list(model_dir.glob("*.pt"))
if len(files) == 0:
raise ValueError(f"No model weights found in {model_dir}")
if str(files[0]).startswith("pytorch_model_"):
model_path = dit_weight / f"pytorch_model_{load_key}.pt"
bare_model = True
elif any(str(f).endswith("_model_states.pt") for f in files):
files = [f for f in files if str(f).endswith("_model_states.pt")]
model_path = files[0]
if len(files) > 1:
logger.warning(
f"Multiple model weights found in {dit_weight}, using {model_path}"
)
bare_model = False
else:
raise ValueError(
f"Invalid model path: {dit_weight} with unrecognized weight format: "
f"{list(map(str, files))}. When given a directory as --dit-weight, only "
f"`pytorch_model_*.pt`(provided by HunyuanVideo official) and "
f"`*_model_states.pt`(saved by deepspeed) can be parsed. If you want to load a "
f"specific weight file, please provide the full path to the file."
)
else:
if dit_weight.is_dir():
files = list(dit_weight.glob("*.pt"))
if len(files) == 0:
raise ValueError(f"No model weights found in {dit_weight}")
if str(files[0]).startswith("pytorch_model_"):
model_path = dit_weight / f"pytorch_model_{load_key}.pt"
bare_model = True
elif any(str(f).endswith("_model_states.pt") for f in files):
files = [f for f in files if str(f).endswith("_model_states.pt")]
model_path = files[0]
if len(files) > 1:
logger.warning(
f"Multiple model weights found in {dit_weight}, using {model_path}"
)
bare_model = False
else:
raise ValueError(
f"Invalid model path: {dit_weight} with unrecognized weight format: "
f"{list(map(str, files))}. When given a directory as --dit-weight, only "
f"`pytorch_model_*.pt`(provided by HunyuanVideo official) and "
f"`*_model_states.pt`(saved by deepspeed) can be parsed. If you want to load a "
f"specific weight file, please provide the full path to the file."
)
elif dit_weight.is_file():
model_path = dit_weight
bare_model = "unknown"
else:
raise ValueError(f"Invalid model path: {dit_weight}")
if not model_path.exists():
raise ValueError(f"model_path not exists: {model_path}")
logger.info(f"Loading torch model {model_path}...")
state_dict = torch.load(model_path, map_location=lambda storage, loc: storage)
if bare_model == "unknown" and ("ema" in state_dict or "module" in state_dict):
bare_model = False
if bare_model is False:
if load_key in state_dict:
state_dict = state_dict[load_key]
else:
raise KeyError(
f"Missing key: `{load_key}` in the checkpoint: {model_path}. The keys in the checkpoint "
f"are: {list(state_dict.keys())}."
)
model.load_state_dict(state_dict, strict=True)
return model
class set_worker_seed_builder:
def __init__(self, global_rank):
self.global_rank = global_rank
def __call__(self, worker_id):
set_manual_seed(torch.initial_seed() % (2 ** 32 - 1))
def set_reproducibility(enable, global_seed=None):
if enable:
# Configure the seed for reproducibility
set_manual_seed(global_seed)
# Set following debug environment variable
# See the link for details: https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
# Cudnn benchmarking
torch.backends.cudnn.benchmark = not enable
# Use deterministic algorithms in PyTorch
torch.use_deterministic_algorithms(enable)
# LSTM and RNN networks are not deterministic
def prepare_model_inputs(
args,
batch: tuple,
device: Union[int, str],
model,
vae,
text_encoder,
text_encoder_2=None,
rope_theta_rescale_factor: Union[float, List[float]] = 1.0,
rope_interpolation_factor: Union[float, List[float]] = 1.0,
):
media, latents, *batch_args = batch
if len(batch_args) == 3:
text_ids, text_mask, kwargs = batch_args
text_ids_2, text_mask_2 = None, None
elif len(batch_args) == 5:
text_ids, text_mask, text_ids_2, text_mask_2, kwargs = batch_args
else:
raise ValueError(f"Unexpected batch_args.")
data_type = kwargs["type"][0]
# Move batch to device
media = media.to(device)
latents = latents.to(device)
text_ids = text_ids.to(device)
text_mask = text_mask.to(device)
# ======================================== Encode media ======================================
# Used for 3D VAE with 2D inputs(image).
# Prepare media shape for 2D/3D VAE
if len(latents.shape) == 1:
if len(media.shape) == 4:
# media is a batch of image with shape [b, c, h, w]
if isinstance(vae, AutoencoderKLCausal3D):
media = media.unsqueeze(2) # [b, c, 1, h, w]
elif len(media.shape) == 5:
# media is a batch of video with shape [b, c, f, h, w]
if not isinstance(vae, AutoencoderKLCausal3D):
media = rearrange(media, "b c f h w -> (b f) c h w")
else:
raise ValueError(
f"Only support media with shape (b, c, h, w) or (b, c, f, h, w), but got {media.shape}."
)
vae_dtype = PRECISION_TO_TYPE[args.vae_precision]
with torch.autocast(
device_type="cuda", dtype=vae_dtype, enabled=vae_dtype != torch.float32
):
latents = vae.encode(media).latent_dist.sample()
if hasattr(vae.config, "shift_factor") and vae.config.shift_factor:
latents.sub_(vae.config.shift_factor).mul_(vae.config.scaling_factor)
else:
latents.mul_(vae.config.scaling_factor)
elif len(latents.shape) == 5 or len(latents.shape) == 4: # Using video/image cache
latents = (
latents * vae.config.scaling_factor
) # vae cache is not multiplied by scaling_factor
else:
raise ValueError(
f"Only support media/latent with shape (b, c, h, w) or (b, c, f, h, w), but got {media.shape} {latents.shape}."
)
cond_latents = get_cond_latents(args, latents, vae)
is_uncond = (
torch.tensor(1).to(torch.int64)
if random.random() < args.sematic_cond_drop_p
else torch.tensor(0).to(torch.int64)
)
semantic_images = get_cond_images(args, latents, vae, is_uncond=is_uncond)
# ======================================== Encode text ======================================
# Autocast is handled by text_encoder itself.
# Whether to apply text_mask is determined by args.use_attention_mask.
text_outputs = text_encoder.encode(
{"input_ids": text_ids, "attention_mask": text_mask},
data_type=batch_args[-1]["type"][0],
semantic_images=semantic_images,
)
text_states = text_outputs.hidden_state
text_mask = text_outputs.attention_mask
text_states_2 = (
text_encoder_2.encode(
{"input_ids": text_ids_2, "attention_mask": text_mask_2},
data_type=data_type,
).hidden_state
if text_encoder_2 is not None
else None
)
# ======================================== Build RoPE ======================================
target_ndim = 3 # n-d RoPE
ndim = len(latents.shape) - 2
latents_size = list(latents.shape[-ndim:])
freqs_cos, freqs_sin = get_rope_freq_from_size(
args,
model,
latents_size,
ndim,
target_ndim,
rope_theta_rescale_factor=rope_theta_rescale_factor,
rope_interpolation_factor=rope_interpolation_factor,
)
# ===================================== Pack model kwargs ==================================
model_kwargs = dict(
text_states=text_states, # [b, 256, 4096]
text_mask=text_mask, # [b, 256]
text_states_2=text_states_2, # [b, 768]
freqs_cos=freqs_cos, # [seqlen, head_dim]
freqs_sin=freqs_sin, # [seqlen, head_dim]
return_dict=True,
)
return latents, model_kwargs, freqs_cos.shape[0], cond_latents
def format_params(params):
if params < 1e6:
return f"{params} (less than 1M)"
elif params < 1e9:
return f"{params / 1e6:.2f}M"
else:
return f"{params / 1e9:.2f}B"
def set_manual_seed(global_seed):
random.seed(global_seed)
np.random.seed(global_seed)
torch.manual_seed(global_seed)
def get_rope_freq_from_size(
args,
model,
latents_size,
ndim,
target_ndim,
rope_theta_rescale_factor=1.0,
rope_interpolation_factor=1.0,
):
if isinstance(model.patch_size, int):
assert all(s % model.patch_size == 0 for s in latents_size), (
f"Latent size(last {ndim} dimensions) should be divisible by patch size({model.patch_size}), "
f"but got {latents_size}."
)
rope_sizes = [s // model.patch_size for s in latents_size]
elif isinstance(model.patch_size, list):
assert all(
s % model.patch_size[idx] == 0 for idx, s in enumerate(latents_size)
), (
f"Latent size(last {ndim} dimensions) should be divisible by patch size({model.patch_size}), "
f"but got {latents_size}."
)
rope_sizes = [s // model.patch_size[idx] for idx, s in enumerate(latents_size)]
if len(rope_sizes) != target_ndim:
rope_sizes = [1] * (target_ndim - len(rope_sizes)) + rope_sizes # time axis
head_dim = model.hidden_size // model.heads_num
rope_dim_list = model.rope_dim_list
if rope_dim_list is None:
rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
assert (
sum(rope_dim_list) == head_dim
), "sum(rope_dim_list) should equal to head_dim of attention layer"
freqs_cos, freqs_sin = get_nd_rotary_pos_embed(
rope_dim_list,
rope_sizes,
theta=args.rope_theta,
use_real=True,
theta_rescale_factor=rope_theta_rescale_factor,
interpolation_factor=rope_interpolation_factor,
)
return freqs_cos, freqs_sin
# copy from https://github.com/huggingface/diffusers/blob/ec9bfa9e148b7764137dd92247ce859d915abcb0/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py#L258
# get kohya lora state dict
def get_module_kohya_state_dict(module, prefix, dtype, adapter_name="default"):
kohya_ss_state_dict = {}
for peft_key, weight in get_peft_model_state_dict(
module, adapter_name=adapter_name
).items():
kohya_key = peft_key.replace("base_model.model", prefix)
kohya_key = kohya_key.replace("lora_A", "lora_down")
kohya_key = kohya_key.replace("lora_B", "lora_up")
kohya_key = kohya_key.replace(".", "_", kohya_key.count(".") - 2)
kohya_ss_state_dict[kohya_key] = weight.to(dtype)
# Set alpha parameter
if "lora_down" in kohya_key:
alpha_key = f'{kohya_key.split(".")[0]}.alpha'
kohya_ss_state_dict[alpha_key] = torch.tensor(
module.peft_config[adapter_name].lora_alpha
).to(dtype)
return kohya_ss_state_dict
# get diffusers lora state dict
def get_module_diffusers_state_dict(module, dtype, adapter_name="default"):
diffusers_ss_state_dict = {}
for peft_key, weight in get_peft_model_state_dict(
module, adapter_name=adapter_name
).items():
diffusers_key = peft_key.replace("base_model.model", "diffusion_model")
diffusers_ss_state_dict[diffusers_key] = weight.to(dtype)
return diffusers_ss_state_dict
from pathlib import Path
import torch
from .autoencoder_kl_causal_3d import AutoencoderKLCausal3D
from ..constants import VAE_PATH, PRECISION_TO_TYPE
def load_vae(vae_type: str="884-16c-hy",
vae_precision: str=None,
sample_size: tuple=None,
vae_path: str=None,
logger=None,
device=None
):
"""the fucntion to load the 3D VAE model
Args:
vae_type (str): the type of the 3D VAE model. Defaults to "884-16c-hy".
vae_precision (str, optional): the precision to load vae. Defaults to None.
sample_size (tuple, optional): the tiling size. Defaults to None.
vae_path (str, optional): the path to vae. Defaults to None.
logger (_type_, optional): logger. Defaults to None.
device (_type_, optional): device to load vae. Defaults to None.
"""
if vae_path is None:
vae_path = VAE_PATH[vae_type]
if logger is not None:
logger.info(f"Loading 3D VAE model ({vae_type}) from: {vae_path}")
config = AutoencoderKLCausal3D.load_config(vae_path)
if sample_size:
vae = AutoencoderKLCausal3D.from_config(config, sample_size=sample_size)
else:
vae = AutoencoderKLCausal3D.from_config(config)
vae_ckpt = Path(vae_path) / "pytorch_model.pt"
assert vae_ckpt.exists(), f"VAE checkpoint not found: {vae_ckpt}"
ckpt = torch.load(vae_ckpt, map_location=vae.device)
if "state_dict" in ckpt:
ckpt = ckpt["state_dict"]
if any(k.startswith("vae.") for k in ckpt.keys()):
ckpt = {k.replace("vae.", ""): v for k, v in ckpt.items() if k.startswith("vae.")}
vae.load_state_dict(ckpt)
spatial_compression_ratio = vae.config.spatial_compression_ratio
time_compression_ratio = vae.config.time_compression_ratio
if vae_precision is not None:
vae = vae.to(dtype=PRECISION_TO_TYPE[vae_precision])
vae.requires_grad_(False)
if logger is not None:
logger.info(f"VAE to dtype: {vae.dtype}")
if device is not None:
vae = vae.to(device)
vae.eval()
return vae, vae_path, spatial_compression_ratio, time_compression_ratio
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#
# Modified from diffusers==0.29.2
#
# ==============================================================================
from typing import Dict, Optional, Tuple, Union
from dataclasses import dataclass
import torch
import torch.nn as nn
from diffusers.configuration_utils import ConfigMixin, register_to_config
try:
# This diffusers is modified and packed in the mirror.
from diffusers.loaders import FromOriginalVAEMixin
except ImportError:
# Use this to be compatible with the original diffusers.
from diffusers.loaders.single_file_model import FromOriginalModelMixin as FromOriginalVAEMixin
from diffusers.utils.accelerate_utils import apply_forward_hook
from diffusers.models.attention_processor import (
ADDED_KV_ATTENTION_PROCESSORS,
CROSS_ATTENTION_PROCESSORS,
Attention,
AttentionProcessor,
AttnAddedKVProcessor,
AttnProcessor,
)
from diffusers.models.modeling_outputs import AutoencoderKLOutput
from diffusers.models.modeling_utils import ModelMixin
from .vae import DecoderCausal3D, BaseOutput, DecoderOutput, DiagonalGaussianDistribution, EncoderCausal3D
@dataclass
class DecoderOutput2(BaseOutput):
sample: torch.FloatTensor
posterior: Optional[DiagonalGaussianDistribution] = None
class AutoencoderKLCausal3D(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
r"""
A VAE model with KL loss for encoding images/videos into latents and decoding latent representations into images/videos.
This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
for all models (such as downloading or saving).
"""
_supports_gradient_checkpointing = True
@register_to_config
def __init__(
self,
in_channels: int = 3,
out_channels: int = 3,
down_block_types: Tuple[str] = ("DownEncoderBlockCausal3D",),
up_block_types: Tuple[str] = ("UpDecoderBlockCausal3D",),
block_out_channels: Tuple[int] = (64,),
layers_per_block: int = 1,
act_fn: str = "silu",
latent_channels: int = 4,
norm_num_groups: int = 32,
sample_size: int = 32,
sample_tsize: int = 64,
scaling_factor: float = 0.18215,
force_upcast: float = True,
spatial_compression_ratio: int = 8,
time_compression_ratio: int = 4,
mid_block_add_attention: bool = True,
):
super().__init__()
self.time_compression_ratio = time_compression_ratio
self.encoder = EncoderCausal3D(
in_channels=in_channels,
out_channels=latent_channels,
down_block_types=down_block_types,
block_out_channels=block_out_channels,
layers_per_block=layers_per_block,
act_fn=act_fn,
norm_num_groups=norm_num_groups,
double_z=True,
time_compression_ratio=time_compression_ratio,
spatial_compression_ratio=spatial_compression_ratio,
mid_block_add_attention=mid_block_add_attention,
)
self.decoder = DecoderCausal3D(
in_channels=latent_channels,
out_channels=out_channels,
up_block_types=up_block_types,
block_out_channels=block_out_channels,
layers_per_block=layers_per_block,
norm_num_groups=norm_num_groups,
act_fn=act_fn,
time_compression_ratio=time_compression_ratio,
spatial_compression_ratio=spatial_compression_ratio,
mid_block_add_attention=mid_block_add_attention,
)
self.quant_conv = nn.Conv3d(2 * latent_channels, 2 * latent_channels, kernel_size=1)
self.post_quant_conv = nn.Conv3d(latent_channels, latent_channels, kernel_size=1)
self.use_slicing = False
self.use_spatial_tiling = False
self.use_temporal_tiling = False
# only relevant if vae tiling is enabled
self.tile_sample_min_tsize = sample_tsize
self.tile_latent_min_tsize = sample_tsize // time_compression_ratio
self.tile_sample_min_size = self.config.sample_size
sample_size = (
self.config.sample_size[0]
if isinstance(self.config.sample_size, (list, tuple))
else self.config.sample_size
)
self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
self.tile_overlap_factor = 0.25
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, (EncoderCausal3D, DecoderCausal3D)):
module.gradient_checkpointing = value
def enable_temporal_tiling(self, use_tiling: bool = True):
self.use_temporal_tiling = use_tiling
def disable_temporal_tiling(self):
self.enable_temporal_tiling(False)
def enable_spatial_tiling(self, use_tiling: bool = True):
self.use_spatial_tiling = use_tiling
def disable_spatial_tiling(self):
self.enable_spatial_tiling(False)
def enable_tiling(self, use_tiling: bool = True):
r"""
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger videos.
"""
self.enable_spatial_tiling(use_tiling)
self.enable_temporal_tiling(use_tiling)
def disable_tiling(self):
r"""
Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
decoding in one step.
"""
self.disable_spatial_tiling()
self.disable_temporal_tiling()
def enable_slicing(self):
r"""
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
"""
self.use_slicing = True
def disable_slicing(self):
r"""
Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
decoding in one step.
"""
self.use_slicing = False
@property
# Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
def attn_processors(self) -> Dict[str, AttentionProcessor]:
r"""
Returns:
`dict` of attention processors: A dictionary containing all attention processors used in the model with
indexed by its weight name.
"""
# set recursively
processors = {}
def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
if hasattr(module, "get_processor"):
processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
for sub_name, child in module.named_children():
fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
return processors
for name, module in self.named_children():
fn_recursive_add_processors(name, module, processors)
return processors
# Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
def set_attn_processor(
self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
):
r"""
Sets the attention processor to use to compute attention.
Parameters:
processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
The instantiated processor class or a dictionary of processor classes that will be set as the processor
for **all** `Attention` layers.
If `processor` is a dict, the key needs to define the path to the corresponding cross attention
processor. This is strongly recommended when setting trainable attention processors.
"""
count = len(self.attn_processors.keys())
if isinstance(processor, dict) and len(processor) != count:
raise ValueError(
f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
)
def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
if hasattr(module, "set_processor"):
if not isinstance(processor, dict):
module.set_processor(processor, _remove_lora=_remove_lora)
else:
module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
for sub_name, child in module.named_children():
fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
for name, module in self.named_children():
fn_recursive_attn_processor(name, module, processor)
# Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
def set_default_attn_processor(self):
"""
Disables custom attention processors and sets the default attention implementation.
"""
if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
processor = AttnAddedKVProcessor()
elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
processor = AttnProcessor()
else:
raise ValueError(
f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
)
self.set_attn_processor(processor, _remove_lora=True)
@apply_forward_hook
def encode(
self, x: torch.FloatTensor, return_dict: bool = True
) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
"""
Encode a batch of images/videos into latents.
Args:
x (`torch.FloatTensor`): Input batch of images/videos.
return_dict (`bool`, *optional*, defaults to `True`):
Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
Returns:
The latent representations of the encoded images/videos. If `return_dict` is True, a
[`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
"""
assert len(x.shape) == 5, "The input tensor should have 5 dimensions."
if self.use_temporal_tiling and x.shape[2] > self.tile_sample_min_tsize:
return self.temporal_tiled_encode(x, return_dict=return_dict)
if self.use_spatial_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
return self.spatial_tiled_encode(x, return_dict=return_dict)
if self.use_slicing and x.shape[0] > 1:
encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
h = torch.cat(encoded_slices)
else:
h = self.encoder(x)
moments = self.quant_conv(h)
posterior = DiagonalGaussianDistribution(moments)
if not return_dict:
return (posterior,)
return AutoencoderKLOutput(latent_dist=posterior)
def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
assert len(z.shape) == 5, "The input tensor should have 5 dimensions."
if self.use_temporal_tiling and z.shape[2] > self.tile_latent_min_tsize:
return self.temporal_tiled_decode(z, return_dict=return_dict)
if self.use_spatial_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
return self.spatial_tiled_decode(z, return_dict=return_dict)
z = self.post_quant_conv(z)
dec = self.decoder(z)
if not return_dict:
return (dec,)
return DecoderOutput(sample=dec)
@apply_forward_hook
def decode(
self, z: torch.FloatTensor, return_dict: bool = True, generator=None
) -> Union[DecoderOutput, torch.FloatTensor]:
"""
Decode a batch of images/videos.
Args:
z (`torch.FloatTensor`): Input batch of latent vectors.
return_dict (`bool`, *optional*, defaults to `True`):
Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
Returns:
[`~models.vae.DecoderOutput`] or `tuple`:
If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
returned.
"""
if self.use_slicing and z.shape[0] > 1:
decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
decoded = torch.cat(decoded_slices)
else:
decoded = self._decode(z).sample
if not return_dict:
return (decoded,)
return DecoderOutput(sample=decoded)
def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
for y in range(blend_extent):
b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (y / blend_extent)
return b
def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
for x in range(blend_extent):
b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (x / blend_extent)
return b
def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
for x in range(blend_extent):
b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (x / blend_extent)
return b
def spatial_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True, return_moments: bool = False) -> AutoencoderKLOutput:
r"""Encode a batch of images/videos using a tiled encoder.
When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
steps. This is useful to keep memory use constant regardless of image/videos size. The end result of tiled encoding is
different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
output, but they should be much less noticeable.
Args:
x (`torch.FloatTensor`): Input batch of images/videos.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
Returns:
[`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
`tuple` is returned.
"""
overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
row_limit = self.tile_latent_min_size - blend_extent
# Split video into tiles and encode them separately.
rows = []
for i in range(0, x.shape[-2], overlap_size):
row = []
for j in range(0, x.shape[-1], overlap_size):
tile = x[:, :, :, i: i + self.tile_sample_min_size, j: j + self.tile_sample_min_size]
tile = self.encoder(tile)
tile = self.quant_conv(tile)
row.append(tile)
rows.append(row)
result_rows = []
for i, row in enumerate(rows):
result_row = []
for j, tile in enumerate(row):
# blend the above tile and the left tile
# to the current tile and add the current tile to the result row
if i > 0:
tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
if j > 0:
tile = self.blend_h(row[j - 1], tile, blend_extent)
result_row.append(tile[:, :, :, :row_limit, :row_limit])
result_rows.append(torch.cat(result_row, dim=-1))
moments = torch.cat(result_rows, dim=-2)
if return_moments:
return moments
posterior = DiagonalGaussianDistribution(moments)
if not return_dict:
return (posterior,)
return AutoencoderKLOutput(latent_dist=posterior)
def spatial_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
r"""
Decode a batch of images/videos using a tiled decoder.
Args:
z (`torch.FloatTensor`): Input batch of latent vectors.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
Returns:
[`~models.vae.DecoderOutput`] or `tuple`:
If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
returned.
"""
overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
row_limit = self.tile_sample_min_size - blend_extent
# Split z into overlapping tiles and decode them separately.
# The tiles have an overlap to avoid seams between tiles.
rows = []
for i in range(0, z.shape[-2], overlap_size):
row = []
for j in range(0, z.shape[-1], overlap_size):
tile = z[:, :, :, i: i + self.tile_latent_min_size, j: j + self.tile_latent_min_size]
tile = self.post_quant_conv(tile)
decoded = self.decoder(tile)
row.append(decoded)
rows.append(row)
result_rows = []
for i, row in enumerate(rows):
result_row = []
for j, tile in enumerate(row):
# blend the above tile and the left tile
# to the current tile and add the current tile to the result row
if i > 0:
tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
if j > 0:
tile = self.blend_h(row[j - 1], tile, blend_extent)
result_row.append(tile[:, :, :, :row_limit, :row_limit])
result_rows.append(torch.cat(result_row, dim=-1))
dec = torch.cat(result_rows, dim=-2)
if not return_dict:
return (dec,)
return DecoderOutput(sample=dec)
def temporal_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
B, C, T, H, W = x.shape
overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor))
blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor)
t_limit = self.tile_latent_min_tsize - blend_extent
# Split the video into tiles and encode them separately.
row = []
for i in range(0, T, overlap_size):
tile = x[:, :, i: i + self.tile_sample_min_tsize + 1, :, :]
if self.use_spatial_tiling and (tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size):
tile = self.spatial_tiled_encode(tile, return_moments=True)
else:
tile = self.encoder(tile)
tile = self.quant_conv(tile)
if i > 0:
tile = tile[:, :, 1:, :, :]
row.append(tile)
result_row = []
for i, tile in enumerate(row):
if i > 0:
tile = self.blend_t(row[i - 1], tile, blend_extent)
result_row.append(tile[:, :, :t_limit, :, :])
else:
result_row.append(tile[:, :, :t_limit + 1, :, :])
moments = torch.cat(result_row, dim=2)
posterior = DiagonalGaussianDistribution(moments)
if not return_dict:
return (posterior,)
return AutoencoderKLOutput(latent_dist=posterior)
def temporal_tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
# Split z into overlapping tiles and decode them separately.
B, C, T, H, W = z.shape
overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))
blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)
t_limit = self.tile_sample_min_tsize - blend_extent
row = []
for i in range(0, T, overlap_size):
tile = z[:, :, i: i + self.tile_latent_min_tsize + 1, :, :]
if self.use_spatial_tiling and (tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size):
decoded = self.spatial_tiled_decode(tile, return_dict=True).sample
else:
tile = self.post_quant_conv(tile)
decoded = self.decoder(tile)
if i > 0:
decoded = decoded[:, :, 1:, :, :]
row.append(decoded)
result_row = []
for i, tile in enumerate(row):
if i > 0:
tile = self.blend_t(row[i - 1], tile, blend_extent)
result_row.append(tile[:, :, :t_limit, :, :])
else:
result_row.append(tile[:, :, :t_limit + 1, :, :])
dec = torch.cat(result_row, dim=2)
if not return_dict:
return (dec,)
return DecoderOutput(sample=dec)
def forward(
self,
sample: torch.FloatTensor,
sample_posterior: bool = False,
return_dict: bool = True,
return_posterior: bool = False,
generator: Optional[torch.Generator] = None,
) -> Union[DecoderOutput2, torch.FloatTensor]:
r"""
Args:
sample (`torch.FloatTensor`): Input sample.
sample_posterior (`bool`, *optional*, defaults to `False`):
Whether to sample from the posterior.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
"""
x = sample
posterior = self.encode(x).latent_dist
if sample_posterior:
z = posterior.sample(generator=generator)
else:
z = posterior.mode()
dec = self.decode(z).sample
if not return_dict:
if return_posterior:
return (dec, posterior)
else:
return (dec,)
if return_posterior:
return DecoderOutput2(sample=dec, posterior=posterior)
else:
return DecoderOutput2(sample=dec)
# Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
def fuse_qkv_projections(self):
"""
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
<Tip warning={true}>
This API is 🧪 experimental.
</Tip>
"""
self.original_attn_processors = None
for _, attn_processor in self.attn_processors.items():
if "Added" in str(attn_processor.__class__.__name__):
raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
self.original_attn_processors = self.attn_processors
for module in self.modules():
if isinstance(module, Attention):
module.fuse_projections(fuse=True)
# Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
def unfuse_qkv_projections(self):
"""Disables the fused QKV projection if enabled.
<Tip warning={true}>
This API is 🧪 experimental.
</Tip>
"""
if self.original_attn_processors is not None:
self.set_attn_processor(self.original_attn_processors)
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#
# Modified from diffusers==0.29.2
#
# ==============================================================================
from typing import Optional, Tuple, Union
import torch
import torch.nn.functional as F
from torch import nn
from einops import rearrange
from diffusers.utils import logging
from diffusers.models.activations import get_activation
from diffusers.models.attention_processor import SpatialNorm
from diffusers.models.attention_processor import Attention
from diffusers.models.normalization import AdaGroupNorm
from diffusers.models.normalization import RMSNorm
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
def prepare_causal_attention_mask(n_frame: int, n_hw: int, dtype, device, batch_size: int = None):
seq_len = n_frame * n_hw
mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
for i in range(seq_len):
i_frame = i // n_hw
mask[i, : (i_frame + 1) * n_hw] = 0
if batch_size is not None:
mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
return mask
class CausalConv3d(nn.Module):
"""
Implements a causal 3D convolution layer where each position only depends on previous timesteps and current spatial locations.
This maintains temporal causality in video generation tasks.
"""
def __init__(
self,
chan_in,
chan_out,
kernel_size: Union[int, Tuple[int, int, int]],
stride: Union[int, Tuple[int, int, int]] = 1,
dilation: Union[int, Tuple[int, int, int]] = 1,
pad_mode='replicate',
**kwargs
):
super().__init__()
self.pad_mode = pad_mode
padding = (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size - 1, 0) # W, H, T
self.time_causal_padding = padding
self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs)
def forward(self, x):
x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
return self.conv(x)
class UpsampleCausal3D(nn.Module):
"""
A 3D upsampling layer with an optional convolution.
"""
def __init__(
self,
channels: int,
use_conv: bool = False,
use_conv_transpose: bool = False,
out_channels: Optional[int] = None,
name: str = "conv",
kernel_size: Optional[int] = None,
padding=1,
norm_type=None,
eps=None,
elementwise_affine=None,
bias=True,
interpolate=True,
upsample_factor=(2, 2, 2),
):
super().__init__()
self.channels = channels
self.out_channels = out_channels or channels
self.use_conv = use_conv
self.use_conv_transpose = use_conv_transpose
self.name = name
self.interpolate = interpolate
self.upsample_factor = upsample_factor
if norm_type == "ln_norm":
self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
elif norm_type == "rms_norm":
self.norm = RMSNorm(channels, eps, elementwise_affine)
elif norm_type is None:
self.norm = None
else:
raise ValueError(f"unknown norm_type: {norm_type}")
conv = None
if use_conv_transpose:
raise NotImplementedError
elif use_conv:
if kernel_size is None:
kernel_size = 3
conv = CausalConv3d(self.channels, self.out_channels, kernel_size=kernel_size, bias=bias)
if name == "conv":
self.conv = conv
else:
self.Conv2d_0 = conv
def forward(
self,
hidden_states: torch.FloatTensor,
output_size: Optional[int] = None,
scale: float = 1.0,
) -> torch.FloatTensor:
assert hidden_states.shape[1] == self.channels
if self.norm is not None:
raise NotImplementedError
if self.use_conv_transpose:
return self.conv(hidden_states)
# Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
dtype = hidden_states.dtype
if dtype == torch.bfloat16:
hidden_states = hidden_states.to(torch.float32)
# upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
if hidden_states.shape[0] >= 64:
hidden_states = hidden_states.contiguous()
# if `output_size` is passed we force the interpolation output
# size and do not make use of `scale_factor=2`
if self.interpolate:
B, C, T, H, W = hidden_states.shape
first_h, other_h = hidden_states.split((1, T - 1), dim=2)
if output_size is None:
if T > 1:
other_h = F.interpolate(other_h, scale_factor=self.upsample_factor, mode="nearest")
first_h = first_h.squeeze(2)
first_h = F.interpolate(first_h, scale_factor=self.upsample_factor[1:], mode="nearest")
first_h = first_h.unsqueeze(2)
else:
raise NotImplementedError
if T > 1:
hidden_states = torch.cat((first_h, other_h), dim=2)
else:
hidden_states = first_h
# If the input is bfloat16, we cast back to bfloat16
if dtype == torch.bfloat16:
hidden_states = hidden_states.to(dtype)
if self.use_conv:
if self.name == "conv":
hidden_states = self.conv(hidden_states)
else:
hidden_states = self.Conv2d_0(hidden_states)
return hidden_states
class DownsampleCausal3D(nn.Module):
"""
A 3D downsampling layer with an optional convolution.
"""
def __init__(
self,
channels: int,
use_conv: bool = False,
out_channels: Optional[int] = None,
padding: int = 1,
name: str = "conv",
kernel_size=3,
norm_type=None,
eps=None,
elementwise_affine=None,
bias=True,
stride=2,
):
super().__init__()
self.channels = channels
self.out_channels = out_channels or channels
self.use_conv = use_conv
self.padding = padding
stride = stride
self.name = name
if norm_type == "ln_norm":
self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
elif norm_type == "rms_norm":
self.norm = RMSNorm(channels, eps, elementwise_affine)
elif norm_type is None:
self.norm = None
else:
raise ValueError(f"unknown norm_type: {norm_type}")
if use_conv:
conv = CausalConv3d(
self.channels, self.out_channels, kernel_size=kernel_size, stride=stride, bias=bias
)
else:
raise NotImplementedError
if name == "conv":
self.Conv2d_0 = conv
self.conv = conv
elif name == "Conv2d_0":
self.conv = conv
else:
self.conv = conv
def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
assert hidden_states.shape[1] == self.channels
if self.norm is not None:
hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
assert hidden_states.shape[1] == self.channels
hidden_states = self.conv(hidden_states)
return hidden_states
class ResnetBlockCausal3D(nn.Module):
r"""
A Resnet block.
"""
def __init__(
self,
*,
in_channels: int,
out_channels: Optional[int] = None,
conv_shortcut: bool = False,
dropout: float = 0.0,
temb_channels: int = 512,
groups: int = 32,
groups_out: Optional[int] = None,
pre_norm: bool = True,
eps: float = 1e-6,
non_linearity: str = "swish",
skip_time_act: bool = False,
# default, scale_shift, ada_group, spatial
time_embedding_norm: str = "default",
kernel: Optional[torch.FloatTensor] = None,
output_scale_factor: float = 1.0,
use_in_shortcut: Optional[bool] = None,
up: bool = False,
down: bool = False,
conv_shortcut_bias: bool = True,
conv_3d_out_channels: Optional[int] = None,
):
super().__init__()
self.pre_norm = pre_norm
self.pre_norm = True
self.in_channels = in_channels
out_channels = in_channels if out_channels is None else out_channels
self.out_channels = out_channels
self.use_conv_shortcut = conv_shortcut
self.up = up
self.down = down
self.output_scale_factor = output_scale_factor
self.time_embedding_norm = time_embedding_norm
self.skip_time_act = skip_time_act
linear_cls = nn.Linear
if groups_out is None:
groups_out = groups
if self.time_embedding_norm == "ada_group":
self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
elif self.time_embedding_norm == "spatial":
self.norm1 = SpatialNorm(in_channels, temb_channels)
else:
self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, stride=1)
if temb_channels is not None:
if self.time_embedding_norm == "default":
self.time_emb_proj = linear_cls(temb_channels, out_channels)
elif self.time_embedding_norm == "scale_shift":
self.time_emb_proj = linear_cls(temb_channels, 2 * out_channels)
elif self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
self.time_emb_proj = None
else:
raise ValueError(f"Unknown time_embedding_norm : {self.time_embedding_norm} ")
else:
self.time_emb_proj = None
if self.time_embedding_norm == "ada_group":
self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
elif self.time_embedding_norm == "spatial":
self.norm2 = SpatialNorm(out_channels, temb_channels)
else:
self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
self.dropout = torch.nn.Dropout(dropout)
conv_3d_out_channels = conv_3d_out_channels or out_channels
self.conv2 = CausalConv3d(out_channels, conv_3d_out_channels, kernel_size=3, stride=1)
self.nonlinearity = get_activation(non_linearity)
self.upsample = self.downsample = None
if self.up:
self.upsample = UpsampleCausal3D(in_channels, use_conv=False)
elif self.down:
self.downsample = DownsampleCausal3D(in_channels, use_conv=False, name="op")
self.use_in_shortcut = self.in_channels != conv_3d_out_channels if use_in_shortcut is None else use_in_shortcut
self.conv_shortcut = None
if self.use_in_shortcut:
self.conv_shortcut = CausalConv3d(
in_channels,
conv_3d_out_channels,
kernel_size=1,
stride=1,
bias=conv_shortcut_bias,
)
def forward(
self,
input_tensor: torch.FloatTensor,
temb: torch.FloatTensor,
scale: float = 1.0,
) -> torch.FloatTensor:
hidden_states = input_tensor
if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
hidden_states = self.norm1(hidden_states, temb)
else:
hidden_states = self.norm1(hidden_states)
hidden_states = self.nonlinearity(hidden_states)
if self.upsample is not None:
# upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
if hidden_states.shape[0] >= 64:
input_tensor = input_tensor.contiguous()
hidden_states = hidden_states.contiguous()
input_tensor = (
self.upsample(input_tensor, scale=scale)
)
hidden_states = (
self.upsample(hidden_states, scale=scale)
)
elif self.downsample is not None:
input_tensor = (
self.downsample(input_tensor, scale=scale)
)
hidden_states = (
self.downsample(hidden_states, scale=scale)
)
hidden_states = self.conv1(hidden_states)
if self.time_emb_proj is not None:
if not self.skip_time_act:
temb = self.nonlinearity(temb)
temb = (
self.time_emb_proj(temb, scale)[:, :, None, None]
)
if temb is not None and self.time_embedding_norm == "default":
hidden_states = hidden_states + temb
if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
hidden_states = self.norm2(hidden_states, temb)
else:
hidden_states = self.norm2(hidden_states)
if temb is not None and self.time_embedding_norm == "scale_shift":
scale, shift = torch.chunk(temb, 2, dim=1)
hidden_states = hidden_states * (1 + scale) + shift
hidden_states = self.nonlinearity(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.conv2(hidden_states)
if self.conv_shortcut is not None:
input_tensor = (
self.conv_shortcut(input_tensor)
)
output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
return output_tensor
def get_down_block3d(
down_block_type: str,
num_layers: int,
in_channels: int,
out_channels: int,
temb_channels: int,
add_downsample: bool,
downsample_stride: int,
resnet_eps: float,
resnet_act_fn: str,
transformer_layers_per_block: int = 1,
num_attention_heads: Optional[int] = None,
resnet_groups: Optional[int] = None,
cross_attention_dim: Optional[int] = None,
downsample_padding: Optional[int] = None,
dual_cross_attention: bool = False,
use_linear_projection: bool = False,
only_cross_attention: bool = False,
upcast_attention: bool = False,
resnet_time_scale_shift: str = "default",
attention_type: str = "default",
resnet_skip_time_act: bool = False,
resnet_out_scale_factor: float = 1.0,
cross_attention_norm: Optional[str] = None,
attention_head_dim: Optional[int] = None,
downsample_type: Optional[str] = None,
dropout: float = 0.0,
):
# If attn head dim is not defined, we default it to the number of heads
if attention_head_dim is None:
logger.warn(
f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
)
attention_head_dim = num_attention_heads
down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
if down_block_type == "DownEncoderBlockCausal3D":
return DownEncoderBlockCausal3D(
num_layers=num_layers,
in_channels=in_channels,
out_channels=out_channels,
dropout=dropout,
add_downsample=add_downsample,
downsample_stride=downsample_stride,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
resnet_groups=resnet_groups,
downsample_padding=downsample_padding,
resnet_time_scale_shift=resnet_time_scale_shift,
)
raise ValueError(f"{down_block_type} does not exist.")
def get_up_block3d(
up_block_type: str,
num_layers: int,
in_channels: int,
out_channels: int,
prev_output_channel: int,
temb_channels: int,
add_upsample: bool,
upsample_scale_factor: Tuple,
resnet_eps: float,
resnet_act_fn: str,
resolution_idx: Optional[int] = None,
transformer_layers_per_block: int = 1,
num_attention_heads: Optional[int] = None,
resnet_groups: Optional[int] = None,
cross_attention_dim: Optional[int] = None,
dual_cross_attention: bool = False,
use_linear_projection: bool = False,
only_cross_attention: bool = False,
upcast_attention: bool = False,
resnet_time_scale_shift: str = "default",
attention_type: str = "default",
resnet_skip_time_act: bool = False,
resnet_out_scale_factor: float = 1.0,
cross_attention_norm: Optional[str] = None,
attention_head_dim: Optional[int] = None,
upsample_type: Optional[str] = None,
dropout: float = 0.0,
) -> nn.Module:
# If attn head dim is not defined, we default it to the number of heads
if attention_head_dim is None:
logger.warn(
f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
)
attention_head_dim = num_attention_heads
up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
if up_block_type == "UpDecoderBlockCausal3D":
return UpDecoderBlockCausal3D(
num_layers=num_layers,
in_channels=in_channels,
out_channels=out_channels,
resolution_idx=resolution_idx,
dropout=dropout,
add_upsample=add_upsample,
upsample_scale_factor=upsample_scale_factor,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
resnet_groups=resnet_groups,
resnet_time_scale_shift=resnet_time_scale_shift,
temb_channels=temb_channels,
)
raise ValueError(f"{up_block_type} does not exist.")
class UNetMidBlockCausal3D(nn.Module):
"""
A 3D UNet mid-block [`UNetMidBlockCausal3D`] with multiple residual blocks and optional attention blocks.
"""
def __init__(
self,
in_channels: int,
temb_channels: int,
dropout: float = 0.0,
num_layers: int = 1,
resnet_eps: float = 1e-6,
resnet_time_scale_shift: str = "default", # default, spatial
resnet_act_fn: str = "swish",
resnet_groups: int = 32,
attn_groups: Optional[int] = None,
resnet_pre_norm: bool = True,
add_attention: bool = True,
attention_head_dim: int = 1,
output_scale_factor: float = 1.0,
):
super().__init__()
resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
self.add_attention = add_attention
if attn_groups is None:
attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
# there is always at least one resnet
resnets = [
ResnetBlockCausal3D(
in_channels=in_channels,
out_channels=in_channels,
temb_channels=temb_channels,
eps=resnet_eps,
groups=resnet_groups,
dropout=dropout,
time_embedding_norm=resnet_time_scale_shift,
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
)
]
attentions = []
if attention_head_dim is None:
logger.warn(
f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
)
attention_head_dim = in_channels
for _ in range(num_layers):
if self.add_attention:
attentions.append(
Attention(
in_channels,
heads=in_channels // attention_head_dim,
dim_head=attention_head_dim,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
norm_num_groups=attn_groups,
spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
residual_connection=True,
bias=True,
upcast_softmax=True,
_from_deprecated_attn_block=True,
)
)
else:
attentions.append(None)
resnets.append(
ResnetBlockCausal3D(
in_channels=in_channels,
out_channels=in_channels,
temb_channels=temb_channels,
eps=resnet_eps,
groups=resnet_groups,
dropout=dropout,
time_embedding_norm=resnet_time_scale_shift,
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
)
)
self.attentions = nn.ModuleList(attentions)
self.resnets = nn.ModuleList(resnets)
def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
hidden_states = self.resnets[0](hidden_states, temb)
for attn, resnet in zip(self.attentions, self.resnets[1:]):
if attn is not None:
B, C, T, H, W = hidden_states.shape
hidden_states = rearrange(hidden_states, "b c f h w -> b (f h w) c")
attention_mask = prepare_causal_attention_mask(
T, H * W, hidden_states.dtype, hidden_states.device, batch_size=B
)
hidden_states = attn(hidden_states, temb=temb, attention_mask=attention_mask)
hidden_states = rearrange(hidden_states, "b (f h w) c -> b c f h w", f=T, h=H, w=W)
hidden_states = resnet(hidden_states, temb)
return hidden_states
class DownEncoderBlockCausal3D(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
dropout: float = 0.0,
num_layers: int = 1,
resnet_eps: float = 1e-6,
resnet_time_scale_shift: str = "default",
resnet_act_fn: str = "swish",
resnet_groups: int = 32,
resnet_pre_norm: bool = True,
output_scale_factor: float = 1.0,
add_downsample: bool = True,
downsample_stride: int = 2,
downsample_padding: int = 1,
):
super().__init__()
resnets = []
for i in range(num_layers):
in_channels = in_channels if i == 0 else out_channels
resnets.append(
ResnetBlockCausal3D(
in_channels=in_channels,
out_channels=out_channels,
temb_channels=None,
eps=resnet_eps,
groups=resnet_groups,
dropout=dropout,
time_embedding_norm=resnet_time_scale_shift,
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
)
)
self.resnets = nn.ModuleList(resnets)
if add_downsample:
self.downsamplers = nn.ModuleList(
[
DownsampleCausal3D(
out_channels,
use_conv=True,
out_channels=out_channels,
padding=downsample_padding,
name="op",
stride=downsample_stride,
)
]
)
else:
self.downsamplers = None
def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
for resnet in self.resnets:
hidden_states = resnet(hidden_states, temb=None, scale=scale)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
hidden_states = downsampler(hidden_states, scale)
return hidden_states
class UpDecoderBlockCausal3D(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
resolution_idx: Optional[int] = None,
dropout: float = 0.0,
num_layers: int = 1,
resnet_eps: float = 1e-6,
resnet_time_scale_shift: str = "default", # default, spatial
resnet_act_fn: str = "swish",
resnet_groups: int = 32,
resnet_pre_norm: bool = True,
output_scale_factor: float = 1.0,
add_upsample: bool = True,
upsample_scale_factor=(2, 2, 2),
temb_channels: Optional[int] = None,
):
super().__init__()
resnets = []
for i in range(num_layers):
input_channels = in_channels if i == 0 else out_channels
resnets.append(
ResnetBlockCausal3D(
in_channels=input_channels,
out_channels=out_channels,
temb_channels=temb_channels,
eps=resnet_eps,
groups=resnet_groups,
dropout=dropout,
time_embedding_norm=resnet_time_scale_shift,
non_linearity=resnet_act_fn,
output_scale_factor=output_scale_factor,
pre_norm=resnet_pre_norm,
)
)
self.resnets = nn.ModuleList(resnets)
if add_upsample:
self.upsamplers = nn.ModuleList(
[
UpsampleCausal3D(
out_channels,
use_conv=True,
out_channels=out_channels,
upsample_factor=upsample_scale_factor,
)
]
)
else:
self.upsamplers = None
self.resolution_idx = resolution_idx
def forward(
self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
) -> torch.FloatTensor:
for resnet in self.resnets:
hidden_states = resnet(hidden_states, temb=temb, scale=scale)
if self.upsamplers is not None:
for upsampler in self.upsamplers:
hidden_states = upsampler(hidden_states)
return hidden_states
from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np
import torch
import torch.nn as nn
from diffusers.utils import BaseOutput, is_torch_version
from diffusers.utils.torch_utils import randn_tensor
from diffusers.models.attention_processor import SpatialNorm
from .unet_causal_3d_blocks import (
CausalConv3d,
UNetMidBlockCausal3D,
get_down_block3d,
get_up_block3d,
)
@dataclass
class DecoderOutput(BaseOutput):
r"""
Output of decoding method.
Args:
sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
The decoded output sample from the last layer of the model.
"""
sample: torch.FloatTensor
class EncoderCausal3D(nn.Module):
r"""
The `EncoderCausal3D` layer of a variational autoencoder that encodes its input into a latent representation.
"""
def __init__(
self,
in_channels: int = 3,
out_channels: int = 3,
down_block_types: Tuple[str, ...] = ("DownEncoderBlockCausal3D",),
block_out_channels: Tuple[int, ...] = (64,),
layers_per_block: int = 2,
norm_num_groups: int = 32,
act_fn: str = "silu",
double_z: bool = True,
mid_block_add_attention=True,
time_compression_ratio: int = 4,
spatial_compression_ratio: int = 8,
):
super().__init__()
self.layers_per_block = layers_per_block
self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, stride=1)
self.mid_block = None
self.down_blocks = nn.ModuleList([])
# down
output_channel = block_out_channels[0]
for i, down_block_type in enumerate(down_block_types):
input_channel = output_channel
output_channel = block_out_channels[i]
is_final_block = i == len(block_out_channels) - 1
num_spatial_downsample_layers = int(np.log2(spatial_compression_ratio))
num_time_downsample_layers = int(np.log2(time_compression_ratio))
if time_compression_ratio == 4:
add_spatial_downsample = bool(i < num_spatial_downsample_layers)
add_time_downsample = bool(
i >= (len(block_out_channels) - 1 - num_time_downsample_layers)
and not is_final_block
)
else:
raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}.")
downsample_stride_HW = (2, 2) if add_spatial_downsample else (1, 1)
downsample_stride_T = (2,) if add_time_downsample else (1,)
downsample_stride = tuple(downsample_stride_T + downsample_stride_HW)
down_block = get_down_block3d(
down_block_type,
num_layers=self.layers_per_block,
in_channels=input_channel,
out_channels=output_channel,
add_downsample=bool(add_spatial_downsample or add_time_downsample),
downsample_stride=downsample_stride,
resnet_eps=1e-6,
downsample_padding=0,
resnet_act_fn=act_fn,
resnet_groups=norm_num_groups,
attention_head_dim=output_channel,
temb_channels=None,
)
self.down_blocks.append(down_block)
# mid
self.mid_block = UNetMidBlockCausal3D(
in_channels=block_out_channels[-1],
resnet_eps=1e-6,
resnet_act_fn=act_fn,
output_scale_factor=1,
resnet_time_scale_shift="default",
attention_head_dim=block_out_channels[-1],
resnet_groups=norm_num_groups,
temb_channels=None,
add_attention=mid_block_add_attention,
)
# out
self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
self.conv_act = nn.SiLU()
conv_out_channels = 2 * out_channels if double_z else out_channels
self.conv_out = CausalConv3d(block_out_channels[-1], conv_out_channels, kernel_size=3)
def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
r"""The forward method of the `EncoderCausal3D` class."""
assert len(sample.shape) == 5, "The input tensor should have 5 dimensions"
sample = self.conv_in(sample)
# down
for down_block in self.down_blocks:
sample = down_block(sample)
# middle
sample = self.mid_block(sample)
# post-process
sample = self.conv_norm_out(sample)
sample = self.conv_act(sample)
sample = self.conv_out(sample)
return sample
class DecoderCausal3D(nn.Module):
r"""
The `DecoderCausal3D` layer of a variational autoencoder that decodes its latent representation into an output sample.
"""
def __init__(
self,
in_channels: int = 3,
out_channels: int = 3,
up_block_types: Tuple[str, ...] = ("UpDecoderBlockCausal3D",),
block_out_channels: Tuple[int, ...] = (64,),
layers_per_block: int = 2,
norm_num_groups: int = 32,
act_fn: str = "silu",
norm_type: str = "group", # group, spatial
mid_block_add_attention=True,
time_compression_ratio: int = 4,
spatial_compression_ratio: int = 8,
):
super().__init__()
self.layers_per_block = layers_per_block
self.conv_in = CausalConv3d(in_channels, block_out_channels[-1], kernel_size=3, stride=1)
self.mid_block = None
self.up_blocks = nn.ModuleList([])
temb_channels = in_channels if norm_type == "spatial" else None
# mid
self.mid_block = UNetMidBlockCausal3D(
in_channels=block_out_channels[-1],
resnet_eps=1e-6,
resnet_act_fn=act_fn,
output_scale_factor=1,
resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
attention_head_dim=block_out_channels[-1],
resnet_groups=norm_num_groups,
temb_channels=temb_channels,
add_attention=mid_block_add_attention,
)
# up
reversed_block_out_channels = list(reversed(block_out_channels))
output_channel = reversed_block_out_channels[0]
for i, up_block_type in enumerate(up_block_types):
prev_output_channel = output_channel
output_channel = reversed_block_out_channels[i]
is_final_block = i == len(block_out_channels) - 1
num_spatial_upsample_layers = int(np.log2(spatial_compression_ratio))
num_time_upsample_layers = int(np.log2(time_compression_ratio))
if time_compression_ratio == 4:
add_spatial_upsample = bool(i < num_spatial_upsample_layers)
add_time_upsample = bool(
i >= len(block_out_channels) - 1 - num_time_upsample_layers
and not is_final_block
)
else:
raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}.")
upsample_scale_factor_HW = (2, 2) if add_spatial_upsample else (1, 1)
upsample_scale_factor_T = (2,) if add_time_upsample else (1,)
upsample_scale_factor = tuple(upsample_scale_factor_T + upsample_scale_factor_HW)
up_block = get_up_block3d(
up_block_type,
num_layers=self.layers_per_block + 1,
in_channels=prev_output_channel,
out_channels=output_channel,
prev_output_channel=None,
add_upsample=bool(add_spatial_upsample or add_time_upsample),
upsample_scale_factor=upsample_scale_factor,
resnet_eps=1e-6,
resnet_act_fn=act_fn,
resnet_groups=norm_num_groups,
attention_head_dim=output_channel,
temb_channels=temb_channels,
resnet_time_scale_shift=norm_type,
)
self.up_blocks.append(up_block)
prev_output_channel = output_channel
# out
if norm_type == "spatial":
self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
else:
self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
self.conv_act = nn.SiLU()
self.conv_out = CausalConv3d(block_out_channels[0], out_channels, kernel_size=3)
self.gradient_checkpointing = False
def forward(
self,
sample: torch.FloatTensor,
latent_embeds: Optional[torch.FloatTensor] = None,
) -> torch.FloatTensor:
r"""The forward method of the `DecoderCausal3D` class."""
assert len(sample.shape) == 5, "The input tensor should have 5 dimensions."
sample = self.conv_in(sample)
upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
if self.training and self.gradient_checkpointing:
def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs)
return custom_forward
if is_torch_version(">=", "1.11.0"):
# middle
sample = torch.utils.checkpoint.checkpoint(
create_custom_forward(self.mid_block),
sample,
latent_embeds,
use_reentrant=False,
)
sample = sample.to(upscale_dtype)
# up
for up_block in self.up_blocks:
sample = torch.utils.checkpoint.checkpoint(
create_custom_forward(up_block),
sample,
latent_embeds,
use_reentrant=False,
)
else:
# middle
sample = torch.utils.checkpoint.checkpoint(
create_custom_forward(self.mid_block), sample, latent_embeds
)
sample = sample.to(upscale_dtype)
# up
for up_block in self.up_blocks:
sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
else:
# middle
sample = self.mid_block(sample, latent_embeds)
sample = sample.to(upscale_dtype)
# up
for up_block in self.up_blocks:
sample = up_block(sample, latent_embeds)
# post-process
if latent_embeds is None:
sample = self.conv_norm_out(sample)
else:
sample = self.conv_norm_out(sample, latent_embeds)
sample = self.conv_act(sample)
sample = self.conv_out(sample)
return sample
class DiagonalGaussianDistribution(object):
def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
if parameters.ndim == 3:
dim = 2 # (B, L, C)
elif parameters.ndim == 5 or parameters.ndim == 4:
dim = 1 # (B, C, T, H ,W) / (B, C, H, W)
else:
raise NotImplementedError
self.parameters = parameters
self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
self.deterministic = deterministic
self.std = torch.exp(0.5 * self.logvar)
self.var = torch.exp(self.logvar)
if self.deterministic:
self.var = self.std = torch.zeros_like(
self.mean, device=self.parameters.device, dtype=self.parameters.dtype
)
def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
# make sure sample is on the same device as the parameters and has same dtype
sample = randn_tensor(
self.mean.shape,
generator=generator,
device=self.parameters.device,
dtype=self.parameters.dtype,
)
x = self.mean + self.std * sample
return x
def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
if self.deterministic:
return torch.Tensor([0.0])
else:
reduce_dim = list(range(1, self.mean.ndim))
if other is None:
return 0.5 * torch.sum(
torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
dim=reduce_dim,
)
else:
return 0.5 * torch.sum(
torch.pow(self.mean - other.mean, 2) / other.var
+ self.var / other.var
- 1.0
- self.logvar
+ other.logvar,
dim=reduce_dim,
)
def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
if self.deterministic:
return torch.Tensor([0.0])
logtwopi = np.log(2.0 * np.pi)
return 0.5 * torch.sum(
logtwopi + self.logvar +
torch.pow(sample - self.mean, 2) / self.var,
dim=dims,
)
def mode(self) -> torch.Tensor:
return self.mean
icon.png

70.5 KB

# 模型编码
modelCode=1462
# 模型名称
modelName=HunyuanVideo-I2V_pytorch
# 模型描述
modelDescription=腾讯混元系列超高质量视频生成模型
# 应用场景
appScenario=推理,视频生成,电商,教育,广媒
# 框架类型
frameType=pytorch
import os
import torch
import torch.distributed as dist
from packaging import version
from dataclasses import dataclass, fields
from torch import distributed as dist
from xfuser.logger import init_logger
import xfuser.envs as envs
# from xfuser.envs import CUDA_VERSION, TORCH_VERSION, PACKAGES_CHECKER
from xfuser.envs import TORCH_VERSION, PACKAGES_CHECKER
logger = init_logger(__name__)
from typing import Union, Optional, List
env_info = PACKAGES_CHECKER.get_packages_info()
HAS_LONG_CTX_ATTN = env_info["has_long_ctx_attn"]
HAS_FLASH_ATTN = env_info["has_flash_attn"]
def check_packages():
import diffusers
if not version.parse(diffusers.__version__) > version.parse("0.30.2"):
raise RuntimeError(
"This project requires diffusers version > 0.30.2. Currently, you can not install a correct version of diffusers by pip install."
"Please install it from source code!"
)
def check_env():
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/cudagraph.html
#if CUDA_VERSION < version.parse("11.3"):
# raise RuntimeError("NCCL CUDA Graph support requires CUDA 11.3 or above")
if TORCH_VERSION < version.parse("2.2.0"):
# https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/
raise RuntimeError(
"CUDAGraph with NCCL support requires PyTorch 2.2.0 or above. "
"If it is not released yet, please install nightly built PyTorch "
"with `pip3 install --pre torch torchvision torchaudio --index-url "
"https://download.pytorch.org/whl/nightly/cu121`"
)
@dataclass
class ModelConfig:
model: str
download_dir: Optional[str] = None
trust_remote_code: bool = False
@dataclass
class RuntimeConfig:
warmup_steps: int = 1
dtype: torch.dtype = torch.float16
use_cuda_graph: bool = False
use_parallel_vae: bool = False
use_profiler: bool = False
use_torch_compile: bool = False
use_onediff: bool = False
use_fp8_t5_encoder: bool = False
def __post_init__(self):
check_packages()
if self.use_cuda_graph:
check_env()
@dataclass
class FastAttnConfig:
use_fast_attn: bool = False
n_step: int = 20
n_calib: int = 8
threshold: float = 0.5
window_size: int = 64
coco_path: Optional[str] = None
use_cache: bool = False
def __post_init__(self):
assert self.n_calib > 0, "n_calib must be greater than 0"
assert self.threshold > 0.0, "threshold must be greater than 0"
@dataclass
class DataParallelConfig:
dp_degree: int = 1
use_cfg_parallel: bool = False
world_size: int = 1
def __post_init__(self):
assert self.dp_degree >= 1, "dp_degree must greater than or equal to 1"
# set classifier_free_guidance_degree parallel for split batch
if self.use_cfg_parallel:
self.cfg_degree = 2
else:
self.cfg_degree = 1
assert self.dp_degree * self.cfg_degree <= self.world_size, (
"dp_degree * cfg_degree must be less than or equal to "
"world_size because of classifier free guidance"
)
assert (
self.world_size % (self.dp_degree * self.cfg_degree) == 0
), "world_size must be divisible by dp_degree * cfg_degree"
@dataclass
class SequenceParallelConfig:
ulysses_degree: Optional[int] = None
ring_degree: Optional[int] = None
world_size: int = 1
def __post_init__(self):
if self.ulysses_degree is None:
self.ulysses_degree = 1
logger.info(
f"Ulysses degree not set, " f"using default value {self.ulysses_degree}"
)
if self.ring_degree is None:
self.ring_degree = 1
logger.info(
f"Ring degree not set, " f"using default value {self.ring_degree}"
)
self.sp_degree = self.ulysses_degree * self.ring_degree
if not HAS_LONG_CTX_ATTN and self.sp_degree > 1:
raise ImportError(
f"Sequence Parallel kit 'yunchang' not found but "
f"sp_degree is {self.sp_degree}, please set it "
f"to 1 or install 'yunchang' to use it"
)
@dataclass
class TensorParallelConfig:
tp_degree: int = 1
split_scheme: Optional[str] = "row"
world_size: int = 1
def __post_init__(self):
assert self.tp_degree >= 1, "tp_degree must greater than 1"
assert (
self.tp_degree <= self.world_size
), "tp_degree must be less than or equal to world_size"
@dataclass
class PipeFusionParallelConfig:
pp_degree: int = 1
num_pipeline_patch: Optional[int] = None
attn_layer_num_for_pp: Optional[List[int]] = (None,)
world_size: int = 1
def __post_init__(self):
assert (
self.pp_degree is not None and self.pp_degree >= 1
), "pipefusion_degree must be set and greater than 1 to use pipefusion"
assert (
self.pp_degree <= self.world_size
), "pipefusion_degree must be less than or equal to world_size"
if self.num_pipeline_patch is None:
self.num_pipeline_patch = self.pp_degree
logger.info(
f"Pipeline patch number not set, "
f"using default value {self.pp_degree}"
)
if self.attn_layer_num_for_pp is not None:
logger.info(
f"attn_layer_num_for_pp set, splitting attention layers"
f"to {self.attn_layer_num_for_pp}"
)
assert len(self.attn_layer_num_for_pp) == self.pp_degree, (
"attn_layer_num_for_pp must have the same "
"length as pp_degree if not None"
)
if self.pp_degree == 1 and self.num_pipeline_patch > 1:
logger.warning(
f"Pipefusion degree is 1, pipeline will not be used,"
f"num_pipeline_patch will be ignored"
)
self.num_pipeline_patch = 1
@dataclass
class ParallelConfig:
dp_config: DataParallelConfig
sp_config: SequenceParallelConfig
pp_config: PipeFusionParallelConfig
tp_config: TensorParallelConfig
world_size: int = 1 # FIXME: remove this
worker_cls: str = "xfuser.ray.worker.worker.Worker"
def __post_init__(self):
assert self.tp_config is not None, "tp_config must be set"
assert self.dp_config is not None, "dp_config must be set"
assert self.sp_config is not None, "sp_config must be set"
assert self.pp_config is not None, "pp_config must be set"
parallel_world_size = (
self.dp_config.dp_degree
* self.dp_config.cfg_degree
* self.sp_config.sp_degree
* self.tp_config.tp_degree
* self.pp_config.pp_degree
)
world_size = self.world_size
assert parallel_world_size == world_size, (
f"parallel_world_size {parallel_world_size} "
f"must be equal to world_size {self.world_size}"
)
assert (
world_size % (self.dp_config.dp_degree * self.dp_config.cfg_degree) == 0
), "world_size must be divisible by dp_degree * cfg_degree"
assert (
world_size % self.pp_config.pp_degree == 0
), "world_size must be divisible by pp_degree"
assert (
world_size % self.sp_config.sp_degree == 0
), "world_size must be divisible by sp_degree"
assert (
world_size % self.tp_config.tp_degree == 0
), "world_size must be divisible by tp_degree"
self.dp_degree = self.dp_config.dp_degree
self.cfg_degree = self.dp_config.cfg_degree
self.sp_degree = self.sp_config.sp_degree
self.pp_degree = self.pp_config.pp_degree
self.tp_degree = self.tp_config.tp_degree
self.ulysses_degree = self.sp_config.ulysses_degree
self.ring_degree = self.sp_config.ring_degree
@dataclass(frozen=True)
class EngineConfig:
model_config: ModelConfig
runtime_config: RuntimeConfig
parallel_config: ParallelConfig
fast_attn_config: FastAttnConfig
def __post_init__(self):
world_size = self.parallel_config.world_size
if self.fast_attn_config.use_fast_attn:
assert self.parallel_config.dp_degree == world_size, f"world_size must be equal to dp_degree when using DiTFastAttn"
def to_dict(self):
"""Return the configs as a dictionary, for use in **kwargs."""
return dict((field.name, getattr(self, field.name)) for field in fields(self))
@dataclass
class InputConfig:
height: int = 1024
width: int = 1024
num_frames: int = 49
use_resolution_binning: bool = (True,)
batch_size: Optional[int] = None
img_file_path: Optional[str] = None
prompt: Union[str, List[str]] = ""
negative_prompt: Union[str, List[str]] = ""
num_inference_steps: int = 20
max_sequence_length: int = 256
seed: int = 42
output_type: str = "pil"
def __post_init__(self):
if isinstance(self.prompt, list):
assert (
len(self.prompt) == len(self.negative_prompt)
or len(self.negative_prompt) == 0
), "prompts and negative_prompts must have the same quantities"
self.batch_size = self.batch_size or len(self.prompt)
else:
self.batch_size = self.batch_size or 1
assert self.output_type in [
"pil",
"latent",
"pt",
], "output_pil must be either 'pil' or 'latent'"
import os
import torch
import diffusers
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional
from packaging import version
from xfuser.logger import init_logger
logger = init_logger(__name__)
if TYPE_CHECKING:
MASTER_ADDR: str = ""
MASTER_PORT: Optional[int] = None
CUDA_HOME: Optional[str] = None
LOCAL_RANK: int = 0
CUDA_VISIBLE_DEVICES: Optional[str] = None
XDIT_LOGGING_LEVEL: str = "INFO"
CUDA_VERSION: version.Version
TORCH_VERSION: version.Version
environment_variables: Dict[str, Callable[[], Any]] = {
# ================== Runtime Env Vars ==================
# used in distributed environment to determine the master address
"MASTER_ADDR": lambda: os.getenv("MASTER_ADDR", ""),
# used in distributed environment to manually set the communication port
"MASTER_PORT": lambda: (
int(os.getenv("MASTER_PORT", "0")) if "MASTER_PORT" in os.environ else None
),
# path to cudatoolkit home directory, under which should be bin, include,
# and lib directories.
"CUDA_HOME": lambda: os.environ.get("CUDA_HOME", None),
# local rank of the process in the distributed setting, used to determine
# the GPU device id
"LOCAL_RANK": lambda: int(os.environ.get("LOCAL_RANK", "0")),
# used to control the visible devices in the distributed setting
"CUDA_VISIBLE_DEVICES": lambda: os.environ.get("CUDA_VISIBLE_DEVICES", None),
# this is used for configuring the default logging level
"XDIT_LOGGING_LEVEL": lambda: os.getenv("XDIT_LOGGING_LEVEL", "INFO"),
}
variables: Dict[str, Callable[[], Any]] = {
# ================== Other Vars ==================
# used in version checking
# "CUDA_VERSION": lambda: version.parse(torch.version.cuda),
"CUDA_VERSION": "gfx928",
"TORCH_VERSION": lambda: version.parse(
version.parse(torch.__version__).base_version
),
}
class PackagesEnvChecker:
_instance = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(PackagesEnvChecker, cls).__new__(cls)
cls._instance.initialize()
return cls._instance
def initialize(self):
self.packages_info = {
"has_flash_attn": self.check_flash_attn(),
"has_long_ctx_attn": self.check_long_ctx_attn(),
"diffusers_version": self.check_diffusers_version(),
}
def check_flash_attn(self):
try:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpu_name = torch.cuda.get_device_name(device)
if "Turing" in gpu_name or "Tesla" in gpu_name or "T4" in gpu_name:
return False
else:
from flash_attn import flash_attn_func
from flash_attn import __version__
if __version__ < "2.6.0":
raise ImportError(f"install flash_attn >= 2.6.0")
return True
except ImportError:
logger.warning(
f'Flash Attention library "flash_attn" not found, '
f"using pytorch attention implementation"
)
return False
def check_long_ctx_attn(self):
try:
from yunchang import (
set_seq_parallel_pg,
ring_flash_attn_func,
UlyssesAttention,
LongContextAttention,
LongContextAttentionQKVPacked,
)
return True
except ImportError:
logger.warning(
f'Ring Flash Attention library "yunchang" not found, '
f"using pytorch attention implementation"
)
return False
def check_diffusers_version(self):
if version.parse(
version.parse(diffusers.__version__).base_version
) < version.parse("0.30.0"):
raise RuntimeError(
f"Diffusers version: {version.parse(version.parse(diffusers.__version__).base_version)} is not supported,"
f"please upgrade to version > 0.30.0"
)
return version.parse(version.parse(diffusers.__version__).base_version)
def get_packages_info(self):
return self.packages_info
PACKAGES_CHECKER = PackagesEnvChecker()
def __getattr__(name):
# lazy evaluation of environment variables
if name in environment_variables:
return environment_variables[name]()
if name in variables:
return variables[name]()
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
def __dir__():
return list(environment_variables.keys())
#!/bin/bash
cp modified/config.py /usr/local/lib/python3.10/site-packages/xfuser/config/
cp modified/envs.py /usr/local/lib/python3.10/site-packages/xfuser/
This image diff could not be displayed because it is too large. You can view the blob instead.
opencv-python==4.9.0.80
diffusers==0.31.0
accelerate==1.1.1
pandas==2.0.3
# numpy==1.24.4
einops==0.7.0
tqdm==4.66.2
loguru==0.7.2
imageio==2.34.0
imageio-ffmpeg==0.5.1
safetensors==0.4.3
peft==0.13.2
transformers==4.39.3
tokenizers==0.15.0
# deepspeed==0.15.1
pyarrow==14.0.1
tensorboard==2.19.0
# git+https://github.com/openai/CLIP.git
import os
import time
from pathlib import Path
from loguru import logger
from datetime import datetime
from hyvideo.utils.file_utils import save_videos_grid
from hyvideo.config import parse_args
from hyvideo.inference import HunyuanVideoSampler
def main():
args = parse_args()
print(args)
models_root_path = Path(args.model_base)
if not models_root_path.exists():
raise ValueError(f"`models_root` not exists: {models_root_path}")
# Create save folder to save the samples
save_path = args.save_path if args.save_path_suffix=="" else f'{args.save_path}_{args.save_path_suffix}'
if not os.path.exists(save_path):
os.makedirs(save_path, exist_ok=True)
# Load models
hunyuan_video_sampler = HunyuanVideoSampler.from_pretrained(models_root_path, args=args)
# Get the updated args
args = hunyuan_video_sampler.args
# Start sampling
# TODO: batch inference check
outputs = hunyuan_video_sampler.predict(
prompt=args.prompt,
height=args.video_size[0],
width=args.video_size[1],
video_length=args.video_length,
seed=args.seed,
negative_prompt=args.neg_prompt,
infer_steps=args.infer_steps,
guidance_scale=args.cfg_scale,
num_videos_per_prompt=args.num_videos,
flow_shift=args.flow_shift,
batch_size=args.batch_size,
embedded_guidance_scale=args.embedded_cfg_scale,
i2v_mode=args.i2v_mode,
i2v_resolution=args.i2v_resolution,
i2v_image_path=args.i2v_image_path,
i2v_condition_type=args.i2v_condition_type,
i2v_stability=args.i2v_stability,
ulysses_degree=args.ulysses_degree,
ring_degree=args.ring_degree,
xdit_adaptive_size=args.xdit_adaptive_size
)
samples = outputs['samples']
# Save samples
if 'LOCAL_RANK' not in os.environ or int(os.environ['LOCAL_RANK']) == 0:
for i, sample in enumerate(samples):
sample = samples[i].unsqueeze(0)
time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%H:%M:%S")
cur_save_path = f"{save_path}/{time_flag}_seed{outputs['seeds'][i]}_{outputs['prompts'][i][:100].replace('/','')}.mp4"
save_videos_grid(sample, cur_save_path, fps=24)
logger.info(f'Sample save to: {cur_save_path}')
if __name__ == "__main__":
main()
#!/bin/bash
python3 sample_image2video.py \
--prompt "An Asian man with short hair in black tactical uniform and white clothes waves a firework stick." \
--i2v-image-path ./assets/demo/i2v/imgs/0.jpg \
--model HYVideo-T/2 \
--i2v-mode \
--i2v-resolution 720p \
--infer-steps 50 \
--video-length 129 \
--flow-reverse \
--flow-shift 17.0 \
--embedded-cfg-scale 6.0 \
--seed 0 \
--use-cpu-offload \
--save-path ./results \
# More example
# --prompt "A girl walks on the road, shooting stars pass by." \
# --i2v-image-path ./assets/demo/i2v/imgs/1.png \
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment