"vscode:/vscode.git/clone" did not exist on "8262d4614331aba38e39d1e6d3546f429764ac91"
Commit e1f7729e authored by gushiqiao's avatar gushiqiao Committed by GitHub
Browse files

Support wan2.2 ti2v-5B and fix some bugs.

Support wan2.2 ti2v-5B and fix some bugs.
parents 6943aa52 1bba5529
{
"infer_steps": 50,
"target_video_length": 121,
"text_len": 512,
"target_height": 704,
"target_width": 1280,
"num_channels_latents": 48,
"vae_stride": [4, 16, 16],
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale": 5.0,
"sample_shift": 5.0,
"enable_cfg": true,
"cpu_offload": false,
"offload_granularity": "model",
"fps": 24,
"use_image_encoder": false
}
{
"infer_steps": 50,
"target_video_length": 121,
"text_len": 512,
"target_height": 704,
"target_width": 1280,
"num_channels_latents": 48,
"vae_stride": [4, 16, 16],
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale": 5.0,
"sample_shift": 5.0,
"enable_cfg": true,
"cpu_offload": false,
"offload_granularity": "model",
"fps": 24
}
......@@ -60,8 +60,8 @@ class VAERunner:
def _run_vae_encoder(self, img):
img = image_transporter.load_image(img)
vae_encode_out, kwargs = self.runner.run_vae_encoder(img)
return vae_encode_out, kwargs
vae_encoder_out, kwargs = self.runner.run_vae_encoder(img)
return vae_encoder_out, kwargs
def _run_vae_decoder(self, latents):
latents = tensor_transporter.load_tensor(latents)
......@@ -72,9 +72,9 @@ class VAERunner:
def run_vae_encoder(message: Message):
try:
global runner
vae_encode_out, kwargs = runner._run_vae_encoder(message.img)
vae_encoder_out, kwargs = runner._run_vae_encoder(message.img)
VAEServiceStatus.complete_task(message)
return vae_encode_out, kwargs
return vae_encoder_out, kwargs
except Exception as e:
logger.error(f"task_id {message.task_id} failed: {str(e)}")
VAEServiceStatus.record_failed_task(message, error=str(e))
......@@ -95,9 +95,9 @@ def run_vae_decoder(message: Message):
def v1_local_vae_model_encoder_generate(message: Message):
try:
task_id = VAEServiceStatus.start_task(message)
vae_encode_out, kwargs = run_vae_encoder(message)
output = tensor_transporter.prepare_tensor(vae_encode_out)
del vae_encode_out
vae_encoder_out, kwargs = run_vae_encoder(message)
output = tensor_transporter.prepare_tensor(vae_encoder_out)
del vae_encoder_out
return {"task_id": task_id, "task_status": "completed", "output": output, "kwargs": kwargs}
except RuntimeError as e:
return {"error": str(e)}
......
......@@ -45,7 +45,7 @@ def main():
"--model_cls",
type=str,
required=True,
choices=["wan2.1", "hunyuan", "wan2.1_distill", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox", "wan2.1_audio", "wan2.2_moe", "wan2.2_moe_audio"],
choices=["wan2.1", "hunyuan", "wan2.1_distill", "wan2.1_causvid", "wan2.1_skyreels_v2_df", "cogvideox", "wan2.1_audio", "wan2.2_moe", "wan2.2_moe_audio", "wan2.2"],
default="wan2.1",
)
......
......@@ -52,7 +52,7 @@ class WanAudioPreInfer(WanPreInfer):
seq_len = self.scheduler.seq_len
clip_fea = inputs["image_encoder_output"]["clip_encoder_out"]
ref_image_encoder = inputs["image_encoder_output"]["vae_encode_out"]
ref_image_encoder = inputs["image_encoder_output"]["vae_encoder_out"]
batch_size = len(x)
num_channels, num_frames, height, width = x[0].shape
_, ref_num_channels, ref_num_frames, _, _ = ref_image_encoder.shape
......
......@@ -25,7 +25,7 @@ class WanPostInfer:
if GET_DTYPE() != "BF16":
x = x.float()
x.mul_(1 + e[1].squeeze(0)).add_(e[0].squeeze(0))
x.mul_(1 + e[1].squeeze()).add_(e[0].squeeze())
if GET_DTYPE() != "BF16":
x = x.to(torch.bfloat16)
......
......@@ -35,7 +35,10 @@ class WanPreInfer:
t = self.scheduler.df_timesteps[self.scheduler.step_index].unsqueeze(0)
assert t.dim() == 2 # df推理模型timestep是二维
else:
t = torch.stack([self.scheduler.timesteps[self.scheduler.step_index]])
timestep = self.scheduler.timesteps[self.scheduler.step_index]
t = torch.stack([timestep])
if hasattr(self.scheduler, "mask"):
t = (self.scheduler.mask[0][:, ::2, ::2] * t).flatten()
if positive:
context = inputs["text_encoder_output"]["context"]
......@@ -47,17 +50,18 @@ class WanPreInfer:
clip_fea = inputs["image_encoder_output"]["clip_encoder_out"]
if self.config.get("changing_resolution", False):
image_encoder = inputs["image_encoder_output"]["vae_encode_out"][self.scheduler.changing_resolution_index]
image_encoder = inputs["image_encoder_output"]["vae_encoder_out"][self.scheduler.changing_resolution_index]
else:
image_encoder = inputs["image_encoder_output"]["vae_encode_out"]
image_encoder = inputs["image_encoder_output"]["vae_encoder_out"]
frame_seq_length = (image_encoder.size(2) // 2) * (image_encoder.size(3) // 2)
if kv_end - kv_start >= frame_seq_length: # 如果是CausalVid, image_encoder取片段
idx_s = kv_start // frame_seq_length
idx_e = kv_end // frame_seq_length
image_encoder = image_encoder[:, idx_s:idx_e, :, :]
y = image_encoder
x = torch.cat([x, y], dim=0)
if image_encoder is not None:
frame_seq_length = (image_encoder.size(2) // 2) * (image_encoder.size(3) // 2)
if kv_end - kv_start >= frame_seq_length: # 如果是CausalVid, image_encoder取片段
idx_s = kv_start // frame_seq_length
idx_e = kv_end // frame_seq_length
image_encoder = image_encoder[:, idx_s:idx_e, :, :]
y = image_encoder
x = torch.cat([x, y], dim=0)
# embeddings
x = weights.patch_embedding.apply(x.unsqueeze(0))
......
......@@ -309,12 +309,13 @@ class WanTransformerInfer(BaseTransformerInfer):
return x
def infer_modulation(self, weights, embed0):
if embed0.dim() == 3:
if embed0.dim() == 3 and embed0.shape[2] == 1:
modulation = weights.modulation.tensor.unsqueeze(2)
embed0 = (modulation + embed0).chunk(6, dim=1)
shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = [ei.squeeze(1) for ei in embed0]
elif embed0.dim() == 2:
else:
shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (weights.modulation.tensor + embed0).chunk(6, dim=1)
if self.clean_cuda_cache:
del embed0
torch.cuda.empty_cache()
......@@ -330,11 +331,11 @@ class WanTransformerInfer(BaseTransformerInfer):
def infer_self_attn(self, weights, grid_sizes, x, seq_lens, freqs, shift_msa, scale_msa):
if hasattr(weights, "smooth_norm1_weight"):
norm1_weight = (1 + scale_msa.squeeze(0)) * weights.smooth_norm1_weight.tensor
norm1_bias = shift_msa.squeeze(0) * weights.smooth_norm1_bias.tensor
norm1_weight = (1 + scale_msa.squeeze()) * weights.smooth_norm1_weight.tensor
norm1_bias = shift_msa.squeeze() * weights.smooth_norm1_bias.tensor
else:
norm1_weight = 1 + scale_msa.squeeze(0)
norm1_bias = shift_msa.squeeze(0)
norm1_weight = 1 + scale_msa.squeeze()
norm1_bias = shift_msa.squeeze()
norm1_out = weights.norm1.apply(x)
......@@ -398,9 +399,9 @@ class WanTransformerInfer(BaseTransformerInfer):
def infer_cross_attn(self, weights, x, context, y_out, gate_msa):
if GET_DTYPE() != "BF16":
x = x.float() + y_out.float() * gate_msa.squeeze(0)
x = x.float() + y_out.float() * gate_msa.squeeze()
else:
x.add_(y_out * gate_msa.squeeze(0))
x.add_(y_out * gate_msa.squeeze())
norm3_out = weights.norm3.apply(x)
if self.task == "i2v" and self.config.get("use_image_encoder", True):
......@@ -473,11 +474,11 @@ class WanTransformerInfer(BaseTransformerInfer):
torch.cuda.empty_cache()
if hasattr(weights, "smooth_norm2_weight"):
norm2_weight = (1 + c_scale_msa.squeeze(0)) * weights.smooth_norm2_weight.tensor
norm2_bias = c_shift_msa.squeeze(0) * weights.smooth_norm2_bias.tensor
norm2_weight = (1 + c_scale_msa.squeeze()) * weights.smooth_norm2_weight.tensor
norm2_bias = c_shift_msa.squeeze() * weights.smooth_norm2_bias.tensor
else:
norm2_weight = 1 + c_scale_msa.squeeze(0)
norm2_bias = c_shift_msa.squeeze(0)
norm2_weight = 1 + c_scale_msa.squeeze()
norm2_bias = c_shift_msa.squeeze()
norm2_out = weights.norm2.apply(x)
if GET_DTYPE() != "BF16":
......@@ -499,9 +500,9 @@ class WanTransformerInfer(BaseTransformerInfer):
def post_process(self, x, y, c_gate_msa):
if GET_DTYPE() != "BF16":
x = x.float() + y.float() * c_gate_msa.squeeze(0)
x = x.float() + y.float() * c_gate_msa.squeeze()
else:
x.add_(y * c_gate_msa.squeeze(0))
x.add_(y * c_gate_msa.squeeze())
if self.clean_cuda_cache:
del y, c_gate_msa
......
......@@ -112,12 +112,12 @@ class BaseRunner(ABC):
pass
@abstractmethod
def get_encoder_output_i2v(self, clip_encoder_out: Any, vae_encode_out: Any, text_encoder_output: Any, img: Any) -> Dict[str, Any]:
def get_encoder_output_i2v(self, clip_encoder_out: Any, vae_encoder_out: Any, text_encoder_output: Any, img: Any) -> Dict[str, Any]:
"""Combine encoder outputs for i2v task
Args:
clip_encoder_out: CLIP encoder output
vae_encode_out: VAE encoder output
vae_encoder_out: VAE encoder output
text_encoder_output: Text encoder output
img: Original image
......
......@@ -49,7 +49,7 @@ class CogvideoxRunner(DefaultRunner):
# TODO: implement vae encoder for Cogvideox
raise NotImplementedError("I2V inference is not implemented for Cogvideox.")
def get_encoder_output_i2v(self, clip_encoder_out, vae_encode_out, text_encoder_output, img):
def get_encoder_output_i2v(self, clip_encoder_out, vae_encoder_out, text_encoder_output, img):
# TODO: Implement image encoder for Cogvideox-I2V
raise ValueError(f"Unsupported model class: {self.config['model_cls']}")
......
......@@ -10,7 +10,7 @@ import torch.distributed as dist
from lightx2v.utils.envs import *
from lightx2v.utils.generate_task_id import generate_task_id
from lightx2v.utils.profiler import ProfilingContext, ProfilingContext4Debug
from lightx2v.utils.utils import save_to_video, vae_to_comfyui_image
from lightx2v.utils.utils import save_to_video, vae_to_comfyui_image, cache_video
from .base_runner import BaseRunner
......@@ -176,6 +176,8 @@ class DefaultRunner(BaseRunner):
self.model = self.load_transformer()
self.init_scheduler()
self.model.scheduler.prepare(self.inputs["image_encoder_output"])
if self.config.get("model_cls") == "wan2.2":
self.inputs["image_encoder_output"]["vae_encoder_out"] = None
latents, generator = self.run()
self.end_run()
return latents, generator
......@@ -212,13 +214,12 @@ class DefaultRunner(BaseRunner):
self.config["prompt_enhanced"] = self.post_prompt_enhancer()
self.inputs = self.run_input_encoder()
self.set_target_shape()
latents, generator = self.run_dit()
images = self.run_vae_decoder(latents, generator)
images = vae_to_comfyui_image(images)
if self.config["model_cls"] != "wan2.2":
images = vae_to_comfyui_image(images)
if "video_frame_interpolation" in self.config:
assert self.vfi_model is not None and self.config["video_frame_interpolation"].get("target_fps", None) is not None
......@@ -238,7 +239,11 @@ class DefaultRunner(BaseRunner):
if not self.config.get("parallel_attn_type", None) or dist.get_rank() == 0:
logger.info(f"Saving video to {self.config.save_video_path}")
save_to_video(images, self.config.save_video_path, fps=fps, method="ffmpeg") # type: ignore
if self.config["model_cls"] != "wan2.2":
save_to_video(images, self.config.save_video_path, fps=fps, method="ffmpeg") # type: ignore
else:
cache_video(tensor=images, save_file=self.config.save_video_path, fps=fps, nrow=1, normalize=True, value_range=(-1, 1))
del latents, generator
torch.cuda.empty_cache()
......
......@@ -137,8 +137,8 @@ class HunyuanRunner(DefaultRunner):
return img_latents, kwargs
def get_encoder_output_i2v(self, clip_encoder_out, vae_encode_out, text_encoder_output, img):
image_encoder_output = {"img": img, "img_latents": vae_encode_out}
def get_encoder_output_i2v(self, clip_encoder_out, vae_encoder_out, text_encoder_output, img):
image_encoder_output = {"img": img, "img_latents": vae_encoder_out}
return {"text_encoder_output": text_encoder_output, "image_encoder_output": image_encoder_output}
def set_target_shape(self):
......
......@@ -436,10 +436,10 @@ class WanAudioRunner(WanRunner): # type:ignore
if os.path.isfile(self.config.image_path):
with ProfilingContext("Run Img Encoder"):
vae_encode_out, clip_encoder_out = self.run_image_encoder(self.config, self.vae_encoder)
vae_encoder_out, clip_encoder_out = self.run_image_encoder(self.config, self.vae_encoder)
image_encoder_output = {
"clip_encoder_out": clip_encoder_out,
"vae_encode_out": vae_encode_out,
"vae_encoder_out": vae_encoder_out,
}
with ProfilingContext("Run Text Encoder"):
......@@ -660,11 +660,11 @@ class WanAudioRunner(WanRunner): # type:ignore
# vae encode
cond_frms = rearrange(cond_frms, "1 C H W -> 1 C 1 H W")
vae_encode_out = vae_model.encode(cond_frms.to(torch.float), config)
if isinstance(vae_encode_out, list):
vae_encode_out = torch.stack(vae_encode_out, dim=0).to(torch.bfloat16)
vae_encoder_out = vae_model.encode(cond_frms.to(torch.float), config)
if isinstance(vae_encoder_out, list):
vae_encoder_out = torch.stack(vae_encoder_out, dim=0).to(torch.bfloat16)
return vae_encode_out, clip_encoder_out
return vae_encoder_out, clip_encoder_out
def set_target_shape(self):
"""Set target shape for generation"""
......
......@@ -3,6 +3,8 @@ import gc
import numpy as np
import torch
import torchvision.transforms.functional as TF
import torch.distributed as dist
from loguru import logger
from PIL import Image
from lightx2v.utils.registry_factory import RUNNER_REGISTER
from lightx2v.models.runners.default_runner import DefaultRunner
......@@ -14,16 +16,16 @@ from lightx2v.models.schedulers.wan.feature_caching.scheduler import (
WanSchedulerCaching,
WanSchedulerTaylorCaching,
)
from lightx2v.utils.profiler import ProfilingContext
from lightx2v.utils.utils import *
from lightx2v.models.input_encoders.hf.t5.model import T5EncoderModel
from lightx2v.models.input_encoders.hf.xlm_roberta.model import CLIPModel
from lightx2v.models.networks.wan.model import WanModel, Wan22MoeModel
from lightx2v.models.networks.wan.lora_adapter import WanLoraWrapper
from lightx2v.models.video_encoders.hf.wan.vae import WanVAE
from lightx2v.models.video_encoders.hf.wan.vae_2_2 import Wan2_2_VAE
from lightx2v.models.video_encoders.hf.wan.vae_tiny import WanVAE_tiny
from lightx2v.utils.utils import cache_video
from loguru import logger
from lightx2v.utils.utils import cache_video, best_output_size
from lightx2v.utils.profiler import ProfilingContext
@RUNNER_REGISTER("wan2.1")
......@@ -218,8 +220,8 @@ class WanRunner(DefaultRunner):
return vae_encode_out_list
else:
self.config.lat_h, self.config.lat_w = lat_h, lat_w
vae_encode_out = self.get_vae_encoder_output(img, lat_h, lat_w)
return vae_encode_out
vae_encoder_out = self.get_vae_encoder_output(img, lat_h, lat_w)
return vae_encoder_out
def get_vae_encoder_output(self, img, lat_h, lat_w):
h = lat_h * self.config.vae_stride[1]
......@@ -238,7 +240,7 @@ class WanRunner(DefaultRunner):
msk = msk.transpose(1, 2)[0]
if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
self.vae_encoder = self.load_vae_encoder()
vae_encode_out = self.vae_encoder.encode(
vae_encoder_out = self.vae_encoder.encode(
[
torch.concat(
[
......@@ -254,13 +256,13 @@ class WanRunner(DefaultRunner):
del self.vae_encoder
torch.cuda.empty_cache()
gc.collect()
vae_encode_out = torch.concat([msk, vae_encode_out]).to(torch.bfloat16)
return vae_encode_out
vae_encoder_out = torch.concat([msk, vae_encoder_out]).to(torch.bfloat16)
return vae_encoder_out
def get_encoder_output_i2v(self, clip_encoder_out, vae_encode_out, text_encoder_output, img):
def get_encoder_output_i2v(self, clip_encoder_out, vae_encoder_out, text_encoder_output, img):
image_encoder_output = {
"clip_encoder_out": clip_encoder_out,
"vae_encode_out": vae_encode_out,
"vae_encoder_out": vae_encoder_out,
}
return {
"text_encoder_output": text_encoder_output,
......@@ -363,3 +365,58 @@ class Wan22MoeRunner(WanRunner):
self.init_device,
)
return MultiModelStruct([high_noise_model, low_noise_model], self.config, self.config.boundary)
@RUNNER_REGISTER("wan2.2")
class Wan22DenseRunner(WanRunner):
def __init__(self, config):
super().__init__(config)
def load_vae_decoder(self):
vae_config = {
"vae_pth": find_torch_model_path(self.config, "vae_pth", "Wan2.2_VAE.pth"),
"device": self.init_device,
}
vae_decoder = Wan2_2_VAE(**vae_config)
return vae_decoder
def load_vae_encoder(self):
vae_config = {
"vae_pth": find_torch_model_path(self.config, "vae_pth", "Wan2.2_VAE.pth"),
"device": self.init_device,
}
if self.config.task != "i2v":
return None
else:
return Wan2_2_VAE(**vae_config)
def load_vae(self):
vae_encoder = self.load_vae_encoder()
vae_decoder = self.load_vae_decoder()
return vae_encoder, vae_decoder
def run_vae_encoder(self, img):
max_area = self.config.target_height * self.config.target_width
ih, iw = img.height, img.width
dh, dw = self.config.patch_size[1] * self.config.vae_stride[1], self.config.patch_size[2] * self.config.vae_stride[2]
ow, oh = best_output_size(iw, ih, dw, dh, max_area)
scale = max(ow / iw, oh / ih)
img = img.resize((round(iw * scale), round(ih * scale)), Image.LANCZOS)
# center-crop
x1 = (img.width - ow) // 2
y1 = (img.height - oh) // 2
img = img.crop((x1, y1, x1 + ow, y1 + oh))
assert img.width == ow and img.height == oh
# to tensor
img = TF.to_tensor(img).sub_(0.5).div_(0.5).cuda().unsqueeze(1)
vae_encoder_out = self.get_vae_encoder_output(img)
self.config.lat_w, self.config.lat_h = ow // self.config.vae_stride[2], oh // self.config.vae_stride[1]
return vae_encoder_out
def get_vae_encoder_output(self, img):
z = self.vae_encoder.encode(img)
return z
......@@ -36,9 +36,9 @@ class WanSkyreelsV2DFRunner(WanRunner): # Diffustion foring for SkyReelsV2 DF I
config.lat_h = lat_h
config.lat_w = lat_w
vae_encode_out = vae_model.encode([torch.nn.functional.interpolate(img[None].cpu(), size=(h, w), mode="bicubic").transpose(0, 1).cuda()], config)[0]
vae_encode_out = vae_encode_out.to(torch.bfloat16)
return vae_encode_out
vae_encoder_out = vae_model.encode([torch.nn.functional.interpolate(img[None].cpu(), size=(h, w), mode="bicubic").transpose(0, 1).cuda()], config)[0]
vae_encoder_out = vae_encoder_out.to(torch.bfloat16)
return vae_encoder_out
def set_target_shape(self):
if os.path.isfile(self.config.image_path):
......
......@@ -2,8 +2,9 @@ import math
import numpy as np
import torch
import gc
from typing import List, Optional, Tuple, Union
from typing import List, Optional, Union
from lightx2v.models.schedulers.scheduler import BaseScheduler
from lightx2v.utils.utils import masks_like
class WanScheduler(BaseScheduler):
......@@ -19,10 +20,11 @@ class WanScheduler(BaseScheduler):
self.solver_order = 2
self.noise_pred = None
self.sample_guide_scale = self.config.sample_guide_scale
self.caching_records_2 = [True] * self.config.infer_steps
def prepare(self, image_encoder_output=None):
if self.config["model_cls"] == "wan2.2" and self.config["task"] == "i2v":
self.vae_encoder_out = image_encoder_output["vae_encoder_out"]
self.generator = torch.Generator(device=self.device)
self.generator.manual_seed(self.config.seed)
......@@ -57,6 +59,12 @@ class WanScheduler(BaseScheduler):
device=self.device,
generator=self.generator,
)
if self.config["model_cls"] == "wan2.2":
if self.config["task"] == "t2v":
self.mask = masks_like(self.latents, zero=False)
elif self.config["task"] == "i2v":
self.mask = masks_like(self.latents, zero=True)
self.latents = (1.0 - self.mask) * self.vae_encoder_out + self.mask * self.latents
def set_timesteps(
self,
......@@ -354,3 +362,5 @@ class WanScheduler(BaseScheduler):
self.lower_order_nums += 1
self.latents = prev_sample
if self.config["model_cls"] == "wan2.2" and self.config["task"] == "i2v":
self.latents = (1.0 - self.mask) * self.vae_encoder_out + self.mask * self.latents
# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
import logging
import torch
import torch.cuda.amp as amp
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
__all__ = [
"Wan2_2_VAE",
]
CACHE_T = 2
class CausalConv3d(nn.Conv3d):
"""
Causal 3d convolusion.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._padding = (
self.padding[2],
self.padding[2],
self.padding[1],
self.padding[1],
2 * self.padding[0],
0,
)
self.padding = (0, 0, 0)
def forward(self, x, cache_x=None):
padding = list(self._padding)
if cache_x is not None and self._padding[4] > 0:
cache_x = cache_x.to(x.device)
x = torch.cat([cache_x, x], dim=2)
padding[4] -= cache_x.shape[2]
x = F.pad(x, padding)
return super().forward(x)
class RMS_norm(nn.Module):
def __init__(self, dim, channel_first=True, images=True, bias=False):
super().__init__()
broadcastable_dims = (1, 1, 1) if not images else (1, 1)
shape = (dim, *broadcastable_dims) if channel_first else (dim,)
self.channel_first = channel_first
self.scale = dim**0.5
self.gamma = nn.Parameter(torch.ones(shape))
self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
def forward(self, x):
return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
class Upsample(nn.Upsample):
def forward(self, x):
"""
Fix bfloat16 support for nearest neighbor interpolation.
"""
return super().forward(x.float()).type_as(x)
class Resample(nn.Module):
def __init__(self, dim, mode):
assert mode in (
"none",
"upsample2d",
"upsample3d",
"downsample2d",
"downsample3d",
)
super().__init__()
self.dim = dim
self.mode = mode
# layers
if mode == "upsample2d":
self.resample = nn.Sequential(
Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
nn.Conv2d(dim, dim, 3, padding=1),
)
elif mode == "upsample3d":
self.resample = nn.Sequential(
Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
nn.Conv2d(dim, dim, 3, padding=1),
# nn.Conv2d(dim, dim//2, 3, padding=1)
)
self.time_conv = CausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
elif mode == "downsample2d":
self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
elif mode == "downsample3d":
self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
self.time_conv = CausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
else:
self.resample = nn.Identity()
def forward(self, x, feat_cache=None, feat_idx=[0]):
b, c, t, h, w = x.size()
if self.mode == "upsample3d":
if feat_cache is not None:
idx = feat_idx[0]
if feat_cache[idx] is None:
feat_cache[idx] = "Rep"
feat_idx[0] += 1
else:
cache_x = x[:, :, -CACHE_T:, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
# cache last frame of last two chunk
cache_x = torch.cat(
[
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
cache_x,
],
dim=2,
)
if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
cache_x = torch.cat(
[torch.zeros_like(cache_x).to(cache_x.device), cache_x],
dim=2,
)
if feat_cache[idx] == "Rep":
x = self.time_conv(x)
else:
x = self.time_conv(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1
x = x.reshape(b, 2, c, t, h, w)
x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
x = x.reshape(b, c, t * 2, h, w)
t = x.shape[2]
x = rearrange(x, "b c t h w -> (b t) c h w")
x = self.resample(x)
x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
if self.mode == "downsample3d":
if feat_cache is not None:
idx = feat_idx[0]
if feat_cache[idx] is None:
feat_cache[idx] = x.clone()
feat_idx[0] += 1
else:
cache_x = x[:, :, -1:, :, :].clone()
x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
feat_cache[idx] = cache_x
feat_idx[0] += 1
return x
def init_weight(self, conv):
conv_weight = conv.weight.detach().clone()
nn.init.zeros_(conv_weight)
c1, c2, t, h, w = conv_weight.size()
one_matrix = torch.eye(c1, c2)
init_matrix = one_matrix
nn.init.zeros_(conv_weight)
conv_weight.data[:, :, 1, 0, 0] = init_matrix # * 0.5
conv.weight = nn.Parameter(conv_weight)
nn.init.zeros_(conv.bias.data)
def init_weight2(self, conv):
conv_weight = conv.weight.data.detach().clone()
nn.init.zeros_(conv_weight)
c1, c2, t, h, w = conv_weight.size()
init_matrix = torch.eye(c1 // 2, c2)
conv_weight[: c1 // 2, :, -1, 0, 0] = init_matrix
conv_weight[c1 // 2 :, :, -1, 0, 0] = init_matrix
conv.weight = nn.Parameter(conv_weight)
nn.init.zeros_(conv.bias.data)
class ResidualBlock(nn.Module):
def __init__(self, in_dim, out_dim, dropout=0.0):
super().__init__()
self.in_dim = in_dim
self.out_dim = out_dim
# layers
self.residual = nn.Sequential(
RMS_norm(in_dim, images=False),
nn.SiLU(),
CausalConv3d(in_dim, out_dim, 3, padding=1),
RMS_norm(out_dim, images=False),
nn.SiLU(),
nn.Dropout(dropout),
CausalConv3d(out_dim, out_dim, 3, padding=1),
)
self.shortcut = CausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
def forward(self, x, feat_cache=None, feat_idx=[0]):
h = self.shortcut(x)
for layer in self.residual:
if isinstance(layer, CausalConv3d) and feat_cache is not None:
idx = feat_idx[0]
cache_x = x[:, :, -CACHE_T:, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
# cache last frame of last two chunk
cache_x = torch.cat(
[
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
cache_x,
],
dim=2,
)
x = layer(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1
else:
x = layer(x)
return x + h
class AttentionBlock(nn.Module):
"""
Causal self-attention with a single head.
"""
def __init__(self, dim):
super().__init__()
self.dim = dim
# layers
self.norm = RMS_norm(dim)
self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
self.proj = nn.Conv2d(dim, dim, 1)
# zero out the last layer params
nn.init.zeros_(self.proj.weight)
def forward(self, x):
identity = x
b, c, t, h, w = x.size()
x = rearrange(x, "b c t h w -> (b t) c h w")
x = self.norm(x)
# compute query, key, value
q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3, -1).permute(0, 1, 3, 2).contiguous().chunk(3, dim=-1)
# apply attention
x = F.scaled_dot_product_attention(
q,
k,
v,
)
x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
# output
x = self.proj(x)
x = rearrange(x, "(b t) c h w-> b c t h w", t=t)
return x + identity
def patchify(x, patch_size):
if patch_size == 1:
return x
if x.dim() == 4:
x = rearrange(x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
elif x.dim() == 5:
x = rearrange(
x,
"b c f (h q) (w r) -> b (c r q) f h w",
q=patch_size,
r=patch_size,
)
else:
raise ValueError(f"Invalid input shape: {x.shape}")
return x
def unpatchify(x, patch_size):
if patch_size == 1:
return x
if x.dim() == 4:
x = rearrange(x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
elif x.dim() == 5:
x = rearrange(
x,
"b (c r q) f h w -> b c f (h q) (w r)",
q=patch_size,
r=patch_size,
)
return x
class AvgDown3D(nn.Module):
def __init__(
self,
in_channels,
out_channels,
factor_t,
factor_s=1,
):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.factor_t = factor_t
self.factor_s = factor_s
self.factor = self.factor_t * self.factor_s * self.factor_s
assert in_channels * self.factor % out_channels == 0
self.group_size = in_channels * self.factor // out_channels
def forward(self, x: torch.Tensor) -> torch.Tensor:
pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
pad = (0, 0, 0, 0, pad_t, 0)
x = F.pad(x, pad)
B, C, T, H, W = x.shape
x = x.view(
B,
C,
T // self.factor_t,
self.factor_t,
H // self.factor_s,
self.factor_s,
W // self.factor_s,
self.factor_s,
)
x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
x = x.view(
B,
C * self.factor,
T // self.factor_t,
H // self.factor_s,
W // self.factor_s,
)
x = x.view(
B,
self.out_channels,
self.group_size,
T // self.factor_t,
H // self.factor_s,
W // self.factor_s,
)
x = x.mean(dim=2)
return x
class DupUp3D(nn.Module):
def __init__(
self,
in_channels: int,
out_channels: int,
factor_t,
factor_s=1,
):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.factor_t = factor_t
self.factor_s = factor_s
self.factor = self.factor_t * self.factor_s * self.factor_s
assert out_channels * self.factor % in_channels == 0
self.repeats = out_channels * self.factor // in_channels
def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
x = x.repeat_interleave(self.repeats, dim=1)
x = x.view(
x.size(0),
self.out_channels,
self.factor_t,
self.factor_s,
self.factor_s,
x.size(2),
x.size(3),
x.size(4),
)
x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
x = x.view(
x.size(0),
self.out_channels,
x.size(2) * self.factor_t,
x.size(4) * self.factor_s,
x.size(6) * self.factor_s,
)
if first_chunk:
x = x[:, :, self.factor_t - 1 :, :, :]
return x
class Down_ResidualBlock(nn.Module):
def __init__(self, in_dim, out_dim, dropout, mult, temperal_downsample=False, down_flag=False):
super().__init__()
# Shortcut path with downsample
self.avg_shortcut = AvgDown3D(
in_dim,
out_dim,
factor_t=2 if temperal_downsample else 1,
factor_s=2 if down_flag else 1,
)
# Main path with residual blocks and downsample
downsamples = []
for _ in range(mult):
downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
in_dim = out_dim
# Add the final downsample block
if down_flag:
mode = "downsample3d" if temperal_downsample else "downsample2d"
downsamples.append(Resample(out_dim, mode=mode))
self.downsamples = nn.Sequential(*downsamples)
def forward(self, x, feat_cache=None, feat_idx=[0]):
x_copy = x.clone()
for module in self.downsamples:
x = module(x, feat_cache, feat_idx)
return x + self.avg_shortcut(x_copy)
class Up_ResidualBlock(nn.Module):
def __init__(self, in_dim, out_dim, dropout, mult, temperal_upsample=False, up_flag=False):
super().__init__()
# Shortcut path with upsample
if up_flag:
self.avg_shortcut = DupUp3D(
in_dim,
out_dim,
factor_t=2 if temperal_upsample else 1,
factor_s=2 if up_flag else 1,
)
else:
self.avg_shortcut = None
# Main path with residual blocks and upsample
upsamples = []
for _ in range(mult):
upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
in_dim = out_dim
# Add the final upsample block
if up_flag:
mode = "upsample3d" if temperal_upsample else "upsample2d"
upsamples.append(Resample(out_dim, mode=mode))
self.upsamples = nn.Sequential(*upsamples)
def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
x_main = x.clone()
for module in self.upsamples:
x_main = module(x_main, feat_cache, feat_idx)
if self.avg_shortcut is not None:
x_shortcut = self.avg_shortcut(x, first_chunk)
return x_main + x_shortcut
else:
return x_main
class Encoder3d(nn.Module):
def __init__(
self,
dim=128,
z_dim=4,
dim_mult=[1, 2, 4, 4],
num_res_blocks=2,
attn_scales=[],
temperal_downsample=[True, True, False],
dropout=0.0,
):
super().__init__()
self.dim = dim
self.z_dim = z_dim
self.dim_mult = dim_mult
self.num_res_blocks = num_res_blocks
self.attn_scales = attn_scales
self.temperal_downsample = temperal_downsample
# dimensions
dims = [dim * u for u in [1] + dim_mult]
scale = 1.0
# init block
self.conv1 = CausalConv3d(12, dims[0], 3, padding=1)
# downsample blocks
downsamples = []
for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
t_down_flag = temperal_downsample[i] if i < len(temperal_downsample) else False
downsamples.append(
Down_ResidualBlock(
in_dim=in_dim,
out_dim=out_dim,
dropout=dropout,
mult=num_res_blocks,
temperal_downsample=t_down_flag,
down_flag=i != len(dim_mult) - 1,
)
)
scale /= 2.0
self.downsamples = nn.Sequential(*downsamples)
# middle blocks
self.middle = nn.Sequential(
ResidualBlock(out_dim, out_dim, dropout),
AttentionBlock(out_dim),
ResidualBlock(out_dim, out_dim, dropout),
)
# # output blocks
self.head = nn.Sequential(
RMS_norm(out_dim, images=False),
nn.SiLU(),
CausalConv3d(out_dim, z_dim, 3, padding=1),
)
def forward(self, x, feat_cache=None, feat_idx=[0]):
if feat_cache is not None:
idx = feat_idx[0]
cache_x = x[:, :, -CACHE_T:, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
cache_x = torch.cat(
[
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
cache_x,
],
dim=2,
)
x = self.conv1(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1
else:
x = self.conv1(x)
## downsamples
for layer in self.downsamples:
if feat_cache is not None:
x = layer(x, feat_cache, feat_idx)
else:
x = layer(x)
## middle
for layer in self.middle:
if isinstance(layer, ResidualBlock) and feat_cache is not None:
x = layer(x, feat_cache, feat_idx)
else:
x = layer(x)
## head
for layer in self.head:
if isinstance(layer, CausalConv3d) and feat_cache is not None:
idx = feat_idx[0]
cache_x = x[:, :, -CACHE_T:, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
cache_x = torch.cat(
[
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
cache_x,
],
dim=2,
)
x = layer(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1
else:
x = layer(x)
return x
class Decoder3d(nn.Module):
def __init__(
self,
dim=128,
z_dim=4,
dim_mult=[1, 2, 4, 4],
num_res_blocks=2,
attn_scales=[],
temperal_upsample=[False, True, True],
dropout=0.0,
):
super().__init__()
self.dim = dim
self.z_dim = z_dim
self.dim_mult = dim_mult
self.num_res_blocks = num_res_blocks
self.attn_scales = attn_scales
self.temperal_upsample = temperal_upsample
# dimensions
dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
scale = 1.0 / 2 ** (len(dim_mult) - 2)
# init block
self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
# middle blocks
self.middle = nn.Sequential(
ResidualBlock(dims[0], dims[0], dropout),
AttentionBlock(dims[0]),
ResidualBlock(dims[0], dims[0], dropout),
)
# upsample blocks
upsamples = []
for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
t_up_flag = temperal_upsample[i] if i < len(temperal_upsample) else False
upsamples.append(
Up_ResidualBlock(
in_dim=in_dim,
out_dim=out_dim,
dropout=dropout,
mult=num_res_blocks + 1,
temperal_upsample=t_up_flag,
up_flag=i != len(dim_mult) - 1,
)
)
self.upsamples = nn.Sequential(*upsamples)
# output blocks
self.head = nn.Sequential(
RMS_norm(out_dim, images=False),
nn.SiLU(),
CausalConv3d(out_dim, 12, 3, padding=1),
)
def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
if feat_cache is not None:
idx = feat_idx[0]
cache_x = x[:, :, -CACHE_T:, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
cache_x = torch.cat(
[
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
cache_x,
],
dim=2,
)
x = self.conv1(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1
else:
x = self.conv1(x)
for layer in self.middle:
if isinstance(layer, ResidualBlock) and feat_cache is not None:
x = layer(x, feat_cache, feat_idx)
else:
x = layer(x)
## upsamples
for layer in self.upsamples:
if feat_cache is not None:
x = layer(x, feat_cache, feat_idx, first_chunk)
else:
x = layer(x)
## head
for layer in self.head:
if isinstance(layer, CausalConv3d) and feat_cache is not None:
idx = feat_idx[0]
cache_x = x[:, :, -CACHE_T:, :, :].clone()
if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
cache_x = torch.cat(
[
feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
cache_x,
],
dim=2,
)
x = layer(x, feat_cache[idx])
feat_cache[idx] = cache_x
feat_idx[0] += 1
else:
x = layer(x)
return x
def count_conv3d(model):
count = 0
for m in model.modules():
if isinstance(m, CausalConv3d):
count += 1
return count
class WanVAE_(nn.Module):
def __init__(
self,
dim=160,
dec_dim=256,
z_dim=16,
dim_mult=[1, 2, 4, 4],
num_res_blocks=2,
attn_scales=[],
temperal_downsample=[True, True, False],
dropout=0.0,
):
super().__init__()
self.dim = dim
self.z_dim = z_dim
self.dim_mult = dim_mult
self.num_res_blocks = num_res_blocks
self.attn_scales = attn_scales
self.temperal_downsample = temperal_downsample
self.temperal_upsample = temperal_downsample[::-1]
# modules
self.encoder = Encoder3d(
dim,
z_dim * 2,
dim_mult,
num_res_blocks,
attn_scales,
self.temperal_downsample,
dropout,
)
self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
self.conv2 = CausalConv3d(z_dim, z_dim, 1)
self.decoder = Decoder3d(
dec_dim,
z_dim,
dim_mult,
num_res_blocks,
attn_scales,
self.temperal_upsample,
dropout,
)
def forward(self, x, scale=[0, 1]):
mu = self.encode(x, scale)
x_recon = self.decode(mu, scale)
return x_recon, mu
def encode(self, x, scale):
self.clear_cache()
x = patchify(x, patch_size=2)
t = x.shape[2]
iter_ = 1 + (t - 1) // 4
for i in range(iter_):
self._enc_conv_idx = [0]
if i == 0:
out = self.encoder(
x[:, :, :1, :, :],
feat_cache=self._enc_feat_map,
feat_idx=self._enc_conv_idx,
)
else:
out_ = self.encoder(
x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :],
feat_cache=self._enc_feat_map,
feat_idx=self._enc_conv_idx,
)
out = torch.cat([out, out_], 2)
mu, log_var = self.conv1(out).chunk(2, dim=1)
if isinstance(scale[0], torch.Tensor):
mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(1, self.z_dim, 1, 1, 1)
else:
mu = (mu - scale[0]) * scale[1]
self.clear_cache()
return mu
def decode(self, z, scale):
self.clear_cache()
if isinstance(scale[0], torch.Tensor):
z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(1, self.z_dim, 1, 1, 1)
else:
z = z / scale[1] + scale[0]
iter_ = z.shape[2]
x = self.conv2(z)
for i in range(iter_):
self._conv_idx = [0]
if i == 0:
out = self.decoder(
x[:, :, i : i + 1, :, :],
feat_cache=self._feat_map,
feat_idx=self._conv_idx,
first_chunk=True,
)
else:
out_ = self.decoder(
x[:, :, i : i + 1, :, :],
feat_cache=self._feat_map,
feat_idx=self._conv_idx,
)
out = torch.cat([out, out_], 2)
out = unpatchify(out, patch_size=2)
self.clear_cache()
return out
def reparameterize(self, mu, log_var):
std = torch.exp(0.5 * log_var)
eps = torch.randn_like(std)
return eps * std + mu
def sample(self, imgs, deterministic=False):
mu, log_var = self.encode(imgs)
if deterministic:
return mu
std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
return mu + std * torch.randn_like(std)
def clear_cache(self):
self._conv_num = count_conv3d(self.decoder)
self._conv_idx = [0]
self._feat_map = [None] * self._conv_num
# cache encode
self._enc_conv_num = count_conv3d(self.encoder)
self._enc_conv_idx = [0]
self._enc_feat_map = [None] * self._enc_conv_num
def _video_vae(pretrained_path=None, z_dim=16, dim=160, device="cpu", **kwargs):
# params
cfg = dict(
dim=dim,
z_dim=z_dim,
dim_mult=[1, 2, 4, 4],
num_res_blocks=2,
attn_scales=[],
temperal_downsample=[True, True, True],
dropout=0.0,
)
cfg.update(**kwargs)
# init model
with torch.device("meta"):
model = WanVAE_(**cfg)
# load checkpoint
logging.info(f"loading {pretrained_path}")
model.load_state_dict(torch.load(pretrained_path, map_location=device), assign=True)
return model
class Wan2_2_VAE:
def __init__(
self,
z_dim=48,
c_dim=160,
vae_pth=None,
dim_mult=[1, 2, 4, 4],
temperal_downsample=[False, True, True],
dtype=torch.float,
device="cuda",
):
self.dtype = dtype
self.device = device
mean = torch.tensor(
[
-0.2289,
-0.0052,
-0.1323,
-0.2339,
-0.2799,
0.0174,
0.1838,
0.1557,
-0.1382,
0.0542,
0.2813,
0.0891,
0.1570,
-0.0098,
0.0375,
-0.1825,
-0.2246,
-0.1207,
-0.0698,
0.5109,
0.2665,
-0.2108,
-0.2158,
0.2502,
-0.2055,
-0.0322,
0.1109,
0.1567,
-0.0729,
0.0899,
-0.2799,
-0.1230,
-0.0313,
-0.1649,
0.0117,
0.0723,
-0.2839,
-0.2083,
-0.0520,
0.3748,
0.0152,
0.1957,
0.1433,
-0.2944,
0.3573,
-0.0548,
-0.1681,
-0.0667,
],
dtype=dtype,
device=device,
)
std = torch.tensor(
[
0.4765,
1.0364,
0.4514,
1.1677,
0.5313,
0.4990,
0.4818,
0.5013,
0.8158,
1.0344,
0.5894,
1.0901,
0.6885,
0.6165,
0.8454,
0.4978,
0.5759,
0.3523,
0.7135,
0.6804,
0.5833,
1.4146,
0.8986,
0.5659,
0.7069,
0.5338,
0.4889,
0.4917,
0.4069,
0.4999,
0.6866,
0.4093,
0.5709,
0.6065,
0.6415,
0.4944,
0.5726,
1.2042,
0.5458,
1.6887,
0.3971,
1.0600,
0.3943,
0.5537,
0.5444,
0.4089,
0.7468,
0.7744,
],
dtype=dtype,
device=device,
)
self.scale = [mean, 1.0 / std]
# init model
self.model = (
_video_vae(
pretrained_path=vae_pth,
z_dim=z_dim,
dim=c_dim,
dim_mult=dim_mult,
temperal_downsample=temperal_downsample,
)
.eval()
.requires_grad_(False)
.to(device)
)
def encode(self, videos):
# try:
# if not isinstance(videos, list):
# raise TypeError("videos should be a list")
# with amp.autocast(dtype=self.dtype):
# return [
# self.model.encode(u.unsqueeze(0),
# self.scale).float().squeeze(0)
# for u in videos
# ]
# except TypeError as e:
# logging.info(e)
# return None
# print(1111111)
# print(self.model.encode(videos.unsqueeze(0), self.scale).float().shape)
# exit()
return self.model.encode(videos.unsqueeze(0), self.scale).float().squeeze(0)
def decode(self, zs, generator, config):
return self.model.decode(zs.unsqueeze(0), self.scale).float().clamp_(-1, 1)
......@@ -294,3 +294,43 @@ def find_hf_model_path(config, ckpt_config_key=None, subdir=["original", "fp8",
logger.info(f"Found Hugging Face model files in: {path}")
return path
raise FileNotFoundError(f"No Hugging Face model files (.safetensors) found.\nPlease download the model from: https://huggingface.co/lightx2v/ or specify the model path in the configuration file.")
def masks_like(tensor, zero=False, generator=None, p=0.2):
assert isinstance(tensor, torch.Tensor)
out = torch.ones_like(tensor)
if zero:
if generator is not None:
# 生成随机数判断是否需要修改
random_num = torch.rand(1, generator=generator, device=generator.device).item()
if random_num < p:
out[:, 0] = torch.zeros_like(out[:, 0])
else:
out[:, 0] = torch.zeros_like(out[:, 0])
return out
def best_output_size(w, h, dw, dh, expected_area):
# float output size
ratio = w / h
ow = (expected_area * ratio) ** 0.5
oh = expected_area / ow
# process width first
ow1 = int(ow // dw * dw)
oh1 = int(expected_area / ow1 // dh * dh)
assert ow1 % dw == 0 and oh1 % dh == 0 and ow1 * oh1 <= expected_area
ratio1 = ow1 / oh1
# process height first
oh2 = int(oh // dh * dh)
ow2 = int(expected_area / oh2 // dw * dw)
assert oh2 % dh == 0 and ow2 % dw == 0 and ow2 * oh2 <= expected_area
ratio2 = ow2 / oh2
# compare ratios
if max(ratio / ratio1, ratio1 / ratio) < max(ratio / ratio2, ratio2 / ratio):
return ow1, oh1
else:
return ow2, oh2
#!/bin/bash
# set path and first
lightx2v_path=
model_path=
# check section
if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
cuda_devices=5
echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
export CUDA_VISIBLE_DEVICES=${cuda_devices}
fi
if [ -z "${lightx2v_path}" ]; then
echo "Error: lightx2v_path is not set. Please set this variable first."
exit 1
fi
if [ -z "${model_path}" ]; then
echo "Error: model_path is not set. Please set this variable first."
exit 1
fi
export TOKENIZERS_PARALLELISM=false
export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
export DTYPE=BF16
export ENABLE_PROFILING_DEBUG=true
export ENABLE_GRAPH_MODE=false
python -m lightx2v.infer \
--model_cls wan2.2 \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/wan22/wan_ti2v_i2v.json \
--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
--negative_prompt "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" \
--image_path ${lightx2v_path}/assets/inputs/imgs/img_0.jpg \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_i2v.mp4
#!/bin/bash
# set path and first
lightx2v_path=
model_path=
# check section
if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
cuda_devices=1
echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
export CUDA_VISIBLE_DEVICES=${cuda_devices}
fi
if [ -z "${lightx2v_path}" ]; then
echo "Error: lightx2v_path is not set. Please set this variable first."
exit 1
fi
if [ -z "${model_path}" ]; then
echo "Error: model_path is not set. Please set this variable first."
exit 1
fi
export TOKENIZERS_PARALLELISM=false
export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
export DTYPE=BF16
export ENABLE_PROFILING_DEBUG=true
export ENABLE_GRAPH_MODE=false
python -m lightx2v.infer \
--model_cls wan2.2 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/wan22/wan_ti2v_t2v.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
--negative_prompt "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_dense_t2v.mp4
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment