Unverified Commit d7ffe601 authored by Aryan's avatar Aryan Committed by GitHub
Browse files

Hunyuan Video Framepack (#11428)

* add transformer

* add pipeline

* fixes

* make fix-copies

* update

* add flux mu shift

* update example snippet

* debug

* cleanup

* batch_size=1 optimization

* add pipeline test

* fix for model cpu offloading'

* add last_image support; credits: https://github.com/lllyasviel/FramePack/pull/167

* update example with flf2v

* update penguin url

* fix test

* address review comment: https://github.com/huggingface/diffusers/pull/11428#discussion_r2071032371

* address review comment: https://github.com/huggingface/diffusers/pull/11428#discussion_r2071087689



* Update src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_framepack.py

---------
Co-authored-by: default avatarLinoy Tsaban <57615435+linoytsaban@users.noreply.github.com>
parent 10bee525
......@@ -52,6 +52,7 @@ The following models are available for the image-to-video pipeline:
| [`Skywork/SkyReels-V1-Hunyuan-I2V`](https://huggingface.co/Skywork/SkyReels-V1-Hunyuan-I2V) | Skywork's custom finetune of HunyuanVideo (de-distilled). Performs best with `97x544x960` resolution. Performs best at `97x544x960` resolution, `guidance_scale=1.0`, `true_cfg_scale=6.0` and a negative prompt. |
| [`hunyuanvideo-community/HunyuanVideo-I2V-33ch`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-I2V) | Tecent's official HunyuanVideo 33-channel I2V model. Performs best at resolutions of 480, 720, 960, 1280. A higher `shift` value when initializing the scheduler is recommended (good values are between 7 and 20). |
| [`hunyuanvideo-community/HunyuanVideo-I2V`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-I2V) | Tecent's official HunyuanVideo 16-channel I2V model. Performs best at resolutions of 480, 720, 960, 1280. A higher `shift` value when initializing the scheduler is recommended (good values are between 7 and 20) |
- [`lllyasviel/FramePackI2V_HY`](https://huggingface.co/lllyasviel/FramePackI2V_HY) | lllyasviel's paper introducing a new technique for long-context video generation called [Framepack](https://arxiv.org/abs/2504.12626). |
## Quantization
......
......@@ -175,6 +175,7 @@ else:
"HunyuanDiT2DControlNetModel",
"HunyuanDiT2DModel",
"HunyuanDiT2DMultiControlNetModel",
"HunyuanVideoFramepackTransformer3DModel",
"HunyuanVideoTransformer3DModel",
"I2VGenXLUNet",
"Kandinsky3UNet",
......@@ -376,6 +377,7 @@ else:
"HunyuanDiTPAGPipeline",
"HunyuanDiTPipeline",
"HunyuanSkyreelsImageToVideoPipeline",
"HunyuanVideoFramepackPipeline",
"HunyuanVideoImageToVideoPipeline",
"HunyuanVideoPipeline",
"I2VGenXLPipeline",
......@@ -770,6 +772,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
HunyuanDiT2DControlNetModel,
HunyuanDiT2DModel,
HunyuanDiT2DMultiControlNetModel,
HunyuanVideoFramepackTransformer3DModel,
HunyuanVideoTransformer3DModel,
I2VGenXLUNet,
Kandinsky3UNet,
......@@ -950,6 +953,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
HunyuanDiTPAGPipeline,
HunyuanDiTPipeline,
HunyuanSkyreelsImageToVideoPipeline,
HunyuanVideoFramepackPipeline,
HunyuanVideoImageToVideoPipeline,
HunyuanVideoPipeline,
I2VGenXLPipeline,
......
......@@ -79,6 +79,7 @@ if is_torch_available():
_import_structure["transformers.transformer_flux"] = ["FluxTransformer2DModel"]
_import_structure["transformers.transformer_hidream_image"] = ["HiDreamImageTransformer2DModel"]
_import_structure["transformers.transformer_hunyuan_video"] = ["HunyuanVideoTransformer3DModel"]
_import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
_import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
_import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
_import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
......@@ -156,6 +157,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
FluxTransformer2DModel,
HiDreamImageTransformer2DModel,
HunyuanDiT2DModel,
HunyuanVideoFramepackTransformer3DModel,
HunyuanVideoTransformer3DModel,
LatteTransformer3DModel,
LTXVideoTransformer3DModel,
......
......@@ -23,6 +23,7 @@ if is_torch_available():
from .transformer_flux import FluxTransformer2DModel
from .transformer_hidream_image import HiDreamImageTransformer2DModel
from .transformer_hunyuan_video import HunyuanVideoTransformer3DModel
from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel
from .transformer_ltx import LTXVideoTransformer3DModel
from .transformer_lumina2 import Lumina2Transformer2DModel
from .transformer_mochi import MochiTransformer3DModel
......
# Copyright 2025 The Framepack Team, The Hunyuan Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, List, Optional, Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
from ...utils import USE_PEFT_BACKEND, get_logger, scale_lora_layers, unscale_lora_layers
from ..cache_utils import CacheMixin
from ..embeddings import get_1d_rotary_pos_embed
from ..modeling_outputs import Transformer2DModelOutput
from ..modeling_utils import ModelMixin
from ..normalization import AdaLayerNormContinuous
from .transformer_hunyuan_video import (
HunyuanVideoConditionEmbedding,
HunyuanVideoPatchEmbed,
HunyuanVideoSingleTransformerBlock,
HunyuanVideoTokenRefiner,
HunyuanVideoTransformerBlock,
)
logger = get_logger(__name__) # pylint: disable=invalid-name
class HunyuanVideoFramepackRotaryPosEmbed(nn.Module):
def __init__(self, patch_size: int, patch_size_t: int, rope_dim: List[int], theta: float = 256.0) -> None:
super().__init__()
self.patch_size = patch_size
self.patch_size_t = patch_size_t
self.rope_dim = rope_dim
self.theta = theta
def forward(self, frame_indices: torch.Tensor, height: int, width: int, device: torch.device):
height = height // self.patch_size
width = width // self.patch_size
grid = torch.meshgrid(
frame_indices.to(device=device, dtype=torch.float32),
torch.arange(0, height, device=device, dtype=torch.float32),
torch.arange(0, width, device=device, dtype=torch.float32),
indexing="ij",
) # 3 * [W, H, T]
grid = torch.stack(grid, dim=0) # [3, W, H, T]
freqs = []
for i in range(3):
freq = get_1d_rotary_pos_embed(self.rope_dim[i], grid[i].reshape(-1), self.theta, use_real=True)
freqs.append(freq)
freqs_cos = torch.cat([f[0] for f in freqs], dim=1) # (W * H * T, D / 2)
freqs_sin = torch.cat([f[1] for f in freqs], dim=1) # (W * H * T, D / 2)
return freqs_cos, freqs_sin
class FramepackClipVisionProjection(nn.Module):
def __init__(self, in_channels: int, out_channels: int):
super().__init__()
self.up = nn.Linear(in_channels, out_channels * 3)
self.down = nn.Linear(out_channels * 3, out_channels)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.up(hidden_states)
hidden_states = F.silu(hidden_states)
hidden_states = self.down(hidden_states)
return hidden_states
class HunyuanVideoHistoryPatchEmbed(nn.Module):
def __init__(self, in_channels: int, inner_dim: int):
super().__init__()
self.proj = nn.Conv3d(in_channels, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
self.proj_2x = nn.Conv3d(in_channels, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
self.proj_4x = nn.Conv3d(in_channels, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
def forward(
self,
latents_clean: Optional[torch.Tensor] = None,
latents_clean_2x: Optional[torch.Tensor] = None,
latents_clean_4x: Optional[torch.Tensor] = None,
):
if latents_clean is not None:
latents_clean = self.proj(latents_clean)
latents_clean = latents_clean.flatten(2).transpose(1, 2)
if latents_clean_2x is not None:
latents_clean_2x = _pad_for_3d_conv(latents_clean_2x, (2, 4, 4))
latents_clean_2x = self.proj_2x(latents_clean_2x)
latents_clean_2x = latents_clean_2x.flatten(2).transpose(1, 2)
if latents_clean_4x is not None:
latents_clean_4x = _pad_for_3d_conv(latents_clean_4x, (4, 8, 8))
latents_clean_4x = self.proj_4x(latents_clean_4x)
latents_clean_4x = latents_clean_4x.flatten(2).transpose(1, 2)
return latents_clean, latents_clean_2x, latents_clean_4x
class HunyuanVideoFramepackTransformer3DModel(
ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin
):
_supports_gradient_checkpointing = True
_skip_layerwise_casting_patterns = ["x_embedder", "context_embedder", "norm"]
_no_split_modules = [
"HunyuanVideoTransformerBlock",
"HunyuanVideoSingleTransformerBlock",
"HunyuanVideoHistoryPatchEmbed",
"HunyuanVideoTokenRefiner",
]
@register_to_config
def __init__(
self,
in_channels: int = 16,
out_channels: int = 16,
num_attention_heads: int = 24,
attention_head_dim: int = 128,
num_layers: int = 20,
num_single_layers: int = 40,
num_refiner_layers: int = 2,
mlp_ratio: float = 4.0,
patch_size: int = 2,
patch_size_t: int = 1,
qk_norm: str = "rms_norm",
guidance_embeds: bool = True,
text_embed_dim: int = 4096,
pooled_projection_dim: int = 768,
rope_theta: float = 256.0,
rope_axes_dim: Tuple[int] = (16, 56, 56),
image_condition_type: Optional[str] = None,
has_image_proj: int = False,
image_proj_dim: int = 1152,
has_clean_x_embedder: int = False,
) -> None:
super().__init__()
inner_dim = num_attention_heads * attention_head_dim
out_channels = out_channels or in_channels
# 1. Latent and condition embedders
self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
self.context_embedder = HunyuanVideoTokenRefiner(
text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
)
self.time_text_embed = HunyuanVideoConditionEmbedding(
inner_dim, pooled_projection_dim, guidance_embeds, image_condition_type
)
# 2. RoPE
self.rope = HunyuanVideoFramepackRotaryPosEmbed(patch_size, patch_size_t, rope_axes_dim, rope_theta)
# 3. Dual stream transformer blocks
self.transformer_blocks = nn.ModuleList(
[
HunyuanVideoTransformerBlock(
num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
)
for _ in range(num_layers)
]
)
# 4. Single stream transformer blocks
self.single_transformer_blocks = nn.ModuleList(
[
HunyuanVideoSingleTransformerBlock(
num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
)
for _ in range(num_single_layers)
]
)
# 5. Output projection
self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
# Framepack specific modules
self.image_projection = FramepackClipVisionProjection(image_proj_dim, inner_dim) if has_image_proj else None
self.clean_x_embedder = None
if has_clean_x_embedder:
self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim)
self.use_gradient_checkpointing = False
def forward(
self,
hidden_states: torch.Tensor,
timestep: torch.LongTensor,
encoder_hidden_states: torch.Tensor,
encoder_attention_mask: torch.Tensor,
pooled_projections: torch.Tensor,
image_embeds: torch.Tensor,
indices_latents: torch.Tensor,
guidance: Optional[torch.Tensor] = None,
latents_clean: Optional[torch.Tensor] = None,
indices_latents_clean: Optional[torch.Tensor] = None,
latents_history_2x: Optional[torch.Tensor] = None,
indices_latents_history_2x: Optional[torch.Tensor] = None,
latents_history_4x: Optional[torch.Tensor] = None,
indices_latents_history_4x: Optional[torch.Tensor] = None,
attention_kwargs: Optional[Dict[str, Any]] = None,
return_dict: bool = True,
):
if attention_kwargs is not None:
attention_kwargs = attention_kwargs.copy()
lora_scale = attention_kwargs.pop("scale", 1.0)
else:
lora_scale = 1.0
if USE_PEFT_BACKEND:
# weight the lora layers by setting `lora_scale` for each PEFT layer
scale_lora_layers(self, lora_scale)
else:
if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
logger.warning(
"Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
)
batch_size, num_channels, num_frames, height, width = hidden_states.shape
p, p_t = self.config.patch_size, self.config.patch_size_t
post_patch_num_frames = num_frames // p_t
post_patch_height = height // p
post_patch_width = width // p
original_context_length = post_patch_num_frames * post_patch_height * post_patch_width
if indices_latents is None:
indices_latents = torch.arange(0, num_frames).unsqueeze(0).expand(batch_size, -1)
hidden_states = self.x_embedder(hidden_states)
image_rotary_emb = self.rope(
frame_indices=indices_latents, height=height, width=width, device=hidden_states.device
)
latents_clean, latents_history_2x, latents_history_4x = self.clean_x_embedder(
latents_clean, latents_history_2x, latents_history_4x
)
if latents_clean is not None and indices_latents_clean is not None:
image_rotary_emb_clean = self.rope(
frame_indices=indices_latents_clean, height=height, width=width, device=hidden_states.device
)
if latents_history_2x is not None and indices_latents_history_2x is not None:
image_rotary_emb_history_2x = self.rope(
frame_indices=indices_latents_history_2x, height=height, width=width, device=hidden_states.device
)
if latents_history_4x is not None and indices_latents_history_4x is not None:
image_rotary_emb_history_4x = self.rope(
frame_indices=indices_latents_history_4x, height=height, width=width, device=hidden_states.device
)
hidden_states, image_rotary_emb = self._pack_history_states(
hidden_states,
latents_clean,
latents_history_2x,
latents_history_4x,
image_rotary_emb,
image_rotary_emb_clean,
image_rotary_emb_history_2x,
image_rotary_emb_history_4x,
post_patch_height,
post_patch_width,
)
temb, _ = self.time_text_embed(timestep, pooled_projections, guidance)
encoder_hidden_states = self.context_embedder(encoder_hidden_states, timestep, encoder_attention_mask)
encoder_hidden_states_image = self.image_projection(image_embeds)
attention_mask_image = encoder_attention_mask.new_ones((batch_size, encoder_hidden_states_image.shape[1]))
# must cat before (not after) encoder_hidden_states, due to attn masking
encoder_hidden_states = torch.cat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
encoder_attention_mask = torch.cat([attention_mask_image, encoder_attention_mask], dim=1)
latent_sequence_length = hidden_states.shape[1]
condition_sequence_length = encoder_hidden_states.shape[1]
sequence_length = latent_sequence_length + condition_sequence_length
attention_mask = torch.zeros(
batch_size, sequence_length, device=hidden_states.device, dtype=torch.bool
) # [B, N]
effective_condition_sequence_length = encoder_attention_mask.sum(dim=1, dtype=torch.int) # [B,]
effective_sequence_length = latent_sequence_length + effective_condition_sequence_length
if batch_size == 1:
encoder_hidden_states = encoder_hidden_states[:, : effective_condition_sequence_length[0]]
attention_mask = None
else:
for i in range(batch_size):
attention_mask[i, : effective_sequence_length[i]] = True
# [B, 1, 1, N], for broadcasting across attention heads
attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
if torch.is_grad_enabled() and self.gradient_checkpointing:
for block in self.transformer_blocks:
hidden_states, encoder_hidden_states = self._gradient_checkpointing_func(
block, hidden_states, encoder_hidden_states, temb, attention_mask, image_rotary_emb
)
for block in self.single_transformer_blocks:
hidden_states, encoder_hidden_states = self._gradient_checkpointing_func(
block, hidden_states, encoder_hidden_states, temb, attention_mask, image_rotary_emb
)
else:
for block in self.transformer_blocks:
hidden_states, encoder_hidden_states = block(
hidden_states, encoder_hidden_states, temb, attention_mask, image_rotary_emb
)
for block in self.single_transformer_blocks:
hidden_states, encoder_hidden_states = block(
hidden_states, encoder_hidden_states, temb, attention_mask, image_rotary_emb
)
hidden_states = hidden_states[:, -original_context_length:]
hidden_states = self.norm_out(hidden_states, temb)
hidden_states = self.proj_out(hidden_states)
hidden_states = hidden_states.reshape(
batch_size, post_patch_num_frames, post_patch_height, post_patch_width, -1, p_t, p, p
)
hidden_states = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7)
hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
if USE_PEFT_BACKEND:
# remove `lora_scale` from each PEFT layer
unscale_lora_layers(self, lora_scale)
if not return_dict:
return (hidden_states,)
return Transformer2DModelOutput(sample=hidden_states)
def _pack_history_states(
self,
hidden_states: torch.Tensor,
latents_clean: Optional[torch.Tensor] = None,
latents_history_2x: Optional[torch.Tensor] = None,
latents_history_4x: Optional[torch.Tensor] = None,
image_rotary_emb: Tuple[torch.Tensor, torch.Tensor] = None,
image_rotary_emb_clean: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
image_rotary_emb_history_2x: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
image_rotary_emb_history_4x: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
height: int = None,
width: int = None,
):
image_rotary_emb = list(image_rotary_emb) # convert tuple to list for in-place modification
if latents_clean is not None and image_rotary_emb_clean is not None:
hidden_states = torch.cat([latents_clean, hidden_states], dim=1)
image_rotary_emb[0] = torch.cat([image_rotary_emb_clean[0], image_rotary_emb[0]], dim=0)
image_rotary_emb[1] = torch.cat([image_rotary_emb_clean[1], image_rotary_emb[1]], dim=0)
if latents_history_2x is not None and image_rotary_emb_history_2x is not None:
hidden_states = torch.cat([latents_history_2x, hidden_states], dim=1)
image_rotary_emb_history_2x = self._pad_rotary_emb(image_rotary_emb_history_2x, height, width, (2, 2, 2))
image_rotary_emb[0] = torch.cat([image_rotary_emb_history_2x[0], image_rotary_emb[0]], dim=0)
image_rotary_emb[1] = torch.cat([image_rotary_emb_history_2x[1], image_rotary_emb[1]], dim=0)
if latents_history_4x is not None and image_rotary_emb_history_4x is not None:
hidden_states = torch.cat([latents_history_4x, hidden_states], dim=1)
image_rotary_emb_history_4x = self._pad_rotary_emb(image_rotary_emb_history_4x, height, width, (4, 4, 4))
image_rotary_emb[0] = torch.cat([image_rotary_emb_history_4x[0], image_rotary_emb[0]], dim=0)
image_rotary_emb[1] = torch.cat([image_rotary_emb_history_4x[1], image_rotary_emb[1]], dim=0)
return hidden_states, tuple(image_rotary_emb)
def _pad_rotary_emb(
self,
image_rotary_emb: Tuple[torch.Tensor],
height: int,
width: int,
kernel_size: Tuple[int, int, int],
):
# freqs_cos, freqs_sin have shape [W * H * T, D / 2], where D is attention head dim
freqs_cos, freqs_sin = image_rotary_emb
freqs_cos = freqs_cos.unsqueeze(0).permute(0, 2, 1).unflatten(2, (-1, height, width))
freqs_sin = freqs_sin.unsqueeze(0).permute(0, 2, 1).unflatten(2, (-1, height, width))
freqs_cos = _pad_for_3d_conv(freqs_cos, kernel_size)
freqs_sin = _pad_for_3d_conv(freqs_sin, kernel_size)
freqs_cos = _center_down_sample_3d(freqs_cos, kernel_size)
freqs_sin = _center_down_sample_3d(freqs_sin, kernel_size)
freqs_cos = freqs_cos.flatten(2).permute(0, 2, 1).squeeze(0)
freqs_sin = freqs_sin.flatten(2).permute(0, 2, 1).squeeze(0)
return freqs_cos, freqs_sin
def _pad_for_3d_conv(x, kernel_size):
if isinstance(x, (tuple, list)):
return tuple(_pad_for_3d_conv(i, kernel_size) for i in x)
b, c, t, h, w = x.shape
pt, ph, pw = kernel_size
pad_t = (pt - (t % pt)) % pt
pad_h = (ph - (h % ph)) % ph
pad_w = (pw - (w % pw)) % pw
return torch.nn.functional.pad(x, (0, pad_w, 0, pad_h, 0, pad_t), mode="replicate")
def _center_down_sample_3d(x, kernel_size):
if isinstance(x, (tuple, list)):
return tuple(_center_down_sample_3d(i, kernel_size) for i in x)
return torch.nn.functional.avg_pool3d(x, kernel_size, stride=kernel_size)
......@@ -227,6 +227,7 @@ else:
"HunyuanVideoPipeline",
"HunyuanSkyreelsImageToVideoPipeline",
"HunyuanVideoImageToVideoPipeline",
"HunyuanVideoFramepackPipeline",
]
_import_structure["kandinsky"] = [
"KandinskyCombinedPipeline",
......@@ -589,6 +590,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
from .hidream_image import HiDreamImagePipeline
from .hunyuan_video import (
HunyuanSkyreelsImageToVideoPipeline,
HunyuanVideoFramepackPipeline,
HunyuanVideoImageToVideoPipeline,
HunyuanVideoPipeline,
)
......
......@@ -24,6 +24,7 @@ except OptionalDependencyNotAvailable:
else:
_import_structure["pipeline_hunyuan_skyreels_image2video"] = ["HunyuanSkyreelsImageToVideoPipeline"]
_import_structure["pipeline_hunyuan_video"] = ["HunyuanVideoPipeline"]
_import_structure["pipeline_hunyuan_video_framepack"] = ["HunyuanVideoFramepackPipeline"]
_import_structure["pipeline_hunyuan_video_image2video"] = ["HunyuanVideoImageToVideoPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
......@@ -36,6 +37,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
else:
from .pipeline_hunyuan_skyreels_image2video import HunyuanSkyreelsImageToVideoPipeline
from .pipeline_hunyuan_video import HunyuanVideoPipeline
from .pipeline_hunyuan_video_framepack import HunyuanVideoFramepackPipeline
from .pipeline_hunyuan_video_image2video import HunyuanVideoImageToVideoPipeline
else:
......
# Copyright 2025 The Framepack Team, The HunyuanVideo Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import math
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np
import torch
from transformers import (
CLIPTextModel,
CLIPTokenizer,
LlamaModel,
LlamaTokenizerFast,
SiglipImageProcessor,
SiglipVisionModel,
)
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
from ...image_processor import PipelineImageInput
from ...loaders import HunyuanVideoLoraLoaderMixin
from ...models import AutoencoderKLHunyuanVideo, HunyuanVideoFramepackTransformer3DModel
from ...schedulers import FlowMatchEulerDiscreteScheduler
from ...utils import is_torch_xla_available, logging, replace_example_docstring
from ...utils.torch_utils import randn_tensor
from ...video_processor import VideoProcessor
from ..pipeline_utils import DiffusionPipeline
from .pipeline_output import HunyuanVideoFramepackPipelineOutput
if is_torch_xla_available():
import torch_xla.core.xla_model as xm
XLA_AVAILABLE = True
else:
XLA_AVAILABLE = False
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
# TODO(yiyi): We can pack the checkpoints nicely with modular loader
EXAMPLE_DOC_STRING = """
Examples:
##### Image-to-Video
```python
>>> import torch
>>> from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
>>> from diffusers.utils import export_to_video, load_image
>>> from transformers import SiglipImageProcessor, SiglipVisionModel
>>> transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
... "lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16
... )
>>> feature_extractor = SiglipImageProcessor.from_pretrained(
... "lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
... )
>>> image_encoder = SiglipVisionModel.from_pretrained(
... "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
... )
>>> pipe = HunyuanVideoFramepackPipeline.from_pretrained(
... "hunyuanvideo-community/HunyuanVideo",
... transformer=transformer,
... feature_extractor=feature_extractor,
... image_encoder=image_encoder,
... torch_dtype=torch.float16,
... )
>>> pipe.vae.enable_tiling()
>>> pipe.to("cuda")
>>> image = load_image(
... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
... )
>>> output = pipe(
... image=image,
... prompt="A penguin dancing in the snow",
... height=832,
... width=480,
... num_frames=91,
... num_inference_steps=30,
... guidance_scale=9.0,
... generator=torch.Generator().manual_seed(0),
... ).frames[0]
>>> export_to_video(output, "output.mp4", fps=30)
```
##### First and Last Image-to-Video
```python
>>> import torch
>>> from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
>>> from diffusers.utils import export_to_video, load_image
>>> from transformers import SiglipImageProcessor, SiglipVisionModel
>>> transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
... "lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16
... )
>>> feature_extractor = SiglipImageProcessor.from_pretrained(
... "lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
... )
>>> image_encoder = SiglipVisionModel.from_pretrained(
... "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
... )
>>> pipe = HunyuanVideoFramepackPipeline.from_pretrained(
... "hunyuanvideo-community/HunyuanVideo",
... transformer=transformer,
... feature_extractor=feature_extractor,
... image_encoder=image_encoder,
... torch_dtype=torch.float16,
... )
>>> pipe.to("cuda")
>>> prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
>>> first_image = load_image(
... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png"
... )
>>> last_image = load_image(
... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png"
... )
>>> output = pipe(
... image=first_image,
... last_image=last_image,
... prompt=prompt,
... height=512,
... width=512,
... num_frames=91,
... num_inference_steps=30,
... guidance_scale=9.0,
... generator=torch.Generator().manual_seed(0),
... ).frames[0]
>>> export_to_video(output, "output.mp4", fps=30)
```
"""
DEFAULT_PROMPT_TEMPLATE = {
"template": (
"<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
"1. The main content and theme of the video."
"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
"4. background environment, light, style and atmosphere."
"5. camera angles, movements, and transitions used in the video:<|eot_id|>"
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
),
"crop_start": 95,
}
# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
def calculate_shift(
image_seq_len,
base_seq_len: int = 256,
max_seq_len: int = 4096,
base_shift: float = 0.5,
max_shift: float = 1.15,
):
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
b = base_shift - m * base_seq_len
mu = image_seq_len * m + b
return mu
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
def retrieve_timesteps(
scheduler,
num_inference_steps: Optional[int] = None,
device: Optional[Union[str, torch.device]] = None,
timesteps: Optional[List[int]] = None,
sigmas: Optional[List[float]] = None,
**kwargs,
):
r"""
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
Args:
scheduler (`SchedulerMixin`):
The scheduler to get timesteps from.
num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
must be `None`.
device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*):
Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
`num_inference_steps` and `sigmas` must be `None`.
sigmas (`List[float]`, *optional*):
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
`num_inference_steps` and `timesteps` must be `None`.
Returns:
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
second element is the number of inference steps.
"""
if timesteps is not None and sigmas is not None:
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
if timesteps is not None:
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
if not accepts_timesteps:
raise ValueError(
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
f" timestep schedules. Please check whether you are using the correct scheduler."
)
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
elif sigmas is not None:
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
if not accept_sigmas:
raise ValueError(
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
f" sigmas schedules. Please check whether you are using the correct scheduler."
)
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
else:
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
timesteps = scheduler.timesteps
return timesteps, num_inference_steps
class HunyuanVideoFramepackPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
r"""
Pipeline for text-to-video generation using HunyuanVideo.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args:
text_encoder ([`LlamaModel`]):
[Llava Llama3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers).
tokenizer (`LlamaTokenizer`):
Tokenizer from [Llava Llama3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers).
transformer ([`HunyuanVideoTransformer3DModel`]):
Conditional Transformer to denoise the encoded image latents.
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
vae ([`AutoencoderKLHunyuanVideo`]):
Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
text_encoder_2 ([`CLIPTextModel`]):
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
tokenizer_2 (`CLIPTokenizer`):
Tokenizer of class
[CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
"""
model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
_callback_tensor_inputs = ["latents", "prompt_embeds"]
def __init__(
self,
text_encoder: LlamaModel,
tokenizer: LlamaTokenizerFast,
transformer: HunyuanVideoFramepackTransformer3DModel,
vae: AutoencoderKLHunyuanVideo,
scheduler: FlowMatchEulerDiscreteScheduler,
text_encoder_2: CLIPTextModel,
tokenizer_2: CLIPTokenizer,
image_encoder: SiglipVisionModel,
feature_extractor: SiglipImageProcessor,
):
super().__init__()
self.register_modules(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
transformer=transformer,
scheduler=scheduler,
text_encoder_2=text_encoder_2,
tokenizer_2=tokenizer_2,
image_encoder=image_encoder,
feature_extractor=feature_extractor,
)
self.vae_scale_factor_temporal = self.vae.temporal_compression_ratio if getattr(self, "vae", None) else 4
self.vae_scale_factor_spatial = self.vae.spatial_compression_ratio if getattr(self, "vae", None) else 8
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
# Copied from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video.HunyuanVideoPipeline._get_llama_prompt_embeds
def _get_llama_prompt_embeds(
self,
prompt: Union[str, List[str]],
prompt_template: Dict[str, Any],
num_videos_per_prompt: int = 1,
device: Optional[torch.device] = None,
dtype: Optional[torch.dtype] = None,
max_sequence_length: int = 256,
num_hidden_layers_to_skip: int = 2,
) -> Tuple[torch.Tensor, torch.Tensor]:
device = device or self._execution_device
dtype = dtype or self.text_encoder.dtype
prompt = [prompt] if isinstance(prompt, str) else prompt
batch_size = len(prompt)
prompt = [prompt_template["template"].format(p) for p in prompt]
crop_start = prompt_template.get("crop_start", None)
if crop_start is None:
prompt_template_input = self.tokenizer(
prompt_template["template"],
padding="max_length",
return_tensors="pt",
return_length=False,
return_overflowing_tokens=False,
return_attention_mask=False,
)
crop_start = prompt_template_input["input_ids"].shape[-1]
# Remove <|eot_id|> token and placeholder {}
crop_start -= 2
max_sequence_length += crop_start
text_inputs = self.tokenizer(
prompt,
max_length=max_sequence_length,
padding="max_length",
truncation=True,
return_tensors="pt",
return_length=False,
return_overflowing_tokens=False,
return_attention_mask=True,
)
text_input_ids = text_inputs.input_ids.to(device=device)
prompt_attention_mask = text_inputs.attention_mask.to(device=device)
prompt_embeds = self.text_encoder(
input_ids=text_input_ids,
attention_mask=prompt_attention_mask,
output_hidden_states=True,
).hidden_states[-(num_hidden_layers_to_skip + 1)]
prompt_embeds = prompt_embeds.to(dtype=dtype)
if crop_start is not None and crop_start > 0:
prompt_embeds = prompt_embeds[:, crop_start:]
prompt_attention_mask = prompt_attention_mask[:, crop_start:]
# duplicate text embeddings for each generation per prompt, using mps friendly method
_, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
prompt_attention_mask = prompt_attention_mask.repeat(1, num_videos_per_prompt)
prompt_attention_mask = prompt_attention_mask.view(batch_size * num_videos_per_prompt, seq_len)
return prompt_embeds, prompt_attention_mask
# Copied from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video.HunyuanVideoPipeline._get_clip_prompt_embeds
def _get_clip_prompt_embeds(
self,
prompt: Union[str, List[str]],
num_videos_per_prompt: int = 1,
device: Optional[torch.device] = None,
dtype: Optional[torch.dtype] = None,
max_sequence_length: int = 77,
) -> torch.Tensor:
device = device or self._execution_device
dtype = dtype or self.text_encoder_2.dtype
prompt = [prompt] if isinstance(prompt, str) else prompt
batch_size = len(prompt)
text_inputs = self.tokenizer_2(
prompt,
padding="max_length",
max_length=max_sequence_length,
truncation=True,
return_tensors="pt",
)
text_input_ids = text_inputs.input_ids
untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
f" {max_sequence_length} tokens: {removed_text}"
)
prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False).pooler_output
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, -1)
return prompt_embeds
# Copied from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video.HunyuanVideoPipeline.encode_prompt
def encode_prompt(
self,
prompt: Union[str, List[str]],
prompt_2: Union[str, List[str]] = None,
prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
num_videos_per_prompt: int = 1,
prompt_embeds: Optional[torch.Tensor] = None,
pooled_prompt_embeds: Optional[torch.Tensor] = None,
prompt_attention_mask: Optional[torch.Tensor] = None,
device: Optional[torch.device] = None,
dtype: Optional[torch.dtype] = None,
max_sequence_length: int = 256,
):
if prompt_embeds is None:
prompt_embeds, prompt_attention_mask = self._get_llama_prompt_embeds(
prompt,
prompt_template,
num_videos_per_prompt,
device=device,
dtype=dtype,
max_sequence_length=max_sequence_length,
)
if pooled_prompt_embeds is None:
if prompt_2 is None:
prompt_2 = prompt
pooled_prompt_embeds = self._get_clip_prompt_embeds(
prompt,
num_videos_per_prompt,
device=device,
dtype=dtype,
max_sequence_length=77,
)
return prompt_embeds, pooled_prompt_embeds, prompt_attention_mask
def encode_image(
self, image: torch.Tensor, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None
):
device = device or self._execution_device
image = (image + 1) / 2.0 # [-1, 1] -> [0, 1]
image = self.feature_extractor(images=image, return_tensors="pt", do_rescale=False).to(
device=device, dtype=self.image_encoder.dtype
)
image_embeds = self.image_encoder(**image).last_hidden_state
return image_embeds.to(dtype=dtype)
def check_inputs(
self,
prompt,
prompt_2,
height,
width,
prompt_embeds=None,
callback_on_step_end_tensor_inputs=None,
prompt_template=None,
):
if height % 16 != 0 or width % 16 != 0:
raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
if callback_on_step_end_tensor_inputs is not None and not all(
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
):
raise ValueError(
f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
)
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
" only forward one of the two."
)
elif prompt_2 is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
" only forward one of the two."
)
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
if prompt_template is not None:
if not isinstance(prompt_template, dict):
raise ValueError(f"`prompt_template` has to be of type `dict` but is {type(prompt_template)}")
if "template" not in prompt_template:
raise ValueError(
f"`prompt_template` has to contain a key `template` but only found {prompt_template.keys()}"
)
def prepare_latents(
self,
batch_size: int = 1,
num_channels_latents: int = 16,
height: int = 720,
width: int = 1280,
num_frames: int = 129,
dtype: Optional[torch.dtype] = None,
device: Optional[torch.device] = None,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.Tensor] = None,
) -> torch.Tensor:
if latents is not None:
return latents.to(device=device, dtype=dtype)
shape = (
batch_size,
num_channels_latents,
(num_frames - 1) // self.vae_scale_factor_temporal + 1,
int(height) // self.vae_scale_factor_spatial,
int(width) // self.vae_scale_factor_spatial,
)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
return latents
def prepare_image_latents(
self,
image: torch.Tensor,
dtype: Optional[torch.dtype] = None,
device: Optional[torch.device] = None,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.Tensor] = None,
) -> torch.Tensor:
device = device or self._execution_device
if latents is None:
image = image.unsqueeze(2).to(device=device, dtype=self.vae.dtype)
latents = self.vae.encode(image).latent_dist.sample(generator=generator)
latents = latents * self.vae.config.scaling_factor
return latents.to(device=device, dtype=dtype)
def enable_vae_slicing(self):
r"""
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
"""
self.vae.enable_slicing()
def disable_vae_slicing(self):
r"""
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
computing decoding in one step.
"""
self.vae.disable_slicing()
def enable_vae_tiling(self):
r"""
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.
"""
self.vae.enable_tiling()
def disable_vae_tiling(self):
r"""
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
computing decoding in one step.
"""
self.vae.disable_tiling()
@property
def guidance_scale(self):
return self._guidance_scale
@property
def num_timesteps(self):
return self._num_timesteps
@property
def attention_kwargs(self):
return self._attention_kwargs
@property
def current_timestep(self):
return self._current_timestep
@property
def interrupt(self):
return self._interrupt
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
self,
image: PipelineImageInput,
last_image: Optional[PipelineImageInput] = None,
prompt: Union[str, List[str]] = None,
prompt_2: Union[str, List[str]] = None,
negative_prompt: Union[str, List[str]] = None,
negative_prompt_2: Union[str, List[str]] = None,
height: int = 720,
width: int = 1280,
num_frames: int = 129,
latent_window_size: int = 9,
num_inference_steps: int = 50,
sigmas: List[float] = None,
true_cfg_scale: float = 1.0,
guidance_scale: float = 6.0,
num_videos_per_prompt: Optional[int] = 1,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
image_latents: Optional[torch.Tensor] = None,
last_image_latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.Tensor] = None,
pooled_prompt_embeds: Optional[torch.Tensor] = None,
prompt_attention_mask: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_attention_mask: Optional[torch.Tensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
attention_kwargs: Optional[Dict[str, Any]] = None,
callback_on_step_end: Optional[
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
] = None,
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
prompt_template: Dict[str, Any] = DEFAULT_PROMPT_TEMPLATE,
max_sequence_length: int = 256,
):
r"""
The call function to the pipeline for generation.
Args:
image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
The image to be used as the starting point for the video generation.
last_image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`, *optional*):
The optional last image to be used as the ending point for the video generation. This is useful for
generating transitions between two images.
prompt (`str` or `List[str]`, *optional*):
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
instead.
prompt_2 (`str` or `List[str]`, *optional*):
The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
will be used instead.
negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
not greater than `1`).
negative_prompt_2 (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
`text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
height (`int`, defaults to `720`):
The height in pixels of the generated image.
width (`int`, defaults to `1280`):
The width in pixels of the generated image.
num_frames (`int`, defaults to `129`):
The number of frames in the generated video.
num_inference_steps (`int`, defaults to `50`):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
sigmas (`List[float]`, *optional*):
Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
will be used.
true_cfg_scale (`float`, *optional*, defaults to 1.0):
When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
guidance_scale (`float`, defaults to `6.0`):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality. Note that the only available HunyuanVideo model is
CFG-distilled, which means that traditional guidance between unconditional and conditional latent is
not applied.
num_videos_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
generation deterministic.
image_latents (`torch.Tensor`, *optional*):
Pre-encoded image latents. If not provided, the image will be encoded using the VAE.
last_image_latents (`torch.Tensor`, *optional*):
Pre-encoded last image latents. If not provided, the last image will be encoded using the VAE.
prompt_embeds (`torch.Tensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
provided, text embeddings are generated from the `prompt` input argument.
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
If not provided, pooled text embeddings will be generated from `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument.
negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
input argument.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated image. Choose between `PIL.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`HunyuanVideoFramepackPipelineOutput`] instead of a plain tuple.
attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
`self.processor` in
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
callback_on_step_end_tensor_inputs (`List`, *optional*):
The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
`._callback_tensor_inputs` attribute of your pipeline class.
Examples:
Returns:
[`~HunyuanVideoFramepackPipelineOutput`] or `tuple`:
If `return_dict` is `True`, [`HunyuanVideoFramepackPipelineOutput`] is returned, otherwise a `tuple` is
returned where the first element is a list with the generated images and the second element is a list
of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw)
content.
"""
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
# 1. Check inputs. Raise error if not correct
self.check_inputs(
prompt,
prompt_2,
height,
width,
prompt_embeds,
callback_on_step_end_tensor_inputs,
prompt_template,
)
has_neg_prompt = negative_prompt is not None or (
negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
)
do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
self._guidance_scale = guidance_scale
self._attention_kwargs = attention_kwargs
self._current_timestep = None
self._interrupt = False
device = self._execution_device
transformer_dtype = self.transformer.dtype
vae_dtype = self.vae.dtype
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
batch_size = 1
elif prompt is not None and isinstance(prompt, list):
batch_size = len(prompt)
else:
batch_size = prompt_embeds.shape[0]
# 3. Encode input prompt
transformer_dtype = self.transformer.dtype
prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = self.encode_prompt(
prompt=prompt,
prompt_2=prompt_2,
prompt_template=prompt_template,
num_videos_per_prompt=num_videos_per_prompt,
prompt_embeds=prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
prompt_attention_mask=prompt_attention_mask,
device=device,
max_sequence_length=max_sequence_length,
)
prompt_embeds = prompt_embeds.to(transformer_dtype)
prompt_attention_mask = prompt_attention_mask.to(transformer_dtype)
pooled_prompt_embeds = pooled_prompt_embeds.to(transformer_dtype)
if do_true_cfg:
negative_prompt_embeds, negative_pooled_prompt_embeds, negative_prompt_attention_mask = self.encode_prompt(
prompt=negative_prompt,
prompt_2=negative_prompt_2,
prompt_template=prompt_template,
num_videos_per_prompt=num_videos_per_prompt,
prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=negative_pooled_prompt_embeds,
prompt_attention_mask=negative_prompt_attention_mask,
device=device,
max_sequence_length=max_sequence_length,
)
negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
negative_prompt_attention_mask = negative_prompt_attention_mask.to(transformer_dtype)
negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.to(transformer_dtype)
# 4. Prepare image
image = self.video_processor.preprocess(image, height, width)
image_embeds = self.encode_image(image, device=device).to(transformer_dtype)
if last_image is not None:
# Credits: https://github.com/lllyasviel/FramePack/pull/167
# Users can modify the weighting strategy applied here
last_image = self.video_processor.preprocess(last_image, height, width)
last_image_embeds = self.encode_image(last_image, device=device).to(transformer_dtype)
last_image_embeds = (image_embeds + last_image_embeds) / 2
# 5. Prepare latent variables
num_channels_latents = self.transformer.config.in_channels
window_num_frames = (latent_window_size - 1) * self.vae_scale_factor_temporal + 1
num_latent_sections = max(1, (num_frames + window_num_frames - 1) // window_num_frames)
# Specific to the released checkpoint: https://huggingface.co/lllyasviel/FramePackI2V_HY
# TODO: find a more generic way in future if there are more checkpoints
history_sizes = [1, 2, 16]
history_latents = torch.zeros(
batch_size,
num_channels_latents,
sum(history_sizes),
height // self.vae_scale_factor_spatial,
width // self.vae_scale_factor_spatial,
device=device,
dtype=torch.float32,
)
history_video = None
total_generated_latent_frames = 0
image_latents = self.prepare_image_latents(
image, dtype=torch.float32, device=device, generator=generator, latents=image_latents
)
if last_image is not None:
last_image_latents = self.prepare_image_latents(
last_image, dtype=torch.float32, device=device, generator=generator
)
latent_paddings = list(reversed(range(num_latent_sections)))
if num_latent_sections > 4:
latent_paddings = [3] + [2] * (num_latent_sections - 3) + [1, 0]
# 6. Prepare guidance condition
guidance = torch.tensor([guidance_scale] * batch_size, dtype=transformer_dtype, device=device) * 1000.0
# 7. Denoising loop
for k in range(num_latent_sections):
is_first_section = k == 0
is_last_section = k == num_latent_sections - 1
latent_padding_size = latent_paddings[k] * latent_window_size
indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, *history_sizes]))
(
indices_prefix,
indices_padding,
indices_latents,
indices_postfix,
indices_latents_history_2x,
indices_latents_history_4x,
) = indices.split([1, latent_padding_size, latent_window_size, *history_sizes], dim=0)
# Inverted anti-drifting sampling: Figure 2(c) in the paper
indices_clean_latents = torch.cat([indices_prefix, indices_postfix], dim=0)
latents_prefix = image_latents
latents_postfix, latents_history_2x, latents_history_4x = history_latents[
:, :, : sum(history_sizes)
].split(history_sizes, dim=2)
if last_image is not None and is_first_section:
latents_postfix = last_image_latents
latents_clean = torch.cat([latents_prefix, latents_postfix], dim=2)
latents = self.prepare_latents(
batch_size,
num_channels_latents,
height,
width,
window_num_frames,
dtype=torch.float32,
device=device,
generator=generator,
latents=None,
)
sigmas = np.linspace(1.0, 0.0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
image_seq_len = (
latents.shape[2] * latents.shape[3] * latents.shape[4] / self.transformer.config.patch_size**2
)
exp_max = 7.0
mu = calculate_shift(
image_seq_len,
self.scheduler.config.get("base_image_seq_len", 256),
self.scheduler.config.get("max_image_seq_len", 4096),
self.scheduler.config.get("base_shift", 0.5),
self.scheduler.config.get("max_shift", 1.15),
)
mu = min(mu, math.log(exp_max))
timesteps, num_inference_steps = retrieve_timesteps(
self.scheduler, num_inference_steps, device, sigmas=sigmas, mu=mu
)
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
self._num_timesteps = len(timesteps)
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
if self.interrupt:
continue
self._current_timestep = t
timestep = t.expand(latents.shape[0])
noise_pred = self.transformer(
hidden_states=latents.to(transformer_dtype),
timestep=timestep,
encoder_hidden_states=prompt_embeds,
encoder_attention_mask=prompt_attention_mask,
pooled_projections=pooled_prompt_embeds,
image_embeds=image_embeds,
indices_latents=indices_latents,
guidance=guidance,
latents_clean=latents_clean.to(transformer_dtype),
indices_latents_clean=indices_clean_latents,
latents_history_2x=latents_history_2x.to(transformer_dtype),
indices_latents_history_2x=indices_latents_history_2x,
latents_history_4x=latents_history_4x.to(transformer_dtype),
indices_latents_history_4x=indices_latents_history_4x,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]
if do_true_cfg:
neg_noise_pred = self.transformer(
hidden_states=latents.to(transformer_dtype),
timestep=timestep,
encoder_hidden_states=negative_prompt_embeds,
encoder_attention_mask=negative_prompt_attention_mask,
pooled_projections=negative_pooled_prompt_embeds,
image_embeds=image_embeds,
indices_latents=indices_latents,
guidance=guidance,
latents_clean=latents_clean.to(transformer_dtype),
indices_latents_clean=indices_clean_latents,
latents_history_2x=latents_history_2x.to(transformer_dtype),
indices_latents_history_2x=indices_latents_history_2x,
latents_history_4x=latents_history_4x.to(transformer_dtype),
indices_latents_history_4x=indices_latents_history_4x,
attention_kwargs=attention_kwargs,
return_dict=False,
)[0]
noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
# compute the previous noisy sample x_t -> x_t-1
latents = self.scheduler.step(noise_pred.float(), t, latents, return_dict=False)[0]
if callback_on_step_end is not None:
callback_kwargs = {}
for k in callback_on_step_end_tensor_inputs:
callback_kwargs[k] = locals()[k]
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
latents = callback_outputs.pop("latents", latents)
prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
# call the callback, if provided
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
progress_bar.update()
if XLA_AVAILABLE:
xm.mark_step()
if is_last_section:
latents = torch.cat([image_latents, latents], dim=2)
total_generated_latent_frames += latents.shape[2]
history_latents = torch.cat([latents, history_latents], dim=2)
real_history_latents = history_latents[:, :, :total_generated_latent_frames]
if history_video is None:
if not output_type == "latent":
current_latents = real_history_latents.to(vae_dtype) / self.vae.config.scaling_factor
history_video = self.vae.decode(current_latents, return_dict=False)[0]
else:
history_video = [real_history_latents]
else:
if not output_type == "latent":
section_latent_frames = (
(latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
)
overlapped_frames = (latent_window_size - 1) * self.vae_scale_factor_temporal + 1
current_latents = (
real_history_latents[:, :, :section_latent_frames].to(vae_dtype)
/ self.vae.config.scaling_factor
)
current_video = self.vae.decode(current_latents, return_dict=False)[0]
history_video = self._soft_append(current_video, history_video, overlapped_frames)
else:
history_video.append(real_history_latents)
self._current_timestep = None
if not output_type == "latent":
generated_frames = history_video.size(2)
generated_frames = (
generated_frames - 1
) // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
history_video = history_video[:, :, :generated_frames]
video = self.video_processor.postprocess_video(history_video, output_type=output_type)
else:
video = history_video
# Offload all models
self.maybe_free_model_hooks()
if not return_dict:
return (video,)
return HunyuanVideoFramepackPipelineOutput(frames=video)
def _soft_append(self, history: torch.Tensor, current: torch.Tensor, overlap: int = 0):
if overlap <= 0:
return torch.cat([history, current], dim=2)
assert history.shape[2] >= overlap, f"Current length ({history.shape[2]}) must be >= overlap ({overlap})"
assert current.shape[2] >= overlap, f"History length ({current.shape[2]}) must be >= overlap ({overlap})"
weights = torch.linspace(1, 0, overlap, dtype=history.dtype, device=history.device).view(1, 1, -1, 1, 1)
blended = weights * history[:, :, -overlap:] + (1 - weights) * current[:, :, :overlap]
output = torch.cat([history[:, :, :-overlap], blended, current[:, :, overlap:]], dim=2)
return output.to(history)
from dataclasses import dataclass
from typing import List, Union
import numpy as np
import PIL.Image
import torch
from diffusers.utils import BaseOutput
......@@ -18,3 +21,19 @@ class HunyuanVideoPipelineOutput(BaseOutput):
"""
frames: torch.Tensor
@dataclass
class HunyuanVideoFramepackPipelineOutput(BaseOutput):
r"""
Output class for HunyuanVideo pipelines.
Args:
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
`(batch_size, num_frames, channels, height, width)`. Or, a list of torch tensors where each tensor
corresponds to a latent that decodes to multiple frames.
"""
frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]], List[torch.Tensor]]
......@@ -565,6 +565,21 @@ class HunyuanDiT2DMultiControlNetModel(metaclass=DummyObject):
requires_backends(cls, ["torch"])
class HunyuanVideoFramepackTransformer3DModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class HunyuanVideoTransformer3DModel(metaclass=DummyObject):
_backends = ["torch"]
......
......@@ -692,6 +692,21 @@ class HunyuanSkyreelsImageToVideoPipeline(metaclass=DummyObject):
requires_backends(cls, ["torch", "transformers"])
class HunyuanVideoFramepackPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch", "transformers"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
class HunyuanVideoImageToVideoPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
......
# Copyright 2025 The HuggingFace Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
import unittest
import numpy as np
import torch
from PIL import Image
from transformers import (
CLIPTextConfig,
CLIPTextModel,
CLIPTokenizer,
LlamaConfig,
LlamaModel,
LlamaTokenizer,
SiglipImageProcessor,
SiglipVisionModel,
)
from diffusers import (
AutoencoderKLHunyuanVideo,
FasterCacheConfig,
FlowMatchEulerDiscreteScheduler,
HunyuanVideoFramepackPipeline,
HunyuanVideoFramepackTransformer3DModel,
)
from diffusers.utils.testing_utils import (
enable_full_determinism,
torch_device,
)
from ..test_pipelines_common import (
FasterCacheTesterMixin,
PipelineTesterMixin,
PyramidAttentionBroadcastTesterMixin,
to_np,
)
enable_full_determinism()
class HunyuanVideoFramepackPipelineFastTests(
PipelineTesterMixin, PyramidAttentionBroadcastTesterMixin, FasterCacheTesterMixin, unittest.TestCase
):
pipeline_class = HunyuanVideoFramepackPipeline
params = frozenset(
["image", "prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"]
)
batch_params = frozenset(["image", "prompt"])
required_optional_params = frozenset(
[
"num_inference_steps",
"generator",
"return_dict",
"callback_on_step_end",
"callback_on_step_end_tensor_inputs",
]
)
supports_dduf = False
# there is no xformers processor for Flux
test_xformers_attention = False
test_layerwise_casting = True
test_group_offloading = True
faster_cache_config = FasterCacheConfig(
spatial_attention_block_skip_range=2,
spatial_attention_timestep_skip_range=(-1, 901),
unconditional_batch_skip_range=2,
attention_weight_callback=lambda _: 0.5,
is_guidance_distilled=True,
)
def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
torch.manual_seed(0)
transformer = HunyuanVideoFramepackTransformer3DModel(
in_channels=4,
out_channels=4,
num_attention_heads=2,
attention_head_dim=10,
num_layers=num_layers,
num_single_layers=num_single_layers,
num_refiner_layers=1,
patch_size=2,
patch_size_t=1,
guidance_embeds=True,
text_embed_dim=16,
pooled_projection_dim=8,
rope_axes_dim=(2, 4, 4),
image_condition_type=None,
has_image_proj=True,
image_proj_dim=32,
has_clean_x_embedder=True,
)
torch.manual_seed(0)
vae = AutoencoderKLHunyuanVideo(
in_channels=3,
out_channels=3,
latent_channels=4,
down_block_types=(
"HunyuanVideoDownBlock3D",
"HunyuanVideoDownBlock3D",
"HunyuanVideoDownBlock3D",
"HunyuanVideoDownBlock3D",
),
up_block_types=(
"HunyuanVideoUpBlock3D",
"HunyuanVideoUpBlock3D",
"HunyuanVideoUpBlock3D",
"HunyuanVideoUpBlock3D",
),
block_out_channels=(8, 8, 8, 8),
layers_per_block=1,
act_fn="silu",
norm_num_groups=4,
scaling_factor=0.476986,
spatial_compression_ratio=8,
temporal_compression_ratio=4,
mid_block_add_attention=True,
)
torch.manual_seed(0)
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
llama_text_encoder_config = LlamaConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=16,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=2,
pad_token_id=1,
vocab_size=1000,
hidden_act="gelu",
projection_dim=32,
)
clip_text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=8,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=2,
pad_token_id=1,
vocab_size=1000,
hidden_act="gelu",
projection_dim=32,
)
torch.manual_seed(0)
text_encoder = LlamaModel(llama_text_encoder_config)
tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
torch.manual_seed(0)
text_encoder_2 = CLIPTextModel(clip_text_encoder_config)
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
feature_extractor = SiglipImageProcessor.from_pretrained(
"hf-internal-testing/tiny-random-SiglipVisionModel", size={"height": 30, "width": 30}
)
image_encoder = SiglipVisionModel.from_pretrained("hf-internal-testing/tiny-random-SiglipVisionModel")
components = {
"transformer": transformer,
"vae": vae,
"scheduler": scheduler,
"text_encoder": text_encoder,
"text_encoder_2": text_encoder_2,
"tokenizer": tokenizer,
"tokenizer_2": tokenizer_2,
"feature_extractor": feature_extractor,
"image_encoder": image_encoder,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
image_height = 32
image_width = 32
image = Image.new("RGB", (image_width, image_height))
inputs = {
"image": image,
"prompt": "dance monkey",
"prompt_template": {
"template": "{}",
"crop_start": 0,
},
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 4.5,
"height": image_height,
"width": image_width,
"num_frames": 9,
"latent_window_size": 3,
"max_sequence_length": 256,
"output_type": "pt",
}
return inputs
def test_inference(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
video = pipe(**inputs).frames
generated_video = video[0]
self.assertEqual(generated_video.shape, (13, 3, 32, 32))
expected_video = torch.randn(13, 3, 32, 32)
max_diff = np.abs(generated_video - expected_video).max()
self.assertLessEqual(max_diff, 1e10)
def test_callback_inputs(self):
sig = inspect.signature(self.pipeline_class.__call__)
has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
has_callback_step_end = "callback_on_step_end" in sig.parameters
if not (has_callback_tensor_inputs and has_callback_step_end):
return
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
self.assertTrue(
hasattr(pipe, "_callback_tensor_inputs"),
f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
)
def callback_inputs_subset(pipe, i, t, callback_kwargs):
# iterate over callback args
for tensor_name, tensor_value in callback_kwargs.items():
# check that we're only passing in allowed tensor inputs
assert tensor_name in pipe._callback_tensor_inputs
return callback_kwargs
def callback_inputs_all(pipe, i, t, callback_kwargs):
for tensor_name in pipe._callback_tensor_inputs:
assert tensor_name in callback_kwargs
# iterate over callback args
for tensor_name, tensor_value in callback_kwargs.items():
# check that we're only passing in allowed tensor inputs
assert tensor_name in pipe._callback_tensor_inputs
return callback_kwargs
inputs = self.get_dummy_inputs(torch_device)
# Test passing in a subset
inputs["callback_on_step_end"] = callback_inputs_subset
inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
output = pipe(**inputs)[0]
# Test passing in a everything
inputs["callback_on_step_end"] = callback_inputs_all
inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
output = pipe(**inputs)[0]
def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
is_last = i == (pipe.num_timesteps - 1)
if is_last:
callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
return callback_kwargs
inputs["callback_on_step_end"] = callback_inputs_change_tensor
inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
output = pipe(**inputs)[0]
assert output.abs().sum() < 1e10
def test_attention_slicing_forward_pass(
self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
):
if not self.test_attention_slicing:
return
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
for component in pipe.components.values():
if hasattr(component, "set_default_attn_processor"):
component.set_default_attn_processor()
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
output_without_slicing = pipe(**inputs)[0]
pipe.enable_attention_slicing(slice_size=1)
inputs = self.get_dummy_inputs(generator_device)
output_with_slicing1 = pipe(**inputs)[0]
pipe.enable_attention_slicing(slice_size=2)
inputs = self.get_dummy_inputs(generator_device)
output_with_slicing2 = pipe(**inputs)[0]
if test_max_difference:
max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
self.assertLess(
max(max_diff1, max_diff2),
expected_max_diff,
"Attention slicing should not affect the inference results",
)
def test_vae_tiling(self, expected_diff_max: float = 0.2):
# Seems to require higher tolerance than the other tests
expected_diff_max = 0.6
generator_device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to("cpu")
pipe.set_progress_bar_config(disable=None)
# Without tiling
inputs = self.get_dummy_inputs(generator_device)
inputs["height"] = inputs["width"] = 128
output_without_tiling = pipe(**inputs)[0]
# With tiling
pipe.vae.enable_tiling(
tile_sample_min_height=96,
tile_sample_min_width=96,
tile_sample_stride_height=64,
tile_sample_stride_width=64,
)
inputs = self.get_dummy_inputs(generator_device)
inputs["height"] = inputs["width"] = 128
output_with_tiling = pipe(**inputs)[0]
self.assertLess(
(to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
expected_diff_max,
"VAE tiling should not affect the inference results",
)
# TODO(aryan): Create a dummy gemma model with smol vocab size
@unittest.skip(
"A very small vocab size is used for fast tests. So, any kind of prompt other than the empty default used in other tests will lead to a embedding lookup error. This test uses a long prompt that causes the error."
)
def test_inference_batch_consistent(self):
pass
@unittest.skip(
"A very small vocab size is used for fast tests. So, any kind of prompt other than the empty default used in other tests will lead to a embedding lookup error. This test uses a long prompt that causes the error."
)
def test_inference_batch_single_identical(self):
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment