Unverified Commit b8cf84a3 authored by Xin Ma's avatar Xin Ma Committed by GitHub
Browse files

Latte: Latent Diffusion Transformer for Video Generation (#8404)



* add Latte to diffusers

* remove print

* remove print

* remove print

* remove unuse codes

* remove layer_norm_latte and add a flag

* remove layer_norm_latte and add a flag

* update latte_pipeline

* update latte_pipeline

* remove unuse squeeze

* add norm_hidden_states.ndim == 2: # for Latte

* fixed test latte pipeline bugs

* fixed test latte pipeline bugs

* delete sh

* add doc for latte

* add licensing

* Move Transformer3DModelOutput to modeling_outputs

* give a default value to sample_size

* remove the einops dependency

* change norm2 for latte

* modify pipeline of latte

* update test for Latte

* modify some codes for latte

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* modify for Latte pipeline

* video_length -> num_frames; update prepare_latents copied from

* make fix-copies

* make style

* typo: videe -> video

* update

* modify for Latte pipeline

* modify latte pipeline

* modify latte pipeline

* modify latte pipeline

* modify latte pipeline

* modify for Latte pipeline

* Delete .vscode directory

* make style

* make fix-copies

* add latte transformer 3d to docs _toctree.yml

* update example

* reduce frames for test

* fixed bug of _text_preprocessing

* set num frame to 1 for testing

* remove unuse print

* add text = self._clean_caption(text) again

---------
Co-authored-by: default avatarSayak Paul <spsayakpaul@gmail.com>
Co-authored-by: default avatarYiYi Xu <yixu310@gmail.com>
Co-authored-by: default avatarAryan <contact.aryanvs@gmail.com>
Co-authored-by: default avatarAryan <aryan@huggingface.co>
parent 673eb60f
......@@ -175,4 +175,4 @@ tags
.ruff_cache
# wandb
wandb
wandb
\ No newline at end of file
......@@ -249,6 +249,8 @@
title: DiTTransformer2DModel
- local: api/models/hunyuan_transformer2d
title: HunyuanDiT2DModel
- local: api/models/latte_transformer3d
title: LatteTransformer3DModel
- local: api/models/lumina_nextdit2d
title: LuminaNextDiT2DModel
- local: api/models/transformer_temporal
......
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
## LatteTransformer3DModel
A Diffusion Transformer model for 3D data from [Latte](https://github.com/Vchitect/Latte).
## LatteTransformer3DModel
[[autodoc]] LatteTransformer3DModel
......@@ -88,6 +88,7 @@ else:
"HunyuanDiT2DMultiControlNetModel",
"I2VGenXLUNet",
"Kandinsky3UNet",
"LatteTransformer3DModel",
"LuminaNextDiT2DModel",
"ModelMixin",
"MotionAdapter",
......@@ -269,6 +270,7 @@ else:
"KandinskyV22PriorPipeline",
"LatentConsistencyModelImg2ImgPipeline",
"LatentConsistencyModelPipeline",
"LattePipeline",
"LDMTextToImagePipeline",
"LEditsPPPipelineStableDiffusion",
"LEditsPPPipelineStableDiffusionXL",
......@@ -513,6 +515,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
HunyuanDiT2DMultiControlNetModel,
I2VGenXLUNet,
Kandinsky3UNet,
LatteTransformer3DModel,
LuminaNextDiT2DModel,
ModelMixin,
MotionAdapter,
......@@ -672,6 +675,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
KandinskyV22PriorPipeline,
LatentConsistencyModelImg2ImgPipeline,
LatentConsistencyModelPipeline,
LattePipeline,
LDMTextToImagePipeline,
LEditsPPPipelineStableDiffusion,
LEditsPPPipelineStableDiffusionXL,
......
......@@ -41,6 +41,7 @@ if is_torch_available():
_import_structure["transformers.dit_transformer_2d"] = ["DiTTransformer2DModel"]
_import_structure["transformers.dual_transformer_2d"] = ["DualTransformer2DModel"]
_import_structure["transformers.hunyuan_transformer_2d"] = ["HunyuanDiT2DModel"]
_import_structure["transformers.latte_transformer_3d"] = ["LatteTransformer3DModel"]
_import_structure["transformers.lumina_nextdit2d"] = ["LuminaNextDiT2DModel"]
_import_structure["transformers.pixart_transformer_2d"] = ["PixArtTransformer2DModel"]
_import_structure["transformers.prior_transformer"] = ["PriorTransformer"]
......@@ -86,6 +87,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
DiTTransformer2DModel,
DualTransformer2DModel,
HunyuanDiT2DModel,
LatteTransformer3DModel,
LuminaNextDiT2DModel,
PixArtTransformer2DModel,
PriorTransformer,
......
......@@ -359,7 +359,10 @@ class BasicTransformerBlock(nn.Module):
out_bias=attention_out_bias,
) # is self-attn if encoder_hidden_states is none
else:
self.norm2 = None
if norm_type == "ada_norm_single": # For Latte
self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
else:
self.norm2 = None
self.attn2 = None
# 3. Feed-forward
......@@ -439,7 +442,6 @@ class BasicTransformerBlock(nn.Module):
).chunk(6, dim=1)
norm_hidden_states = self.norm1(hidden_states)
norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
norm_hidden_states = norm_hidden_states.squeeze(1)
else:
raise ValueError("Incorrect norm used")
......@@ -456,6 +458,7 @@ class BasicTransformerBlock(nn.Module):
attention_mask=attention_mask,
**cross_attention_kwargs,
)
if self.norm_type == "ada_norm_zero":
attn_output = gate_msa.unsqueeze(1) * attn_output
elif self.norm_type == "ada_norm_single":
......
......@@ -5,6 +5,7 @@ if is_torch_available():
from .dit_transformer_2d import DiTTransformer2DModel
from .dual_transformer_2d import DualTransformer2DModel
from .hunyuan_transformer_2d import HunyuanDiT2DModel
from .latte_transformer_3d import LatteTransformer3DModel
from .lumina_nextdit2d import LuminaNextDiT2DModel
from .pixart_transformer_2d import PixArtTransformer2DModel
from .prior_transformer import PriorTransformer
......
# Copyright 2024 the Latte Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
import torch
from torch import nn
from ...configuration_utils import ConfigMixin, register_to_config
from ...models.embeddings import PixArtAlphaTextProjection, get_1d_sincos_pos_embed_from_grid
from ..attention import BasicTransformerBlock
from ..embeddings import PatchEmbed
from ..modeling_outputs import Transformer2DModelOutput
from ..modeling_utils import ModelMixin
from ..normalization import AdaLayerNormSingle
class LatteTransformer3DModel(ModelMixin, ConfigMixin):
_supports_gradient_checkpointing = True
"""
A 3D Transformer model for video-like data, paper: https://arxiv.org/abs/2401.03048, offical code:
https://github.com/Vchitect/Latte
Parameters:
num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
in_channels (`int`, *optional*):
The number of channels in the input.
out_channels (`int`, *optional*):
The number of channels in the output.
num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
attention_bias (`bool`, *optional*):
Configure if the `TransformerBlocks` attention should contain a bias parameter.
sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
This is fixed during training since it is used to learn a number of position embeddings.
patch_size (`int`, *optional*):
The size of the patches to use in the patch embedding layer.
activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
num_embeds_ada_norm ( `int`, *optional*):
The number of diffusion steps used during training. Pass if at least one of the norm_layers is
`AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
added to the hidden states. During inference, you can denoise for up to but not more steps than
`num_embeds_ada_norm`.
norm_type (`str`, *optional*, defaults to `"layer_norm"`):
The type of normalization to use. Options are `"layer_norm"` or `"ada_layer_norm"`.
norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
Whether or not to use elementwise affine in normalization layers.
norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon value to use in normalization layers.
caption_channels (`int`, *optional*):
The number of channels in the caption embeddings.
video_length (`int`, *optional*):
The number of frames in the video-like data.
"""
@register_to_config
def __init__(
self,
num_attention_heads: int = 16,
attention_head_dim: int = 88,
in_channels: Optional[int] = None,
out_channels: Optional[int] = None,
num_layers: int = 1,
dropout: float = 0.0,
cross_attention_dim: Optional[int] = None,
attention_bias: bool = False,
sample_size: int = 64,
patch_size: Optional[int] = None,
activation_fn: str = "geglu",
num_embeds_ada_norm: Optional[int] = None,
norm_type: str = "layer_norm",
norm_elementwise_affine: bool = True,
norm_eps: float = 1e-5,
caption_channels: int = None,
video_length: int = 16,
):
super().__init__()
inner_dim = num_attention_heads * attention_head_dim
# 1. Define input layers
self.height = sample_size
self.width = sample_size
interpolation_scale = self.config.sample_size // 64
interpolation_scale = max(interpolation_scale, 1)
self.pos_embed = PatchEmbed(
height=sample_size,
width=sample_size,
patch_size=patch_size,
in_channels=in_channels,
embed_dim=inner_dim,
interpolation_scale=interpolation_scale,
)
# 2. Define spatial transformers blocks
self.transformer_blocks = nn.ModuleList(
[
BasicTransformerBlock(
inner_dim,
num_attention_heads,
attention_head_dim,
dropout=dropout,
cross_attention_dim=cross_attention_dim,
activation_fn=activation_fn,
num_embeds_ada_norm=num_embeds_ada_norm,
attention_bias=attention_bias,
norm_type=norm_type,
norm_elementwise_affine=norm_elementwise_affine,
norm_eps=norm_eps,
)
for d in range(num_layers)
]
)
# 3. Define temporal transformers blocks
self.temporal_transformer_blocks = nn.ModuleList(
[
BasicTransformerBlock(
inner_dim,
num_attention_heads,
attention_head_dim,
dropout=dropout,
cross_attention_dim=None,
activation_fn=activation_fn,
num_embeds_ada_norm=num_embeds_ada_norm,
attention_bias=attention_bias,
norm_type=norm_type,
norm_elementwise_affine=norm_elementwise_affine,
norm_eps=norm_eps,
)
for d in range(num_layers)
]
)
# 4. Define output layers
self.out_channels = in_channels if out_channels is None else out_channels
self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
# 5. Latte other blocks.
self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=False)
self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
# define temporal positional embedding
temp_pos_embed = get_1d_sincos_pos_embed_from_grid(
inner_dim, torch.arange(0, video_length).unsqueeze(1)
) # 1152 hidden size
self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
self.gradient_checkpointing = False
def _set_gradient_checkpointing(self, module, value=False):
self.gradient_checkpointing = value
def forward(
self,
hidden_states: torch.Tensor,
timestep: Optional[torch.LongTensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
enable_temporal_attentions: bool = True,
return_dict: bool = True,
):
"""
The [`LatteTransformer3DModel`] forward method.
Args:
hidden_states shape `(batch size, channel, num_frame, height, width)`:
Input `hidden_states`.
timestep ( `torch.LongTensor`, *optional*):
Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
self-attention.
encoder_attention_mask ( `torch.Tensor`, *optional*):
Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
* Mask `(batcheight, sequence_length)` True = keep, False = discard.
* Bias `(batcheight, 1, sequence_length)` 0 = keep, -10000 = discard.
If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
above. This bias will be added to the cross-attention scores.
enable_temporal_attentions:
(`bool`, *optional*, defaults to `True`): Whether to enable temporal attentions.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
tuple.
Returns:
If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
`tuple` where the first element is the sample tensor.
"""
# Reshape hidden states
batch_size, channels, num_frame, height, width = hidden_states.shape
# batch_size channels num_frame height width -> (batch_size * num_frame) channels height width
hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(-1, channels, height, width)
# Input
height, width = (
hidden_states.shape[-2] // self.config.patch_size,
hidden_states.shape[-1] // self.config.patch_size,
)
num_patches = height * width
hidden_states = self.pos_embed(hidden_states) # alrady add positional embeddings
added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
timestep, embedded_timestep = self.adaln_single(
timestep, added_cond_kwargs=added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
)
# Prepare text embeddings for spatial block
# batch_size num_tokens hidden_size -> (batch_size * num_frame) num_tokens hidden_size
encoder_hidden_states = self.caption_projection(encoder_hidden_states) # 3 120 1152
encoder_hidden_states_spatial = encoder_hidden_states.repeat_interleave(num_frame, dim=0).view(
-1, encoder_hidden_states.shape[-2], encoder_hidden_states.shape[-1]
)
# Prepare timesteps for spatial and temporal block
timestep_spatial = timestep.repeat_interleave(num_frame, dim=0).view(-1, timestep.shape[-1])
timestep_temp = timestep.repeat_interleave(num_patches, dim=0).view(-1, timestep.shape[-1])
# Spatial and temporal transformer blocks
for i, (spatial_block, temp_block) in enumerate(
zip(self.transformer_blocks, self.temporal_transformer_blocks)
):
if self.training and self.gradient_checkpointing:
hidden_states = torch.utils.checkpoint.checkpoint(
spatial_block,
hidden_states,
None, # attention_mask
encoder_hidden_states_spatial,
encoder_attention_mask,
timestep_spatial,
None, # cross_attention_kwargs
None, # class_labels
use_reentrant=False,
)
else:
hidden_states = spatial_block(
hidden_states,
None, # attention_mask
encoder_hidden_states_spatial,
encoder_attention_mask,
timestep_spatial,
None, # cross_attention_kwargs
None, # class_labels
)
if enable_temporal_attentions:
# (batch_size * num_frame) num_tokens hidden_size -> (batch_size * num_tokens) num_frame hidden_size
hidden_states = hidden_states.reshape(
batch_size, -1, hidden_states.shape[-2], hidden_states.shape[-1]
).permute(0, 2, 1, 3)
hidden_states = hidden_states.reshape(-1, hidden_states.shape[-2], hidden_states.shape[-1])
if i == 0 and num_frame > 1:
hidden_states = hidden_states + self.temp_pos_embed
if self.training and self.gradient_checkpointing:
hidden_states = torch.utils.checkpoint.checkpoint(
temp_block,
hidden_states,
None, # attention_mask
None, # encoder_hidden_states
None, # encoder_attention_mask
timestep_temp,
None, # cross_attention_kwargs
None, # class_labels
use_reentrant=False,
)
else:
hidden_states = temp_block(
hidden_states,
None, # attention_mask
None, # encoder_hidden_states
None, # encoder_attention_mask
timestep_temp,
None, # cross_attention_kwargs
None, # class_labels
)
# (batch_size * num_tokens) num_frame hidden_size -> (batch_size * num_frame) num_tokens hidden_size
hidden_states = hidden_states.reshape(
batch_size, -1, hidden_states.shape[-2], hidden_states.shape[-1]
).permute(0, 2, 1, 3)
hidden_states = hidden_states.reshape(-1, hidden_states.shape[-2], hidden_states.shape[-1])
embedded_timestep = embedded_timestep.repeat_interleave(num_frame, dim=0).view(-1, embedded_timestep.shape[-1])
shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
hidden_states = self.norm_out(hidden_states)
# Modulation
hidden_states = hidden_states * (1 + scale) + shift
hidden_states = self.proj_out(hidden_states)
# unpatchify
if self.adaln_single is None:
height = width = int(hidden_states.shape[1] ** 0.5)
hidden_states = hidden_states.reshape(
shape=(-1, height, width, self.config.patch_size, self.config.patch_size, self.out_channels)
)
hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
output = hidden_states.reshape(
shape=(-1, self.out_channels, height * self.config.patch_size, width * self.config.patch_size)
)
output = output.reshape(batch_size, -1, output.shape[-3], output.shape[-2], output.shape[-1]).permute(
0, 2, 1, 3, 4
)
if not return_dict:
return (output,)
return Transformer2DModelOutput(sample=output)
......@@ -209,6 +209,7 @@ else:
"LEditsPPPipelineStableDiffusionXL",
]
)
_import_structure["latte"] = ["LattePipeline"]
_import_structure["lumina"] = ["LuminaText2ImgPipeline"]
_import_structure["marigold"].extend(
[
......@@ -485,6 +486,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
LatentConsistencyModelPipeline,
)
from .latent_diffusion import LDMTextToImagePipeline
from .latte import LattePipeline
from .ledits_pp import (
LEditsPPDiffusionPipelineOutput,
LEditsPPInversionPipelineOutput,
......
from typing import TYPE_CHECKING
from ...utils import (
DIFFUSERS_SLOW_IMPORT,
OptionalDependencyNotAvailable,
_LazyModule,
get_objects_from_module,
is_torch_available,
is_transformers_available,
)
_dummy_objects = {}
_import_structure = {}
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils import dummy_torch_and_transformers_objects # noqa F403
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
_import_structure["pipeline_latte"] = ["LattePipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import *
else:
from .pipeline_latte import LattePipeline
else:
import sys
sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
_import_structure,
module_spec=__spec__,
)
for name, value in _dummy_objects.items():
setattr(sys.modules[__name__], name, value)
This diff is collapsed.
......@@ -197,6 +197,21 @@ class Kandinsky3UNet(metaclass=DummyObject):
requires_backends(cls, ["torch"])
class LatteTransformer3DModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class LuminaNextDiT2DModel(metaclass=DummyObject):
_backends = ["torch"]
......
......@@ -677,6 +677,21 @@ class LatentConsistencyModelPipeline(metaclass=DummyObject):
requires_backends(cls, ["torch", "transformers"])
class LattePipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch", "transformers"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
class LDMTextToImagePipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
......
# coding=utf-8
# Copyright 2024 Latte Team and HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import inspect
import tempfile
import unittest
import numpy as np
import torch
from transformers import AutoTokenizer, T5EncoderModel
from diffusers import (
AutoencoderKL,
DDIMScheduler,
LattePipeline,
LatteTransformer3DModel,
)
from diffusers.utils.testing_utils import (
enable_full_determinism,
numpy_cosine_similarity_distance,
require_torch_gpu,
slow,
torch_device,
)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin, to_np
enable_full_determinism()
class LattePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = LattePipeline
params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
required_optional_params = PipelineTesterMixin.required_optional_params
def get_dummy_components(self):
torch.manual_seed(0)
transformer = LatteTransformer3DModel(
sample_size=8,
num_layers=1,
patch_size=2,
attention_head_dim=8,
num_attention_heads=3,
caption_channels=32,
in_channels=4,
cross_attention_dim=24,
out_channels=8,
attention_bias=True,
activation_fn="gelu-approximate",
num_embeds_ada_norm=1000,
norm_type="ada_norm_single",
norm_elementwise_affine=False,
norm_eps=1e-6,
)
torch.manual_seed(0)
vae = AutoencoderKL()
scheduler = DDIMScheduler()
text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
components = {
"transformer": transformer.eval(),
"vae": vae.eval(),
"scheduler": scheduler,
"text_encoder": text_encoder.eval(),
"tokenizer": tokenizer,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"negative_prompt": "low quality",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 5.0,
"height": 8,
"width": 8,
"video_length": 1,
"output_type": "pt",
"clean_caption": False,
}
return inputs
def test_inference(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
video = pipe(**inputs).frames
generated_video = video[0]
self.assertEqual(generated_video.shape, (1, 3, 8, 8))
expected_video = torch.randn(1, 3, 8, 8)
max_diff = np.abs(generated_video - expected_video).max()
self.assertLessEqual(max_diff, 1e10)
def test_callback_inputs(self):
sig = inspect.signature(self.pipeline_class.__call__)
has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
has_callback_step_end = "callback_on_step_end" in sig.parameters
if not (has_callback_tensor_inputs and has_callback_step_end):
return
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
self.assertTrue(
hasattr(pipe, "_callback_tensor_inputs"),
f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
)
def callback_inputs_subset(pipe, i, t, callback_kwargs):
# iterate over callback args
for tensor_name, tensor_value in callback_kwargs.items():
# check that we're only passing in allowed tensor inputs
assert tensor_name in pipe._callback_tensor_inputs
return callback_kwargs
def callback_inputs_all(pipe, i, t, callback_kwargs):
for tensor_name in pipe._callback_tensor_inputs:
assert tensor_name in callback_kwargs
# iterate over callback args
for tensor_name, tensor_value in callback_kwargs.items():
# check that we're only passing in allowed tensor inputs
assert tensor_name in pipe._callback_tensor_inputs
return callback_kwargs
inputs = self.get_dummy_inputs(torch_device)
# Test passing in a subset
inputs["callback_on_step_end"] = callback_inputs_subset
inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
output = pipe(**inputs)[0]
# Test passing in a everything
inputs["callback_on_step_end"] = callback_inputs_all
inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
output = pipe(**inputs)[0]
def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
is_last = i == (pipe.num_timesteps - 1)
if is_last:
callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
return callback_kwargs
inputs["callback_on_step_end"] = callback_inputs_change_tensor
inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
output = pipe(**inputs)[0]
assert output.abs().sum() < 1e10
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-3)
def test_attention_slicing_forward_pass(self):
pass
def test_save_load_optional_components(self):
if not hasattr(self.pipeline_class, "_optional_components"):
return
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
for component in pipe.components.values():
if hasattr(component, "set_default_attn_processor"):
component.set_default_attn_processor()
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
prompt = inputs["prompt"]
generator = inputs["generator"]
(
prompt_embeds,
negative_prompt_embeds,
) = pipe.encode_prompt(prompt)
# inputs with prompt converted to embeddings
inputs = {
"prompt_embeds": prompt_embeds,
"negative_prompt": None,
"negative_prompt_embeds": negative_prompt_embeds,
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 5.0,
"height": 8,
"width": 8,
"video_length": 1,
"mask_feature": False,
"output_type": "pt",
"clean_caption": False,
}
# set all optional components to None
for optional_component in pipe._optional_components:
setattr(pipe, optional_component, None)
output = pipe(**inputs)[0]
with tempfile.TemporaryDirectory() as tmpdir:
pipe.save_pretrained(tmpdir, safe_serialization=False)
pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
pipe_loaded.to(torch_device)
for component in pipe_loaded.components.values():
if hasattr(component, "set_default_attn_processor"):
component.set_default_attn_processor()
pipe_loaded.set_progress_bar_config(disable=None)
for optional_component in pipe._optional_components:
self.assertTrue(
getattr(pipe_loaded, optional_component) is None,
f"`{optional_component}` did not stay set to None after loading.",
)
output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
self.assertLess(max_diff, 1.0)
@slow
@require_torch_gpu
class LattePipelineIntegrationTests(unittest.TestCase):
prompt = "A painting of a squirrel eating a burger."
def setUp(self):
super().setUp()
gc.collect()
torch.cuda.empty_cache()
def tearDown(self):
super().tearDown()
gc.collect()
torch.cuda.empty_cache()
def test_latte(self):
generator = torch.Generator("cpu").manual_seed(0)
pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16)
pipe.enable_model_cpu_offload()
prompt = self.prompt
videos = pipe(
prompt=prompt,
height=512,
width=512,
generator=generator,
num_inference_steps=2,
clean_caption=False,
).frames
video = videos[0]
expected_video = torch.randn(1, 512, 512, 3).numpy()
max_diff = numpy_cosine_similarity_distance(video.flatten(), expected_video)
assert max_diff < 1e-3, f"Max diff is too high. got {video.flatten()}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment