fix: enable unet_3d_condition to support time_cond_proj_dim (#7364)

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>

fix: enable unet_3d_condition to support time_cond_proj_dim (#7364)
Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
405a1fac · Yuanhao Zhai · GitHub · 3028089e · 405a1fac
Unverified Commit 405a1fac authored Mar 20, 2024 by Yuanhao Zhai Committed by GitHub Mar 21, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 0 deletions

src/diffusers/models/unets/unet_3d_condition.py src/diffusers/models/unets/unet_3d_condition.py +4 -0

No files found.
--- a/src/diffusers/models/unets/unet_3d_condition.py
+++ b/src/diffusers/models/unets/unet_3d_condition.py
@@ -91,6 +91,8 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
        cross_attention_dim (`int`, *optional*, defaults to 1024): The dimension of the cross attention features.
        attention_head_dim (`int`, *optional*, defaults to 64): The dimension of the attention heads.
        num_attention_heads (`int`, *optional*): The number of attention heads.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
    """

    _supports_gradient_checkpointing = False
@@ -123,6 +125,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
        cross_attention_dim: int = 1024,
        attention_head_dim: Union[int, Tuple[int]] = 64,
        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        time_cond_proj_dim: Optional[int] = None,
    ):
        super().__init__()

@@ -174,6 +177,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
            timestep_input_dim,
            time_embed_dim,
            act_fn=act_fn,
+            cond_proj_dim=time_cond_proj_dim,
        )

        self.transformer_in = TransformerTemporalModel(