add attention_head_dim

15f6b224 · sayakpaul · e6fd9ada · 15f6b224 · 15f6b224
Commit 15f6b224 authored Feb 06, 2024 by sayakpaul
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 1 deletion

src/diffusers/models/attention.py src/diffusers/models/attention.py +1 -0

src/diffusers/models/unets/unet_i2vgen_xl.py src/diffusers/models/unets/unet_i2vgen_xl.py +8 -1

No files found.
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -158,6 +158,7 @@ class BasicTransformerBlock(nn.Module):
        super().__init__()
        self.only_cross_attention = only_cross_attention
+        # We keep these boolean flags for backwards-compatibility.
        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"

--- a/src/diffusers/models/unets/unet_i2vgen_xl.py
+++ b/src/diffusers/models/unets/unet_i2vgen_xl.py
@@ -120,6 +120,7 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
            If `None`, normalization and activation layers is skipped in post-processing.
        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
        num_attention_heads (`int`, *optional*): The number of attention heads.
    """
@@ -147,10 +148,16 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
        layers_per_block: int = 2,
        norm_num_groups: Optional[int] = 32,
        cross_attention_dim: int = 1024,
+        attention_head_dim: Union[int, Tuple[int]] = None,
        num_attention_heads: Optional[Union[int, Tuple[int]]] = 64,
    ):
        super().__init__()
+        # We didn't define `attention_head_dim` when we first integrated this UNet. As a result,
+        # we had to use `num_attention_heads` in to pass values for arguments that actually denote
+        # attention head dimension. This is why we correct it here.
+        attention_head_dim = num_attention_heads or attention_head_dim
        # Check inputs
        if len(down_block_types) != len(up_block_types):
            raise ValueError(
@@ -172,7 +179,7 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
        self.transformer_in = TransformerTemporalModel(
            num_attention_heads=8,
-            attention_head_dim=num_attention_heads,
+            attention_head_dim=attention_head_dim,
            in_channels=block_out_channels[0],
            num_layers=1,
            norm_num_groups=norm_num_groups,