[refactor] Remove additional Flux code (#10881)

* update * apply review suggestions --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>

[refactor] Remove additional Flux code (#10881)
* update * apply review suggestions --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
04047032 · Aryan · GitHub · 13f20c7f · 04047032
Unverified Commit 04047032 authored Feb 25, 2025 by Aryan Committed by GitHub Feb 24, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 46 deletions

src/diffusers/models/transformers/transformer_flux.py src/diffusers/models/transformers/transformer_flux.py +9 -46

No files found.
--- a/src/diffusers/models/transformers/transformer_flux.py
+++ b/src/diffusers/models/transformers/transformer_flux.py
@@ -18,7 +18,6 @@ from typing import Any, Dict, Optional, Tuple, Union
 import numpy as np
 import torch
 import torch.nn as nn
-import torch.nn.functional as F

 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
@@ -32,7 +31,7 @@ from ...models.attention_processor import (
 )
 from ...models.modeling_utils import ModelMixin
 from ...models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
-from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.import_utils import is_torch_npu_available
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..cache_utils import CacheMixin
@@ -45,20 +44,7 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

 @maybe_allow_in_graph
 class FluxSingleTransformerBlock(nn.Module):
-    r"""
-    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
-
-    Reference: https://arxiv.org/abs/2403.03206
-
-    Parameters:
-        dim (`int`): The number of channels in the input and output.
-        num_attention_heads (`int`): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`): The number of channels in each head.
-        context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
-            processing of `context` conditions.
-    """
-
-    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
+    def __init__(self, dim: int, num_attention_heads: int, attention_head_dim: int, mlp_ratio: float = 4.0):
        super().__init__()
        self.mlp_hidden_dim = int(dim * mlp_ratio)

@@ -68,9 +54,15 @@ class FluxSingleTransformerBlock(nn.Module):
        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)

        if is_torch_npu_available():
+            deprecation_message = (
+                "Defaulting to FluxAttnProcessor2_0_NPU for NPU devices will be removed. Attention processors "
+                "should be set explicitly using the `set_attn_processor` method."
+            )
+            deprecate("npu_processor", "0.34.0", deprecation_message)
            processor = FluxAttnProcessor2_0_NPU()
        else:
            processor = FluxAttnProcessor2_0()
+
        self.attn = Attention(
            query_dim=dim,
            cross_attention_dim=None,
@@ -113,39 +105,14 @@ class FluxSingleTransformerBlock(nn.Module):

 @maybe_allow_in_graph
 class FluxTransformerBlock(nn.Module):
-    r"""
-    A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
-
-    Reference: https://arxiv.org/abs/2403.03206
-
-    Args:
-        dim (`int`):
-            The embedding dimension of the block.
-        num_attention_heads (`int`):
-            The number of attention heads to use.
-        attention_head_dim (`int`):
-            The number of dimensions to use for each attention head.
-        qk_norm (`str`, defaults to `"rms_norm"`):
-            The normalization to use for the query and key tensors.
-        eps (`float`, defaults to `1e-6`):
-            The epsilon value to use for the normalization.
-    """
-
    def __init__(
        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
    ):
        super().__init__()

        self.norm1 = AdaLayerNormZero(dim)
-
        self.norm1_context = AdaLayerNormZero(dim)

-        if hasattr(F, "scaled_dot_product_attention"):
-            processor = FluxAttnProcessor2_0()
-        else:
-            raise ValueError(
-                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
-            )
        self.attn = Attention(
            query_dim=dim,
            cross_attention_dim=None,
@@ -155,7 +122,7 @@ class FluxTransformerBlock(nn.Module):
            out_dim=dim,
            context_pre_only=False,
            bias=True,
-            processor=processor,
+            processor=FluxAttnProcessor2_0(),
            qk_norm=qk_norm,
            eps=eps,
        )
@@ -166,10 +133,6 @@ class FluxTransformerBlock(nn.Module):
        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")

-        # let chunk size default to None
-        self._chunk_size = None
-        self._chunk_dim = 0
-
    def forward(
        self,
        hidden_states: torch.Tensor,