modular_blocks.py

# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict
from .before_denoise import (
    FluxImg2ImgPrepareLatentsStep,
    FluxImg2ImgSetTimestepsStep,
    FluxKontextRoPEInputsStep,
    FluxPrepareLatentsStep,
    FluxRoPEInputsStep,
    FluxSetTimestepsStep,
)
from .decoders import FluxDecodeStep
from .denoise import FluxDenoiseStep, FluxKontextDenoiseStep
from .encoders import (
    FluxKontextProcessImagesInputStep,
    FluxProcessImagesInputStep,
    FluxTextEncoderStep,
    FluxVaeEncoderDynamicStep,
)
from .inputs import (
    FluxInputsDynamicStep,
    FluxKontextInputsDynamicStep,
    FluxKontextSetResolutionStep,
    FluxTextInputStep,
)


logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


# vae encoder (run before before_denoise)
FluxImg2ImgVaeEncoderBlocks = InsertableDict(
    [("preprocess", FluxProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep())]
)


class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
    model_name = "flux"

    block_classes = FluxImg2ImgVaeEncoderBlocks.values()
    block_names = FluxImg2ImgVaeEncoderBlocks.keys()

    @property
    def description(self) -> str:
        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."


class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
    block_classes = [FluxImg2ImgVaeEncoderStep]
    block_names = ["img2img"]
    block_trigger_inputs = ["image"]

    @property
    def description(self):
        return (
            "Vae encoder step that encode the image inputs into their latent representations.\n"
            + "This is an auto pipeline block that works for img2img tasks.\n"
            + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided."
            + " - if `image` is not provided, step will be skipped."
        )


# Flux Kontext vae encoder (run before before_denoise)

FluxKontextVaeEncoderBlocks = InsertableDict(
    [("preprocess", FluxKontextProcessImagesInputStep()), ("encode", FluxVaeEncoderDynamicStep(sample_mode="argmax"))]
)


class FluxKontextVaeEncoderStep(SequentialPipelineBlocks):
    model_name = "flux-kontext"

    block_classes = FluxKontextVaeEncoderBlocks.values()
    block_names = FluxKontextVaeEncoderBlocks.keys()

    @property
    def description(self) -> str:
        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."


class FluxKontextAutoVaeEncoderStep(AutoPipelineBlocks):
    block_classes = [FluxKontextVaeEncoderStep]
    block_names = ["img2img"]
    block_trigger_inputs = ["image"]

    @property
    def description(self):
        return (
            "Vae encoder step that encode the image inputs into their latent representations.\n"
            + "This is an auto pipeline block that works for img2img tasks.\n"
            + " - `FluxKontextVaeEncoderStep` (img2img) is used when only `image` is provided."
            + " - if `image` is not provided, step will be skipped."
        )


# before_denoise: text2img
FluxBeforeDenoiseBlocks = InsertableDict(
    [
        ("prepare_latents", FluxPrepareLatentsStep()),
        ("set_timesteps", FluxSetTimestepsStep()),
        ("prepare_rope_inputs", FluxRoPEInputsStep()),
    ]
)


class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
    block_classes = FluxBeforeDenoiseBlocks.values()
    block_names = FluxBeforeDenoiseBlocks.keys()

    @property
    def description(self):
        return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."


# before_denoise: img2img
FluxImg2ImgBeforeDenoiseBlocks = InsertableDict(
    [
        ("prepare_latents", FluxPrepareLatentsStep()),
        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
        ("prepare_rope_inputs", FluxRoPEInputsStep()),
    ]
)


class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
    block_classes = FluxImg2ImgBeforeDenoiseBlocks.values()
    block_names = FluxImg2ImgBeforeDenoiseBlocks.keys()

    @property
    def description(self):
        return "Before denoise step that prepare the inputs for the denoise step for img2img task."


# before_denoise: all task (text2img, img2img)
class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
    model_name = "flux-kontext"
    block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
    block_names = ["img2img", "text2image"]
    block_trigger_inputs = ["image_latents", None]

    @property
    def description(self):
        return (
            "Before denoise step that prepare the inputs for the denoise step.\n"
            + "This is an auto pipeline block that works for text2image.\n"
            + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
            + " - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
        )


# before_denoise: FluxKontext

FluxKontextBeforeDenoiseBlocks = InsertableDict(
    [
        ("prepare_latents", FluxPrepareLatentsStep()),
        ("set_timesteps", FluxSetTimestepsStep()),
        ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
    ]
)


class FluxKontextBeforeDenoiseStep(SequentialPipelineBlocks):
    block_classes = FluxKontextBeforeDenoiseBlocks.values()
    block_names = FluxKontextBeforeDenoiseBlocks.keys()

    @property
    def description(self):
        return (
            "Before denoise step that prepare the inputs for the denoise step\n"
            "for img2img/text2img task for Flux Kontext."
        )


class FluxKontextAutoBeforeDenoiseStep(AutoPipelineBlocks):
    block_classes = [FluxKontextBeforeDenoiseStep, FluxBeforeDenoiseStep]
    block_names = ["img2img", "text2image"]
    block_trigger_inputs = ["image_latents", None]

    @property
    def description(self):
        return (
            "Before denoise step that prepare the inputs for the denoise step.\n"
            + "This is an auto pipeline block that works for text2image.\n"
            + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
            + " - `FluxKontextBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
        )


# denoise: text2image
class FluxAutoDenoiseStep(AutoPipelineBlocks):
    block_classes = [FluxDenoiseStep]
    block_names = ["denoise"]
    block_trigger_inputs = [None]

    @property
    def description(self) -> str:
        return (
            "Denoise step that iteratively denoise the latents. "
            "This is a auto pipeline block that works for text2image and img2img tasks."
            " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks."
        )


# denoise: Flux Kontext


class FluxKontextAutoDenoiseStep(AutoPipelineBlocks):
    block_classes = [FluxKontextDenoiseStep]
    block_names = ["denoise"]
    block_trigger_inputs = [None]

    @property
    def description(self) -> str:
        return (
            "Denoise step that iteratively denoise the latents for Flux Kontext. "
            "This is a auto pipeline block that works for text2image and img2img tasks."
            " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks."
        )


# decode: all task (text2img, img2img)
class FluxAutoDecodeStep(AutoPipelineBlocks):
    block_classes = [FluxDecodeStep]
    block_names = ["non-inpaint"]
    block_trigger_inputs = [None]

    @property
    def description(self):
        return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`"


# inputs: text2image/img2img
FluxImg2ImgBlocks = InsertableDict(
    [("text_inputs", FluxTextInputStep()), ("additional_inputs", FluxInputsDynamicStep())]
)


class FluxImg2ImgInputStep(SequentialPipelineBlocks):
    model_name = "flux"
    block_classes = FluxImg2ImgBlocks.values()
    block_names = FluxImg2ImgBlocks.keys()

    @property
    def description(self):
        return "Input step that prepares the inputs for the img2img denoising step. It:\n"
        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
        " - update height/width based `image_latents`, patchify `image_latents`."


class FluxAutoInputStep(AutoPipelineBlocks):
    block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
    block_names = ["img2img", "text2image"]
    block_trigger_inputs = ["image_latents", None]

    @property
    def description(self):
        return (
            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
            + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
            + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n"
        )


# inputs: Flux Kontext

FluxKontextBlocks = InsertableDict(
    [
        ("set_resolution", FluxKontextSetResolutionStep()),
        ("text_inputs", FluxTextInputStep()),
        ("additional_inputs", FluxKontextInputsDynamicStep()),
    ]
)


class FluxKontextInputStep(SequentialPipelineBlocks):
    model_name = "flux-kontext"
    block_classes = FluxKontextBlocks.values()
    block_names = FluxKontextBlocks.keys()

    @property
    def description(self):
        return (
            "Input step that prepares the inputs for the both text2img and img2img denoising step. It:\n"
            " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
            " - update height/width based `image_latents`, patchify `image_latents`."
        )


class FluxKontextAutoInputStep(AutoPipelineBlocks):
    block_classes = [FluxKontextInputStep, FluxTextInputStep]
    # block_classes = [FluxKontextInputStep]
    block_names = ["img2img", "text2img"]
    # block_names = ["img2img"]
    block_trigger_inputs = ["image_latents", None]
    # block_trigger_inputs = ["image_latents"]

    @property
    def description(self):
        return (
            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
            + " - `FluxKontextInputStep` (img2img) is used when `image_latents` is provided.\n"
            + " - `FluxKontextInputStep` is also capable of handling text2image task when `image_latent` isn't present."
        )


class FluxCoreDenoiseStep(SequentialPipelineBlocks):
    model_name = "flux"
    block_classes = [FluxAutoInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
    block_names = ["input", "before_denoise", "denoise"]

    @property
    def description(self):
        return (
            "Core step that performs the denoising process. \n"
            + " - `FluxAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
            + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
            + " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
            + " - for image-to-image generation, you need to provide `image_latents`\n"
            + " - for text-to-image generation, all you need to provide is prompt embeddings."
        )


class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
    model_name = "flux-kontext"
    block_classes = [FluxKontextAutoInputStep, FluxKontextAutoBeforeDenoiseStep, FluxKontextAutoDenoiseStep]
    block_names = ["input", "before_denoise", "denoise"]

    @property
    def description(self):
        return (
            "Core step that performs the denoising process. \n"
            + " - `FluxKontextAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
            + " - `FluxKontextAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
            + " - `FluxKontextAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
            + " - for image-to-image generation, you need to provide `image_latents`\n"
            + " - for text-to-image generation, all you need to provide is prompt embeddings."
        )


# Auto blocks (text2image and img2img)
AUTO_BLOCKS = InsertableDict(
    [
        ("text_encoder", FluxTextEncoderStep()),
        ("image_encoder", FluxAutoVaeEncoderStep()),
        ("denoise", FluxCoreDenoiseStep()),
        ("decode", FluxDecodeStep()),
    ]
)

AUTO_BLOCKS_KONTEXT = InsertableDict(
    [
        ("text_encoder", FluxTextEncoderStep()),
        ("image_encoder", FluxKontextAutoVaeEncoderStep()),
        ("denoise", FluxKontextCoreDenoiseStep()),
        ("decode", FluxDecodeStep()),
    ]
)


class FluxAutoBlocks(SequentialPipelineBlocks):
    model_name = "flux"

    block_classes = AUTO_BLOCKS.values()
    block_names = AUTO_BLOCKS.keys()

    @property
    def description(self):
        return (
            "Auto Modular pipeline for text-to-image and image-to-image using Flux.\n"
            + "- for text-to-image generation, all you need to provide is `prompt`\n"
            + "- for image-to-image generation, you need to provide either `image` or `image_latents`"
        )


class FluxKontextAutoBlocks(FluxAutoBlocks):
    model_name = "flux-kontext"

    block_classes = AUTO_BLOCKS_KONTEXT.values()
    block_names = AUTO_BLOCKS_KONTEXT.keys()


TEXT2IMAGE_BLOCKS = InsertableDict(
    [
        ("text_encoder", FluxTextEncoderStep()),
        ("input", FluxTextInputStep()),
        ("prepare_latents", FluxPrepareLatentsStep()),
        ("set_timesteps", FluxSetTimestepsStep()),
        ("prepare_rope_inputs", FluxRoPEInputsStep()),
        ("denoise", FluxDenoiseStep()),
        ("decode", FluxDecodeStep()),
    ]
)

IMAGE2IMAGE_BLOCKS = InsertableDict(
    [
        ("text_encoder", FluxTextEncoderStep()),
        ("vae_encoder", FluxVaeEncoderDynamicStep()),
        ("input", FluxImg2ImgInputStep()),
        ("prepare_latents", FluxPrepareLatentsStep()),
        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
        ("prepare_rope_inputs", FluxRoPEInputsStep()),
        ("denoise", FluxDenoiseStep()),
        ("decode", FluxDecodeStep()),
    ]
)

FLUX_KONTEXT_BLOCKS = InsertableDict(
    [
        ("text_encoder", FluxTextEncoderStep()),
        ("vae_encoder", FluxVaeEncoderDynamicStep(sample_mode="argmax")),
        ("input", FluxKontextInputStep()),
        ("prepare_latents", FluxPrepareLatentsStep()),
        ("set_timesteps", FluxSetTimestepsStep()),
        ("prepare_rope_inputs", FluxKontextRoPEInputsStep()),
        ("denoise", FluxKontextDenoiseStep()),
        ("decode", FluxDecodeStep()),
    ]
)

ALL_BLOCKS = {
    "text2image": TEXT2IMAGE_BLOCKS,
    "img2img": IMAGE2IMAGE_BLOCKS,
    "auto": AUTO_BLOCKS,
    "auto_kontext": AUTO_BLOCKS_KONTEXT,
    "kontext": FLUX_KONTEXT_BLOCKS,
}