modular_blocks.py 8.95 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict
18
19
20
21
from .before_denoise import (
    FluxImg2ImgPrepareLatentsStep,
    FluxImg2ImgSetTimestepsStep,
    FluxPrepareLatentsStep,
22
    FluxRoPEInputsStep,
23
24
    FluxSetTimestepsStep,
)
25
26
from .decoders import FluxDecodeStep
from .denoise import FluxDenoiseStep
27
28
from .encoders import FluxProcessImagesInputStep, FluxTextEncoderStep, FluxVaeEncoderDynamicStep
from .inputs import FluxInputsDynamicStep, FluxTextInputStep
29
30
31
32
33


logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


34
# vae encoder (run before before_denoise)
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
FluxImg2ImgVaeEncoderBlocks = InsertableDict(
    [
        ("preprocess", FluxProcessImagesInputStep()),
        ("encode", FluxVaeEncoderDynamicStep()),
    ]
)


class FluxImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
    model_name = "flux"

    block_classes = FluxImg2ImgVaeEncoderBlocks.values()
    block_names = FluxImg2ImgVaeEncoderBlocks.keys()

    @property
    def description(self) -> str:
        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."


54
class FluxAutoVaeEncoderStep(AutoPipelineBlocks):
55
    block_classes = [FluxImg2ImgVaeEncoderStep]
56
57
58
59
60
61
62
63
    block_names = ["img2img"]
    block_trigger_inputs = ["image"]

    @property
    def description(self):
        return (
            "Vae encoder step that encode the image inputs into their latent representations.\n"
            + "This is an auto pipeline block that works for img2img tasks.\n"
64
65
            + " - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided."
            + " - if `image` is not provided, step will be skipped."
66
67
68
        )


69
70
71
72
73
74
# before_denoise: text2img
FluxBeforeDenoiseBlocks = InsertableDict(
    [
        ("prepare_latents", FluxPrepareLatentsStep()),
        ("set_timesteps", FluxSetTimestepsStep()),
        ("prepare_rope_inputs", FluxRoPEInputsStep()),
75
    ]
76
77
78
79
80
81
)


class FluxBeforeDenoiseStep(SequentialPipelineBlocks):
    block_classes = FluxBeforeDenoiseBlocks.values()
    block_names = FluxBeforeDenoiseBlocks.keys()
82
83
84

    @property
    def description(self):
85
        return "Before denoise step that prepares the inputs for the denoise step in text-to-image generation."
86
87


88
# before_denoise: img2img
89
90
91
92
93
94
95
96
97
98
FluxImg2ImgBeforeDenoiseBlocks = InsertableDict(
    [
        ("prepare_latents", FluxPrepareLatentsStep()),
        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
        ("prepare_rope_inputs", FluxRoPEInputsStep()),
    ]
)


99
class FluxImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
100
101
    block_classes = FluxImg2ImgBeforeDenoiseBlocks.values()
    block_names = FluxImg2ImgBeforeDenoiseBlocks.keys()
102
103
104

    @property
    def description(self):
105
        return "Before denoise step that prepare the inputs for the denoise step for img2img task."
106
107
108


# before_denoise: all task (text2img, img2img)
109
class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
110
111
112
    block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
    block_names = ["img2img", "text2image"]
    block_trigger_inputs = ["image_latents", None]
113
114
115
116
117
118
119

    @property
    def description(self):
        return (
            "Before denoise step that prepare the inputs for the denoise step.\n"
            + "This is an auto pipeline block that works for text2image.\n"
            + " - `FluxBeforeDenoiseStep` (text2image) is used.\n"
120
            + " - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
121
122
123
124
125
126
127
128
129
130
131
132
133
        )


# denoise: text2image
class FluxAutoDenoiseStep(AutoPipelineBlocks):
    block_classes = [FluxDenoiseStep]
    block_names = ["denoise"]
    block_trigger_inputs = [None]

    @property
    def description(self) -> str:
        return (
            "Denoise step that iteratively denoise the latents. "
134
135
            "This is a auto pipeline block that works for text2image and img2img tasks."
            " - `FluxDenoiseStep` (denoise) for text2image and img2img tasks."
136
137
138
        )


139
# decode: all task (text2img, img2img)
140
141
142
143
144
145
146
class FluxAutoDecodeStep(AutoPipelineBlocks):
    block_classes = [FluxDecodeStep]
    block_names = ["non-inpaint"]
    block_trigger_inputs = [None]

    @property
    def description(self):
147
        return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`"
148
149


150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# inputs: text2image/img2img
FluxImg2ImgBlocks = InsertableDict(
    [("text_inputs", FluxTextInputStep()), ("additional_inputs", FluxInputsDynamicStep())]
)


class FluxImg2ImgInputStep(SequentialPipelineBlocks):
    model_name = "flux"
    block_classes = FluxImg2ImgBlocks.values()
    block_names = FluxImg2ImgBlocks.keys()

    @property
    def description(self):
        return "Input step that prepares the inputs for the img2img denoising step. It:\n"
        " - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
        " - update height/width based `image_latents`, patchify `image_latents`."


class FluxImageAutoInputStep(AutoPipelineBlocks):
    block_classes = [FluxImg2ImgInputStep, FluxTextInputStep]
    block_names = ["img2img", "text2image"]
    block_trigger_inputs = ["image_latents", None]

    @property
    def description(self):
        return (
            "Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
            " This is an auto pipeline block that works for text2image/img2img tasks.\n"
            + " - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
            + " - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.\n"
        )


183
class FluxCoreDenoiseStep(SequentialPipelineBlocks):
184
185
    model_name = "flux"
    block_classes = [FluxImageAutoInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
186
187
188
189
190
191
    block_names = ["input", "before_denoise", "denoise"]

    @property
    def description(self):
        return (
            "Core step that performs the denoising process. \n"
192
            + " - `FluxImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
193
194
            + " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
            + " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
195
            + "This step supports text-to-image and image-to-image tasks for Flux:\n"
196
            + " - for image-to-image generation, you need to provide `image_latents`\n"
197
            + " - for text-to-image generation, all you need to provide is prompt embeddings."
198
199
200
        )


201
202
203
204
205
206
207
# Auto blocks (text2image and img2img)
AUTO_BLOCKS = InsertableDict(
    [
        ("text_encoder", FluxTextEncoderStep()),
        ("image_encoder", FluxAutoVaeEncoderStep()),
        ("denoise", FluxCoreDenoiseStep()),
        ("decode", FluxDecodeStep()),
208
    ]
209
210
211
212
213
214
215
216
)


class FluxAutoBlocks(SequentialPipelineBlocks):
    model_name = "flux"

    block_classes = AUTO_BLOCKS.values()
    block_names = AUTO_BLOCKS.keys()
217
218
219
220

    @property
    def description(self):
        return (
221
222
223
            "Auto Modular pipeline for text-to-image and image-to-image using Flux.\n"
            + "- for text-to-image generation, all you need to provide is `prompt`\n"
            + "- for image-to-image generation, you need to provide either `image` or `image_latents`"
224
225
226
227
228
        )


TEXT2IMAGE_BLOCKS = InsertableDict(
    [
229
230
231
232
233
234
235
        ("text_encoder", FluxTextEncoderStep()),
        ("input", FluxTextInputStep()),
        ("prepare_latents", FluxPrepareLatentsStep()),
        ("set_timesteps", FluxSetTimestepsStep()),
        ("prepare_rope_inputs", FluxRoPEInputsStep()),
        ("denoise", FluxDenoiseStep()),
        ("decode", FluxDecodeStep()),
236
237
238
    ]
)

239
240
IMAGE2IMAGE_BLOCKS = InsertableDict(
    [
241
242
243
244
245
246
247
248
249
        ("text_encoder", FluxTextEncoderStep()),
        ("vae_encoder", FluxVaeEncoderDynamicStep()),
        ("input", FluxImg2ImgInputStep()),
        ("prepare_latents", FluxPrepareLatentsStep()),
        ("set_timesteps", FluxImg2ImgSetTimestepsStep()),
        ("prepare_img2img_latents", FluxImg2ImgPrepareLatentsStep()),
        ("prepare_rope_inputs", FluxRoPEInputsStep()),
        ("denoise", FluxDenoiseStep()),
        ("decode", FluxDecodeStep()),
250
251
    ]
)
252

253
ALL_BLOCKS = {"text2image": TEXT2IMAGE_BLOCKS, "img2img": IMAGE2IMAGE_BLOCKS, "auto": AUTO_BLOCKS}