Unverified Commit 76d4e416 authored by YiYi Xu's avatar YiYi Xu Committed by GitHub
Browse files

[modular]some small fix (#12307)

* fix

* add mellon node registry

* style

* update docstring to include more info!

* support custom node mellon

* HTTPErrpr -> HfHubHTTPErrpr

* up

* Update src/diffusers/modular_pipelines/qwenimage/node_utils.py
parent c07fcf78
...@@ -32,6 +32,8 @@ class FluxModularPipeline(ModularPipeline, FluxLoraLoaderMixin, TextualInversion ...@@ -32,6 +32,8 @@ class FluxModularPipeline(ModularPipeline, FluxLoraLoaderMixin, TextualInversion
</Tip> </Tip>
""" """
default_blocks_name = "FluxAutoBlocks"
@property @property
def default_height(self): def default_height(self):
return self.default_sample_size * self.vae_scale_factor return self.default_sample_size * self.vae_scale_factor
......
This diff is collapsed.
...@@ -51,6 +51,7 @@ if is_accelerate_available(): ...@@ -51,6 +51,7 @@ if is_accelerate_available():
logger = logging.get_logger(__name__) # pylint: disable=invalid-name logger = logging.get_logger(__name__) # pylint: disable=invalid-name
# map regular pipeline to modular pipeline class name
MODULAR_PIPELINE_MAPPING = OrderedDict( MODULAR_PIPELINE_MAPPING = OrderedDict(
[ [
("stable-diffusion-xl", "StableDiffusionXLModularPipeline"), ("stable-diffusion-xl", "StableDiffusionXLModularPipeline"),
...@@ -61,16 +62,6 @@ MODULAR_PIPELINE_MAPPING = OrderedDict( ...@@ -61,16 +62,6 @@ MODULAR_PIPELINE_MAPPING = OrderedDict(
] ]
) )
MODULAR_PIPELINE_BLOCKS_MAPPING = OrderedDict(
[
("StableDiffusionXLModularPipeline", "StableDiffusionXLAutoBlocks"),
("WanModularPipeline", "WanAutoBlocks"),
("FluxModularPipeline", "FluxAutoBlocks"),
("QwenImageModularPipeline", "QwenImageAutoBlocks"),
("QwenImageEditModularPipeline", "QwenImageEditAutoBlocks"),
]
)
@dataclass @dataclass
class PipelineState: class PipelineState:
...@@ -423,7 +414,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin): ...@@ -423,7 +414,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
state.set(input_param.name, param, input_param.kwargs_type) state.set(input_param.name, param, input_param.kwargs_type)
elif input_param.kwargs_type: elif input_param.kwargs_type:
# if it is a kwargs type, e.g. "guider_input_fields", it is likely to be a list of parameters # if it is a kwargs type, e.g. "denoiser_input_fields", it is likely to be a list of parameters
# we need to first find out which inputs are and loop through them. # we need to first find out which inputs are and loop through them.
intermediate_kwargs = state.get_by_kwargs(input_param.kwargs_type) intermediate_kwargs = state.get_by_kwargs(input_param.kwargs_type)
for param_name, current_value in intermediate_kwargs.items(): for param_name, current_value in intermediate_kwargs.items():
...@@ -1454,6 +1445,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin): ...@@ -1454,6 +1445,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
config_name = "modular_model_index.json" config_name = "modular_model_index.json"
hf_device_map = None hf_device_map = None
default_blocks_name = None
# YiYi TODO: add warning for passing multiple ComponentSpec/ConfigSpec with the same name # YiYi TODO: add warning for passing multiple ComponentSpec/ConfigSpec with the same name
def __init__( def __init__(
...@@ -1514,7 +1506,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin): ...@@ -1514,7 +1506,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
`_blocks_class_name` in the config dict `_blocks_class_name` in the config dict
""" """
if blocks is None: if blocks is None:
blocks_class_name = MODULAR_PIPELINE_BLOCKS_MAPPING.get(self.__class__.__name__) blocks_class_name = self.default_blocks_name
if blocks_class_name is not None: if blocks_class_name is not None:
diffusers_module = importlib.import_module("diffusers") diffusers_module = importlib.import_module("diffusers")
blocks_class = getattr(diffusers_module, blocks_class_name) blocks_class = getattr(diffusers_module, blocks_class_name)
......
This diff is collapsed.
...@@ -577,9 +577,8 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): ...@@ -577,9 +577,8 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]: def inputs(self) -> List[InputParam]:
return [ return [
InputParam(name="batch_size", required=True), InputParam(name="batch_size", required=True),
InputParam( InputParam(name="image_height", required=True),
name="resized_image", required=True, type_hint=torch.Tensor, description="The resized image input" InputParam(name="image_width", required=True),
),
InputParam(name="height", required=True), InputParam(name="height", required=True),
InputParam(name="width", required=True), InputParam(name="width", required=True),
InputParam(name="prompt_embeds_mask"), InputParam(name="prompt_embeds_mask"),
...@@ -612,10 +611,6 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): ...@@ -612,10 +611,6 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
block_state = self.get_block_state(state) block_state = self.get_block_state(state)
# for edit, image size can be different from the target size (height/width) # for edit, image size can be different from the target size (height/width)
image = (
block_state.resized_image[0] if isinstance(block_state.resized_image, list) else block_state.resized_image
)
image_width, image_height = image.size
block_state.img_shapes = [ block_state.img_shapes = [
[ [
...@@ -624,7 +619,11 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): ...@@ -624,7 +619,11 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
block_state.height // components.vae_scale_factor // 2, block_state.height // components.vae_scale_factor // 2,
block_state.width // components.vae_scale_factor // 2, block_state.width // components.vae_scale_factor // 2,
), ),
(1, image_height // components.vae_scale_factor // 2, image_width // components.vae_scale_factor // 2), (
1,
block_state.image_height // components.vae_scale_factor // 2,
block_state.image_width // components.vae_scale_factor // 2,
),
] ]
] * block_state.batch_size ] * block_state.batch_size
......
...@@ -496,7 +496,7 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks): ...@@ -496,7 +496,7 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
) )
if components.requires_unconditional_embeds: if components.requires_unconditional_embeds:
negative_prompt = block_state.negative_prompt or "" negative_prompt = block_state.negative_prompt or " "
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds_edit( block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds_edit(
components.text_encoder, components.text_encoder,
components.processor, components.processor,
......
...@@ -307,6 +307,13 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks): ...@@ -307,6 +307,13 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
return inputs return inputs
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(name="image_height", type_hint=int, description="The height of the image latents"),
OutputParam(name="image_width", type_hint=int, description="The width of the image latents"),
]
@property @property
def expected_components(self) -> List[ComponentSpec]: def expected_components(self) -> List[ComponentSpec]:
return [ return [
...@@ -327,6 +334,11 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks): ...@@ -327,6 +334,11 @@ class QwenImageInputsDynamicStep(ModularPipelineBlocks):
block_state.height = block_state.height or height block_state.height = block_state.height or height
block_state.width = block_state.width or width block_state.width = block_state.width or width
if not hasattr(block_state, "image_height"):
block_state.image_height = height
if not hasattr(block_state, "image_width"):
block_state.image_width = width
# 2. Patchify the image latent tensor # 2. Patchify the image latent tensor
image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor) image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor)
......
...@@ -511,17 +511,42 @@ class QwenImageAutoDecodeStep(AutoPipelineBlocks): ...@@ -511,17 +511,42 @@ class QwenImageAutoDecodeStep(AutoPipelineBlocks):
) )
class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = [
QwenImageAutoInputStep,
QwenImageOptionalControlNetInputStep,
QwenImageAutoBeforeDenoiseStep,
QwenImageOptionalControlNetBeforeDenoiseStep,
QwenImageAutoDenoiseStep,
]
block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise", "decode"]
@property
def description(self):
return (
"Core step that performs the denoising process. \n"
+ " - `QwenImageAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+ " - `QwenImageOptionalControlNetInputStep` (controlnet_input) prepares the controlnet input.\n"
+ " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n"
+ " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
+ " - `QwenImageAutoDecodeStep` (decode) decodes the latents into images.\n\n"
+ "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
+ " - for image-to-image generation, you need to provide `image_latents`\n"
+ " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
+ " - to run the controlnet workflow, you need to provide `control_image_latents`\n"
+ " - for text-to-image generation, all you need to provide is prompt embeddings"
)
## 1.10 QwenImage/auto block & presets ## 1.10 QwenImage/auto block & presets
AUTO_BLOCKS = InsertableDict( AUTO_BLOCKS = InsertableDict(
[ [
("text_encoder", QwenImageTextEncoderStep()), ("text_encoder", QwenImageTextEncoderStep()),
("vae_encoder", QwenImageAutoVaeEncoderStep()), ("vae_encoder", QwenImageAutoVaeEncoderStep()),
("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()), ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
("input", QwenImageAutoInputStep()), ("denoise", QwenImageCoreDenoiseStep()),
("controlnet_input", QwenImageOptionalControlNetInputStep()),
("before_denoise", QwenImageAutoBeforeDenoiseStep()),
("controlnet_before_denoise", QwenImageOptionalControlNetBeforeDenoiseStep()),
("denoise", QwenImageAutoDenoiseStep()),
("decode", QwenImageAutoDecodeStep()), ("decode", QwenImageAutoDecodeStep()),
] ]
) )
...@@ -699,7 +724,7 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks): ...@@ -699,7 +724,7 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
class QwenImageEditAutoInputStep(AutoPipelineBlocks): class QwenImageEditAutoInputStep(AutoPipelineBlocks):
block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep] block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep]
block_names = ["edit_inpaint", "edit"] block_names = ["edit_inpaint", "edit"]
block_trigger_inputs = ["processed_mask_image", "image"] block_trigger_inputs = ["processed_mask_image", "image_latents"]
@property @property
def description(self): def description(self):
...@@ -800,13 +825,34 @@ class QwenImageEditAutoDenoiseStep(AutoPipelineBlocks): ...@@ -800,13 +825,34 @@ class QwenImageEditAutoDenoiseStep(AutoPipelineBlocks):
## 2.7 QwenImage-Edit/auto blocks & presets ## 2.7 QwenImage-Edit/auto blocks & presets
class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditAutoInputStep,
QwenImageEditAutoBeforeDenoiseStep,
QwenImageEditAutoDenoiseStep,
]
block_names = ["input", "before_denoise", "denoise"]
@property
def description(self):
return (
"Core step that performs the denoising process. \n"
+ " - `QwenImageEditAutoInputStep` (input) standardizes the inputs for the denoising step.\n"
+ " - `QwenImageEditAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ " - `QwenImageEditAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n"
+ "This step support edit (img2img) and edit inpainting workflow for QwenImage Edit:\n"
+ " - When `processed_mask_image` is provided, it will be used for edit inpainting task.\n"
+ " - When `image_latents` is provided, it will be used for edit (img2img) task.\n"
)
EDIT_AUTO_BLOCKS = InsertableDict( EDIT_AUTO_BLOCKS = InsertableDict(
[ [
("text_encoder", QwenImageEditVLEncoderStep()), ("text_encoder", QwenImageEditVLEncoderStep()),
("vae_encoder", QwenImageEditAutoVaeEncoderStep()), ("vae_encoder", QwenImageEditAutoVaeEncoderStep()),
("input", QwenImageEditAutoInputStep()), ("denoise", QwenImageEditCoreDenoiseStep()),
("before_denoise", QwenImageEditAutoBeforeDenoiseStep()),
("denoise", QwenImageEditAutoDenoiseStep()),
("decode", QwenImageAutoDecodeStep()), ("decode", QwenImageAutoDecodeStep()),
] ]
) )
......
...@@ -104,6 +104,8 @@ class QwenImageModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin): ...@@ -104,6 +104,8 @@ class QwenImageModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
</Tip> </Tip>
""" """
default_blocks_name = "QwenImageAutoBlocks"
@property @property
def default_height(self): def default_height(self):
return self.default_sample_size * self.vae_scale_factor return self.default_sample_size * self.vae_scale_factor
...@@ -158,6 +160,8 @@ class QwenImageEditModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin): ...@@ -158,6 +160,8 @@ class QwenImageEditModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
</Tip> </Tip>
""" """
default_blocks_name = "QwenImageEditAutoBlocks"
# YiYi TODO: qwen edit should not provide default height/width, should be derived from the resized input image (after adjustment) produced by the resize step. # YiYi TODO: qwen edit should not provide default height/width, should be derived from the resized input image (after adjustment) produced by the resize step.
@property @property
def default_height(self): def default_height(self):
......
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# mellon nodes
QwenImage_NODE_TYPES_PARAMS_MAP = {
"controlnet": {
"inputs": [
"control_image",
"controlnet_conditioning_scale",
"control_guidance_start",
"control_guidance_end",
"height",
"width",
],
"model_inputs": [
"controlnet",
"vae",
],
"outputs": [
"controlnet_out",
],
"block_names": ["controlnet_vae_encoder"],
},
"denoise": {
"inputs": [
"embeddings",
"width",
"height",
"seed",
"num_inference_steps",
"guidance_scale",
"image_latents",
"strength",
"controlnet",
],
"model_inputs": [
"unet",
"guider",
"scheduler",
],
"outputs": [
"latents",
"latents_preview",
],
"block_names": ["denoise"],
},
"vae_encoder": {
"inputs": [
"image",
"width",
"height",
],
"model_inputs": [
"vae",
],
"outputs": [
"image_latents",
],
},
"text_encoder": {
"inputs": [
"prompt",
"negative_prompt",
],
"model_inputs": [
"text_encoders",
],
"outputs": [
"embeddings",
],
},
"decoder": {
"inputs": [
"latents",
],
"model_inputs": [
"vae",
],
"outputs": [
"images",
],
},
}
...@@ -262,37 +262,37 @@ class StableDiffusionXLInputStep(ModularPipelineBlocks): ...@@ -262,37 +262,37 @@ class StableDiffusionXLInputStep(ModularPipelineBlocks):
OutputParam( OutputParam(
"prompt_embeds", "prompt_embeds",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="text embeddings used to guide the image generation", description="text embeddings used to guide the image generation",
), ),
OutputParam( OutputParam(
"negative_prompt_embeds", "negative_prompt_embeds",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="negative text embeddings used to guide the image generation", description="negative text embeddings used to guide the image generation",
), ),
OutputParam( OutputParam(
"pooled_prompt_embeds", "pooled_prompt_embeds",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="pooled text embeddings used to guide the image generation", description="pooled text embeddings used to guide the image generation",
), ),
OutputParam( OutputParam(
"negative_pooled_prompt_embeds", "negative_pooled_prompt_embeds",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="negative pooled text embeddings used to guide the image generation", description="negative pooled text embeddings used to guide the image generation",
), ),
OutputParam( OutputParam(
"ip_adapter_embeds", "ip_adapter_embeds",
type_hint=List[torch.Tensor], type_hint=List[torch.Tensor],
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="image embeddings for IP-Adapter", description="image embeddings for IP-Adapter",
), ),
OutputParam( OutputParam(
"negative_ip_adapter_embeds", "negative_ip_adapter_embeds",
type_hint=List[torch.Tensor], type_hint=List[torch.Tensor],
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="negative image embeddings for IP-Adapter", description="negative image embeddings for IP-Adapter",
), ),
] ]
...@@ -1120,13 +1120,13 @@ class StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep(ModularPipelineB ...@@ -1120,13 +1120,13 @@ class StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep(ModularPipelineB
OutputParam( OutputParam(
"add_time_ids", "add_time_ids",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description="The time ids to condition the denoising process", description="The time ids to condition the denoising process",
), ),
OutputParam( OutputParam(
"negative_add_time_ids", "negative_add_time_ids",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description="The negative time ids to condition the denoising process", description="The negative time ids to condition the denoising process",
), ),
OutputParam("timestep_cond", type_hint=torch.Tensor, description="The timestep cond to use for LCM"), OutputParam("timestep_cond", type_hint=torch.Tensor, description="The timestep cond to use for LCM"),
...@@ -1331,13 +1331,13 @@ class StableDiffusionXLPrepareAdditionalConditioningStep(ModularPipelineBlocks): ...@@ -1331,13 +1331,13 @@ class StableDiffusionXLPrepareAdditionalConditioningStep(ModularPipelineBlocks):
OutputParam( OutputParam(
"add_time_ids", "add_time_ids",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description="The time ids to condition the denoising process", description="The time ids to condition the denoising process",
), ),
OutputParam( OutputParam(
"negative_add_time_ids", "negative_add_time_ids",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description="The negative time ids to condition the denoising process", description="The negative time ids to condition the denoising process",
), ),
OutputParam("timestep_cond", type_hint=torch.Tensor, description="The timestep cond to use for LCM"), OutputParam("timestep_cond", type_hint=torch.Tensor, description="The timestep cond to use for LCM"),
......
...@@ -183,14 +183,14 @@ class StableDiffusionXLLoopDenoiser(ModularPipelineBlocks): ...@@ -183,14 +183,14 @@ class StableDiffusionXLLoopDenoiser(ModularPipelineBlocks):
description="The guidance scale embedding to use for Latent Consistency Models(LCMs). Can be generated in prepare_additional_conditioning step.", description="The guidance scale embedding to use for Latent Consistency Models(LCMs). Can be generated in prepare_additional_conditioning step.",
), ),
InputParam( InputParam(
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description=( description=(
"All conditional model inputs that need to be prepared with guider. " "All conditional model inputs that need to be prepared with guider. "
"It should contain prompt_embeds/negative_prompt_embeds, " "It should contain prompt_embeds/negative_prompt_embeds, "
"add_time_ids/negative_add_time_ids, " "add_time_ids/negative_add_time_ids, "
"pooled_prompt_embeds/negative_pooled_prompt_embeds, " "pooled_prompt_embeds/negative_pooled_prompt_embeds, "
"and ip_adapter_embeds/negative_ip_adapter_embeds (optional)." "and ip_adapter_embeds/negative_ip_adapter_embeds (optional)."
"please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state" "please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
), ),
), ),
] ]
...@@ -307,14 +307,14 @@ class StableDiffusionXLControlNetLoopDenoiser(ModularPipelineBlocks): ...@@ -307,14 +307,14 @@ class StableDiffusionXLControlNetLoopDenoiser(ModularPipelineBlocks):
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
), ),
InputParam( InputParam(
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description=( description=(
"All conditional model inputs that need to be prepared with guider. " "All conditional model inputs that need to be prepared with guider. "
"It should contain prompt_embeds/negative_prompt_embeds, " "It should contain prompt_embeds/negative_prompt_embeds, "
"add_time_ids/negative_add_time_ids, " "add_time_ids/negative_add_time_ids, "
"pooled_prompt_embeds/negative_pooled_prompt_embeds, " "pooled_prompt_embeds/negative_pooled_prompt_embeds, "
"and ip_adapter_embeds/negative_ip_adapter_embeds (optional)." "and ip_adapter_embeds/negative_ip_adapter_embeds (optional)."
"please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state" "please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
), ),
), ),
InputParam( InputParam(
......
...@@ -258,25 +258,25 @@ class StableDiffusionXLTextEncoderStep(ModularPipelineBlocks): ...@@ -258,25 +258,25 @@ class StableDiffusionXLTextEncoderStep(ModularPipelineBlocks):
OutputParam( OutputParam(
"prompt_embeds", "prompt_embeds",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description="text embeddings used to guide the image generation", description="text embeddings used to guide the image generation",
), ),
OutputParam( OutputParam(
"negative_prompt_embeds", "negative_prompt_embeds",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description="negative text embeddings used to guide the image generation", description="negative text embeddings used to guide the image generation",
), ),
OutputParam( OutputParam(
"pooled_prompt_embeds", "pooled_prompt_embeds",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description="pooled text embeddings used to guide the image generation", description="pooled text embeddings used to guide the image generation",
), ),
OutputParam( OutputParam(
"negative_pooled_prompt_embeds", "negative_pooled_prompt_embeds",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description="negative pooled text embeddings used to guide the image generation", description="negative pooled text embeddings used to guide the image generation",
), ),
] ]
......
...@@ -82,19 +82,17 @@ class StableDiffusionXLAutoIPAdapterStep(AutoPipelineBlocks): ...@@ -82,19 +82,17 @@ class StableDiffusionXLAutoIPAdapterStep(AutoPipelineBlocks):
# before_denoise: text2img # before_denoise: text2img
class StableDiffusionXLBeforeDenoiseStep(SequentialPipelineBlocks): class StableDiffusionXLBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = [ block_classes = [
StableDiffusionXLInputStep,
StableDiffusionXLSetTimestepsStep, StableDiffusionXLSetTimestepsStep,
StableDiffusionXLPrepareLatentsStep, StableDiffusionXLPrepareLatentsStep,
StableDiffusionXLPrepareAdditionalConditioningStep, StableDiffusionXLPrepareAdditionalConditioningStep,
] ]
block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"] block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"]
@property @property
def description(self): def description(self):
return ( return (
"Before denoise step that prepare the inputs for the denoise step.\n" "Before denoise step that prepare the inputs for the denoise step.\n"
+ "This is a sequential pipeline blocks:\n" + "This is a sequential pipeline blocks:\n"
+ " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
+ " - `StableDiffusionXLSetTimestepsStep` is used to set the timesteps\n" + " - `StableDiffusionXLSetTimestepsStep` is used to set the timesteps\n"
+ " - `StableDiffusionXLPrepareLatentsStep` is used to prepare the latents\n" + " - `StableDiffusionXLPrepareLatentsStep` is used to prepare the latents\n"
+ " - `StableDiffusionXLPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n" + " - `StableDiffusionXLPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
...@@ -104,19 +102,17 @@ class StableDiffusionXLBeforeDenoiseStep(SequentialPipelineBlocks): ...@@ -104,19 +102,17 @@ class StableDiffusionXLBeforeDenoiseStep(SequentialPipelineBlocks):
# before_denoise: img2img # before_denoise: img2img
class StableDiffusionXLImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks): class StableDiffusionXLImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = [ block_classes = [
StableDiffusionXLInputStep,
StableDiffusionXLImg2ImgSetTimestepsStep, StableDiffusionXLImg2ImgSetTimestepsStep,
StableDiffusionXLImg2ImgPrepareLatentsStep, StableDiffusionXLImg2ImgPrepareLatentsStep,
StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep, StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep,
] ]
block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"] block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"]
@property @property
def description(self): def description(self):
return ( return (
"Before denoise step that prepare the inputs for the denoise step for img2img task.\n" "Before denoise step that prepare the inputs for the denoise step for img2img task.\n"
+ "This is a sequential pipeline blocks:\n" + "This is a sequential pipeline blocks:\n"
+ " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
+ " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n" + " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n"
+ " - `StableDiffusionXLImg2ImgPrepareLatentsStep` is used to prepare the latents\n" + " - `StableDiffusionXLImg2ImgPrepareLatentsStep` is used to prepare the latents\n"
+ " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n" + " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
...@@ -126,19 +122,17 @@ class StableDiffusionXLImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks): ...@@ -126,19 +122,17 @@ class StableDiffusionXLImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
# before_denoise: inpainting # before_denoise: inpainting
class StableDiffusionXLInpaintBeforeDenoiseStep(SequentialPipelineBlocks): class StableDiffusionXLInpaintBeforeDenoiseStep(SequentialPipelineBlocks):
block_classes = [ block_classes = [
StableDiffusionXLInputStep,
StableDiffusionXLImg2ImgSetTimestepsStep, StableDiffusionXLImg2ImgSetTimestepsStep,
StableDiffusionXLInpaintPrepareLatentsStep, StableDiffusionXLInpaintPrepareLatentsStep,
StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep, StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep,
] ]
block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"] block_names = ["set_timesteps", "prepare_latents", "prepare_add_cond"]
@property @property
def description(self): def description(self):
return ( return (
"Before denoise step that prepare the inputs for the denoise step for inpainting task.\n" "Before denoise step that prepare the inputs for the denoise step for inpainting task.\n"
+ "This is a sequential pipeline blocks:\n" + "This is a sequential pipeline blocks:\n"
+ " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
+ " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n" + " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n"
+ " - `StableDiffusionXLInpaintPrepareLatentsStep` is used to prepare the latents\n" + " - `StableDiffusionXLInpaintPrepareLatentsStep` is used to prepare the latents\n"
+ " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n" + " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
...@@ -255,25 +249,48 @@ class StableDiffusionXLAutoDecodeStep(AutoPipelineBlocks): ...@@ -255,25 +249,48 @@ class StableDiffusionXLAutoDecodeStep(AutoPipelineBlocks):
) )
class StableDiffusionXLCoreDenoiseStep(SequentialPipelineBlocks):
block_classes = [
StableDiffusionXLInputStep,
StableDiffusionXLAutoBeforeDenoiseStep,
StableDiffusionXLAutoControlNetInputStep,
StableDiffusionXLAutoDenoiseStep,
]
block_names = ["input", "before_denoise", "controlnet_input", "denoise"]
@property
def description(self):
return (
"Core step that performs the denoising process. \n"
+ " - `StableDiffusionXLInputStep` (input) standardizes the inputs for the denoising step.\n"
+ " - `StableDiffusionXLAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ " - `StableDiffusionXLAutoControlNetInputStep` (controlnet_input) prepares the controlnet input.\n"
+ " - `StableDiffusionXLAutoDenoiseStep` (denoise) iteratively denoises the latents.\n\n"
+ "This step support text-to-image, image-to-image, inpainting, with or without controlnet/controlnet_union/ip_adapter for Stable Diffusion XL:\n"
+ "- for image-to-image generation, you need to provide `image_latents`\n"
+ "- for inpainting, you need to provide `mask_image` and `image_latents`\n"
+ "- to run the controlnet workflow, you need to provide `control_image`\n"
+ "- to run the controlnet_union workflow, you need to provide `control_image` and `control_mode`\n"
+ "- to run the ip_adapter workflow, you need to load ip_adapter into your unet and provide `ip_adapter_embeds`\n"
+ "- for text-to-image generation, all you need to provide is prompt embeddings\n"
)
# ip-adapter, controlnet, text2img, img2img, inpainting # ip-adapter, controlnet, text2img, img2img, inpainting
class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks): class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
block_classes = [ block_classes = [
StableDiffusionXLTextEncoderStep, StableDiffusionXLTextEncoderStep,
StableDiffusionXLAutoIPAdapterStep, StableDiffusionXLAutoIPAdapterStep,
StableDiffusionXLAutoVaeEncoderStep, StableDiffusionXLAutoVaeEncoderStep,
StableDiffusionXLAutoBeforeDenoiseStep, StableDiffusionXLCoreDenoiseStep,
StableDiffusionXLAutoControlNetInputStep,
StableDiffusionXLAutoDenoiseStep,
StableDiffusionXLAutoDecodeStep, StableDiffusionXLAutoDecodeStep,
] ]
block_names = [ block_names = [
"text_encoder", "text_encoder",
"ip_adapter", "ip_adapter",
"image_encoder", "vae_encoder",
"before_denoise",
"controlnet_input",
"denoise", "denoise",
"decoder", "decode",
] ]
@property @property
...@@ -321,7 +338,7 @@ TEXT2IMAGE_BLOCKS = InsertableDict( ...@@ -321,7 +338,7 @@ TEXT2IMAGE_BLOCKS = InsertableDict(
IMAGE2IMAGE_BLOCKS = InsertableDict( IMAGE2IMAGE_BLOCKS = InsertableDict(
[ [
("text_encoder", StableDiffusionXLTextEncoderStep), ("text_encoder", StableDiffusionXLTextEncoderStep),
("image_encoder", StableDiffusionXLVaeEncoderStep), ("vae_encoder", StableDiffusionXLVaeEncoderStep),
("input", StableDiffusionXLInputStep), ("input", StableDiffusionXLInputStep),
("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep), ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep), ("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep),
...@@ -334,7 +351,7 @@ IMAGE2IMAGE_BLOCKS = InsertableDict( ...@@ -334,7 +351,7 @@ IMAGE2IMAGE_BLOCKS = InsertableDict(
INPAINT_BLOCKS = InsertableDict( INPAINT_BLOCKS = InsertableDict(
[ [
("text_encoder", StableDiffusionXLTextEncoderStep), ("text_encoder", StableDiffusionXLTextEncoderStep),
("image_encoder", StableDiffusionXLInpaintVaeEncoderStep), ("vae_encoder", StableDiffusionXLInpaintVaeEncoderStep),
("input", StableDiffusionXLInputStep), ("input", StableDiffusionXLInputStep),
("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep), ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
("prepare_latents", StableDiffusionXLInpaintPrepareLatentsStep), ("prepare_latents", StableDiffusionXLInpaintPrepareLatentsStep),
...@@ -361,10 +378,8 @@ AUTO_BLOCKS = InsertableDict( ...@@ -361,10 +378,8 @@ AUTO_BLOCKS = InsertableDict(
[ [
("text_encoder", StableDiffusionXLTextEncoderStep), ("text_encoder", StableDiffusionXLTextEncoderStep),
("ip_adapter", StableDiffusionXLAutoIPAdapterStep), ("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
("image_encoder", StableDiffusionXLAutoVaeEncoderStep), ("vae_encoder", StableDiffusionXLAutoVaeEncoderStep),
("before_denoise", StableDiffusionXLAutoBeforeDenoiseStep), ("denoise", StableDiffusionXLCoreDenoiseStep),
("controlnet_input", StableDiffusionXLAutoControlNetInputStep),
("denoise", StableDiffusionXLAutoDenoiseStep),
("decode", StableDiffusionXLAutoDecodeStep), ("decode", StableDiffusionXLAutoDecodeStep),
] ]
) )
......
...@@ -54,6 +54,8 @@ class StableDiffusionXLModularPipeline( ...@@ -54,6 +54,8 @@ class StableDiffusionXLModularPipeline(
</Tip> </Tip>
""" """
default_blocks_name = "StableDiffusionXLAutoBlocks"
@property @property
def default_height(self): def default_height(self):
return self.default_sample_size * self.vae_scale_factor return self.default_sample_size * self.vae_scale_factor
......
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SDXL_NODE_TYPES_PARAMS_MAP = {
"controlnet": {
"inputs": [
"control_image",
"controlnet_conditioning_scale",
"control_guidance_start",
"control_guidance_end",
"height",
"width",
],
"model_inputs": [
"controlnet",
],
"outputs": [
"controlnet_out",
],
"block_names": [None],
},
"denoise": {
"inputs": [
"embeddings",
"width",
"height",
"seed",
"num_inference_steps",
"guidance_scale",
"image_latents",
"strength",
# custom adapters coming in as inputs
"controlnet",
# ip_adapter is optional and custom; include if available
"ip_adapter",
],
"model_inputs": [
"unet",
"guider",
"scheduler",
],
"outputs": [
"latents",
"latents_preview",
],
"block_names": ["denoise"],
},
"vae_encoder": {
"inputs": [
"image",
"width",
"height",
],
"model_inputs": [
"vae",
],
"outputs": [
"image_latents",
],
"block_names": ["vae_encoder"],
},
"text_encoder": {
"inputs": [
"prompt",
"negative_prompt",
],
"model_inputs": [
"text_encoders",
],
"outputs": [
"embeddings",
],
"block_names": ["text_encoder"],
},
"decoder": {
"inputs": [
"latents",
],
"model_inputs": [
"vae",
],
"outputs": [
"images",
],
"block_names": ["decode"],
},
}
...@@ -146,13 +146,13 @@ class WanInputStep(ModularPipelineBlocks): ...@@ -146,13 +146,13 @@ class WanInputStep(ModularPipelineBlocks):
OutputParam( OutputParam(
"prompt_embeds", "prompt_embeds",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="text embeddings used to guide the image generation", description="text embeddings used to guide the image generation",
), ),
OutputParam( OutputParam(
"negative_prompt_embeds", "negative_prompt_embeds",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", # already in intermedites state but declare here again for guider_input_fields kwargs_type="denoiser_input_fields", # already in intermedites state but declare here again for denoiser_input_fields
description="negative text embeddings used to guide the image generation", description="negative text embeddings used to guide the image generation",
), ),
] ]
......
...@@ -79,11 +79,11 @@ class WanLoopDenoiser(ModularPipelineBlocks): ...@@ -79,11 +79,11 @@ class WanLoopDenoiser(ModularPipelineBlocks):
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
), ),
InputParam( InputParam(
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description=( description=(
"All conditional model inputs that need to be prepared with guider. " "All conditional model inputs that need to be prepared with guider. "
"It should contain prompt_embeds/negative_prompt_embeds. " "It should contain prompt_embeds/negative_prompt_embeds. "
"Please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state" "Please add `kwargs_type=denoiser_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
), ),
), ),
] ]
......
...@@ -89,13 +89,13 @@ class WanTextEncoderStep(ModularPipelineBlocks): ...@@ -89,13 +89,13 @@ class WanTextEncoderStep(ModularPipelineBlocks):
OutputParam( OutputParam(
"prompt_embeds", "prompt_embeds",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description="text embeddings used to guide the image generation", description="text embeddings used to guide the image generation",
), ),
OutputParam( OutputParam(
"negative_prompt_embeds", "negative_prompt_embeds",
type_hint=torch.Tensor, type_hint=torch.Tensor,
kwargs_type="guider_input_fields", kwargs_type="denoiser_input_fields",
description="negative text embeddings used to guide the image generation", description="negative text embeddings used to guide the image generation",
), ),
] ]
......
...@@ -37,6 +37,8 @@ class WanModularPipeline( ...@@ -37,6 +37,8 @@ class WanModularPipeline(
</Tip> </Tip>
""" """
default_blocks_name = "WanAutoBlocks"
@property @property
def default_height(self): def default_height(self):
return self.default_sample_height * self.vae_scale_factor_spatial return self.default_sample_height * self.vae_scale_factor_spatial
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment