Unverified Commit a138d71e authored by YiYi Xu's avatar YiYi Xu Committed by GitHub
Browse files

HunyuanImage21 (#12333)



* add hunyuanimage2.1


---------
Co-authored-by: default avatarSayak Paul <spsayakpaul@gmail.com>
parent bc403988
...@@ -36,6 +36,8 @@ if is_torch_available(): ...@@ -36,6 +36,8 @@ if is_torch_available():
_import_structure["autoencoders.autoencoder_kl_cogvideox"] = ["AutoencoderKLCogVideoX"] _import_structure["autoencoders.autoencoder_kl_cogvideox"] = ["AutoencoderKLCogVideoX"]
_import_structure["autoencoders.autoencoder_kl_cosmos"] = ["AutoencoderKLCosmos"] _import_structure["autoencoders.autoencoder_kl_cosmos"] = ["AutoencoderKLCosmos"]
_import_structure["autoencoders.autoencoder_kl_hunyuan_video"] = ["AutoencoderKLHunyuanVideo"] _import_structure["autoencoders.autoencoder_kl_hunyuan_video"] = ["AutoencoderKLHunyuanVideo"]
_import_structure["autoencoders.autoencoder_kl_hunyuanimage"] = ["AutoencoderKLHunyuanImage"]
_import_structure["autoencoders.autoencoder_kl_hunyuanimage_refiner"] = ["AutoencoderKLHunyuanImageRefiner"]
_import_structure["autoencoders.autoencoder_kl_ltx"] = ["AutoencoderKLLTXVideo"] _import_structure["autoencoders.autoencoder_kl_ltx"] = ["AutoencoderKLLTXVideo"]
_import_structure["autoencoders.autoencoder_kl_magvit"] = ["AutoencoderKLMagvit"] _import_structure["autoencoders.autoencoder_kl_magvit"] = ["AutoencoderKLMagvit"]
_import_structure["autoencoders.autoencoder_kl_mochi"] = ["AutoencoderKLMochi"] _import_structure["autoencoders.autoencoder_kl_mochi"] = ["AutoencoderKLMochi"]
...@@ -91,6 +93,7 @@ if is_torch_available(): ...@@ -91,6 +93,7 @@ if is_torch_available():
_import_structure["transformers.transformer_hidream_image"] = ["HiDreamImageTransformer2DModel"] _import_structure["transformers.transformer_hidream_image"] = ["HiDreamImageTransformer2DModel"]
_import_structure["transformers.transformer_hunyuan_video"] = ["HunyuanVideoTransformer3DModel"] _import_structure["transformers.transformer_hunyuan_video"] = ["HunyuanVideoTransformer3DModel"]
_import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"] _import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
_import_structure["transformers.transformer_hunyuanimage"] = ["HunyuanImageTransformer2DModel"]
_import_structure["transformers.transformer_kandinsky"] = ["Kandinsky5Transformer3DModel"] _import_structure["transformers.transformer_kandinsky"] = ["Kandinsky5Transformer3DModel"]
_import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"] _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
_import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"] _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
...@@ -133,6 +136,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: ...@@ -133,6 +136,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
AutoencoderKLAllegro, AutoencoderKLAllegro,
AutoencoderKLCogVideoX, AutoencoderKLCogVideoX,
AutoencoderKLCosmos, AutoencoderKLCosmos,
AutoencoderKLHunyuanImage,
AutoencoderKLHunyuanImageRefiner,
AutoencoderKLHunyuanVideo, AutoencoderKLHunyuanVideo,
AutoencoderKLLTXVideo, AutoencoderKLLTXVideo,
AutoencoderKLMagvit, AutoencoderKLMagvit,
...@@ -182,6 +187,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: ...@@ -182,6 +187,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
FluxTransformer2DModel, FluxTransformer2DModel,
HiDreamImageTransformer2DModel, HiDreamImageTransformer2DModel,
HunyuanDiT2DModel, HunyuanDiT2DModel,
HunyuanImageTransformer2DModel,
HunyuanVideoFramepackTransformer3DModel, HunyuanVideoFramepackTransformer3DModel,
HunyuanVideoTransformer3DModel, HunyuanVideoTransformer3DModel,
Kandinsky5Transformer3DModel, Kandinsky5Transformer3DModel,
......
...@@ -5,6 +5,8 @@ from .autoencoder_kl_allegro import AutoencoderKLAllegro ...@@ -5,6 +5,8 @@ from .autoencoder_kl_allegro import AutoencoderKLAllegro
from .autoencoder_kl_cogvideox import AutoencoderKLCogVideoX from .autoencoder_kl_cogvideox import AutoencoderKLCogVideoX
from .autoencoder_kl_cosmos import AutoencoderKLCosmos from .autoencoder_kl_cosmos import AutoencoderKLCosmos
from .autoencoder_kl_hunyuan_video import AutoencoderKLHunyuanVideo from .autoencoder_kl_hunyuan_video import AutoencoderKLHunyuanVideo
from .autoencoder_kl_hunyuanimage import AutoencoderKLHunyuanImage
from .autoencoder_kl_hunyuanimage_refiner import AutoencoderKLHunyuanImageRefiner
from .autoencoder_kl_ltx import AutoencoderKLLTXVideo from .autoencoder_kl_ltx import AutoencoderKLLTXVideo
from .autoencoder_kl_magvit import AutoencoderKLMagvit from .autoencoder_kl_magvit import AutoencoderKLMagvit
from .autoencoder_kl_mochi import AutoencoderKLMochi from .autoencoder_kl_mochi import AutoencoderKLMochi
......
...@@ -27,6 +27,7 @@ if is_torch_available(): ...@@ -27,6 +27,7 @@ if is_torch_available():
from .transformer_hidream_image import HiDreamImageTransformer2DModel from .transformer_hidream_image import HiDreamImageTransformer2DModel
from .transformer_hunyuan_video import HunyuanVideoTransformer3DModel from .transformer_hunyuan_video import HunyuanVideoTransformer3DModel
from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel
from .transformer_hunyuanimage import HunyuanImageTransformer2DModel
from .transformer_kandinsky import Kandinsky5Transformer3DModel from .transformer_kandinsky import Kandinsky5Transformer3DModel
from .transformer_ltx import LTXVideoTransformer3DModel from .transformer_ltx import LTXVideoTransformer3DModel
from .transformer_lumina2 import Lumina2Transformer2DModel from .transformer_lumina2 import Lumina2Transformer2DModel
......
This diff is collapsed.
...@@ -130,8 +130,14 @@ class PipelineState: ...@@ -130,8 +130,14 @@ class PipelineState:
Allow attribute access to intermediate values. If an attribute is not found in the object, look for it in the Allow attribute access to intermediate values. If an attribute is not found in the object, look for it in the
intermediates dict. intermediates dict.
""" """
if name in self.values: # Use object.__getattribute__ to avoid infinite recursion during deepcopy
return self.values[name] try:
values = object.__getattribute__(self, "values")
except AttributeError:
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
if name in values:
return values[name]
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
def __repr__(self): def __repr__(self):
...@@ -2492,6 +2498,8 @@ class ModularPipeline(ConfigMixin, PushToHubMixin): ...@@ -2492,6 +2498,8 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
""" """
if state is None: if state is None:
state = PipelineState() state = PipelineState()
else:
state = deepcopy(state)
# Make a copy of the input kwargs # Make a copy of the input kwargs
passed_kwargs = kwargs.copy() passed_kwargs = kwargs.copy()
......
...@@ -238,19 +238,27 @@ class QwenImageLoopDenoiser(ModularPipelineBlocks): ...@@ -238,19 +238,27 @@ class QwenImageLoopDenoiser(ModularPipelineBlocks):
@torch.no_grad() @torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
guider_input_fields = { guider_inputs = {
"encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), "encoder_hidden_states": (
"encoder_hidden_states_mask": ("prompt_embeds_mask", "negative_prompt_embeds_mask"), getattr(block_state, "prompt_embeds", None),
"txt_seq_lens": ("txt_seq_lens", "negative_txt_seq_lens"), getattr(block_state, "negative_prompt_embeds", None),
),
"encoder_hidden_states_mask": (
getattr(block_state, "prompt_embeds_mask", None),
getattr(block_state, "negative_prompt_embeds_mask", None),
),
"txt_seq_lens": (
getattr(block_state, "txt_seq_lens", None),
getattr(block_state, "negative_txt_seq_lens", None),
),
} }
components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
guider_state = components.guider.prepare_inputs(block_state, guider_input_fields) guider_state = components.guider.prepare_inputs(guider_inputs)
for guider_state_batch in guider_state: for guider_state_batch in guider_state:
components.guider.prepare_models(components.transformer) components.guider.prepare_models(components.transformer)
cond_kwargs = guider_state_batch.as_dict() cond_kwargs = {input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()}
cond_kwargs = {k: v for k, v in cond_kwargs.items() if k in guider_input_fields}
# YiYi TODO: add cache context # YiYi TODO: add cache context
guider_state_batch.noise_pred = components.transformer( guider_state_batch.noise_pred = components.transformer(
...@@ -328,19 +336,27 @@ class QwenImageEditLoopDenoiser(ModularPipelineBlocks): ...@@ -328,19 +336,27 @@ class QwenImageEditLoopDenoiser(ModularPipelineBlocks):
@torch.no_grad() @torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor): def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
guider_input_fields = { guider_inputs = {
"encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"), "encoder_hidden_states": (
"encoder_hidden_states_mask": ("prompt_embeds_mask", "negative_prompt_embeds_mask"), getattr(block_state, "prompt_embeds", None),
"txt_seq_lens": ("txt_seq_lens", "negative_txt_seq_lens"), getattr(block_state, "negative_prompt_embeds", None),
),
"encoder_hidden_states_mask": (
getattr(block_state, "prompt_embeds_mask", None),
getattr(block_state, "negative_prompt_embeds_mask", None),
),
"txt_seq_lens": (
getattr(block_state, "txt_seq_lens", None),
getattr(block_state, "negative_txt_seq_lens", None),
),
} }
components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
guider_state = components.guider.prepare_inputs(block_state, guider_input_fields) guider_state = components.guider.prepare_inputs(guider_inputs)
for guider_state_batch in guider_state: for guider_state_batch in guider_state:
components.guider.prepare_models(components.transformer) components.guider.prepare_models(components.transformer)
cond_kwargs = guider_state_batch.as_dict() cond_kwargs = {input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()}
cond_kwargs = {k: v for k, v in cond_kwargs.items() if k in guider_input_fields}
# YiYi TODO: add cache context # YiYi TODO: add cache context
guider_state_batch.noise_pred = components.transformer( guider_state_batch.noise_pred = components.transformer(
......
...@@ -201,27 +201,41 @@ class StableDiffusionXLLoopDenoiser(ModularPipelineBlocks): ...@@ -201,27 +201,41 @@ class StableDiffusionXLLoopDenoiser(ModularPipelineBlocks):
) -> PipelineState: ) -> PipelineState:
# Map the keys we'll see on each `guider_state_batch` (e.g. guider_state_batch.prompt_embeds) # Map the keys we'll see on each `guider_state_batch` (e.g. guider_state_batch.prompt_embeds)
# to the corresponding (cond, uncond) fields on block_state. (e.g. block_state.prompt_embeds, block_state.negative_prompt_embeds) # to the corresponding (cond, uncond) fields on block_state. (e.g. block_state.prompt_embeds, block_state.negative_prompt_embeds)
guider_input_fields = { guider_inputs = {
"prompt_embeds": ("prompt_embeds", "negative_prompt_embeds"), "prompt_embeds": (
"time_ids": ("add_time_ids", "negative_add_time_ids"), getattr(block_state, "prompt_embeds", None),
"text_embeds": ("pooled_prompt_embeds", "negative_pooled_prompt_embeds"), getattr(block_state, "negative_prompt_embeds", None),
"image_embeds": ("ip_adapter_embeds", "negative_ip_adapter_embeds"), ),
"time_ids": (
getattr(block_state, "add_time_ids", None),
getattr(block_state, "negative_add_time_ids", None),
),
"text_embeds": (
getattr(block_state, "pooled_prompt_embeds", None),
getattr(block_state, "negative_pooled_prompt_embeds", None),
),
"image_embeds": (
getattr(block_state, "ip_adapter_embeds", None),
getattr(block_state, "negative_ip_adapter_embeds", None),
),
} }
components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
# Prepare mini‐batches according to guidance method and `guider_input_fields` # The guider splits model inputs into separate batches for conditional/unconditional predictions.
# Each guider_state_batch will have .prompt_embeds, .time_ids, text_embeds, image_embeds. # For CFG with guider_inputs = {"encoder_hidden_states": (prompt_embeds, negative_prompt_embeds)}:
# e.g. for CFG, we prepare two batches: one for uncond, one for cond # you will get a guider_state with two batches:
# for first batch, guider_state_batch.prompt_embeds correspond to block_state.prompt_embeds # guider_state = [
# for second batch, guider_state_batch.prompt_embeds correspond to block_state.negative_prompt_embeds # {"encoder_hidden_states": prompt_embeds, "__guidance_identifier__": "pred_cond"}, # conditional batch
guider_state = components.guider.prepare_inputs(block_state, guider_input_fields) # {"encoder_hidden_states": negative_prompt_embeds, "__guidance_identifier__": "pred_uncond"}, # unconditional batch
# ]
# Other guidance methods may return 1 batch (no guidance) or 3+ batches (e.g., PAG, APG).
guider_state = components.guider.prepare_inputs(guider_inputs)
# run the denoiser for each guidance batch # run the denoiser for each guidance batch
for guider_state_batch in guider_state: for guider_state_batch in guider_state:
components.guider.prepare_models(components.unet) components.guider.prepare_models(components.unet)
cond_kwargs = guider_state_batch.as_dict() cond_kwargs = {input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()}
cond_kwargs = {k: v for k, v in cond_kwargs.items() if k in guider_input_fields}
prompt_embeds = cond_kwargs.pop("prompt_embeds") prompt_embeds = cond_kwargs.pop("prompt_embeds")
# Predict the noise residual # Predict the noise residual
...@@ -344,11 +358,23 @@ class StableDiffusionXLControlNetLoopDenoiser(ModularPipelineBlocks): ...@@ -344,11 +358,23 @@ class StableDiffusionXLControlNetLoopDenoiser(ModularPipelineBlocks):
# Map the keys we'll see on each `guider_state_batch` (e.g. guider_state_batch.prompt_embeds) # Map the keys we'll see on each `guider_state_batch` (e.g. guider_state_batch.prompt_embeds)
# to the corresponding (cond, uncond) fields on block_state. (e.g. block_state.prompt_embeds, block_state.negative_prompt_embeds) # to the corresponding (cond, uncond) fields on block_state. (e.g. block_state.prompt_embeds, block_state.negative_prompt_embeds)
guider_input_fields = { guider_inputs = {
"prompt_embeds": ("prompt_embeds", "negative_prompt_embeds"), "prompt_embeds": (
"time_ids": ("add_time_ids", "negative_add_time_ids"), getattr(block_state, "prompt_embeds", None),
"text_embeds": ("pooled_prompt_embeds", "negative_pooled_prompt_embeds"), getattr(block_state, "negative_prompt_embeds", None),
"image_embeds": ("ip_adapter_embeds", "negative_ip_adapter_embeds"), ),
"time_ids": (
getattr(block_state, "add_time_ids", None),
getattr(block_state, "negative_add_time_ids", None),
),
"text_embeds": (
getattr(block_state, "pooled_prompt_embeds", None),
getattr(block_state, "negative_pooled_prompt_embeds", None),
),
"image_embeds": (
getattr(block_state, "ip_adapter_embeds", None),
getattr(block_state, "negative_ip_adapter_embeds", None),
),
} }
# cond_scale for the timestep (controlnet input) # cond_scale for the timestep (controlnet input)
...@@ -369,12 +395,15 @@ class StableDiffusionXLControlNetLoopDenoiser(ModularPipelineBlocks): ...@@ -369,12 +395,15 @@ class StableDiffusionXLControlNetLoopDenoiser(ModularPipelineBlocks):
# guided denoiser step # guided denoiser step
components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
# Prepare mini‐batches according to guidance method and `guider_input_fields` # The guider splits model inputs into separate batches for conditional/unconditional predictions.
# Each guider_state_batch will have .prompt_embeds, .time_ids, text_embeds, image_embeds. # For CFG with guider_inputs = {"encoder_hidden_states": (prompt_embeds, negative_prompt_embeds)}:
# e.g. for CFG, we prepare two batches: one for uncond, one for cond # you will get a guider_state with two batches:
# for first batch, guider_state_batch.prompt_embeds correspond to block_state.prompt_embeds # guider_state = [
# for second batch, guider_state_batch.prompt_embeds correspond to block_state.negative_prompt_embeds # {"encoder_hidden_states": prompt_embeds, "__guidance_identifier__": "pred_cond"}, # conditional batch
guider_state = components.guider.prepare_inputs(block_state, guider_input_fields) # {"encoder_hidden_states": negative_prompt_embeds, "__guidance_identifier__": "pred_uncond"}, # unconditional batch
# ]
# Other guidance methods may return 1 batch (no guidance) or 3+ batches (e.g., PAG, APG).
guider_state = components.guider.prepare_inputs(guider_inputs)
# run the denoiser for each guidance batch # run the denoiser for each guidance batch
for guider_state_batch in guider_state: for guider_state_batch in guider_state:
......
...@@ -94,25 +94,30 @@ class WanLoopDenoiser(ModularPipelineBlocks): ...@@ -94,25 +94,30 @@ class WanLoopDenoiser(ModularPipelineBlocks):
) -> PipelineState: ) -> PipelineState:
# Map the keys we'll see on each `guider_state_batch` (e.g. guider_state_batch.prompt_embeds) # Map the keys we'll see on each `guider_state_batch` (e.g. guider_state_batch.prompt_embeds)
# to the corresponding (cond, uncond) fields on block_state. (e.g. block_state.prompt_embeds, block_state.negative_prompt_embeds) # to the corresponding (cond, uncond) fields on block_state. (e.g. block_state.prompt_embeds, block_state.negative_prompt_embeds)
guider_input_fields = { guider_inputs = {
"prompt_embeds": ("prompt_embeds", "negative_prompt_embeds"), "prompt_embeds": (
getattr(block_state, "prompt_embeds", None),
getattr(block_state, "negative_prompt_embeds", None),
),
} }
transformer_dtype = components.transformer.dtype transformer_dtype = components.transformer.dtype
components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t) components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
# Prepare mini‐batches according to guidance method and `guider_input_fields` # The guider splits model inputs into separate batches for conditional/unconditional predictions.
# Each guider_state_batch will have .prompt_embeds, .time_ids, text_embeds, image_embeds. # For CFG with guider_inputs = {"encoder_hidden_states": (prompt_embeds, negative_prompt_embeds)}:
# e.g. for CFG, we prepare two batches: one for uncond, one for cond # you will get a guider_state with two batches:
# for first batch, guider_state_batch.prompt_embeds correspond to block_state.prompt_embeds # guider_state = [
# for second batch, guider_state_batch.prompt_embeds correspond to block_state.negative_prompt_embeds # {"encoder_hidden_states": prompt_embeds, "__guidance_identifier__": "pred_cond"}, # conditional batch
guider_state = components.guider.prepare_inputs(block_state, guider_input_fields) # {"encoder_hidden_states": negative_prompt_embeds, "__guidance_identifier__": "pred_uncond"}, # unconditional batch
# ]
# Other guidance methods may return 1 batch (no guidance) or 3+ batches (e.g., PAG, APG).
guider_state = components.guider.prepare_inputs(guider_inputs)
# run the denoiser for each guidance batch # run the denoiser for each guidance batch
for guider_state_batch in guider_state: for guider_state_batch in guider_state:
components.guider.prepare_models(components.transformer) components.guider.prepare_models(components.transformer)
cond_kwargs = guider_state_batch.as_dict() cond_kwargs = {input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()}
cond_kwargs = {k: v for k, v in cond_kwargs.items() if k in guider_input_fields}
prompt_embeds = cond_kwargs.pop("prompt_embeds") prompt_embeds = cond_kwargs.pop("prompt_embeds")
# Predict the noise residual # Predict the noise residual
......
...@@ -241,6 +241,7 @@ else: ...@@ -241,6 +241,7 @@ else:
"HunyuanVideoImageToVideoPipeline", "HunyuanVideoImageToVideoPipeline",
"HunyuanVideoFramepackPipeline", "HunyuanVideoFramepackPipeline",
] ]
_import_structure["hunyuan_image"] = ["HunyuanImagePipeline", "HunyuanImageRefinerPipeline"]
_import_structure["kandinsky"] = [ _import_structure["kandinsky"] = [
"KandinskyCombinedPipeline", "KandinskyCombinedPipeline",
"KandinskyImg2ImgCombinedPipeline", "KandinskyImg2ImgCombinedPipeline",
...@@ -640,6 +641,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: ...@@ -640,6 +641,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
ReduxImageEncoder, ReduxImageEncoder,
) )
from .hidream_image import HiDreamImagePipeline from .hidream_image import HiDreamImagePipeline
from .hunyuan_image import HunyuanImagePipeline, HunyuanImageRefinerPipeline
from .hunyuan_video import ( from .hunyuan_video import (
HunyuanSkyreelsImageToVideoPipeline, HunyuanSkyreelsImageToVideoPipeline,
HunyuanVideoFramepackPipeline, HunyuanVideoFramepackPipeline,
......
from typing import TYPE_CHECKING
from ...utils import (
DIFFUSERS_SLOW_IMPORT,
OptionalDependencyNotAvailable,
_LazyModule,
get_objects_from_module,
is_torch_available,
is_transformers_available,
)
_dummy_objects = {}
_import_structure = {}
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils import dummy_torch_and_transformers_objects # noqa F403
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
_import_structure["pipeline_hunyuanimage"] = ["HunyuanImagePipeline"]
_import_structure["pipeline_hunyuanimage_refiner"] = ["HunyuanImageRefinerPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import *
else:
from .pipeline_hunyuanimage import HunyuanImagePipeline
from .pipeline_hunyuanimage_refiner import HunyuanImageRefinerPipeline
else:
import sys
sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
_import_structure,
module_spec=__spec__,
)
for name, value in _dummy_objects.items():
setattr(sys.modules[__name__], name, value)
This diff is collapsed.
from dataclasses import dataclass
from typing import List, Union
import numpy as np
import PIL.Image
from ...utils import BaseOutput
@dataclass
class HunyuanImagePipelineOutput(BaseOutput):
"""
Output class for HunyuanImage pipelines.
Args:
images (`List[PIL.Image.Image]` or `np.ndarray`)
List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
"""
images: Union[List[PIL.Image.Image], np.ndarray]
...@@ -76,6 +76,7 @@ LOADABLE_CLASSES = { ...@@ -76,6 +76,7 @@ LOADABLE_CLASSES = {
"SchedulerMixin": ["save_pretrained", "from_pretrained"], "SchedulerMixin": ["save_pretrained", "from_pretrained"],
"DiffusionPipeline": ["save_pretrained", "from_pretrained"], "DiffusionPipeline": ["save_pretrained", "from_pretrained"],
"OnnxRuntimeModel": ["save_pretrained", "from_pretrained"], "OnnxRuntimeModel": ["save_pretrained", "from_pretrained"],
"BaseGuidance": ["save_pretrained", "from_pretrained"],
}, },
"transformers": { "transformers": {
"PreTrainedTokenizer": ["save_pretrained", "from_pretrained"], "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
......
...@@ -17,6 +17,21 @@ class AdaptiveProjectedGuidance(metaclass=DummyObject): ...@@ -17,6 +17,21 @@ class AdaptiveProjectedGuidance(metaclass=DummyObject):
requires_backends(cls, ["torch"]) requires_backends(cls, ["torch"])
class AdaptiveProjectedMixGuidance(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class AutoGuidance(metaclass=DummyObject): class AutoGuidance(metaclass=DummyObject):
_backends = ["torch"] _backends = ["torch"]
...@@ -32,6 +47,21 @@ class AutoGuidance(metaclass=DummyObject): ...@@ -32,6 +47,21 @@ class AutoGuidance(metaclass=DummyObject):
requires_backends(cls, ["torch"]) requires_backends(cls, ["torch"])
class BaseGuidance(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class ClassifierFreeGuidance(metaclass=DummyObject): class ClassifierFreeGuidance(metaclass=DummyObject):
_backends = ["torch"] _backends = ["torch"]
...@@ -378,6 +408,36 @@ class AutoencoderKLCosmos(metaclass=DummyObject): ...@@ -378,6 +408,36 @@ class AutoencoderKLCosmos(metaclass=DummyObject):
requires_backends(cls, ["torch"]) requires_backends(cls, ["torch"])
class AutoencoderKLHunyuanImage(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class AutoencoderKLHunyuanImageRefiner(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class AutoencoderKLHunyuanVideo(metaclass=DummyObject): class AutoencoderKLHunyuanVideo(metaclass=DummyObject):
_backends = ["torch"] _backends = ["torch"]
...@@ -858,6 +918,21 @@ class HunyuanDiT2DMultiControlNetModel(metaclass=DummyObject): ...@@ -858,6 +918,21 @@ class HunyuanDiT2DMultiControlNetModel(metaclass=DummyObject):
requires_backends(cls, ["torch"]) requires_backends(cls, ["torch"])
class HunyuanImageTransformer2DModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class HunyuanVideoFramepackTransformer3DModel(metaclass=DummyObject): class HunyuanVideoFramepackTransformer3DModel(metaclass=DummyObject):
_backends = ["torch"] _backends = ["torch"]
......
...@@ -1037,6 +1037,36 @@ class HunyuanDiTPipeline(metaclass=DummyObject): ...@@ -1037,6 +1037,36 @@ class HunyuanDiTPipeline(metaclass=DummyObject):
requires_backends(cls, ["torch", "transformers"]) requires_backends(cls, ["torch", "transformers"])
class HunyuanImagePipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch", "transformers"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
class HunyuanImageRefinerPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch", "transformers"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
class HunyuanSkyreelsImageToVideoPipeline(metaclass=DummyObject): class HunyuanSkyreelsImageToVideoPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"] _backends = ["torch", "transformers"]
......
# Copyright 2025 The HuggingFace Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import torch
from transformers import (
ByT5Tokenizer,
Qwen2_5_VLConfig,
Qwen2_5_VLForConditionalGeneration,
Qwen2Tokenizer,
T5Config,
T5EncoderModel,
)
from diffusers import (
AdaptiveProjectedMixGuidance,
AutoencoderKLHunyuanImage,
FlowMatchEulerDiscreteScheduler,
HunyuanImagePipeline,
HunyuanImageTransformer2DModel,
)
from ...testing_utils import enable_full_determinism
from ..test_pipelines_common import (
FirstBlockCacheTesterMixin,
PipelineTesterMixin,
to_np,
)
enable_full_determinism()
class HunyuanImagePipelineFastTests(
PipelineTesterMixin,
FirstBlockCacheTesterMixin,
unittest.TestCase,
):
pipeline_class = HunyuanImagePipeline
params = frozenset(["prompt", "height", "width"])
batch_params = frozenset(["prompt", "negative_prompt"])
required_optional_params = frozenset(
[
"num_inference_steps",
"generator",
"latents",
"return_dict",
"callback_on_step_end",
"callback_on_step_end_tensor_inputs",
]
)
test_xformers_attention = False
test_layerwise_casting = True
test_group_offloading = True
test_attention_slicing = False
supports_dduf = False
def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1, guidance_embeds: bool = False):
torch.manual_seed(0)
transformer = HunyuanImageTransformer2DModel(
in_channels=4,
out_channels=4,
num_attention_heads=4,
attention_head_dim=8,
num_layers=num_layers,
num_single_layers=num_single_layers,
num_refiner_layers=1,
patch_size=(1, 1),
guidance_embeds=guidance_embeds,
text_embed_dim=32,
text_embed_2_dim=32,
rope_axes_dim=(4, 4),
)
torch.manual_seed(0)
vae = AutoencoderKLHunyuanImage(
in_channels=3,
out_channels=3,
latent_channels=4,
block_out_channels=(32, 64, 64, 64),
layers_per_block=1,
scaling_factor=0.476986,
spatial_compression_ratio=8,
sample_size=128,
)
torch.manual_seed(0)
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
if not guidance_embeds:
torch.manual_seed(0)
guider = AdaptiveProjectedMixGuidance(adaptive_projected_guidance_start_step=2)
ocr_guider = AdaptiveProjectedMixGuidance(adaptive_projected_guidance_start_step=3)
else:
guider = None
ocr_guider = None
torch.manual_seed(0)
config = Qwen2_5_VLConfig(
text_config={
"hidden_size": 32,
"intermediate_size": 32,
"num_hidden_layers": 2,
"num_attention_heads": 2,
"num_key_value_heads": 2,
"rope_scaling": {
"mrope_section": [2, 2, 4],
"rope_type": "default",
"type": "default",
},
"rope_theta": 1000000.0,
},
vision_config={
"depth": 2,
"hidden_size": 32,
"intermediate_size": 32,
"num_heads": 2,
"out_hidden_size": 32,
},
hidden_size=32,
vocab_size=152064,
vision_end_token_id=151653,
vision_start_token_id=151652,
vision_token_id=151654,
)
text_encoder = Qwen2_5_VLForConditionalGeneration(config)
tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
torch.manual_seed(0)
t5_config = T5Config(
d_model=32,
d_kv=4,
d_ff=16,
num_layers=2,
num_heads=2,
relative_attention_num_buckets=8,
relative_attention_max_distance=32,
vocab_size=256,
feed_forward_proj="gated-gelu",
dense_act_fn="gelu_new",
is_encoder_decoder=False,
use_cache=False,
tie_word_embeddings=False,
)
text_encoder_2 = T5EncoderModel(t5_config)
tokenizer_2 = ByT5Tokenizer()
components = {
"transformer": transformer,
"vae": vae,
"scheduler": scheduler,
"text_encoder": text_encoder,
"text_encoder_2": text_encoder_2,
"tokenizer": tokenizer,
"tokenizer_2": tokenizer_2,
"guider": guider,
"ocr_guider": ocr_guider,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"generator": generator,
"num_inference_steps": 5,
"height": 16,
"width": 16,
"output_type": "pt",
}
return inputs
def test_inference(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
image = pipe(**inputs).images
generated_image = image[0]
self.assertEqual(generated_image.shape, (3, 16, 16))
expected_slice_np = np.array(
[0.6252659, 0.51482046, 0.60799813, 0.59267783, 0.488082, 0.5857634, 0.523781, 0.58028054, 0.5674121]
)
output_slice = generated_image[0, -3:, -3:].flatten().cpu().numpy()
self.assertTrue(
np.abs(output_slice - expected_slice_np).max() < 1e-3,
f"output_slice: {output_slice}, expected_slice_np: {expected_slice_np}",
)
def test_inference_guider(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(device)
pipe.set_progress_bar_config(disable=None)
pipe.guider = pipe.guider.new(guidance_scale=1000)
pipe.ocr_guider = pipe.ocr_guider.new(guidance_scale=1000)
inputs = self.get_dummy_inputs(device)
image = pipe(**inputs).images
generated_image = image[0]
self.assertEqual(generated_image.shape, (3, 16, 16))
expected_slice_np = np.array(
[0.61494756, 0.49616697, 0.60327923, 0.6115793, 0.49047345, 0.56977504, 0.53066164, 0.58880305, 0.5570612]
)
output_slice = generated_image[0, -3:, -3:].flatten().cpu().numpy()
self.assertTrue(
np.abs(output_slice - expected_slice_np).max() < 1e-3,
f"output_slice: {output_slice}, expected_slice_np: {expected_slice_np}",
)
def test_inference_with_distilled_guidance(self):
device = "cpu"
components = self.get_dummy_components(guidance_embeds=True)
pipe = self.pipeline_class(**components)
pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
inputs["distilled_guidance_scale"] = 3.5
image = pipe(**inputs).images
generated_image = image[0]
self.assertEqual(generated_image.shape, (3, 16, 16))
expected_slice_np = np.array(
[0.63667065, 0.5187377, 0.66757566, 0.6320319, 0.4913387, 0.54813194, 0.5335031, 0.5736143, 0.5461346]
)
output_slice = generated_image[0, -3:, -3:].flatten().cpu().numpy()
self.assertTrue(
np.abs(output_slice - expected_slice_np).max() < 1e-3,
f"output_slice: {output_slice}, expected_slice_np: {expected_slice_np}",
)
def test_vae_tiling(self, expected_diff_max: float = 0.2):
generator_device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to("cpu")
pipe.set_progress_bar_config(disable=None)
# Without tiling
inputs = self.get_dummy_inputs(generator_device)
inputs["height"] = inputs["width"] = 128
output_without_tiling = pipe(**inputs)[0]
# With tiling
pipe.vae.enable_tiling(tile_sample_min_size=96)
inputs = self.get_dummy_inputs(generator_device)
inputs["height"] = inputs["width"] = 128
output_with_tiling = pipe(**inputs)[0]
self.assertLess(
(to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
expected_diff_max,
"VAE tiling should not affect the inference results",
)
@unittest.skip("TODO: Test not supported for now because needs to be adjusted to work with guiders.")
def test_encode_prompt_works_in_isolation(self):
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment