Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
renzhc
diffusers_dcu
Commits
671149e0
Unverified
Commit
671149e0
authored
Dec 07, 2025
by
YiYi Xu
Committed by
GitHub
Dec 07, 2025
Browse files
[HunyuanVideo1.5] support step-distilled (#12802)
* support step-distilled * style
parent
f67639b0
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
55 additions
and
5 deletions
+55
-5
scripts/convert_hunyuan_video1_5_to_diffusers.py
scripts/convert_hunyuan_video1_5_to_diffusers.py
+27
-2
src/diffusers/models/transformers/transformer_hunyuan_video15.py
...fusers/models/transformers/transformer_hunyuan_video15.py
+18
-3
src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py
...hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py
+10
-0
No files found.
scripts/convert_hunyuan_video1_5_to_diffusers.py
View file @
671149e0
...
@@ -69,6 +69,11 @@ TRANSFORMER_CONFIGS = {
...
@@ -69,6 +69,11 @@ TRANSFORMER_CONFIGS = {
"target_size"
:
960
,
"target_size"
:
960
,
"task_type"
:
"i2v"
,
"task_type"
:
"i2v"
,
},
},
"480p_i2v_step_distilled"
:
{
"target_size"
:
640
,
"task_type"
:
"i2v"
,
"use_meanflow"
:
True
,
},
}
}
SCHEDULER_CONFIGS
=
{
SCHEDULER_CONFIGS
=
{
...
@@ -93,6 +98,9 @@ SCHEDULER_CONFIGS = {
...
@@ -93,6 +98,9 @@ SCHEDULER_CONFIGS = {
"720p_i2v_distilled"
:
{
"720p_i2v_distilled"
:
{
"shift"
:
7.0
,
"shift"
:
7.0
,
},
},
"480p_i2v_step_distilled"
:
{
"shift"
:
7.0
,
},
}
}
GUIDANCE_CONFIGS
=
{
GUIDANCE_CONFIGS
=
{
...
@@ -117,6 +125,9 @@ GUIDANCE_CONFIGS = {
...
@@ -117,6 +125,9 @@ GUIDANCE_CONFIGS = {
"720p_i2v_distilled"
:
{
"720p_i2v_distilled"
:
{
"guidance_scale"
:
1.0
,
"guidance_scale"
:
1.0
,
},
},
"480p_i2v_step_distilled"
:
{
"guidance_scale"
:
1.0
,
},
}
}
...
@@ -126,7 +137,7 @@ def swap_scale_shift(weight):
...
@@ -126,7 +137,7 @@ def swap_scale_shift(weight):
return
new_weight
return
new_weight
def
convert_hyvideo15_transformer_to_diffusers
(
original_state_dict
):
def
convert_hyvideo15_transformer_to_diffusers
(
original_state_dict
,
config
=
None
):
"""
"""
Convert HunyuanVideo 1.5 original checkpoint to Diffusers format.
Convert HunyuanVideo 1.5 original checkpoint to Diffusers format.
"""
"""
...
@@ -142,6 +153,20 @@ def convert_hyvideo15_transformer_to_diffusers(original_state_dict):
...
@@ -142,6 +153,20 @@ def convert_hyvideo15_transformer_to_diffusers(original_state_dict):
)
)
converted_state_dict
[
"time_embed.timestep_embedder.linear_2.bias"
]
=
original_state_dict
.
pop
(
"time_in.mlp.2.bias"
)
converted_state_dict
[
"time_embed.timestep_embedder.linear_2.bias"
]
=
original_state_dict
.
pop
(
"time_in.mlp.2.bias"
)
if
config
.
use_meanflow
:
converted_state_dict
[
"time_embed.timestep_embedder_r.linear_1.weight"
]
=
original_state_dict
.
pop
(
"time_r_in.mlp.0.weight"
)
converted_state_dict
[
"time_embed.timestep_embedder_r.linear_1.bias"
]
=
original_state_dict
.
pop
(
"time_r_in.mlp.0.bias"
)
converted_state_dict
[
"time_embed.timestep_embedder_r.linear_2.weight"
]
=
original_state_dict
.
pop
(
"time_r_in.mlp.2.weight"
)
converted_state_dict
[
"time_embed.timestep_embedder_r.linear_2.bias"
]
=
original_state_dict
.
pop
(
"time_r_in.mlp.2.bias"
)
# 2. context_embedder.time_text_embed.timestep_embedder <- txt_in.t_embedder
# 2. context_embedder.time_text_embed.timestep_embedder <- txt_in.t_embedder
converted_state_dict
[
"context_embedder.time_text_embed.timestep_embedder.linear_1.weight"
]
=
(
converted_state_dict
[
"context_embedder.time_text_embed.timestep_embedder.linear_1.weight"
]
=
(
original_state_dict
.
pop
(
"txt_in.t_embedder.mlp.0.weight"
)
original_state_dict
.
pop
(
"txt_in.t_embedder.mlp.0.weight"
)
...
@@ -627,7 +652,7 @@ def convert_transformer(args):
...
@@ -627,7 +652,7 @@ def convert_transformer(args):
config
=
TRANSFORMER_CONFIGS
[
args
.
transformer_type
]
config
=
TRANSFORMER_CONFIGS
[
args
.
transformer_type
]
with
init_empty_weights
():
with
init_empty_weights
():
transformer
=
HunyuanVideo15Transformer3DModel
(
**
config
)
transformer
=
HunyuanVideo15Transformer3DModel
(
**
config
)
state_dict
=
convert_hyvideo15_transformer_to_diffusers
(
original_state_dict
)
state_dict
=
convert_hyvideo15_transformer_to_diffusers
(
original_state_dict
,
config
=
transformer
.
config
)
transformer
.
load_state_dict
(
state_dict
,
strict
=
True
,
assign
=
True
)
transformer
.
load_state_dict
(
state_dict
,
strict
=
True
,
assign
=
True
)
return
transformer
return
transformer
...
...
src/diffusers/models/transformers/transformer_hunyuan_video15.py
View file @
671149e0
...
@@ -184,19 +184,32 @@ class HunyuanVideo15TimeEmbedding(nn.Module):
...
@@ -184,19 +184,32 @@ class HunyuanVideo15TimeEmbedding(nn.Module):
The dimension of the output embedding.
The dimension of the output embedding.
"""
"""
def
__init__
(
self
,
embedding_dim
:
int
):
def
__init__
(
self
,
embedding_dim
:
int
,
use_meanflow
:
bool
=
False
):
super
().
__init__
()
super
().
__init__
()
self
.
time_proj
=
Timesteps
(
num_channels
=
256
,
flip_sin_to_cos
=
True
,
downscale_freq_shift
=
0
)
self
.
time_proj
=
Timesteps
(
num_channels
=
256
,
flip_sin_to_cos
=
True
,
downscale_freq_shift
=
0
)
self
.
timestep_embedder
=
TimestepEmbedding
(
in_channels
=
256
,
time_embed_dim
=
embedding_dim
)
self
.
timestep_embedder
=
TimestepEmbedding
(
in_channels
=
256
,
time_embed_dim
=
embedding_dim
)
self
.
use_meanflow
=
use_meanflow
self
.
time_proj_r
=
None
self
.
timestep_embedder_r
=
None
if
use_meanflow
:
self
.
time_proj_r
=
Timesteps
(
num_channels
=
256
,
flip_sin_to_cos
=
True
,
downscale_freq_shift
=
0
)
self
.
timestep_embedder_r
=
TimestepEmbedding
(
in_channels
=
256
,
time_embed_dim
=
embedding_dim
)
def
forward
(
def
forward
(
self
,
self
,
timestep
:
torch
.
Tensor
,
timestep
:
torch
.
Tensor
,
timestep_r
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
timesteps_proj
=
self
.
time_proj
(
timestep
)
timesteps_proj
=
self
.
time_proj
(
timestep
)
timesteps_emb
=
self
.
timestep_embedder
(
timesteps_proj
.
to
(
dtype
=
timestep
.
dtype
))
timesteps_emb
=
self
.
timestep_embedder
(
timesteps_proj
.
to
(
dtype
=
timestep
.
dtype
))
if
timestep_r
is
not
None
:
timesteps_proj_r
=
self
.
time_proj_r
(
timestep_r
)
timesteps_emb_r
=
self
.
timestep_embedder_r
(
timesteps_proj_r
.
to
(
dtype
=
timestep
.
dtype
))
timesteps_emb
=
timesteps_emb
+
timesteps_emb_r
return
timesteps_emb
return
timesteps_emb
...
@@ -567,6 +580,7 @@ class HunyuanVideo15Transformer3DModel(
...
@@ -567,6 +580,7 @@ class HunyuanVideo15Transformer3DModel(
# YiYi Notes: config based on target_size_config https://github.com/yiyixuxu/hy15/blob/main/hyvideo/pipelines/hunyuan_video_pipeline.py#L205
# YiYi Notes: config based on target_size_config https://github.com/yiyixuxu/hy15/blob/main/hyvideo/pipelines/hunyuan_video_pipeline.py#L205
target_size
:
int
=
640
,
# did not name sample_size since it is in pixel spaces
target_size
:
int
=
640
,
# did not name sample_size since it is in pixel spaces
task_type
:
str
=
"i2v"
,
task_type
:
str
=
"i2v"
,
use_meanflow
:
bool
=
False
,
)
->
None
:
)
->
None
:
super
().
__init__
()
super
().
__init__
()
...
@@ -582,7 +596,7 @@ class HunyuanVideo15Transformer3DModel(
...
@@ -582,7 +596,7 @@ class HunyuanVideo15Transformer3DModel(
)
)
self
.
context_embedder_2
=
HunyuanVideo15ByT5TextProjection
(
text_embed_2_dim
,
2048
,
inner_dim
)
self
.
context_embedder_2
=
HunyuanVideo15ByT5TextProjection
(
text_embed_2_dim
,
2048
,
inner_dim
)
self
.
time_embed
=
HunyuanVideo15TimeEmbedding
(
inner_dim
)
self
.
time_embed
=
HunyuanVideo15TimeEmbedding
(
inner_dim
,
use_meanflow
=
use_meanflow
)
self
.
cond_type_embed
=
nn
.
Embedding
(
3
,
inner_dim
)
self
.
cond_type_embed
=
nn
.
Embedding
(
3
,
inner_dim
)
...
@@ -612,6 +626,7 @@ class HunyuanVideo15Transformer3DModel(
...
@@ -612,6 +626,7 @@ class HunyuanVideo15Transformer3DModel(
timestep
:
torch
.
LongTensor
,
timestep
:
torch
.
LongTensor
,
encoder_hidden_states
:
torch
.
Tensor
,
encoder_hidden_states
:
torch
.
Tensor
,
encoder_attention_mask
:
torch
.
Tensor
,
encoder_attention_mask
:
torch
.
Tensor
,
timestep_r
:
Optional
[
torch
.
LongTensor
]
=
None
,
encoder_hidden_states_2
:
Optional
[
torch
.
Tensor
]
=
None
,
encoder_hidden_states_2
:
Optional
[
torch
.
Tensor
]
=
None
,
encoder_attention_mask_2
:
Optional
[
torch
.
Tensor
]
=
None
,
encoder_attention_mask_2
:
Optional
[
torch
.
Tensor
]
=
None
,
image_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
image_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
...
@@ -643,7 +658,7 @@ class HunyuanVideo15Transformer3DModel(
...
@@ -643,7 +658,7 @@ class HunyuanVideo15Transformer3DModel(
image_rotary_emb
=
self
.
rope
(
hidden_states
)
image_rotary_emb
=
self
.
rope
(
hidden_states
)
# 2. Conditional embeddings
# 2. Conditional embeddings
temb
=
self
.
time_embed
(
timestep
)
temb
=
self
.
time_embed
(
timestep
,
timestep_r
=
timestep_r
)
hidden_states
=
self
.
x_embedder
(
hidden_states
)
hidden_states
=
self
.
x_embedder
(
hidden_states
)
...
...
src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5_image2video.py
View file @
671149e0
...
@@ -852,6 +852,15 @@ class HunyuanVideo15ImageToVideoPipeline(DiffusionPipeline):
...
@@ -852,6 +852,15 @@ class HunyuanVideo15ImageToVideoPipeline(DiffusionPipeline):
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
timestep
=
t
.
expand
(
latent_model_input
.
shape
[
0
]).
to
(
latent_model_input
.
dtype
)
timestep
=
t
.
expand
(
latent_model_input
.
shape
[
0
]).
to
(
latent_model_input
.
dtype
)
if
self
.
transformer
.
config
.
use_meanflow
:
if
i
==
len
(
timesteps
)
-
1
:
timestep_r
=
torch
.
tensor
([
0.0
],
device
=
device
)
else
:
timestep_r
=
timesteps
[
i
+
1
]
timestep_r
=
timestep_r
.
expand
(
latents
.
shape
[
0
]).
to
(
latents
.
dtype
)
else
:
timestep_r
=
None
# Step 1: Collect model inputs needed for the guidance method
# Step 1: Collect model inputs needed for the guidance method
# conditional inputs should always be first element in the tuple
# conditional inputs should always be first element in the tuple
guider_inputs
=
{
guider_inputs
=
{
...
@@ -893,6 +902,7 @@ class HunyuanVideo15ImageToVideoPipeline(DiffusionPipeline):
...
@@ -893,6 +902,7 @@ class HunyuanVideo15ImageToVideoPipeline(DiffusionPipeline):
hidden_states
=
latent_model_input
,
hidden_states
=
latent_model_input
,
image_embeds
=
image_embeds
,
image_embeds
=
image_embeds
,
timestep
=
timestep
,
timestep
=
timestep
,
timestep_r
=
timestep_r
,
attention_kwargs
=
self
.
attention_kwargs
,
attention_kwargs
=
self
.
attention_kwargs
,
return_dict
=
False
,
return_dict
=
False
,
**
cond_kwargs
,
**
cond_kwargs
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment