"src/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "ff182ad6694ada3c01b3514eeae03392b2761b92"
Unverified Commit 59f1b7b1 authored by Dhruv Nair's avatar Dhruv Nair Committed by GitHub
Browse files

Hunyuan I2V fast tests fix (#11341)

* update

* update
parent ce1063ac
...@@ -344,7 +344,7 @@ class HunyuanVideoImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoader ...@@ -344,7 +344,7 @@ class HunyuanVideoImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoader
) )
prompt_embeds = self.text_encoder( prompt_embeds = self.text_encoder(
**expanded_inputs, **expanded_inputs,
pixel_value=image_embeds, pixel_values=image_embeds,
output_hidden_states=True, output_hidden_states=True,
).hidden_states[-(num_hidden_layers_to_skip + 1)] ).hidden_states[-(num_hidden_layers_to_skip + 1)]
prompt_embeds = prompt_embeds.to(dtype=dtype) prompt_embeds = prompt_embeds.to(dtype=dtype)
......
...@@ -24,9 +24,11 @@ from transformers import ( ...@@ -24,9 +24,11 @@ from transformers import (
CLIPTextModel, CLIPTextModel,
CLIPTokenizer, CLIPTokenizer,
LlamaConfig, LlamaConfig,
LlamaModel, LlamaTokenizerFast,
LlamaTokenizer, LlavaConfig,
LlavaForConditionalGeneration,
) )
from transformers.models.clip import CLIPVisionConfig
from diffusers import ( from diffusers import (
AutoencoderKLHunyuanVideo, AutoencoderKLHunyuanVideo,
...@@ -116,7 +118,7 @@ class HunyuanVideoImageToVideoPipelineFastTests( ...@@ -116,7 +118,7 @@ class HunyuanVideoImageToVideoPipelineFastTests(
torch.manual_seed(0) torch.manual_seed(0)
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0) scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
llama_text_encoder_config = LlamaConfig( text_config = LlamaConfig(
bos_token_id=0, bos_token_id=0,
eos_token_id=2, eos_token_id=2,
hidden_size=16, hidden_size=16,
...@@ -124,11 +126,21 @@ class HunyuanVideoImageToVideoPipelineFastTests( ...@@ -124,11 +126,21 @@ class HunyuanVideoImageToVideoPipelineFastTests(
layer_norm_eps=1e-05, layer_norm_eps=1e-05,
num_attention_heads=4, num_attention_heads=4,
num_hidden_layers=2, num_hidden_layers=2,
pad_token_id=1, pad_token_id=100,
vocab_size=1000, vocab_size=1000,
hidden_act="gelu", hidden_act="gelu",
projection_dim=32, projection_dim=32,
) )
vision_config = CLIPVisionConfig(
hidden_size=8,
intermediate_size=37,
projection_dim=32,
num_attention_heads=4,
num_hidden_layers=2,
image_size=224,
)
llava_text_encoder_config = LlavaConfig(vision_config, text_config, pad_token_id=100, image_token_index=101)
clip_text_encoder_config = CLIPTextConfig( clip_text_encoder_config = CLIPTextConfig(
bos_token_id=0, bos_token_id=0,
eos_token_id=2, eos_token_id=2,
...@@ -144,8 +156,8 @@ class HunyuanVideoImageToVideoPipelineFastTests( ...@@ -144,8 +156,8 @@ class HunyuanVideoImageToVideoPipelineFastTests(
) )
torch.manual_seed(0) torch.manual_seed(0)
text_encoder = LlamaModel(llama_text_encoder_config) text_encoder = LlavaForConditionalGeneration(llava_text_encoder_config)
tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer") tokenizer = LlamaTokenizerFast.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
torch.manual_seed(0) torch.manual_seed(0)
text_encoder_2 = CLIPTextModel(clip_text_encoder_config) text_encoder_2 = CLIPTextModel(clip_text_encoder_config)
...@@ -153,14 +165,14 @@ class HunyuanVideoImageToVideoPipelineFastTests( ...@@ -153,14 +165,14 @@ class HunyuanVideoImageToVideoPipelineFastTests(
torch.manual_seed(0) torch.manual_seed(0)
image_processor = CLIPImageProcessor( image_processor = CLIPImageProcessor(
crop_size=336, crop_size=224,
do_center_crop=True, do_center_crop=True,
do_normalize=True, do_normalize=True,
do_resize=True, do_resize=True,
image_mean=[0.48145466, 0.4578275, 0.40821073], image_mean=[0.48145466, 0.4578275, 0.40821073],
image_std=[0.26862954, 0.26130258, 0.27577711], image_std=[0.26862954, 0.26130258, 0.27577711],
resample=3, resample=3,
size=336, size=224,
) )
components = { components = {
...@@ -190,6 +202,10 @@ class HunyuanVideoImageToVideoPipelineFastTests( ...@@ -190,6 +202,10 @@ class HunyuanVideoImageToVideoPipelineFastTests(
"prompt_template": { "prompt_template": {
"template": "{}", "template": "{}",
"crop_start": 0, "crop_start": 0,
"image_emb_len": 49,
"image_emb_start": 5,
"image_emb_end": 54,
"double_return_token_id": 0,
}, },
"generator": generator, "generator": generator,
"num_inference_steps": 2, "num_inference_steps": 2,
...@@ -197,7 +213,7 @@ class HunyuanVideoImageToVideoPipelineFastTests( ...@@ -197,7 +213,7 @@ class HunyuanVideoImageToVideoPipelineFastTests(
"height": image_height, "height": image_height,
"width": image_width, "width": image_width,
"num_frames": 9, "num_frames": 9,
"max_sequence_length": 16, "max_sequence_length": 64,
"output_type": "pt", "output_type": "pt",
} }
return inputs return inputs
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment