Unverified Commit b2ca39c8 authored by Sayak Paul's avatar Sayak Paul Committed by GitHub
Browse files

[tests] test `encode_prompt()` in isolation (#10438)

* poc encode_prompt() tests

* fix

* updates.

* fixes

* fixes

* updates

* updates

* updates

* revert

* updates

* updates

* updates

* updates

* remove SDXLOptionalComponentsTesterMixin.

* remove tests that directly leveraged encode_prompt() in some way or the other.

* fix imports.

* remove _save_load

* fixes

* fixes

* fixes

* fixes
parent 53217126
...@@ -279,6 +279,10 @@ class StableDiffusionLatentUpscalePipelineFastTests( ...@@ -279,6 +279,10 @@ class StableDiffusionLatentUpscalePipelineFastTests(
def test_float16_inference(self): def test_float16_inference(self):
super().test_float16_inference(expected_max_diff=5e-1) super().test_float16_inference(expected_max_diff=5e-1)
@unittest.skip("Test not supported for a weird use of `text_input_ids`.")
def test_encode_prompt_works_in_isolation(self):
pass
@require_torch_gpu @require_torch_gpu
@slow @slow
......
...@@ -156,39 +156,6 @@ class StableDiffusion3PipelineFastTests(unittest.TestCase, PipelineTesterMixin): ...@@ -156,39 +156,6 @@ class StableDiffusion3PipelineFastTests(unittest.TestCase, PipelineTesterMixin):
# Outputs should be different here # Outputs should be different here
assert max_diff > 1e-2 assert max_diff > 1e-2
def test_stable_diffusion_3_prompt_embeds(self):
pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
inputs = self.get_dummy_inputs(torch_device)
output_with_prompt = pipe(**inputs).images[0]
inputs = self.get_dummy_inputs(torch_device)
prompt = inputs.pop("prompt")
do_classifier_free_guidance = inputs["guidance_scale"] > 1
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = pipe.encode_prompt(
prompt,
prompt_2=None,
prompt_3=None,
do_classifier_free_guidance=do_classifier_free_guidance,
device=torch_device,
)
output_with_embeds = pipe(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
**inputs,
).images[0]
max_diff = np.abs(output_with_prompt - output_with_embeds).max()
assert max_diff < 1e-4
def test_fused_qkv_projections(self): def test_fused_qkv_projections(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components() components = self.get_dummy_components()
......
...@@ -159,39 +159,7 @@ class StableDiffusion3Img2ImgPipelineFastTests(PipelineLatentTesterMixin, unitte ...@@ -159,39 +159,7 @@ class StableDiffusion3Img2ImgPipelineFastTests(PipelineLatentTesterMixin, unitte
# Outputs should be different here # Outputs should be different here
assert max_diff > 1e-2 assert max_diff > 1e-2
def test_stable_diffusion_3_img2img_prompt_embeds(self): @unittest.skip("Skip for now.")
pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
inputs = self.get_dummy_inputs(torch_device)
output_with_prompt = pipe(**inputs).images[0]
inputs = self.get_dummy_inputs(torch_device)
prompt = inputs.pop("prompt")
do_classifier_free_guidance = inputs["guidance_scale"] > 1
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = pipe.encode_prompt(
prompt,
prompt_2=None,
prompt_3=None,
do_classifier_free_guidance=do_classifier_free_guidance,
device=torch_device,
)
output_with_embeds = pipe(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
**inputs,
).images[0]
max_diff = np.abs(output_with_prompt - output_with_embeds).max()
assert max_diff < 1e-4
def test_multi_vae(self): def test_multi_vae(self):
pass pass
......
...@@ -164,38 +164,5 @@ class StableDiffusion3InpaintPipelineFastTests(PipelineLatentTesterMixin, unitte ...@@ -164,38 +164,5 @@ class StableDiffusion3InpaintPipelineFastTests(PipelineLatentTesterMixin, unitte
# Outputs should be different here # Outputs should be different here
assert max_diff > 1e-2 assert max_diff > 1e-2
def test_stable_diffusion_3_inpaint_prompt_embeds(self):
pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)
inputs = self.get_dummy_inputs(torch_device)
output_with_prompt = pipe(**inputs).images[0]
inputs = self.get_dummy_inputs(torch_device)
prompt = inputs.pop("prompt")
do_classifier_free_guidance = inputs["guidance_scale"] > 1
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = pipe.encode_prompt(
prompt,
prompt_2=None,
prompt_3=None,
do_classifier_free_guidance=do_classifier_free_guidance,
device=torch_device,
)
output_with_embeds = pipe(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
**inputs,
).images[0]
max_diff = np.abs(output_with_prompt - output_with_embeds).max()
assert max_diff < 1e-4
def test_multi_vae(self): def test_multi_vae(self):
pass pass
...@@ -336,6 +336,13 @@ class AdapterTests: ...@@ -336,6 +336,13 @@ class AdapterTests:
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
class StableDiffusionFullAdapterPipelineFastTests( class StableDiffusionFullAdapterPipelineFastTests(
AdapterTests, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase AdapterTests, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
......
...@@ -169,3 +169,7 @@ class GligenPipelineFastTests( ...@@ -169,3 +169,7 @@ class GligenPipelineFastTests(
def test_inference_batch_single_identical(self): def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3) super().test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3)
@unittest.skip("Test not supported as tokenizer is used for parsing bounding boxes.")
def test_encode_prompt_works_in_isolation(self):
pass
...@@ -207,3 +207,9 @@ class GligenTextImagePipelineFastTests( ...@@ -207,3 +207,9 @@ class GligenTextImagePipelineFastTests(
def test_inference_batch_single_identical(self): def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3) super().test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3)
@unittest.skip(
"Test not supported because of the use of `text_encoder` in `get_cross_attention_kwargs_with_grounded()`."
)
def test_encode_prompt_works_in_isolation(self):
pass
...@@ -258,6 +258,13 @@ class StableDiffusionPanoramaPipelineFastTests( ...@@ -258,6 +258,13 @@ class StableDiffusionPanoramaPipelineFastTests(
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
@nightly @nightly
@require_torch_gpu @require_torch_gpu
......
...@@ -153,6 +153,13 @@ class StableDiffusionSAGPipelineFastTests( ...@@ -153,6 +153,13 @@ class StableDiffusionSAGPipelineFastTests(
# Karras schedulers are not supported # Karras schedulers are not supported
image = pipeline(**inputs).images[0] image = pipeline(**inputs).images[0]
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
@nightly @nightly
@require_torch_gpu @require_torch_gpu
......
...@@ -54,7 +54,6 @@ from ..test_pipelines_common import ( ...@@ -54,7 +54,6 @@ from ..test_pipelines_common import (
PipelineLatentTesterMixin, PipelineLatentTesterMixin,
PipelineTesterMixin, PipelineTesterMixin,
SDFunctionTesterMixin, SDFunctionTesterMixin,
SDXLOptionalComponentsTesterMixin,
) )
...@@ -66,7 +65,6 @@ class StableDiffusionXLPipelineFastTests( ...@@ -66,7 +65,6 @@ class StableDiffusionXLPipelineFastTests(
IPAdapterTesterMixin, IPAdapterTesterMixin,
PipelineLatentTesterMixin, PipelineLatentTesterMixin,
PipelineTesterMixin, PipelineTesterMixin,
SDXLOptionalComponentsTesterMixin,
unittest.TestCase, unittest.TestCase,
): ):
pipeline_class = StableDiffusionXLPipeline pipeline_class = StableDiffusionXLPipeline
...@@ -254,84 +252,6 @@ class StableDiffusionXLPipelineFastTests( ...@@ -254,84 +252,6 @@ class StableDiffusionXLPipelineFastTests(
np.abs(output.flatten() - output_sigmas.flatten()).max() > 1e-3 np.abs(output.flatten() - output_sigmas.flatten()).max() > 1e-3
), "use ays sigmas should have different outputs" ), "use ays sigmas should have different outputs"
def test_stable_diffusion_xl_prompt_embeds(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionXLPipeline(**components)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
# forward without prompt embeds
inputs = self.get_dummy_inputs(torch_device)
inputs["prompt"] = 2 * [inputs["prompt"]]
inputs["num_images_per_prompt"] = 2
output = sd_pipe(**inputs)
image_slice_1 = output.images[0, -3:, -3:, -1]
# forward with prompt embeds
inputs = self.get_dummy_inputs(torch_device)
prompt = 2 * [inputs.pop("prompt")]
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = sd_pipe.encode_prompt(prompt)
output = sd_pipe(
**inputs,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
)
image_slice_2 = output.images[0, -3:, -3:, -1]
# make sure that it's equal
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
def test_stable_diffusion_xl_negative_prompt_embeds(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionXLPipeline(**components)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
# forward without prompt embeds
inputs = self.get_dummy_inputs(torch_device)
negative_prompt = 3 * ["this is a negative prompt"]
inputs["negative_prompt"] = negative_prompt
inputs["prompt"] = 3 * [inputs["prompt"]]
output = sd_pipe(**inputs)
image_slice_1 = output.images[0, -3:, -3:, -1]
# forward with prompt embeds
inputs = self.get_dummy_inputs(torch_device)
negative_prompt = 3 * ["this is a negative prompt"]
prompt = 3 * [inputs.pop("prompt")]
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
output = sd_pipe(
**inputs,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
)
image_slice_2 = output.images[0, -3:, -3:, -1]
# make sure that it's equal
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
def test_ip_adapter(self): def test_ip_adapter(self):
expected_pipe_slice = None expected_pipe_slice = None
if torch_device == "cpu": if torch_device == "cpu":
...@@ -345,9 +265,6 @@ class StableDiffusionXLPipelineFastTests( ...@@ -345,9 +265,6 @@ class StableDiffusionXLPipelineFastTests(
def test_inference_batch_single_identical(self): def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=3e-3) super().test_inference_batch_single_identical(expected_max_diff=3e-3)
def test_save_load_optional_components(self):
self._test_save_load_optional_components()
@require_torch_gpu @require_torch_gpu
def test_stable_diffusion_xl_offloads(self): def test_stable_diffusion_xl_offloads(self):
pipes = [] pipes = []
...@@ -377,41 +294,9 @@ class StableDiffusionXLPipelineFastTests( ...@@ -377,41 +294,9 @@ class StableDiffusionXLPipelineFastTests(
assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3 assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3 assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
def test_stable_diffusion_xl_img2img_prompt_embeds_only(self): @unittest.skip("We test this functionality elsewhere already.")
components = self.get_dummy_components() def test_save_load_optional_components(self):
sd_pipe = StableDiffusionXLPipeline(**components) pass
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
# forward without prompt embeds
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
inputs["prompt"] = 3 * [inputs["prompt"]]
output = sd_pipe(**inputs)
image_slice_1 = output.images[0, -3:, -3:, -1]
# forward with prompt embeds
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
prompt = 3 * [inputs.pop("prompt")]
(
prompt_embeds,
_,
pooled_prompt_embeds,
_,
) = sd_pipe.encode_prompt(prompt)
output = sd_pipe(
**inputs,
prompt_embeds=prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
)
image_slice_2 = output.images[0, -3:, -3:, -1]
# make sure that it's equal
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
def test_stable_diffusion_two_xl_mixture_of_denoiser_fast(self): def test_stable_diffusion_two_xl_mixture_of_denoiser_fast(self):
components = self.get_dummy_components() components = self.get_dummy_components()
......
...@@ -42,7 +42,6 @@ from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUI ...@@ -42,7 +42,6 @@ from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUI
from ..test_pipelines_common import ( from ..test_pipelines_common import (
IPAdapterTesterMixin, IPAdapterTesterMixin,
PipelineTesterMixin, PipelineTesterMixin,
SDXLOptionalComponentsTesterMixin,
assert_mean_pixel_difference, assert_mean_pixel_difference,
) )
...@@ -50,9 +49,7 @@ from ..test_pipelines_common import ( ...@@ -50,9 +49,7 @@ from ..test_pipelines_common import (
enable_full_determinism() enable_full_determinism()
class StableDiffusionXLAdapterPipelineFastTests( class StableDiffusionXLAdapterPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, unittest.TestCase):
IPAdapterTesterMixin, PipelineTesterMixin, SDXLOptionalComponentsTesterMixin, unittest.TestCase
):
pipeline_class = StableDiffusionXLAdapterPipeline pipeline_class = StableDiffusionXLAdapterPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
...@@ -300,6 +297,10 @@ class StableDiffusionXLAdapterPipelineFastTests( ...@@ -300,6 +297,10 @@ class StableDiffusionXLAdapterPipelineFastTests(
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice) return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
@unittest.skip("We test this functionality elsewhere already.")
def test_save_load_optional_components(self):
pass
def test_stable_diffusion_adapter_default_case(self): def test_stable_diffusion_adapter_default_case(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components() components = self.get_dummy_components()
...@@ -373,9 +374,6 @@ class StableDiffusionXLAdapterPipelineFastTests( ...@@ -373,9 +374,6 @@ class StableDiffusionXLAdapterPipelineFastTests(
expected_out_image_size, expected_out_image_size,
) )
def test_save_load_optional_components(self):
return self._test_save_load_optional_components()
def test_adapter_sdxl_lcm(self): def test_adapter_sdxl_lcm(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator device = "cpu" # ensure determinism for the device-dependent torch.Generator
...@@ -515,6 +513,10 @@ class StableDiffusionXLMultiAdapterPipelineFastTests( ...@@ -515,6 +513,10 @@ class StableDiffusionXLMultiAdapterPipelineFastTests(
logger.setLevel(level=diffusers.logging.WARNING) logger.setLevel(level=diffusers.logging.WARNING)
@unittest.skip("We test this functionality elsewhere already.")
def test_save_load_optional_components(self):
pass
def test_num_images_per_prompt(self): def test_num_images_per_prompt(self):
components = self.get_dummy_components() components = self.get_dummy_components()
pipe = self.pipeline_class(**components) pipe = self.pipeline_class(**components)
......
...@@ -57,7 +57,6 @@ from ..test_pipelines_common import ( ...@@ -57,7 +57,6 @@ from ..test_pipelines_common import (
IPAdapterTesterMixin, IPAdapterTesterMixin,
PipelineLatentTesterMixin, PipelineLatentTesterMixin,
PipelineTesterMixin, PipelineTesterMixin,
SDXLOptionalComponentsTesterMixin,
) )
...@@ -266,52 +265,10 @@ class StableDiffusionXLImg2ImgPipelineFastTests( ...@@ -266,52 +265,10 @@ class StableDiffusionXLImg2ImgPipelineFastTests(
def test_inference_batch_single_identical(self): def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=3e-3) super().test_inference_batch_single_identical(expected_max_diff=3e-3)
# TODO(Patrick, Sayak) - skip for now as this requires more refiner tests @unittest.skip("Skip for now.")
def test_save_load_optional_components(self): def test_save_load_optional_components(self):
pass pass
def test_stable_diffusion_xl_img2img_negative_prompt_embeds(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
# forward without prompt embeds
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
negative_prompt = 3 * ["this is a negative prompt"]
inputs["negative_prompt"] = negative_prompt
inputs["prompt"] = 3 * [inputs["prompt"]]
output = sd_pipe(**inputs)
image_slice_1 = output.images[0, -3:, -3:, -1]
# forward with prompt embeds
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
negative_prompt = 3 * ["this is a negative prompt"]
prompt = 3 * [inputs.pop("prompt")]
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
output = sd_pipe(
**inputs,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
)
image_slice_2 = output.images[0, -3:, -3:, -1]
# make sure that it's equal
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
def test_ip_adapter(self): def test_ip_adapter(self):
expected_pipe_slice = None expected_pipe_slice = None
if torch_device == "cpu": if torch_device == "cpu":
...@@ -519,7 +476,7 @@ class StableDiffusionXLImg2ImgPipelineFastTests( ...@@ -519,7 +476,7 @@ class StableDiffusionXLImg2ImgPipelineFastTests(
class StableDiffusionXLImg2ImgRefinerOnlyPipelineFastTests( class StableDiffusionXLImg2ImgRefinerOnlyPipelineFastTests(
PipelineLatentTesterMixin, PipelineTesterMixin, SDXLOptionalComponentsTesterMixin, unittest.TestCase PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
): ):
pipeline_class = StableDiffusionXLImg2ImgPipeline pipeline_class = StableDiffusionXLImg2ImgPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
...@@ -697,92 +654,15 @@ class StableDiffusionXLImg2ImgRefinerOnlyPipelineFastTests( ...@@ -697,92 +654,15 @@ class StableDiffusionXLImg2ImgRefinerOnlyPipelineFastTests(
> 1e-4 > 1e-4
) )
def test_stable_diffusion_xl_img2img_negative_prompt_embeds(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
# forward without prompt embeds
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
negative_prompt = 3 * ["this is a negative prompt"]
inputs["negative_prompt"] = negative_prompt
inputs["prompt"] = 3 * [inputs["prompt"]]
output = sd_pipe(**inputs)
image_slice_1 = output.images[0, -3:, -3:, -1]
# forward with prompt embeds
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
negative_prompt = 3 * ["this is a negative prompt"]
prompt = 3 * [inputs.pop("prompt")]
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
output = sd_pipe(
**inputs,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
)
image_slice_2 = output.images[0, -3:, -3:, -1]
# make sure that it's equal
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
def test_stable_diffusion_xl_img2img_prompt_embeds_only(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
# forward without prompt embeds
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
inputs["prompt"] = 3 * [inputs["prompt"]]
output = sd_pipe(**inputs)
image_slice_1 = output.images[0, -3:, -3:, -1]
# forward with prompt embeds
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
prompt = 3 * [inputs.pop("prompt")]
(
prompt_embeds,
_,
pooled_prompt_embeds,
_,
) = sd_pipe.encode_prompt(prompt)
output = sd_pipe(
**inputs,
prompt_embeds=prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
)
image_slice_2 = output.images[0, -3:, -3:, -1]
# make sure that it's equal
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
def test_attention_slicing_forward_pass(self): def test_attention_slicing_forward_pass(self):
super().test_attention_slicing_forward_pass(expected_max_diff=3e-3) super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
def test_inference_batch_single_identical(self): def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=3e-3) super().test_inference_batch_single_identical(expected_max_diff=3e-3)
@unittest.skip("We test this functionality elsewhere already.")
def test_save_load_optional_components(self): def test_save_load_optional_components(self):
self._test_save_load_optional_components() pass
@slow @slow
......
...@@ -301,50 +301,10 @@ class StableDiffusionXLInpaintPipelineFastTests( ...@@ -301,50 +301,10 @@ class StableDiffusionXLInpaintPipelineFastTests(
def test_inference_batch_single_identical(self): def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=3e-3) super().test_inference_batch_single_identical(expected_max_diff=3e-3)
# TODO(Patrick, Sayak) - skip for now as this requires more refiner tests @unittest.skip("Skip for now.")
def test_save_load_optional_components(self): def test_save_load_optional_components(self):
pass pass
def test_stable_diffusion_xl_inpaint_negative_prompt_embeds(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionXLInpaintPipeline(**components)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
# forward without prompt embeds
inputs = self.get_dummy_inputs(torch_device)
negative_prompt = 3 * ["this is a negative prompt"]
inputs["negative_prompt"] = negative_prompt
inputs["prompt"] = 3 * [inputs["prompt"]]
output = sd_pipe(**inputs)
image_slice_1 = output.images[0, -3:, -3:, -1]
# forward with prompt embeds
inputs = self.get_dummy_inputs(torch_device)
negative_prompt = 3 * ["this is a negative prompt"]
prompt = 3 * [inputs.pop("prompt")]
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
output = sd_pipe(
**inputs,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
)
image_slice_2 = output.images[0, -3:, -3:, -1]
# make sure that it's equal
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
@require_torch_gpu @require_torch_gpu
def test_stable_diffusion_xl_offloads(self): def test_stable_diffusion_xl_offloads(self):
pipes = [] pipes = []
......
...@@ -40,7 +40,6 @@ from ..test_pipelines_common import ( ...@@ -40,7 +40,6 @@ from ..test_pipelines_common import (
PipelineKarrasSchedulerTesterMixin, PipelineKarrasSchedulerTesterMixin,
PipelineLatentTesterMixin, PipelineLatentTesterMixin,
PipelineTesterMixin, PipelineTesterMixin,
SDXLOptionalComponentsTesterMixin,
) )
...@@ -51,7 +50,6 @@ class StableDiffusionXLInstructPix2PixPipelineFastTests( ...@@ -51,7 +50,6 @@ class StableDiffusionXLInstructPix2PixPipelineFastTests(
PipelineLatentTesterMixin, PipelineLatentTesterMixin,
PipelineKarrasSchedulerTesterMixin, PipelineKarrasSchedulerTesterMixin,
PipelineTesterMixin, PipelineTesterMixin,
SDXLOptionalComponentsTesterMixin,
unittest.TestCase, unittest.TestCase,
): ):
pipeline_class = StableDiffusionXLInstructPix2PixPipeline pipeline_class = StableDiffusionXLInstructPix2PixPipeline
...@@ -182,8 +180,10 @@ class StableDiffusionXLInstructPix2PixPipelineFastTests( ...@@ -182,8 +180,10 @@ class StableDiffusionXLInstructPix2PixPipelineFastTests(
max_diff = np.abs(out - out_latents_inputs).max() max_diff = np.abs(out - out_latents_inputs).max()
self.assertLess(max_diff, 1e-4, "passing latents as image input generate different result from passing image") self.assertLess(max_diff, 1e-4, "passing latents as image input generate different result from passing image")
@unittest.skip("Test not supported at the moment.")
def test_cfg(self): def test_cfg(self):
pass pass
@unittest.skip("Functionality is tested elsewhere.")
def test_save_load_optional_components(self): def test_save_load_optional_components(self):
self._test_save_load_optional_components() pass
...@@ -184,6 +184,10 @@ class StableUnCLIPPipelineFastTests( ...@@ -184,6 +184,10 @@ class StableUnCLIPPipelineFastTests(
def test_inference_batch_single_identical(self): def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical(expected_max_diff=1e-3) self._test_inference_batch_single_identical(expected_max_diff=1e-3)
@unittest.skip("Test not supported because of the use of `_encode_prior_prompt()`.")
def test_encode_prompt_works_in_isolation(self):
pass
@nightly @nightly
@require_torch_gpu @require_torch_gpu
......
...@@ -207,6 +207,10 @@ class StableUnCLIPImg2ImgPipelineFastTests( ...@@ -207,6 +207,10 @@ class StableUnCLIPImg2ImgPipelineFastTests(
def test_xformers_attention_forwardGenerator_pass(self): def test_xformers_attention_forwardGenerator_pass(self):
self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False) self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False)
@unittest.skip("Test not supported at the moment.")
def test_encode_prompt_works_in_isolation(self):
pass
@nightly @nightly
@require_torch_gpu @require_torch_gpu
......
...@@ -42,6 +42,7 @@ from diffusers.pipelines.pipeline_utils import StableDiffusionMixin ...@@ -42,6 +42,7 @@ from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import logging from diffusers.utils import logging
from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.source_code_parsing_utils import ReturnNameVisitor
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
CaptureLogger, CaptureLogger,
require_accelerate_version_greater, require_accelerate_version_greater,
...@@ -1984,6 +1985,118 @@ class PipelineTesterMixin: ...@@ -1984,6 +1985,118 @@ class PipelineTesterMixin:
assert f"You are trying to load the model files of the `variant={variant}`" in str(error.exception) assert f"You are trying to load the model files of the `variant={variant}`" in str(error.exception)
def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-4, rtol=1e-4):
if not hasattr(self.pipeline_class, "encode_prompt"):
return
components = self.get_dummy_components()
# We initialize the pipeline with only text encoders and tokenizers,
# mimicking a real-world scenario.
components_with_text_encoders = {}
for k in components:
if "text" in k or "tokenizer" in k:
components_with_text_encoders[k] = components[k]
else:
components_with_text_encoders[k] = None
pipe_with_just_text_encoder = self.pipeline_class(**components_with_text_encoders)
pipe_with_just_text_encoder = pipe_with_just_text_encoder.to(torch_device)
# Get inputs and also the args of `encode_prompts`.
inputs = self.get_dummy_inputs(torch_device)
encode_prompt_signature = inspect.signature(pipe_with_just_text_encoder.encode_prompt)
encode_prompt_parameters = list(encode_prompt_signature.parameters.values())
# Required args in encode_prompt with those with no default.
required_params = []
for param in encode_prompt_parameters:
if param.name == "self" or param.name == "kwargs":
continue
if param.default is inspect.Parameter.empty:
required_params.append(param.name)
# Craft inputs for the `encode_prompt()` method to run in isolation.
encode_prompt_param_names = [p.name for p in encode_prompt_parameters if p.name != "self"]
input_keys = list(inputs.keys())
encode_prompt_inputs = {k: inputs.pop(k) for k in input_keys if k in encode_prompt_param_names}
pipe_call_signature = inspect.signature(pipe_with_just_text_encoder.__call__)
pipe_call_parameters = pipe_call_signature.parameters
# For each required arg in encode_prompt, check if it's missing
# in encode_prompt_inputs. If so, see if __call__ has a default
# for that arg and use it if available.
for required_param_name in required_params:
if required_param_name not in encode_prompt_inputs:
pipe_call_param = pipe_call_parameters.get(required_param_name, None)
if pipe_call_param is not None and pipe_call_param.default is not inspect.Parameter.empty:
# Use the default from pipe.__call__
encode_prompt_inputs[required_param_name] = pipe_call_param.default
elif extra_required_param_value_dict is not None and isinstance(extra_required_param_value_dict, dict):
encode_prompt_inputs[required_param_name] = extra_required_param_value_dict[required_param_name]
else:
raise ValueError(
f"Required parameter '{required_param_name}' in "
f"encode_prompt has no default in either encode_prompt or __call__."
)
# Compute `encode_prompt()`.
with torch.no_grad():
encoded_prompt_outputs = pipe_with_just_text_encoder.encode_prompt(**encode_prompt_inputs)
# Programatically determine the reutrn names of `encode_prompt.`
ast_vistor = ReturnNameVisitor()
encode_prompt_tree = ast_vistor.get_ast_tree(cls=self.pipeline_class)
ast_vistor.visit(encode_prompt_tree)
prompt_embed_kwargs = ast_vistor.return_names
prompt_embeds_kwargs = dict(zip(prompt_embed_kwargs, encoded_prompt_outputs))
# Pack the outputs of `encode_prompt`.
adapted_prompt_embeds_kwargs = {
k: prompt_embeds_kwargs.pop(k) for k in list(prompt_embeds_kwargs.keys()) if k in pipe_call_parameters
}
# now initialize a pipeline without text encoders and compute outputs with the
# `encode_prompt()` outputs and other relevant inputs.
components_with_text_encoders = {}
for k in components:
if "text" in k or "tokenizer" in k:
components_with_text_encoders[k] = None
else:
components_with_text_encoders[k] = components[k]
pipe_without_text_encoders = self.pipeline_class(**components_with_text_encoders).to(torch_device)
# Set `negative_prompt` to None as we have already calculated its embeds
# if it was present in `inputs`. This is because otherwise we will interfere wrongly
# for non-None `negative_prompt` values as defaults (PixArt for example).
pipe_without_tes_inputs = {**inputs, **adapted_prompt_embeds_kwargs}
if (
pipe_call_parameters.get("negative_prompt", None) is not None
and pipe_call_parameters.get("negative_prompt").default is not None
):
pipe_without_tes_inputs.update({"negative_prompt": None})
# Pipelines like attend and excite have `prompt` as a required argument.
if (
pipe_call_parameters.get("prompt", None) is not None
and pipe_call_parameters.get("prompt").default is inspect.Parameter.empty
and pipe_call_parameters.get("prompt_embeds", None) is not None
and pipe_call_parameters.get("prompt_embeds").default is None
):
pipe_without_tes_inputs.update({"prompt": None})
pipe_out = pipe_without_text_encoders(**pipe_without_tes_inputs)[0]
# Compare against regular pipeline outputs.
full_pipe = self.pipeline_class(**components).to(torch_device)
inputs = self.get_dummy_inputs(torch_device)
pipe_out_2 = full_pipe(**inputs)[0]
if isinstance(pipe_out, np.ndarray) and isinstance(pipe_out_2, np.ndarray):
self.assertTrue(np.allclose(pipe_out, pipe_out_2, atol=atol, rtol=rtol))
elif isinstance(pipe_out, torch.Tensor) and isinstance(pipe_out_2, torch.Tensor):
self.assertTrue(torch.allclose(pipe_out, pipe_out_2, atol=atol, rtol=rtol))
def test_StableDiffusionMixin_component(self): def test_StableDiffusionMixin_component(self):
"""Any pipeline that have LDMFuncMixin should have vae and unet components.""" """Any pipeline that have LDMFuncMixin should have vae and unet components."""
if not issubclass(self.pipeline_class, StableDiffusionMixin): if not issubclass(self.pipeline_class, StableDiffusionMixin):
...@@ -2256,150 +2369,6 @@ class PipelinePushToHubTester(unittest.TestCase): ...@@ -2256,150 +2369,6 @@ class PipelinePushToHubTester(unittest.TestCase):
delete_repo(self.repo_id, token=TOKEN) delete_repo(self.repo_id, token=TOKEN)
# For SDXL and its derivative pipelines (such as ControlNet), we have the text encoders
# and the tokenizers as optional components. So, we need to override the `test_save_load_optional_components()`
# test for all such pipelines. This requires us to use a custom `encode_prompt()` function.
class SDXLOptionalComponentsTesterMixin:
def encode_prompt(
self, tokenizers, text_encoders, prompt: str, num_images_per_prompt: int = 1, negative_prompt: str = None
):
device = text_encoders[0].device
if isinstance(prompt, str):
prompt = [prompt]
batch_size = len(prompt)
prompt_embeds_list = []
for tokenizer, text_encoder in zip(tokenizers, text_encoders):
text_inputs = tokenizer(
prompt,
padding="max_length",
max_length=tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_input_ids = text_inputs.input_ids
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
pooled_prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.hidden_states[-2]
prompt_embeds_list.append(prompt_embeds)
prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
if negative_prompt is None:
negative_prompt_embeds = torch.zeros_like(prompt_embeds)
negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
else:
negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
negative_prompt_embeds_list = []
for tokenizer, text_encoder in zip(tokenizers, text_encoders):
uncond_input = tokenizer(
negative_prompt,
padding="max_length",
max_length=tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
negative_prompt_embeds = text_encoder(uncond_input.input_ids.to(device), output_hidden_states=True)
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
negative_prompt_embeds_list.append(negative_prompt_embeds)
negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
# for classifier-free guidance
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = negative_prompt_embeds.shape[1]
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
bs_embed * num_images_per_prompt, -1
)
# for classifier-free guidance
negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
bs_embed * num_images_per_prompt, -1
)
return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
def _test_save_load_optional_components(self, expected_max_difference=1e-4):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
for optional_component in pipe._optional_components:
setattr(pipe, optional_component, None)
for component in pipe.components.values():
if hasattr(component, "set_default_attn_processor"):
component.set_default_attn_processor()
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
tokenizer = components.pop("tokenizer")
tokenizer_2 = components.pop("tokenizer_2")
text_encoder = components.pop("text_encoder")
text_encoder_2 = components.pop("text_encoder_2")
tokenizers = [tokenizer, tokenizer_2] if tokenizer is not None else [tokenizer_2]
text_encoders = [text_encoder, text_encoder_2] if text_encoder is not None else [text_encoder_2]
prompt = inputs.pop("prompt")
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = self.encode_prompt(tokenizers, text_encoders, prompt)
inputs["prompt_embeds"] = prompt_embeds
inputs["negative_prompt_embeds"] = negative_prompt_embeds
inputs["pooled_prompt_embeds"] = pooled_prompt_embeds
inputs["negative_pooled_prompt_embeds"] = negative_pooled_prompt_embeds
output = pipe(**inputs)[0]
with tempfile.TemporaryDirectory() as tmpdir:
pipe.save_pretrained(tmpdir)
pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
for component in pipe_loaded.components.values():
if hasattr(component, "set_default_attn_processor"):
component.set_default_attn_processor()
pipe_loaded.to(torch_device)
pipe_loaded.set_progress_bar_config(disable=None)
for optional_component in pipe._optional_components:
self.assertTrue(
getattr(pipe_loaded, optional_component) is None,
f"`{optional_component}` did not stay set to None after loading.",
)
inputs = self.get_dummy_inputs(generator_device)
_ = inputs.pop("prompt")
inputs["prompt_embeds"] = prompt_embeds
inputs["negative_prompt_embeds"] = negative_prompt_embeds
inputs["pooled_prompt_embeds"] = pooled_prompt_embeds
inputs["negative_pooled_prompt_embeds"] = negative_pooled_prompt_embeds
output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
self.assertLess(max_diff, expected_max_difference)
class PyramidAttentionBroadcastTesterMixin: class PyramidAttentionBroadcastTesterMixin:
pab_config = PyramidAttentionBroadcastConfig( pab_config = PyramidAttentionBroadcastConfig(
spatial_attention_block_skip_range=2, spatial_attention_block_skip_range=2,
......
...@@ -173,6 +173,14 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, SDFunctionTesterMixin, ...@@ -173,6 +173,14 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, SDFunctionTesterMixin,
def test_num_images_per_prompt(self): def test_num_images_per_prompt(self):
pass pass
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"num_images_per_prompt": 1,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
@slow @slow
@skip_mps @skip_mps
......
...@@ -197,6 +197,14 @@ class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ...@@ -197,6 +197,14 @@ class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
def test_num_images_per_prompt(self): def test_num_images_per_prompt(self):
pass pass
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"num_images_per_prompt": 1,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
@nightly @nightly
@skip_mps @skip_mps
......
...@@ -578,6 +578,12 @@ class UniDiffuserPipelineFastTests( ...@@ -578,6 +578,12 @@ class UniDiffuserPipelineFastTests(
expected_text_prefix = '" This This' expected_text_prefix = '" This This'
assert text[0][: len(expected_text_prefix)] == expected_text_prefix assert text[0][: len(expected_text_prefix)] == expected_text_prefix
@unittest.skip(
"Test not supported becauseit has a bunch of direct configs at init and also, this pipeline isn't used that much now."
)
def test_encode_prompt_works_in_isolation():
pass
@nightly @nightly
@require_torch_gpu @require_torch_gpu
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment