Unverified Commit 8c661ea5 authored by Yao Matrix's avatar Yao Matrix Committed by GitHub
Browse files

enable lora cases on XPU (#11506)



* enable lora cases on XPU
Signed-off-by: default avatarYao Matrix <matrix.yao@intel.com>

* remove hunyuanvideo xpu expectation
Signed-off-by: default avatarYao Matrix <matrix.yao@intel.com>

---------
Signed-off-by: default avatarYao Matrix <matrix.yao@intel.com>
parent d7ffe601
...@@ -31,13 +31,14 @@ from diffusers import FlowMatchEulerDiscreteScheduler, FluxControlPipeline, Flux ...@@ -31,13 +31,14 @@ from diffusers import FlowMatchEulerDiscreteScheduler, FluxControlPipeline, Flux
from diffusers.utils import load_image, logging from diffusers.utils import load_image, logging
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
CaptureLogger, CaptureLogger,
backend_empty_cache,
floats_tensor, floats_tensor,
is_peft_available, is_peft_available,
nightly, nightly,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda, require_big_accelerator,
require_peft_backend, require_peft_backend,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -809,10 +810,10 @@ class FluxControlLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): ...@@ -809,10 +810,10 @@ class FluxControlLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
@slow @slow
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
@require_peft_backend @require_peft_backend
@require_big_gpu_with_torch_cuda @require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda @pytest.mark.big_accelerator
class FluxLoRAIntegrationTests(unittest.TestCase): class FluxLoRAIntegrationTests(unittest.TestCase):
"""internal note: The integration slices were obtained on audace. """internal note: The integration slices were obtained on audace.
...@@ -827,7 +828,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase): ...@@ -827,7 +828,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
self.pipeline = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16) self.pipeline = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
...@@ -836,13 +837,13 @@ class FluxLoRAIntegrationTests(unittest.TestCase): ...@@ -836,13 +837,13 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
del self.pipeline del self.pipeline
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_flux_the_last_ben(self): def test_flux_the_last_ben(self):
self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors") self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors")
self.pipeline.fuse_lora() self.pipeline.fuse_lora()
self.pipeline.unload_lora_weights() self.pipeline.unload_lora_weights()
# Instead of calling `enable_model_cpu_offload()`, we do a cuda placement here because the CI # Instead of calling `enable_model_cpu_offload()`, we do a accelerator placement here because the CI
# run supports it. We have about 34GB RAM in the CI runner which kills the test when run with # run supports it. We have about 34GB RAM in the CI runner which kills the test when run with
# `enable_model_cpu_offload()`. We repeat this for the other tests, too. # `enable_model_cpu_offload()`. We repeat this for the other tests, too.
self.pipeline = self.pipeline.to(torch_device) self.pipeline = self.pipeline.to(torch_device)
...@@ -956,10 +957,10 @@ class FluxLoRAIntegrationTests(unittest.TestCase): ...@@ -956,10 +957,10 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
@require_peft_backend @require_peft_backend
@require_big_gpu_with_torch_cuda @require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda @pytest.mark.big_accelerator
class FluxControlLoRAIntegrationTests(unittest.TestCase): class FluxControlLoRAIntegrationTests(unittest.TestCase):
num_inference_steps = 10 num_inference_steps = 10
seed = 0 seed = 0
...@@ -969,17 +970,17 @@ class FluxControlLoRAIntegrationTests(unittest.TestCase): ...@@ -969,17 +970,17 @@ class FluxControlLoRAIntegrationTests(unittest.TestCase):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
self.pipeline = FluxControlPipeline.from_pretrained( self.pipeline = FluxControlPipeline.from_pretrained(
"black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16 "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16
).to("cuda") ).to(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
@parameterized.expand(["black-forest-labs/FLUX.1-Canny-dev-lora", "black-forest-labs/FLUX.1-Depth-dev-lora"]) @parameterized.expand(["black-forest-labs/FLUX.1-Canny-dev-lora", "black-forest-labs/FLUX.1-Depth-dev-lora"])
def test_lora(self, lora_ckpt_id): def test_lora(self, lora_ckpt_id):
......
...@@ -28,13 +28,16 @@ from diffusers import ( ...@@ -28,13 +28,16 @@ from diffusers import (
HunyuanVideoTransformer3DModel, HunyuanVideoTransformer3DModel,
) )
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
Expectations,
backend_empty_cache,
floats_tensor, floats_tensor,
nightly, nightly,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda, require_big_accelerator,
require_peft_backend, require_peft_backend,
require_torch_gpu, require_torch_accelerator,
skip_mps, skip_mps,
torch_device,
) )
...@@ -192,10 +195,10 @@ class HunyuanVideoLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): ...@@ -192,10 +195,10 @@ class HunyuanVideoLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
@require_peft_backend @require_peft_backend
@require_big_gpu_with_torch_cuda @require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda @pytest.mark.big_accelerator
class HunyuanVideoLoRAIntegrationTests(unittest.TestCase): class HunyuanVideoLoRAIntegrationTests(unittest.TestCase):
"""internal note: The integration slices were obtained on DGX. """internal note: The integration slices were obtained on DGX.
...@@ -210,7 +213,7 @@ class HunyuanVideoLoRAIntegrationTests(unittest.TestCase): ...@@ -210,7 +213,7 @@ class HunyuanVideoLoRAIntegrationTests(unittest.TestCase):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
model_id = "hunyuanvideo-community/HunyuanVideo" model_id = "hunyuanvideo-community/HunyuanVideo"
transformer = HunyuanVideoTransformer3DModel.from_pretrained( transformer = HunyuanVideoTransformer3DModel.from_pretrained(
...@@ -218,13 +221,13 @@ class HunyuanVideoLoRAIntegrationTests(unittest.TestCase): ...@@ -218,13 +221,13 @@ class HunyuanVideoLoRAIntegrationTests(unittest.TestCase):
) )
self.pipeline = HunyuanVideoPipeline.from_pretrained( self.pipeline = HunyuanVideoPipeline.from_pretrained(
model_id, transformer=transformer, torch_dtype=torch.float16 model_id, transformer=transformer, torch_dtype=torch.float16
).to("cuda") ).to(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_original_format_cseti(self): def test_original_format_cseti(self):
self.pipeline.load_lora_weights( self.pipeline.load_lora_weights(
...@@ -249,8 +252,13 @@ class HunyuanVideoLoRAIntegrationTests(unittest.TestCase): ...@@ -249,8 +252,13 @@ class HunyuanVideoLoRAIntegrationTests(unittest.TestCase):
out_slice = np.concatenate((out[:8], out[-8:])) out_slice = np.concatenate((out[:8], out[-8:]))
# fmt: off # fmt: off
expected_slice = np.array([0.1013, 0.1924, 0.0078, 0.1021, 0.1929, 0.0078, 0.1023, 0.1919, 0.7402, 0.104, 0.4482, 0.7354, 0.0925, 0.4382, 0.7275, 0.0815]) expected_slices = Expectations(
{
("cuda", 7): np.array([0.1013, 0.1924, 0.0078, 0.1021, 0.1929, 0.0078, 0.1023, 0.1919, 0.7402, 0.104, 0.4482, 0.7354, 0.0925, 0.4382, 0.7275, 0.0815]),
}
)
# fmt: on # fmt: on
expected_slice = expected_slices.get_expectation()
max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), out_slice) max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), out_slice)
......
...@@ -93,12 +93,12 @@ class StableDiffusionLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase): ...@@ -93,12 +93,12 @@ class StableDiffusionLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
# Keeping this test here makes sense because it doesn't look any integration # Keeping this test here makes sense because it doesn't look any integration
# (value assertions on logits). # (value assertions on logits).
......
...@@ -34,7 +34,7 @@ from diffusers.utils.testing_utils import ( ...@@ -34,7 +34,7 @@ from diffusers.utils.testing_utils import (
is_flaky, is_flaky,
nightly, nightly,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda, require_big_accelerator,
require_peft_backend, require_peft_backend,
require_torch_accelerator, require_torch_accelerator,
torch_device, torch_device,
...@@ -138,8 +138,8 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): ...@@ -138,8 +138,8 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
@nightly @nightly
@require_torch_accelerator @require_torch_accelerator
@require_peft_backend @require_peft_backend
@require_big_gpu_with_torch_cuda @require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda @pytest.mark.big_accelerator
class SD3LoraIntegrationTests(unittest.TestCase): class SD3LoraIntegrationTests(unittest.TestCase):
pipeline_class = StableDiffusion3Img2ImgPipeline pipeline_class = StableDiffusion3Img2ImgPipeline
repo_id = "stabilityai/stable-diffusion-3-medium-diffusers" repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"
......
...@@ -37,12 +37,13 @@ from diffusers.utils import logging ...@@ -37,12 +37,13 @@ from diffusers.utils import logging
from diffusers.utils.import_utils import is_accelerate_available from diffusers.utils.import_utils import is_accelerate_available
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
CaptureLogger, CaptureLogger,
backend_empty_cache,
is_flaky, is_flaky,
load_image, load_image,
nightly, nightly,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_peft_backend, require_peft_backend,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -105,12 +106,12 @@ class StableDiffusionXLLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase): ...@@ -105,12 +106,12 @@ class StableDiffusionXLLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
@is_flaky @is_flaky
def test_multiple_wrong_adapter_name_raises_error(self): def test_multiple_wrong_adapter_name_raises_error(self):
...@@ -119,18 +120,18 @@ class StableDiffusionXLLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase): ...@@ -119,18 +120,18 @@ class StableDiffusionXLLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
@slow @slow
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
@require_peft_backend @require_peft_backend
class LoraSDXLIntegrationTests(unittest.TestCase): class LoraSDXLIntegrationTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_sdxl_1_0_lora(self): def test_sdxl_1_0_lora(self):
generator = torch.Generator("cpu").manual_seed(0) generator = torch.Generator("cpu").manual_seed(0)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment