Unverified Commit 2d380895 authored by Yao Matrix's avatar Yao Matrix Committed by GitHub
Browse files

enable 7 cases on XPU (#11503)



* enable 7 cases on XPU
Signed-off-by: default avatarYao Matrix <matrix.yao@intel.com>

* calibrate A100 expectations
Signed-off-by: default avatarYAO Matrix <matrix.yao@intel.com>

---------
Signed-off-by: default avatarYao Matrix <matrix.yao@intel.com>
Signed-off-by: default avatarYAO Matrix <matrix.yao@intel.com>
parent 0c47c954
...@@ -24,9 +24,10 @@ from transformers import AutoTokenizer, T5EncoderModel ...@@ -24,9 +24,10 @@ from transformers import AutoTokenizer, T5EncoderModel
from diffusers import AutoencoderKLCogVideoX, ConsisIDPipeline, ConsisIDTransformer3DModel, DDIMScheduler from diffusers import AutoencoderKLCogVideoX, ConsisIDPipeline, ConsisIDTransformer3DModel, DDIMScheduler
from diffusers.utils import load_image from diffusers.utils import load_image
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism, enable_full_determinism,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -316,19 +317,19 @@ class ConsisIDPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ...@@ -316,19 +317,19 @@ class ConsisIDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@slow @slow
@require_torch_gpu @require_torch_accelerator
class ConsisIDPipelineIntegrationTests(unittest.TestCase): class ConsisIDPipelineIntegrationTests(unittest.TestCase):
prompt = "A painting of a squirrel eating a burger." prompt = "A painting of a squirrel eating a burger."
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_consisid(self): def test_consisid(self):
generator = torch.Generator("cpu").manual_seed(0) generator = torch.Generator("cpu").manual_seed(0)
...@@ -338,8 +339,8 @@ class ConsisIDPipelineIntegrationTests(unittest.TestCase): ...@@ -338,8 +339,8 @@ class ConsisIDPipelineIntegrationTests(unittest.TestCase):
prompt = self.prompt prompt = self.prompt
image = load_image("https://github.com/PKU-YuanGroup/ConsisID/blob/main/asserts/example_images/2.png?raw=true") image = load_image("https://github.com/PKU-YuanGroup/ConsisID/blob/main/asserts/example_images/2.png?raw=true")
id_vit_hidden = [torch.ones([1, 2, 2])] * 1 id_vit_hidden = [torch.ones([1, 577, 1024])] * 5
id_cond = torch.ones(1, 2) id_cond = torch.ones(1, 1280)
videos = pipe( videos = pipe(
image=image, image=image,
...@@ -357,5 +358,5 @@ class ConsisIDPipelineIntegrationTests(unittest.TestCase): ...@@ -357,5 +358,5 @@ class ConsisIDPipelineIntegrationTests(unittest.TestCase):
video = videos[0] video = videos[0]
expected_video = torch.randn(1, 16, 480, 720, 3).numpy() expected_video = torch.randn(1, 16, 480, 720, 3).numpy()
max_diff = numpy_cosine_similarity_distance(video, expected_video) max_diff = numpy_cosine_similarity_distance(video.cpu(), expected_video)
assert max_diff < 1e-3, f"Max diff is too high. got {video}" assert max_diff < 1e-3, f"Max diff is too high. got {video}"
...@@ -27,9 +27,10 @@ from diffusers import ( ...@@ -27,9 +27,10 @@ from diffusers import (
FlowMatchEulerDiscreteScheduler, FlowMatchEulerDiscreteScheduler,
) )
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism, enable_full_determinism,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -256,19 +257,19 @@ class EasyAnimatePipelineFastTests(PipelineTesterMixin, unittest.TestCase): ...@@ -256,19 +257,19 @@ class EasyAnimatePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@slow @slow
@require_torch_gpu @require_torch_accelerator
class EasyAnimatePipelineIntegrationTests(unittest.TestCase): class EasyAnimatePipelineIntegrationTests(unittest.TestCase):
prompt = "A painting of a squirrel eating a burger." prompt = "A painting of a squirrel eating a burger."
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_EasyAnimate(self): def test_EasyAnimate(self):
generator = torch.Generator("cpu").manual_seed(0) generator = torch.Generator("cpu").manual_seed(0)
......
...@@ -27,8 +27,8 @@ from diffusers.utils.testing_utils import ( ...@@ -27,8 +27,8 @@ from diffusers.utils.testing_utils import (
enable_full_determinism, enable_full_determinism,
nightly, nightly,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda, require_big_accelerator,
require_torch_gpu, require_torch_accelerator,
torch_device, torch_device,
) )
...@@ -266,9 +266,9 @@ class MochiPipelineFastTests(PipelineTesterMixin, FasterCacheTesterMixin, unitte ...@@ -266,9 +266,9 @@ class MochiPipelineFastTests(PipelineTesterMixin, FasterCacheTesterMixin, unitte
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
@require_big_gpu_with_torch_cuda @require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda @pytest.mark.big_accelerator
class MochiPipelineIntegrationTests(unittest.TestCase): class MochiPipelineIntegrationTests(unittest.TestCase):
prompt = "A painting of a squirrel eating a burger." prompt = "A painting of a squirrel eating a burger."
...@@ -302,5 +302,5 @@ class MochiPipelineIntegrationTests(unittest.TestCase): ...@@ -302,5 +302,5 @@ class MochiPipelineIntegrationTests(unittest.TestCase):
video = videos[0] video = videos[0]
expected_video = torch.randn(1, 19, 480, 848, 3).numpy() expected_video = torch.randn(1, 19, 480, 848, 3).numpy()
max_diff = numpy_cosine_similarity_distance(video, expected_video) max_diff = numpy_cosine_similarity_distance(video.cpu(), expected_video)
assert max_diff < 1e-3, f"Max diff is too high. got {video}" assert max_diff < 1e-3, f"Max diff is too high. got {video}"
...@@ -7,8 +7,10 @@ from transformers import AutoTokenizer ...@@ -7,8 +7,10 @@ from transformers import AutoTokenizer
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, OmniGenPipeline, OmniGenTransformer2DModel from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, OmniGenPipeline, OmniGenTransformer2DModel
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
Expectations,
backend_empty_cache,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -87,7 +89,7 @@ class OmniGenPipelineFastTests(unittest.TestCase, PipelineTesterMixin): ...@@ -87,7 +89,7 @@ class OmniGenPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
@slow @slow
@require_torch_gpu @require_torch_accelerator
class OmniGenPipelineSlowTests(unittest.TestCase): class OmniGenPipelineSlowTests(unittest.TestCase):
pipeline_class = OmniGenPipeline pipeline_class = OmniGenPipeline
repo_id = "shitao/OmniGen-v1-diffusers" repo_id = "shitao/OmniGen-v1-diffusers"
...@@ -95,12 +97,12 @@ class OmniGenPipelineSlowTests(unittest.TestCase): ...@@ -95,12 +97,12 @@ class OmniGenPipelineSlowTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, seed=0): def get_inputs(self, device, seed=0):
if str(device).startswith("mps"): if str(device).startswith("mps"):
...@@ -125,21 +127,56 @@ class OmniGenPipelineSlowTests(unittest.TestCase): ...@@ -125,21 +127,56 @@ class OmniGenPipelineSlowTests(unittest.TestCase):
image = pipe(**inputs).images[0] image = pipe(**inputs).images[0]
image_slice = image[0, :10, :10] image_slice = image[0, :10, :10]
expected_slice = np.array( expected_slices = Expectations(
[ {
[0.1783447, 0.16772744, 0.14339337], ("xpu", 3): np.array(
[0.17066911, 0.15521264, 0.13757327], [
[0.17072496, 0.15531206, 0.13524258], [0.05859375, 0.05859375, 0.04492188],
[0.16746324, 0.1564025, 0.13794944], [0.04882812, 0.04101562, 0.03320312],
[0.16490817, 0.15258026, 0.13697758], [0.04882812, 0.04296875, 0.03125],
[0.16971767, 0.15826806, 0.13928896], [0.04296875, 0.0390625, 0.03320312],
[0.16782972, 0.15547255, 0.13783783], [0.04296875, 0.03710938, 0.03125],
[0.16464645, 0.15281534, 0.13522372], [0.04492188, 0.0390625, 0.03320312],
[0.16535294, 0.15301755, 0.13526791], [0.04296875, 0.03710938, 0.03125],
[0.16365296, 0.15092957, 0.13443318], [0.04101562, 0.03710938, 0.02734375],
], [0.04101562, 0.03515625, 0.02734375],
dtype=np.float32, [0.04101562, 0.03515625, 0.02929688],
],
dtype=np.float32,
),
("cuda", 7): np.array(
[
[0.1783447, 0.16772744, 0.14339337],
[0.17066911, 0.15521264, 0.13757327],
[0.17072496, 0.15531206, 0.13524258],
[0.16746324, 0.1564025, 0.13794944],
[0.16490817, 0.15258026, 0.13697758],
[0.16971767, 0.15826806, 0.13928896],
[0.16782972, 0.15547255, 0.13783783],
[0.16464645, 0.15281534, 0.13522372],
[0.16535294, 0.15301755, 0.13526791],
[0.16365296, 0.15092957, 0.13443318],
],
dtype=np.float32,
),
("cuda", 8): np.array(
[
[0.0546875, 0.05664062, 0.04296875],
[0.046875, 0.04101562, 0.03320312],
[0.05078125, 0.04296875, 0.03125],
[0.04296875, 0.04101562, 0.03320312],
[0.0390625, 0.03710938, 0.02929688],
[0.04296875, 0.03710938, 0.03125],
[0.0390625, 0.03710938, 0.02929688],
[0.0390625, 0.03710938, 0.02734375],
[0.0390625, 0.03320312, 0.02734375],
[0.0390625, 0.03320312, 0.02734375],
],
dtype=np.float32,
),
}
) )
expected_slice = expected_slices.get_expectation()
max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten()) max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
......
...@@ -25,11 +25,12 @@ from transformers import CLIPImageProcessor, CLIPVisionConfig ...@@ -25,11 +25,12 @@ from transformers import CLIPImageProcessor, CLIPVisionConfig
from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel
from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism, enable_full_determinism,
floats_tensor, floats_tensor,
load_image, load_image,
nightly, nightly,
require_torch_gpu, require_torch_accelerator,
torch_device, torch_device,
) )
...@@ -174,19 +175,19 @@ class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase): ...@@ -174,19 +175,19 @@ class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
class PaintByExamplePipelineIntegrationTests(unittest.TestCase): class PaintByExamplePipelineIntegrationTests(unittest.TestCase):
def setUp(self): def setUp(self):
# clean up the VRAM before each test # clean up the VRAM before each test
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
# clean up the VRAM after each test # clean up the VRAM after each test
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_paint_by_example(self): def test_paint_by_example(self):
# make sure here that pndm scheduler skips prk # make sure here that pndm scheduler skips prk
......
...@@ -32,7 +32,14 @@ from diffusers import ( ...@@ -32,7 +32,14 @@ from diffusers import (
StableAudioProjectionModel, StableAudioProjectionModel,
) )
from diffusers.utils import is_xformers_available from diffusers.utils import is_xformers_available
from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device from diffusers.utils.testing_utils import (
Expectations,
backend_empty_cache,
enable_full_determinism,
nightly,
require_torch_accelerator,
torch_device,
)
from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS
from ..test_pipelines_common import PipelineTesterMixin from ..test_pipelines_common import PipelineTesterMixin
...@@ -419,17 +426,17 @@ class StableAudioPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ...@@ -419,17 +426,17 @@ class StableAudioPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
class StableAudioPipelineIntegrationTests(unittest.TestCase): class StableAudioPipelineIntegrationTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed) generator = torch.Generator(device=generator_device).manual_seed(seed)
...@@ -459,9 +466,15 @@ class StableAudioPipelineIntegrationTests(unittest.TestCase): ...@@ -459,9 +466,15 @@ class StableAudioPipelineIntegrationTests(unittest.TestCase):
# check the portion of the generated audio with the largest dynamic range (reduces flakiness) # check the portion of the generated audio with the largest dynamic range (reduces flakiness)
audio_slice = audio[0, 447590:447600] audio_slice = audio[0, 447590:447600]
# fmt: off # fmt: off
expected_slice = np.array( expected_slices = Expectations(
[-0.0278, 0.1096, 0.1877, 0.3178, 0.5329, 0.6990, 0.6972, 0.6186, 0.5608, 0.5060] {
("xpu", 3): np.array([-0.0285, 0.1083, 0.1863, 0.3165, 0.5312, 0.6971, 0.6958, 0.6177, 0.5598, 0.5048]),
("cuda", 7): np.array([-0.0278, 0.1096, 0.1877, 0.3178, 0.5329, 0.6990, 0.6972, 0.6186, 0.5608, 0.5060]),
("cuda", 8): np.array([-0.0285, 0.1082, 0.1862, 0.3163, 0.5306, 0.6964, 0.6953, 0.6172, 0.5593, 0.5044]),
}
) )
# fmt: one # fmt: on
expected_slice = expected_slices.get_expectation()
max_diff = np.abs(expected_slice - audio_slice.detach().cpu().numpy()).max() max_diff = np.abs(expected_slice - audio_slice.detach().cpu().numpy()).max()
assert max_diff < 1.5e-3 assert max_diff < 1.5e-3
...@@ -389,7 +389,7 @@ class BnB4BitBasicTests(Base4bitTests): ...@@ -389,7 +389,7 @@ class BnB4BitBasicTests(Base4bitTests):
class BnB4BitTrainingTests(Base4bitTests): class BnB4BitTrainingTests(Base4bitTests):
def setUp(self): def setUp(self):
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
nf4_config = BitsAndBytesConfig( nf4_config = BitsAndBytesConfig(
load_in_4bit=True, load_in_4bit=True,
...@@ -657,7 +657,7 @@ class SlowBnb4BitTests(Base4bitTests): ...@@ -657,7 +657,7 @@ class SlowBnb4BitTests(Base4bitTests):
class SlowBnb4BitFluxTests(Base4bitTests): class SlowBnb4BitFluxTests(Base4bitTests):
def setUp(self) -> None: def setUp(self) -> None:
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
model_id = "hf-internal-testing/flux.1-dev-nf4-pkg" model_id = "hf-internal-testing/flux.1-dev-nf4-pkg"
t5_4bit = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2") t5_4bit = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2")
...@@ -674,7 +674,7 @@ class SlowBnb4BitFluxTests(Base4bitTests): ...@@ -674,7 +674,7 @@ class SlowBnb4BitFluxTests(Base4bitTests):
del self.pipeline_4bit del self.pipeline_4bit
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_quality(self): def test_quality(self):
# keep the resolution and max tokens to a lower number for faster execution. # keep the resolution and max tokens to a lower number for faster execution.
...@@ -722,7 +722,7 @@ class SlowBnb4BitFluxTests(Base4bitTests): ...@@ -722,7 +722,7 @@ class SlowBnb4BitFluxTests(Base4bitTests):
class SlowBnb4BitFluxControlWithLoraTests(Base4bitTests): class SlowBnb4BitFluxControlWithLoraTests(Base4bitTests):
def setUp(self) -> None: def setUp(self) -> None:
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
self.pipeline_4bit = FluxControlPipeline.from_pretrained("eramth/flux-4bit", torch_dtype=torch.float16) self.pipeline_4bit = FluxControlPipeline.from_pretrained("eramth/flux-4bit", torch_dtype=torch.float16)
self.pipeline_4bit.enable_model_cpu_offload() self.pipeline_4bit.enable_model_cpu_offload()
...@@ -731,7 +731,7 @@ class SlowBnb4BitFluxControlWithLoraTests(Base4bitTests): ...@@ -731,7 +731,7 @@ class SlowBnb4BitFluxControlWithLoraTests(Base4bitTests):
del self.pipeline_4bit del self.pipeline_4bit
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_lora_loading(self): def test_lora_loading(self):
self.pipeline_4bit.load_lora_weights("black-forest-labs/FLUX.1-Canny-dev-lora") self.pipeline_4bit.load_lora_weights("black-forest-labs/FLUX.1-Canny-dev-lora")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment