Unverified Commit a6f043a8 authored by Sayak Paul's avatar Sayak Paul Committed by GitHub
Browse files

[LoRA] allow big CUDA tests to run properly for LoRA (and others) (#9845)



* allow big lora tests to run on the CI.

* print

* print.

* print

* print

* print

* print

* more

* print

* remove print.

* remove print

* directly place on cuda.

* remove pipeline.

* remove

* fix

* fix

* spaces

* quality

* updates

* directly place flux controlnet pipeline on cuda.

* torch_device instead of cuda.

* style

* device placement.

* fixes

* add big gpu marker for mochi; rename test correctly

* address feedback

* fix

---------
Co-authored-by: default avatarAryan <aryan@huggingface.co>
parent 12fbe3f7
...@@ -796,8 +796,8 @@ class FluxControlLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): ...@@ -796,8 +796,8 @@ class FluxControlLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
@nightly @nightly
@require_torch_gpu @require_torch_gpu
@require_peft_backend @require_peft_backend
@unittest.skip("We cannot run inference on this model with the current CI hardware") @require_big_gpu_with_torch_cuda
# TODO (DN6, sayakpaul): move these tests to a beefier GPU @pytest.mark.big_gpu_with_torch_cuda
class FluxLoRAIntegrationTests(unittest.TestCase): class FluxLoRAIntegrationTests(unittest.TestCase):
"""internal note: The integration slices were obtained on audace. """internal note: The integration slices were obtained on audace.
...@@ -819,6 +819,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase): ...@@ -819,6 +819,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
del self.pipeline
gc.collect() gc.collect()
torch.cuda.empty_cache() torch.cuda.empty_cache()
...@@ -826,7 +827,10 @@ class FluxLoRAIntegrationTests(unittest.TestCase): ...@@ -826,7 +827,10 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors") self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors")
self.pipeline.fuse_lora() self.pipeline.fuse_lora()
self.pipeline.unload_lora_weights() self.pipeline.unload_lora_weights()
self.pipeline.enable_model_cpu_offload() # Instead of calling `enable_model_cpu_offload()`, we do a cuda placement here because the CI
# run supports it. We have about 34GB RAM in the CI runner which kills the test when run with
# `enable_model_cpu_offload()`. We repeat this for the other tests, too.
self.pipeline = self.pipeline.to(torch_device)
prompt = "jon snow eating pizza with ketchup" prompt = "jon snow eating pizza with ketchup"
...@@ -848,7 +852,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase): ...@@ -848,7 +852,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
self.pipeline.load_lora_weights("Norod78/brain-slug-flux") self.pipeline.load_lora_weights("Norod78/brain-slug-flux")
self.pipeline.fuse_lora() self.pipeline.fuse_lora()
self.pipeline.unload_lora_weights() self.pipeline.unload_lora_weights()
self.pipeline.enable_model_cpu_offload() self.pipeline = self.pipeline.to(torch_device)
prompt = "The cat with a brain slug earring" prompt = "The cat with a brain slug earring"
out = self.pipeline( out = self.pipeline(
...@@ -870,7 +874,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase): ...@@ -870,7 +874,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
self.pipeline.load_lora_weights("cocktailpeanut/optimus", weight_name="optimus.safetensors") self.pipeline.load_lora_weights("cocktailpeanut/optimus", weight_name="optimus.safetensors")
self.pipeline.fuse_lora() self.pipeline.fuse_lora()
self.pipeline.unload_lora_weights() self.pipeline.unload_lora_weights()
self.pipeline.enable_model_cpu_offload() self.pipeline = self.pipeline.to(torch_device)
prompt = "optimus is cleaning the house with broomstick" prompt = "optimus is cleaning the house with broomstick"
out = self.pipeline( out = self.pipeline(
...@@ -892,7 +896,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase): ...@@ -892,7 +896,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
self.pipeline.load_lora_weights("XLabs-AI/flux-lora-collection", weight_name="disney_lora.safetensors") self.pipeline.load_lora_weights("XLabs-AI/flux-lora-collection", weight_name="disney_lora.safetensors")
self.pipeline.fuse_lora() self.pipeline.fuse_lora()
self.pipeline.unload_lora_weights() self.pipeline.unload_lora_weights()
self.pipeline.enable_model_cpu_offload() self.pipeline = self.pipeline.to(torch_device)
prompt = "A blue jay standing on a large basket of rainbow macarons, disney style" prompt = "A blue jay standing on a large basket of rainbow macarons, disney style"
......
...@@ -17,6 +17,7 @@ import sys ...@@ -17,6 +17,7 @@ import sys
import unittest import unittest
import numpy as np import numpy as np
import pytest
import torch import torch
from transformers import AutoTokenizer, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel from transformers import AutoTokenizer, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
...@@ -31,9 +32,9 @@ from diffusers.utils.import_utils import is_accelerate_available ...@@ -31,9 +32,9 @@ from diffusers.utils.import_utils import is_accelerate_available
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
nightly, nightly,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_peft_backend, require_peft_backend,
require_torch_gpu, require_torch_gpu,
slow,
torch_device, torch_device,
) )
...@@ -128,11 +129,12 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): ...@@ -128,11 +129,12 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
pass pass
@slow
@nightly @nightly
@require_torch_gpu @require_torch_gpu
@require_peft_backend @require_peft_backend
class LoraSD3IntegrationTests(unittest.TestCase): @require_big_gpu_with_torch_cuda
@pytest.mark.big_gpu_with_torch_cuda
class SD3LoraIntegrationTests(unittest.TestCase):
pipeline_class = StableDiffusion3Img2ImgPipeline pipeline_class = StableDiffusion3Img2ImgPipeline
repo_id = "stabilityai/stable-diffusion-3-medium-diffusers" repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"
...@@ -166,14 +168,17 @@ class LoraSD3IntegrationTests(unittest.TestCase): ...@@ -166,14 +168,17 @@ class LoraSD3IntegrationTests(unittest.TestCase):
def test_sd3_img2img_lora(self): def test_sd3_img2img_lora(self):
pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16) pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
pipe.load_lora_weights("zwloong/sd3-lora-training-rank16-v2", weight_name="pytorch_lora_weights.safetensors") pipe.load_lora_weights("zwloong/sd3-lora-training-rank16-v2")
pipe.enable_sequential_cpu_offload() pipe.fuse_lora()
pipe.unload_lora_weights()
pipe = pipe.to(torch_device)
inputs = self.get_inputs(torch_device) inputs = self.get_inputs(torch_device)
image = pipe(**inputs).images[0] image = pipe(**inputs).images[0]
image_slice = image[0, -3:, -3:] image_slice = image[0, -3:, -3:]
expected_slice = np.array([0.5396, 0.5776, 0.7432, 0.5151, 0.5586, 0.7383, 0.5537, 0.5933, 0.7153]) expected_slice = np.array([0.5396, 0.5776, 0.7432, 0.5151, 0.5586, 0.7383, 0.5537, 0.5933, 0.7153])
max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten()) max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
assert max_diff < 1e-4, f"Outputs are not close enough, got {max_diff}" assert max_diff < 1e-4, f"Outputs are not close enough, got {max_diff}"
......
...@@ -32,9 +32,9 @@ from diffusers.models import FluxControlNetModel ...@@ -32,9 +32,9 @@ from diffusers.models import FluxControlNetModel
from diffusers.utils import load_image from diffusers.utils import load_image
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
enable_full_determinism, enable_full_determinism,
nightly,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda, require_big_gpu_with_torch_cuda,
slow,
torch_device, torch_device,
) )
from diffusers.utils.torch_utils import randn_tensor from diffusers.utils.torch_utils import randn_tensor
...@@ -204,7 +204,7 @@ class FluxControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMixin): ...@@ -204,7 +204,7 @@ class FluxControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
assert (output_height, output_width) == (expected_height, expected_width) assert (output_height, output_width) == (expected_height, expected_width)
@slow @nightly
@require_big_gpu_with_torch_cuda @require_big_gpu_with_torch_cuda
@pytest.mark.big_gpu_with_torch_cuda @pytest.mark.big_gpu_with_torch_cuda
class FluxControlNetPipelineSlowTests(unittest.TestCase): class FluxControlNetPipelineSlowTests(unittest.TestCase):
...@@ -230,8 +230,7 @@ class FluxControlNetPipelineSlowTests(unittest.TestCase): ...@@ -230,8 +230,7 @@ class FluxControlNetPipelineSlowTests(unittest.TestCase):
text_encoder_2=None, text_encoder_2=None,
controlnet=controlnet, controlnet=controlnet,
torch_dtype=torch.bfloat16, torch_dtype=torch.bfloat16,
) ).to(torch_device)
pipe.enable_model_cpu_offload()
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
generator = torch.Generator(device="cpu").manual_seed(0) generator = torch.Generator(device="cpu").manual_seed(0)
...@@ -241,12 +240,12 @@ class FluxControlNetPipelineSlowTests(unittest.TestCase): ...@@ -241,12 +240,12 @@ class FluxControlNetPipelineSlowTests(unittest.TestCase):
prompt_embeds = torch.load( prompt_embeds = torch.load(
hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt") hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt")
) ).to(torch_device)
pooled_prompt_embeds = torch.load( pooled_prompt_embeds = torch.load(
hf_hub_download( hf_hub_download(
repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt" repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt"
) )
) ).to(torch_device)
output = pipe( output = pipe(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
......
...@@ -9,6 +9,7 @@ from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPToken ...@@ -9,6 +9,7 @@ from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPToken
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
nightly,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda, require_big_gpu_with_torch_cuda,
slow, slow,
...@@ -209,7 +210,7 @@ class FluxPipelineFastTests(unittest.TestCase, PipelineTesterMixin, FluxIPAdapte ...@@ -209,7 +210,7 @@ class FluxPipelineFastTests(unittest.TestCase, PipelineTesterMixin, FluxIPAdapte
assert (output_height, output_width) == (expected_height, expected_width) assert (output_height, output_width) == (expected_height, expected_width)
@slow @nightly
@require_big_gpu_with_torch_cuda @require_big_gpu_with_torch_cuda
@pytest.mark.big_gpu_with_torch_cuda @pytest.mark.big_gpu_with_torch_cuda
class FluxPipelineSlowTests(unittest.TestCase): class FluxPipelineSlowTests(unittest.TestCase):
...@@ -227,19 +228,16 @@ class FluxPipelineSlowTests(unittest.TestCase): ...@@ -227,19 +228,16 @@ class FluxPipelineSlowTests(unittest.TestCase):
torch.cuda.empty_cache() torch.cuda.empty_cache()
def get_inputs(self, device, seed=0): def get_inputs(self, device, seed=0):
if str(device).startswith("mps"): generator = torch.Generator(device="cpu").manual_seed(seed)
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device="cpu").manual_seed(seed)
prompt_embeds = torch.load( prompt_embeds = torch.load(
hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt") hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt")
) ).to(torch_device)
pooled_prompt_embeds = torch.load( pooled_prompt_embeds = torch.load(
hf_hub_download( hf_hub_download(
repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt" repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt"
) )
) ).to(torch_device)
return { return {
"prompt_embeds": prompt_embeds, "prompt_embeds": prompt_embeds,
"pooled_prompt_embeds": pooled_prompt_embeds, "pooled_prompt_embeds": pooled_prompt_embeds,
...@@ -253,8 +251,7 @@ class FluxPipelineSlowTests(unittest.TestCase): ...@@ -253,8 +251,7 @@ class FluxPipelineSlowTests(unittest.TestCase):
def test_flux_inference(self): def test_flux_inference(self):
pipe = self.pipeline_class.from_pretrained( pipe = self.pipeline_class.from_pretrained(
self.repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None self.repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None
) ).to(torch_device)
pipe.enable_model_cpu_offload()
inputs = self.get_inputs(torch_device) inputs = self.get_inputs(torch_device)
......
...@@ -17,15 +17,17 @@ import inspect ...@@ -17,15 +17,17 @@ import inspect
import unittest import unittest
import numpy as np import numpy as np
import pytest
import torch import torch
from transformers import AutoTokenizer, T5EncoderModel from transformers import AutoTokenizer, T5EncoderModel
from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, MochiPipeline, MochiTransformer3DModel from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, MochiPipeline, MochiTransformer3DModel
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
enable_full_determinism, enable_full_determinism,
nightly,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_torch_gpu, require_torch_gpu,
slow,
torch_device, torch_device,
) )
...@@ -260,8 +262,10 @@ class MochiPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ...@@ -260,8 +262,10 @@ class MochiPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
) )
@slow @nightly
@require_torch_gpu @require_torch_gpu
@require_big_gpu_with_torch_cuda
@pytest.mark.big_gpu_with_torch_cuda
class MochiPipelineIntegrationTests(unittest.TestCase): class MochiPipelineIntegrationTests(unittest.TestCase):
prompt = "A painting of a squirrel eating a burger." prompt = "A painting of a squirrel eating a burger."
...@@ -293,7 +297,7 @@ class MochiPipelineIntegrationTests(unittest.TestCase): ...@@ -293,7 +297,7 @@ class MochiPipelineIntegrationTests(unittest.TestCase):
).frames ).frames
video = videos[0] video = videos[0]
expected_video = torch.randn(1, 16, 480, 848, 3).numpy() expected_video = torch.randn(1, 19, 480, 848, 3).numpy()
max_diff = numpy_cosine_similarity_distance(video, expected_video) max_diff = numpy_cosine_similarity_distance(video, expected_video)
assert max_diff < 1e-3, f"Max diff is too high. got {video}" assert max_diff < 1e-3, f"Max diff is too high. got {video}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment