test_stable_diffusion.py 62.6 KB
Newer Older
1
# coding=utf-8
2
# Copyright 2024 HuggingFace Inc.
3
4
5
6
7
8
9
10
11
12
13
14
15
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

16

17
import gc
18
import tempfile
19
import time
20
import traceback
21
22
23
24
import unittest

import numpy as np
import torch
25
from huggingface_hub import hf_hub_download
Aryan's avatar
Aryan committed
26
27
28
29
30
from transformers import (
    CLIPTextConfig,
    CLIPTextModel,
    CLIPTokenizer,
)
31
32
33
34

from diffusers import (
    AutoencoderKL,
    DDIMScheduler,
35
    DPMSolverMultistepScheduler,
hlky's avatar
hlky committed
36
37
    EulerAncestralDiscreteScheduler,
    EulerDiscreteScheduler,
Patrick von Platen's avatar
Patrick von Platen committed
38
    LCMScheduler,
39
40
41
42
    LMSDiscreteScheduler,
    PNDMScheduler,
    StableDiffusionPipeline,
    UNet2DConditionModel,
43
    logging,
44
)
45
46
47
from diffusers.utils.testing_utils import (
    CaptureLogger,
    enable_full_determinism,
48
    is_torch_compile,
Patrick von Platen's avatar
Patrick von Platen committed
49
    load_image,
Dhruv Nair's avatar
Dhruv Nair committed
50
51
    load_numpy,
    nightly,
52
    numpy_cosine_similarity_distance,
53
    require_accelerate_version_greater,
54
55
    require_torch_2,
    require_torch_gpu,
56
    require_torch_multi_gpu,
57
    run_test_in_subprocess,
58
    skip_mps,
Dhruv Nair's avatar
Dhruv Nair committed
59
60
    slow,
    torch_device,
61
)
62

63
64
65
66
67
68
from ..pipeline_params import (
    TEXT_TO_IMAGE_BATCH_PARAMS,
    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
    TEXT_TO_IMAGE_IMAGE_PARAMS,
    TEXT_TO_IMAGE_PARAMS,
)
Aryan's avatar
Aryan committed
69
70
71
72
73
74
from ..test_pipelines_common import (
    IPAdapterTesterMixin,
    PipelineKarrasSchedulerTesterMixin,
    PipelineLatentTesterMixin,
    PipelineTesterMixin,
)
75

76

77
78
79
80
81
82
enable_full_determinism()


# Will be run via run_test_in_subprocess
def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
    error = None
83
    try:
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
        inputs = in_queue.get(timeout=timeout)
        torch_device = inputs.pop("torch_device")
        seed = inputs.pop("seed")
        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)

        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe = sd_pipe.to(torch_device)

        sd_pipe.unet.to(memory_format=torch.channels_last)
        sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True)

        sd_pipe.set_progress_bar_config(disable=None)

        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()

        assert image.shape == (1, 512, 512, 3)
        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
103

104
105
106
107
108
109
110
        assert np.abs(image_slice - expected_slice).max() < 5e-3
    except Exception:
        error = f"{traceback.format_exc()}"

    results = {"error": error}
    out_queue.put(results, timeout=timeout)
    out_queue.join()
111
112


113
class StableDiffusionPipelineFastTests(
Aryan's avatar
Aryan committed
114
115
116
117
118
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
    PipelineKarrasSchedulerTesterMixin,
    PipelineTesterMixin,
    unittest.TestCase,
119
):
120
    pipeline_class = StableDiffusionPipeline
121
122
    params = TEXT_TO_IMAGE_PARAMS
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
123
    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
124
    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
125
    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS
Aryan's avatar
Aryan committed
126
    test_layerwise_casting = True
Aryan's avatar
Aryan committed
127
    test_group_offloading = True
128

Patrick von Platen's avatar
Patrick von Platen committed
129
    def get_dummy_components(self, time_cond_proj_dim=None):
130
131
        cross_attention_dim = 8

132
        torch.manual_seed(0)
133
        unet = UNet2DConditionModel(
134
135
            block_out_channels=(4, 8),
            layers_per_block=1,
136
            sample_size=32,
Patrick von Platen's avatar
Patrick von Platen committed
137
            time_cond_proj_dim=time_cond_proj_dim,
138
139
140
141
            in_channels=4,
            out_channels=4,
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
142
            cross_attention_dim=cross_attention_dim,
143
            norm_num_groups=2,
144
        )
145
146
147
148
149
150
        scheduler = DDIMScheduler(
            beta_start=0.00085,
            beta_end=0.012,
            beta_schedule="scaled_linear",
            clip_sample=False,
            set_alpha_to_one=False,
151
152
        )
        torch.manual_seed(0)
153
        vae = AutoencoderKL(
154
            block_out_channels=[4, 8],
155
156
157
158
159
            in_channels=3,
            out_channels=3,
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
            latent_channels=4,
160
            norm_num_groups=2,
161
162
        )
        torch.manual_seed(0)
163
        text_encoder_config = CLIPTextConfig(
164
165
            bos_token_id=0,
            eos_token_id=2,
166
167
            hidden_size=cross_attention_dim,
            intermediate_size=16,
168
            layer_norm_eps=1e-05,
169
170
            num_attention_heads=2,
            num_hidden_layers=2,
171
172
173
            pad_token_id=1,
            vocab_size=1000,
        )
174
175
        text_encoder = CLIPTextModel(text_encoder_config)
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
176

177
178
179
180
181
182
183
184
        components = {
            "unet": unet,
            "scheduler": scheduler,
            "vae": vae,
            "text_encoder": text_encoder,
            "tokenizer": tokenizer,
            "safety_checker": None,
            "feature_extractor": None,
185
            "image_encoder": None,
186
187
188
189
190
191
192
193
194
195
196
197
198
        }
        return components

    def get_dummy_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
        else:
            generator = torch.Generator(device=device).manual_seed(seed)
        inputs = {
            "prompt": "A painting of a squirrel eating a burger",
            "generator": generator,
            "num_inference_steps": 2,
            "guidance_scale": 6.0,
Aryan's avatar
Aryan committed
199
            "output_type": "np",
200
201
        }
        return inputs
202
203
204
205

    def test_stable_diffusion_ddim(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator

206
207
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
208
        sd_pipe = sd_pipe.to(torch_device)
209
210
        sd_pipe.set_progress_bar_config(disable=None)

211
212
        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs)
213
214
215
216
        image = output.images

        image_slice = image[0, -3:, -3:, -1]

217
        assert image.shape == (1, 64, 64, 3)
218
        expected_slice = np.array([0.1763, 0.4776, 0.4986, 0.2566, 0.3802, 0.4596, 0.5363, 0.3277, 0.3949])
219
220
221

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

Patrick von Platen's avatar
Patrick von Platen committed
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
    def test_stable_diffusion_lcm(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator

        components = self.get_dummy_components(time_cond_proj_dim=256)
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs)
        image = output.images

        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 64, 64, 3)
238
        expected_slice = np.array([0.2368, 0.4900, 0.5019, 0.2723, 0.4473, 0.4578, 0.4551, 0.3532, 0.4133])
Patrick von Platen's avatar
Patrick von Platen committed
239
240
241

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
    def test_stable_diffusion_lcm_custom_timesteps(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator

        components = self.get_dummy_components(time_cond_proj_dim=256)
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(device)
        del inputs["num_inference_steps"]
        inputs["timesteps"] = [999, 499]
        output = sd_pipe(**inputs)
        image = output.images

        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 64, 64, 3)
260
        expected_slice = np.array([0.2368, 0.4900, 0.5019, 0.2723, 0.4473, 0.4578, 0.4551, 0.3532, 0.4133])
261
262
263

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
    def test_stable_diffusion_ays(self):
        from diffusers.schedulers import AysSchedules

        timestep_schedule = AysSchedules["StableDiffusionTimesteps"]
        sigma_schedule = AysSchedules["StableDiffusionSigmas"]

        device = "cpu"  # ensure determinism for the device-dependent torch.Generator

        components = self.get_dummy_components(time_cond_proj_dim=256)
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(device)
        inputs["num_inference_steps"] = 10
        output = sd_pipe(**inputs).images

        inputs = self.get_dummy_inputs(device)
        inputs["num_inference_steps"] = None
        inputs["timesteps"] = timestep_schedule
        output_ts = sd_pipe(**inputs).images

        inputs = self.get_dummy_inputs(device)
        inputs["num_inference_steps"] = None
        inputs["sigmas"] = sigma_schedule
        output_sigmas = sd_pipe(**inputs).images

        assert (
            np.abs(output_sigmas.flatten() - output_ts.flatten()).max() < 1e-3
        ), "ays timesteps and ays sigmas should have the same outputs"
        assert (
            np.abs(output.flatten() - output_ts.flatten()).max() > 1e-3
        ), "use ays timesteps should have different outputs"
        assert (
            np.abs(output.flatten() - output_sigmas.flatten()).max() > 1e-3
        ), "use ays sigmas should have different outputs"

302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
    def test_stable_diffusion_prompt_embeds(self):
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(torch_device)
        inputs["prompt"] = 3 * [inputs["prompt"]]

        # forward
        output = sd_pipe(**inputs)
        image_slice_1 = output.images[0, -3:, -3:, -1]

        inputs = self.get_dummy_inputs(torch_device)
        prompt = 3 * [inputs.pop("prompt")]

        text_inputs = sd_pipe.tokenizer(
            prompt,
            padding="max_length",
            max_length=sd_pipe.tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
        )
        text_inputs = text_inputs["input_ids"].to(torch_device)

        prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]

        inputs["prompt_embeds"] = prompt_embeds

        # forward
        output = sd_pipe(**inputs)
        image_slice_2 = output.images[0, -3:, -3:, -1]

        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4

    def test_stable_diffusion_negative_prompt_embeds(self):
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(torch_device)
        negative_prompt = 3 * ["this is a negative prompt"]
        inputs["negative_prompt"] = negative_prompt
        inputs["prompt"] = 3 * [inputs["prompt"]]

        # forward
        output = sd_pipe(**inputs)
        image_slice_1 = output.images[0, -3:, -3:, -1]

        inputs = self.get_dummy_inputs(torch_device)
        prompt = 3 * [inputs.pop("prompt")]

        embeds = []
        for p in [prompt, negative_prompt]:
            text_inputs = sd_pipe.tokenizer(
                p,
                padding="max_length",
                max_length=sd_pipe.tokenizer.model_max_length,
                truncation=True,
                return_tensors="pt",
            )
            text_inputs = text_inputs["input_ids"].to(torch_device)

            embeds.append(sd_pipe.text_encoder(text_inputs)[0])

        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds

        # forward
        output = sd_pipe(**inputs)
        image_slice_2 = output.images[0, -3:, -3:, -1]

        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4

378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
    def test_stable_diffusion_prompt_embeds_no_text_encoder_or_tokenizer(self):
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(torch_device)
        inputs["negative_prompt"] = "this is a negative prompt"

        # forward
        output = sd_pipe(**inputs)
        image_slice_1 = output.images[0, -3:, -3:, -1]

        inputs = self.get_dummy_inputs(torch_device)
        prompt = inputs.pop("prompt")
        negative_prompt = "this is a negative prompt"

        prompt_embeds, negative_prompt_embeds = sd_pipe.encode_prompt(
            prompt,
            torch_device,
            1,
            True,
            negative_prompt=negative_prompt,
            prompt_embeds=None,
            negative_prompt_embeds=None,
        )

        inputs["prompt_embeds"] = prompt_embeds
        inputs["negative_prompt_embeds"] = negative_prompt_embeds

        sd_pipe.text_encoder = None
        sd_pipe.tokenizer = None

        # forward
        output = sd_pipe(**inputs)
        image_slice_2 = output.images[0, -3:, -3:, -1]

        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4

417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
    def test_stable_diffusion_prompt_embeds_with_plain_negative_prompt_list(self):
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(torch_device)
        negative_prompt = 3 * ["this is a negative prompt"]
        inputs["negative_prompt"] = negative_prompt
        inputs["prompt"] = 3 * [inputs["prompt"]]

        # forward
        output = sd_pipe(**inputs)
        image_slice_1 = output.images[0, -3:, -3:, -1]

        inputs = self.get_dummy_inputs(torch_device)
        inputs["negative_prompt"] = negative_prompt
        prompt = 3 * [inputs.pop("prompt")]

        text_inputs = sd_pipe.tokenizer(
            prompt,
            padding="max_length",
            max_length=sd_pipe.tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
        )
        text_inputs = text_inputs["input_ids"].to(torch_device)

        prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]

        inputs["prompt_embeds"] = prompt_embeds

        # forward
        output = sd_pipe(**inputs)
        image_slice_2 = output.images[0, -3:, -3:, -1]

        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4

456
457
458
    def test_stable_diffusion_ddim_factor_8(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator

459
460
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
461
462
463
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

464
465
        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs, height=136, width=136)
466
467
468
469
        image = output.images

        image_slice = image[0, -3:, -3:, -1]

470
        assert image.shape == (1, 136, 136, 3)
471
        expected_slice = np.array([0.4720, 0.5426, 0.5160, 0.3961, 0.4696, 0.4296, 0.5738, 0.5888, 0.5481])
472
473
474
475
476

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

    def test_stable_diffusion_pndm(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
477
478
479
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
480
481
482
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

483
484
        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs)
485
486
487
        image = output.images
        image_slice = image[0, -3:, -3:, -1]

488
        assert image.shape == (1, 64, 64, 3)
489
        expected_slice = np.array([0.1941, 0.4748, 0.4880, 0.2222, 0.4221, 0.4545, 0.5604, 0.3488, 0.3902])
490

491
492
493
494
495
496
497
498
499
500
501
502
503
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

    def test_stable_diffusion_no_safety_checker(self):
        pipe = StableDiffusionPipeline.from_pretrained(
            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
        )
        assert isinstance(pipe, StableDiffusionPipeline)
        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
        assert pipe.safety_checker is None

        image = pipe("example prompt", num_inference_steps=2).images[0]
        assert image is not None

504
505
506
507
508
509
510
511
512
513
        # check that there's no error when saving a pipeline with one of the models being None
        with tempfile.TemporaryDirectory() as tmpdirname:
            pipe.save_pretrained(tmpdirname)
            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)

        # sanity check that the pipeline still works
        assert pipe.safety_checker is None
        image = pipe("example prompt", num_inference_steps=2).images[0]
        assert image is not None

514
515
    def test_stable_diffusion_k_lms(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
hlky's avatar
hlky committed
516

517
518
519
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
hlky's avatar
hlky committed
520
521
522
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

523
524
        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs)
hlky's avatar
hlky committed
525
526
527
        image = output.images
        image_slice = image[0, -3:, -3:, -1]

528
        assert image.shape == (1, 64, 64, 3)
529
        expected_slice = np.array([0.2681, 0.4785, 0.4857, 0.2426, 0.4473, 0.4481, 0.5610, 0.3676, 0.3855])
530

hlky's avatar
hlky committed
531
532
533
534
535
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

    def test_stable_diffusion_k_euler_ancestral(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator

536
537
538
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
hlky's avatar
hlky committed
539
540
541
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

542
543
        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs)
hlky's avatar
hlky committed
544
545
546
        image = output.images
        image_slice = image[0, -3:, -3:, -1]

547
        assert image.shape == (1, 64, 64, 3)
548
        expected_slice = np.array([0.2682, 0.4782, 0.4855, 0.2424, 0.4472, 0.4479, 0.5612, 0.3676, 0.3854])
549

hlky's avatar
hlky committed
550
551
552
553
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

    def test_stable_diffusion_k_euler(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
554

555
556
557
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
558
559
560
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

561
562
        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs)
563
564
565
        image = output.images
        image_slice = image[0, -3:, -3:, -1]

566
        assert image.shape == (1, 64, 64, 3)
567
        expected_slice = np.array([0.2681, 0.4785, 0.4857, 0.2426, 0.4473, 0.4481, 0.5610, 0.3676, 0.3855])
568

569
570
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

571
572
    def test_stable_diffusion_vae_slicing(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
573
574
575
        components = self.get_dummy_components()
        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
        sd_pipe = StableDiffusionPipeline(**components)
576
577
578
579
580
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

        image_count = 4

581
582
583
        inputs = self.get_dummy_inputs(device)
        inputs["prompt"] = [inputs["prompt"]] * image_count
        output_1 = sd_pipe(**inputs)
584
585
586

        # make sure sliced vae decode yields the same result
        sd_pipe.enable_vae_slicing()
587
588
589
        inputs = self.get_dummy_inputs(device)
        inputs["prompt"] = [inputs["prompt"]] * image_count
        output_2 = sd_pipe(**inputs)
590
591
592
593

        # there is a small discrepancy at image borders vs. full batch decode
        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 3e-3

594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
    def test_stable_diffusion_vae_tiling(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
        components = self.get_dummy_components()

        # make sure here that pndm scheduler skips prk
        components["safety_checker"] = None
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

        prompt = "A painting of a squirrel eating a burger"

        # Test that tiled decode at 512x512 yields the same result as the non-tiled decode
        generator = torch.Generator(device=device).manual_seed(0)
        output_1 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")

        # make sure tiled vae decode yields the same result
        sd_pipe.enable_vae_tiling()
        generator = torch.Generator(device=device).manual_seed(0)
        output_2 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")

        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 5e-1

617
618
619
620
621
622
        # test that tiled decode works with various shapes
        shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]
        for shape in shapes:
            zeros = torch.zeros(shape).to(device)
            sd_pipe.vae.decode(zeros)

623
624
    def test_stable_diffusion_negative_prompt(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
625
626
627
        components = self.get_dummy_components()
        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
        sd_pipe = StableDiffusionPipeline(**components)
628
629
630
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

631
        inputs = self.get_dummy_inputs(device)
632
        negative_prompt = "french fries"
633
        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
634
635
636
637

        image = output.images
        image_slice = image[0, -3:, -3:, -1]

638
        assert image.shape == (1, 64, 64, 3)
639
        expected_slice = np.array([0.1907, 0.4709, 0.4858, 0.2224, 0.4223, 0.4539, 0.5606, 0.3489, 0.3900])
640

641
642
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

643
    def test_stable_diffusion_long_prompt(self):
644
645
646
        components = self.get_dummy_components()
        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
        sd_pipe = StableDiffusionPipeline(**components)
647
648
649
650
651
652
653
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        do_classifier_free_guidance = True
        negative_prompt = None
        num_images_per_prompt = 1
        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
654
        logger.setLevel(logging.WARNING)
655
656
657

        prompt = 100 * "@"
        with CaptureLogger(logger) as cap_logger:
658
            negative_text_embeddings, text_embeddings = sd_pipe.encode_prompt(
659
660
                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
            )
661
662
            if negative_text_embeddings is not None:
                text_embeddings = torch.cat([negative_text_embeddings, text_embeddings])
663

664
665
666
        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
        assert cap_logger.out.count("@") == 25

667
668
        negative_prompt = "Hello"
        with CaptureLogger(logger) as cap_logger_2:
669
            negative_text_embeddings_2, text_embeddings_2 = sd_pipe.encode_prompt(
670
671
                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
            )
672
673
            if negative_text_embeddings_2 is not None:
                text_embeddings_2 = torch.cat([negative_text_embeddings_2, text_embeddings_2])
674

675
676
677
678
679
680
681
682
683
684
        assert cap_logger.out == cap_logger_2.out

        prompt = 25 * "@"
        with CaptureLogger(logger) as cap_logger_3:
            negative_text_embeddings_3, text_embeddings_3 = sd_pipe.encode_prompt(
                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
            )
            if negative_text_embeddings_3 is not None:
                text_embeddings_3 = torch.cat([negative_text_embeddings_3, text_embeddings_3])

685
686
687
688
        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
        assert text_embeddings.shape[1] == 77
        assert cap_logger_3.out == ""

689
    def test_stable_diffusion_height_width_opt(self):
690
691
692
        components = self.get_dummy_components()
        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
        sd_pipe = StableDiffusionPipeline(**components)
693
694
695
696
697
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        prompt = "hey"

698
        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
699
        image_shape = output.images[0].shape[:2]
Patrick von Platen's avatar
Patrick von Platen committed
700
        assert image_shape == (64, 64)
701

702
        output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
703
        image_shape = output.images[0].shape[:2]
Patrick von Platen's avatar
Patrick von Platen committed
704
        assert image_shape == (96, 96)
705
706
707

        config = dict(sd_pipe.unet.config)
        config["sample_size"] = 96
Patrick von Platen's avatar
Patrick von Platen committed
708
        sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
709
        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
710
        image_shape = output.images[0].shape[:2]
Patrick von Platen's avatar
Patrick von Platen committed
711
        assert image_shape == (192, 192)
712

713
714
715
716
717
718
    def test_attention_slicing_forward_pass(self):
        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)

    def test_inference_batch_single_identical(self):
        super().test_inference_batch_single_identical(expected_max_diff=3e-3)

719
720
    # MPS currently doesn't support ComplexFloats, which are required for freeU - see https://github.com/huggingface/diffusers/issues/7569.
    @skip_mps
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
    def test_freeu_enabled(self):
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        prompt = "hey"
        output = sd_pipe(prompt, num_inference_steps=1, output_type="np", generator=torch.manual_seed(0)).images

        sd_pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
        output_freeu = sd_pipe(prompt, num_inference_steps=1, output_type="np", generator=torch.manual_seed(0)).images

        assert not np.allclose(
            output[0, -3:, -3:, -1], output_freeu[0, -3:, -3:, -1]
        ), "Enabling of FreeU should lead to different results."

    def test_freeu_disabled(self):
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        prompt = "hey"
        output = sd_pipe(prompt, num_inference_steps=1, output_type="np", generator=torch.manual_seed(0)).images

        sd_pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
        sd_pipe.disable_freeu()

        freeu_keys = {"s1", "s2", "b1", "b2"}
        for upsample_block in sd_pipe.unet.up_blocks:
            for key in freeu_keys:
                assert getattr(upsample_block, key) is None, f"Disabling of FreeU should have set {key} to None."

        output_no_freeu = sd_pipe(
            prompt, num_inference_steps=1, output_type="np", generator=torch.manual_seed(0)
        ).images

        assert np.allclose(
            output[0, -3:, -3:, -1], output_no_freeu[0, -3:, -3:, -1]
        ), "Disabling of FreeU should lead to results similar to the default pipeline results."

762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
    def test_fused_qkv_projections(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(device)
        image = sd_pipe(**inputs).images
        original_image_slice = image[0, -3:, -3:, -1]

        sd_pipe.fuse_qkv_projections()
        inputs = self.get_dummy_inputs(device)
        image = sd_pipe(**inputs).images
        image_slice_fused = image[0, -3:, -3:, -1]

        sd_pipe.unfuse_qkv_projections()
        inputs = self.get_dummy_inputs(device)
        image = sd_pipe(**inputs).images
        image_slice_disabled = image[0, -3:, -3:, -1]

        assert np.allclose(
            original_image_slice, image_slice_fused, atol=1e-2, rtol=1e-2
        ), "Fusion of QKV projections shouldn't affect the outputs."
        assert np.allclose(
            image_slice_fused, image_slice_disabled, atol=1e-2, rtol=1e-2
        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
        assert np.allclose(
            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
        ), "Original outputs should match when fused QKV projections are disabled."

Dhruv Nair's avatar
Dhruv Nair committed
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
    def test_pipeline_interrupt(self):
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        prompt = "hey"
        num_inference_steps = 3

        # store intermediate latents from the generation process
        class PipelineState:
            def __init__(self):
                self.state = []

            def apply(self, pipe, i, t, callback_kwargs):
                self.state.append(callback_kwargs["latents"])
                return callback_kwargs

        pipe_state = PipelineState()
        sd_pipe(
            prompt,
            num_inference_steps=num_inference_steps,
            output_type="np",
            generator=torch.Generator("cpu").manual_seed(0),
            callback_on_step_end=pipe_state.apply,
        ).images

        # interrupt generation at step index
        interrupt_step_idx = 1

        def callback_on_step_end(pipe, i, t, callback_kwargs):
            if i == interrupt_step_idx:
                pipe._interrupt = True

            return callback_kwargs

        output_interrupted = sd_pipe(
            prompt,
            num_inference_steps=num_inference_steps,
            output_type="latent",
            generator=torch.Generator("cpu").manual_seed(0),
            callback_on_step_end=callback_on_step_end,
        ).images

        # fetch intermediate latents at the interrupted step
        # from the completed generation process
        intermediate_latent = pipe_state.state[interrupt_step_idx]

        # compare the intermediate latent to the output of the interrupted process
        # they should be the same
        assert torch.allclose(intermediate_latent, output_interrupted, atol=1e-4)

845
846
847
848
849
850
851
852
    def test_pipeline_accept_tuple_type_unet_sample_size(self):
        # the purpose of this test is to see whether the pipeline would accept a unet with the tuple-typed sample size
        sd_repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
        sample_size = [60, 80]
        customised_unet = UNet2DConditionModel(sample_size=sample_size)
        pipe = StableDiffusionPipeline.from_pretrained(sd_repo_id, unet=customised_unet)
        assert pipe.unet.config.sample_size == sample_size

853
854

@slow
855
@require_torch_gpu
856
class StableDiffusionPipelineSlowTests(unittest.TestCase):
857
    def setUp(self):
858
859
860
        gc.collect()
        torch.cuda.empty_cache()

861
862
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
863
864
865
866
867
868
869
870
        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
        inputs = {
            "prompt": "a photograph of an astronaut riding a horse",
            "latents": latents,
            "generator": generator,
            "num_inference_steps": 3,
            "guidance_scale": 7.5,
871
            "output_type": "np",
872
873
874
875
        }
        return inputs

    def test_stable_diffusion_1_1_pndm(self):
876
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
877
878
879
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

880
881
882
        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
883

884
        assert image.shape == (1, 512, 512, 3)
Dhruv Nair's avatar
Dhruv Nair committed
885
        expected_slice = np.array([0.4363, 0.4355, 0.3667, 0.4066, 0.3970, 0.3866, 0.4394, 0.4356, 0.4059])
886
        assert np.abs(image_slice - expected_slice).max() < 3e-3
887

888
889
890
891
892
893
894
895
896
897
898
899
900
901
    def test_stable_diffusion_v1_4_with_freeu(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        inputs["num_inference_steps"] = 25

        sd_pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
        image = sd_pipe(**inputs).images
        image = image[0, -3:, -3:, -1].flatten()
        expected_image = [0.0721, 0.0588, 0.0268, 0.0384, 0.0636, 0.0, 0.0429, 0.0344, 0.0309]
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3

902
903
904
905
    def test_stable_diffusion_1_4_pndm(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)
906

907
908
909
        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
910

911
        assert image.shape == (1, 512, 512, 3)
Dhruv Nair's avatar
Dhruv Nair committed
912
        expected_slice = np.array([0.5740, 0.4784, 0.3162, 0.6358, 0.5831, 0.5505, 0.5082, 0.5631, 0.5575])
913
        assert np.abs(image_slice - expected_slice).max() < 3e-3
914

915
916
917
    def test_stable_diffusion_ddim(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
918
919
920
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

921
922
923
        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
924

925
926
927
        assert image.shape == (1, 512, 512, 3)
        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
        assert np.abs(image_slice - expected_slice).max() < 1e-4
928

929
930
931
932
933
934
935
936
937
    def test_stable_diffusion_lms(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
938
939

        assert image.shape == (1, 512, 512, 3)
940
        expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])
941
        assert np.abs(image_slice - expected_slice).max() < 3e-3
942

943
944
    def test_stable_diffusion_dpm(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
945
946
947
948
        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
            sd_pipe.scheduler.config,
            final_sigmas_type="sigma_min",
        )
949
950
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)
951

952
953
954
        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
955
956

        assert image.shape == (1, 512, 512, 3)
957
        expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000])
958
        assert np.abs(image_slice - expected_slice).max() < 3e-3
959

960
    def test_stable_diffusion_attention_slicing(self):
961
        torch.cuda.reset_peak_memory_stats()
962
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
963
        pipe.unet.set_default_attn_processor()
964
        pipe = pipe.to(torch_device)
965
966
        pipe.set_progress_bar_config(disable=None)

967
        # enable attention slicing
968
        pipe.enable_attention_slicing()
969
970
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        image_sliced = pipe(**inputs).images
971
972
973
974
975
976

        mem_bytes = torch.cuda.max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()
        # make sure that less than 3.75 GB is allocated
        assert mem_bytes < 3.75 * 10**9

977
        # disable slicing
978
        pipe.disable_attention_slicing()
979
        pipe.unet.set_default_attn_processor()
980
981
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        image = pipe(**inputs).images
982
983
984
985

        # make sure that more than 3.75 GB is allocated
        mem_bytes = torch.cuda.max_memory_allocated()
        assert mem_bytes > 3.75 * 10**9
986
987
        max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten())
        assert max_diff < 1e-3
988

989
990
    def test_stable_diffusion_vae_slicing(self):
        torch.cuda.reset_peak_memory_stats()
991
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
992
        pipe = pipe.to(torch_device)
993
994
995
996
997
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()

        # enable vae slicing
        pipe.enable_vae_slicing()
998
999
1000
1001
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        inputs["prompt"] = [inputs["prompt"]] * 4
        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
        image_sliced = pipe(**inputs).images
1002
1003
1004
1005
1006
1007
1008
1009

        mem_bytes = torch.cuda.max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()
        # make sure that less than 4 GB is allocated
        assert mem_bytes < 4e9

        # disable vae slicing
        pipe.disable_vae_slicing()
1010
1011
1012
1013
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        inputs["prompt"] = [inputs["prompt"]] * 4
        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
        image = pipe(**inputs).images
1014
1015
1016
1017
1018

        # make sure that more than 4 GB is allocated
        mem_bytes = torch.cuda.max_memory_allocated()
        assert mem_bytes > 4e9
        # There is a small discrepancy at the image borders vs. a fully batched version.
1019
1020
        max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten())
        assert max_diff < 1e-2
1021

1022
1023
1024
    def test_stable_diffusion_vae_tiling(self):
        torch.cuda.reset_peak_memory_stats()
        model_id = "CompVis/stable-diffusion-v1-4"
1025
        pipe = StableDiffusionPipeline.from_pretrained(
1026
            model_id, variant="fp16", torch_dtype=torch.float16, safety_checker=None
1027
        )
1028
1029
1030
1031
1032
1033
1034
1035
1036
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()
        pipe.unet = pipe.unet.to(memory_format=torch.channels_last)
        pipe.vae = pipe.vae.to(memory_format=torch.channels_last)

        prompt = "a photograph of an astronaut riding a horse"

        # enable vae tiling
        pipe.enable_vae_tiling()
1037
1038
1039
1040
1041
1042
1043
1044
1045
        pipe.enable_model_cpu_offload()
        generator = torch.Generator(device="cpu").manual_seed(0)
        output_chunked = pipe(
            [prompt],
            width=1024,
            height=1024,
            generator=generator,
            guidance_scale=7.5,
            num_inference_steps=2,
1046
            output_type="np",
1047
1048
        )
        image_chunked = output_chunked.images
1049
1050
1051
1052
1053

        mem_bytes = torch.cuda.max_memory_allocated()

        # disable vae tiling
        pipe.disable_vae_tiling()
1054
1055
1056
1057
1058
1059
1060
1061
        generator = torch.Generator(device="cpu").manual_seed(0)
        output = pipe(
            [prompt],
            width=1024,
            height=1024,
            generator=generator,
            guidance_scale=7.5,
            num_inference_steps=2,
1062
            output_type="np",
1063
1064
        )
        image = output.images
1065

1066
        assert mem_bytes < 1e10
1067
1068
        max_diff = numpy_cosine_similarity_distance(image_chunked.flatten(), image.flatten())
        assert max_diff < 1e-2
1069

1070
    def test_stable_diffusion_fp16_vs_autocast(self):
1071
1072
        # this test makes sure that the original model with autocast
        # and the new model with fp16 yield the same result
1073
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
1074
        pipe = pipe.to(torch_device)
1075
1076
        pipe.set_progress_bar_config(disable=None)

1077
1078
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        image_fp16 = pipe(**inputs).images
1079
1080

        with torch.autocast(torch_device):
1081
1082
            inputs = self.get_inputs(torch_device)
            image_autocast = pipe(**inputs).images
1083
1084

        # Make sure results are close enough
1085
        diff = np.abs(image_fp16.flatten() - image_autocast.flatten())
1086
1087
1088
1089
        # They ARE different since ops are not run always at the same precision
        # however, they should be extremely close.
        assert diff.mean() < 2e-2

1090
    def test_stable_diffusion_intermediate_state(self):
1091
1092
        number_of_steps = 0

1093
        def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None:
1094
            callback_fn.has_been_called = True
1095
1096
            nonlocal number_of_steps
            number_of_steps += 1
1097
            if step == 1:
1098
1099
1100
                latents = latents.detach().cpu().numpy()
                assert latents.shape == (1, 4, 64, 64)
                latents_slice = latents[0, -3:, -3:, -1]
1101
1102
1103
1104
1105
                expected_slice = np.array(
                    [-0.5693, -0.3018, -0.9746, 0.0518, -0.8770, 0.7559, -1.7402, 0.1022, 1.1582]
                )

                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
1106
            elif step == 2:
1107
1108
1109
                latents = latents.detach().cpu().numpy()
                assert latents.shape == (1, 4, 64, 64)
                latents_slice = latents[0, -3:, -3:, -1]
1110
1111
1112
1113
1114
                expected_slice = np.array(
                    [-0.1958, -0.2993, -1.0166, -0.5005, -0.4810, 0.6162, -0.9492, 0.6621, 1.4492]
                )

                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
1115

1116
        callback_fn.has_been_called = False
1117

1118
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
1119
1120
1121
1122
        pipe = pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()

1123
1124
1125
1126
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        pipe(**inputs, callback=callback_fn, callback_steps=1)
        assert callback_fn.has_been_called
        assert number_of_steps == inputs["num_inference_steps"]
1127

1128
    def test_stable_diffusion_low_cpu_mem_usage(self):
1129
1130
1131
        pipeline_id = "CompVis/stable-diffusion-v1-4"

        start_time = time.time()
1132
        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
1133
1134
        pipeline_low_cpu_mem_usage.to(torch_device)
        low_cpu_mem_usage_time = time.time() - start_time
1135
1136

        start_time = time.time()
1137
        _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
1138
        normal_load_time = time.time() - start_time
1139

1140
        assert 2 * low_cpu_mem_usage_time < normal_load_time
1141

1142
    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
1143
1144
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
Anton Lozhkov's avatar
Anton Lozhkov committed
1145
        torch.cuda.reset_peak_memory_stats()
1146

1147
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
1148
1149
1150
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing(1)
        pipe.enable_sequential_cpu_offload()
1151

1152
1153
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        _ = pipe(**inputs)
1154
1155

        mem_bytes = torch.cuda.max_memory_allocated()
Anton Lozhkov's avatar
Anton Lozhkov committed
1156
1157
        # make sure that less than 2.8 GB is allocated
        assert mem_bytes < 2.8 * 10**9
1158

1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
    def test_stable_diffusion_pipeline_with_model_offloading(self):
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()

        inputs = self.get_inputs(torch_device, dtype=torch.float16)

        # Normal inference

        pipe = StableDiffusionPipeline.from_pretrained(
            "CompVis/stable-diffusion-v1-4",
            torch_dtype=torch.float16,
        )
1172
        pipe.unet.set_default_attn_processor()
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        outputs = pipe(**inputs)
        mem_bytes = torch.cuda.max_memory_allocated()

        # With model offloading

        # Reload but don't move to cuda
        pipe = StableDiffusionPipeline.from_pretrained(
            "CompVis/stable-diffusion-v1-4",
            torch_dtype=torch.float16,
        )
1185
        pipe.unet.set_default_attn_processor()
1186
1187
1188
1189
1190
1191
1192

        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()

        pipe.enable_model_cpu_offload()
        pipe.set_progress_bar_config(disable=None)
1193
1194
        inputs = self.get_inputs(torch_device, dtype=torch.float16)

1195
1196
1197
        outputs_offloaded = pipe(**inputs)
        mem_bytes_offloaded = torch.cuda.max_memory_allocated()

1198
1199
1200
1201
1202
        images = outputs.images
        offloaded_images = outputs_offloaded.images

        max_diff = numpy_cosine_similarity_distance(images.flatten(), offloaded_images.flatten())
        assert max_diff < 1e-3
1203
1204
        assert mem_bytes_offloaded < mem_bytes
        assert mem_bytes_offloaded < 3.5 * 10**9
1205
        for module in pipe.text_encoder, pipe.unet, pipe.vae:
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
            assert module.device == torch.device("cpu")

        # With attention slicing
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()

        pipe.enable_attention_slicing()
        _ = pipe(**inputs)
        mem_bytes_slicing = torch.cuda.max_memory_allocated()

        assert mem_bytes_slicing < mem_bytes_offloaded
        assert mem_bytes_slicing < 3 * 10**9

1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
    def test_stable_diffusion_textual_inversion(self):
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")

        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
        a111_file_neg = hf_hub_download(
            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
        )
        pipe.load_textual_inversion(a111_file)
        pipe.load_textual_inversion(a111_file_neg)
        pipe.to("cuda")

        generator = torch.Generator(device="cpu").manual_seed(1)

        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
        neg_prompt = "Style-Winter-neg"

        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
        expected_image = load_numpy(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
        )

        max_diff = np.abs(expected_image - image).max()
1243
        assert max_diff < 8e-1
1244

1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
    def test_stable_diffusion_textual_inversion_with_model_cpu_offload(self):
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
        pipe.enable_model_cpu_offload()
        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")

        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
        a111_file_neg = hf_hub_download(
            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
        )
        pipe.load_textual_inversion(a111_file)
        pipe.load_textual_inversion(a111_file_neg)

        generator = torch.Generator(device="cpu").manual_seed(1)

        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
        neg_prompt = "Style-Winter-neg"

        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
        expected_image = load_numpy(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
        )

        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 8e-1

    def test_stable_diffusion_textual_inversion_with_sequential_cpu_offload(self):
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
        pipe.enable_sequential_cpu_offload()
        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")

        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
        a111_file_neg = hf_hub_download(
            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
        )
        pipe.load_textual_inversion(a111_file)
        pipe.load_textual_inversion(a111_file_neg)

        generator = torch.Generator(device="cpu").manual_seed(1)

        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
        neg_prompt = "Style-Winter-neg"

        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
        expected_image = load_numpy(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
        )

        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 8e-1

1295
    @is_torch_compile
1296
    @require_torch_2
1297
    def test_stable_diffusion_compile(self):
1298
1299
1300
1301
1302
1303
1304
        seed = 0
        inputs = self.get_inputs(torch_device, seed=seed)
        # Can't pickle a Generator object
        del inputs["generator"]
        inputs["torch_device"] = torch_device
        inputs["seed"] = seed
        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=inputs)
1305

Patrick von Platen's avatar
Patrick von Platen committed
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
    def test_stable_diffusion_lcm(self):
        unet = UNet2DConditionModel.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", subfolder="unet")
        sd_pipe = StableDiffusionPipeline.from_pretrained("Lykon/dreamshaper-7", unet=unet).to(torch_device)
        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        inputs["num_inference_steps"] = 6
        inputs["output_type"] = "pil"

        image = sd_pipe(**inputs).images[0]

        expected_image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_full/stable_diffusion_lcm.png"
        )

        image = sd_pipe.image_processor.pil_to_numpy(image)
        expected_image = sd_pipe.image_processor.pil_to_numpy(expected_image)

        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())

        assert max_diff < 1e-2

1329

1lint's avatar
1lint committed
1330
1331
1332
@slow
@require_torch_gpu
class StableDiffusionPipelineCkptTests(unittest.TestCase):
1333
1334
1335
1336
1337
    def setUp(self):
        super().setUp()
        gc.collect()
        torch.cuda.empty_cache()

1lint's avatar
1lint committed
1338
1339
1340
1341
1342
1343
1344
    def tearDown(self):
        super().tearDown()
        gc.collect()
        torch.cuda.empty_cache()

    def test_download_from_hub(self):
        ckpt_paths = [
1345
            "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
1346
            "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors",
1lint's avatar
1lint committed
1347
1348
1349
        ]

        for ckpt_path in ckpt_paths:
Patrick von Platen's avatar
Patrick von Platen committed
1350
            pipe = StableDiffusionPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16)
1lint's avatar
1lint committed
1351
1352
1353
1354
1355
1356
1357
1358
            pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
            pipe.to("cuda")

        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]

        assert image_out.shape == (512, 512, 3)

    def test_download_local(self):
1359
1360
1361
1362
        ckpt_filename = hf_hub_download(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.safetensors"
        )
        config_filename = hf_hub_download("stable-diffusion-v1-5/stable-diffusion-v1-5", filename="v1-inference.yaml")
1lint's avatar
1lint committed
1363

1364
1365
1366
        pipe = StableDiffusionPipeline.from_single_file(
            ckpt_filename, config_files={"v1": config_filename}, torch_dtype=torch.float16
        )
1lint's avatar
1lint committed
1367
1368
1369
1370
1371
1372
1373
1374
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
        pipe.to("cuda")

        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]

        assert image_out.shape == (512, 512, 3)


1375
1376
1377
@nightly
@require_torch_gpu
class StableDiffusionPipelineNightlyTests(unittest.TestCase):
1378
1379
1380
1381
1382
    def setUp(self):
        super().setUp()
        gc.collect()
        torch.cuda.empty_cache()

1383
1384
1385
1386
1387
    def tearDown(self):
        super().tearDown()
        gc.collect()
        torch.cuda.empty_cache()

1388
1389
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
1390
1391
1392
1393
1394
1395
1396
1397
        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
        inputs = {
            "prompt": "a photograph of an astronaut riding a horse",
            "latents": latents,
            "generator": generator,
            "num_inference_steps": 50,
            "guidance_scale": 7.5,
1398
            "output_type": "np",
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
        }
        return inputs

    def test_stable_diffusion_1_4_pndm(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3

    def test_stable_diffusion_1_5_pndm(self):
1417
1418
1419
        sd_pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5").to(
            torch_device
        )
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_text2img/stable_diffusion_1_5_pndm.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3

    def test_stable_diffusion_ddim(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"
        )
        max_diff = np.abs(expected_image - image).max()
1445
        assert max_diff < 3e-3
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475

    def test_stable_diffusion_lms(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3

    def test_stable_diffusion_euler(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_text2img/stable_diffusion_1_4_euler.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499


# (sayakpaul): This test suite was run in the DGX with two GPUs (1, 2).
@slow
@require_torch_multi_gpu
@require_accelerate_version_greater("0.27.0")
class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
    def tearDown(self):
        super().tearDown()
        gc.collect()
        torch.cuda.empty_cache()

    def get_inputs(self, generator_device="cpu", seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
        inputs = {
            "prompt": "a photograph of an astronaut riding a horse",
            "generator": generator,
            "num_inference_steps": 50,
            "guidance_scale": 7.5,
            "output_type": "np",
        }
        return inputs

    def get_pipeline_output_without_device_map(self):
1500
1501
1502
        sd_pipe = StableDiffusionPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16
        ).to(torch_device)
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
        sd_pipe.set_progress_bar_config(disable=True)
        inputs = self.get_inputs()
        no_device_map_image = sd_pipe(**inputs).images

        del sd_pipe

        return no_device_map_image

    def test_forward_pass_balanced_device_map(self):
        no_device_map_image = self.get_pipeline_output_without_device_map()

        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1515
            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
        )
        sd_pipe_with_device_map.set_progress_bar_config(disable=True)
        inputs = self.get_inputs()
        device_map_image = sd_pipe_with_device_map(**inputs).images

        max_diff = np.abs(device_map_image - no_device_map_image).max()
        assert max_diff < 1e-3

    def test_components_put_in_right_devices(self):
        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1526
            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
1527
1528
1529
1530
1531
1532
1533
1534
        )

        assert len(set(sd_pipe_with_device_map.hf_device_map.values())) >= 2

    def test_max_memory(self):
        no_device_map_image = self.get_pipeline_output_without_device_map()

        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1535
            "stable-diffusion-v1-5/stable-diffusion-v1-5",
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
            device_map="balanced",
            max_memory={0: "1GB", 1: "1GB"},
            torch_dtype=torch.float16,
        )
        sd_pipe_with_device_map.set_progress_bar_config(disable=True)
        inputs = self.get_inputs()
        device_map_image = sd_pipe_with_device_map(**inputs).images

        max_diff = np.abs(device_map_image - no_device_map_image).max()
        assert max_diff < 1e-3

    def test_reset_device_map(self):
        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1549
            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
        )
        sd_pipe_with_device_map.reset_device_map()

        assert sd_pipe_with_device_map.hf_device_map is None

        for name, component in sd_pipe_with_device_map.components.items():
            if isinstance(component, torch.nn.Module):
                assert component.device.type == "cpu"

    def test_reset_device_map_to(self):
        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1561
            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
        )
        sd_pipe_with_device_map.reset_device_map()

        assert sd_pipe_with_device_map.hf_device_map is None

        # Make sure `to()` can be used and the pipeline can be called.
        pipe = sd_pipe_with_device_map.to("cuda")
        _ = pipe("hello", num_inference_steps=2)

    def test_reset_device_map_enable_model_cpu_offload(self):
        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1573
            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
        )
        sd_pipe_with_device_map.reset_device_map()

        assert sd_pipe_with_device_map.hf_device_map is None

        # Make sure `enable_model_cpu_offload()` can be used and the pipeline can be called.
        sd_pipe_with_device_map.enable_model_cpu_offload()
        _ = sd_pipe_with_device_map("hello", num_inference_steps=2)

    def test_reset_device_map_enable_sequential_cpu_offload(self):
        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1585
            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
1586
1587
1588
1589
1590
1591
1592
1593
        )
        sd_pipe_with_device_map.reset_device_map()

        assert sd_pipe_with_device_map.hf_device_map is None

        # Make sure `enable_sequential_cpu_offload()` can be used and the pipeline can be called.
        sd_pipe_with_device_map.enable_sequential_cpu_offload()
        _ = sd_pipe_with_device_map("hello", num_inference_steps=2)