test_stable_diffusion.py 60.3 KB
Newer Older
1
# coding=utf-8
2
# Copyright 2024 HuggingFace Inc.
3
4
5
6
7
8
9
10
11
12
13
14
15
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

16

17
import gc
18
import tempfile
19
import time
20
import traceback
21
22
23
24
import unittest

import numpy as np
import torch
25
from huggingface_hub import hf_hub_download
Aryan's avatar
Aryan committed
26
27
28
29
30
from transformers import (
    CLIPTextConfig,
    CLIPTextModel,
    CLIPTokenizer,
)
31
32
33
34

from diffusers import (
    AutoencoderKL,
    DDIMScheduler,
35
    DPMSolverMultistepScheduler,
hlky's avatar
hlky committed
36
37
    EulerAncestralDiscreteScheduler,
    EulerDiscreteScheduler,
Patrick von Platen's avatar
Patrick von Platen committed
38
    LCMScheduler,
39
40
41
42
    LMSDiscreteScheduler,
    PNDMScheduler,
    StableDiffusionPipeline,
    UNet2DConditionModel,
43
    logging,
44
)
45
46
47
from diffusers.utils.testing_utils import (
    CaptureLogger,
    enable_full_determinism,
48
    is_torch_compile,
Patrick von Platen's avatar
Patrick von Platen committed
49
    load_image,
Dhruv Nair's avatar
Dhruv Nair committed
50
51
    load_numpy,
    nightly,
52
    numpy_cosine_similarity_distance,
53
    require_accelerate_version_greater,
54
55
    require_torch_2,
    require_torch_gpu,
56
    require_torch_multi_gpu,
57
    run_test_in_subprocess,
58
    skip_mps,
Dhruv Nair's avatar
Dhruv Nair committed
59
60
    slow,
    torch_device,
61
)
62

63
64
65
66
67
68
from ..pipeline_params import (
    TEXT_TO_IMAGE_BATCH_PARAMS,
    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
    TEXT_TO_IMAGE_IMAGE_PARAMS,
    TEXT_TO_IMAGE_PARAMS,
)
Aryan's avatar
Aryan committed
69
70
71
72
73
74
from ..test_pipelines_common import (
    IPAdapterTesterMixin,
    PipelineKarrasSchedulerTesterMixin,
    PipelineLatentTesterMixin,
    PipelineTesterMixin,
)
75

76

77
78
79
80
81
82
enable_full_determinism()


# Will be run via run_test_in_subprocess
def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
    error = None
83
    try:
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
        inputs = in_queue.get(timeout=timeout)
        torch_device = inputs.pop("torch_device")
        seed = inputs.pop("seed")
        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)

        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe = sd_pipe.to(torch_device)

        sd_pipe.unet.to(memory_format=torch.channels_last)
        sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True)

        sd_pipe.set_progress_bar_config(disable=None)

        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()

        assert image.shape == (1, 512, 512, 3)
        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
103

104
105
106
107
108
109
110
        assert np.abs(image_slice - expected_slice).max() < 5e-3
    except Exception:
        error = f"{traceback.format_exc()}"

    results = {"error": error}
    out_queue.put(results, timeout=timeout)
    out_queue.join()
111
112


113
class StableDiffusionPipelineFastTests(
Aryan's avatar
Aryan committed
114
115
116
117
118
    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
    PipelineKarrasSchedulerTesterMixin,
    PipelineTesterMixin,
    unittest.TestCase,
119
):
120
    pipeline_class = StableDiffusionPipeline
121
122
    params = TEXT_TO_IMAGE_PARAMS
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
123
    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
124
    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
125
    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS
Aryan's avatar
Aryan committed
126
    test_layerwise_casting = True
Aryan's avatar
Aryan committed
127
    test_group_offloading = True
128

Patrick von Platen's avatar
Patrick von Platen committed
129
    def get_dummy_components(self, time_cond_proj_dim=None):
130
131
        cross_attention_dim = 8

132
        torch.manual_seed(0)
133
        unet = UNet2DConditionModel(
134
135
            block_out_channels=(4, 8),
            layers_per_block=1,
136
            sample_size=32,
Patrick von Platen's avatar
Patrick von Platen committed
137
            time_cond_proj_dim=time_cond_proj_dim,
138
139
140
141
            in_channels=4,
            out_channels=4,
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
142
            cross_attention_dim=cross_attention_dim,
143
            norm_num_groups=2,
144
        )
145
146
147
148
149
150
        scheduler = DDIMScheduler(
            beta_start=0.00085,
            beta_end=0.012,
            beta_schedule="scaled_linear",
            clip_sample=False,
            set_alpha_to_one=False,
151
152
        )
        torch.manual_seed(0)
153
        vae = AutoencoderKL(
154
            block_out_channels=[4, 8],
155
156
157
158
159
            in_channels=3,
            out_channels=3,
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
            latent_channels=4,
160
            norm_num_groups=2,
161
162
        )
        torch.manual_seed(0)
163
        text_encoder_config = CLIPTextConfig(
164
165
            bos_token_id=0,
            eos_token_id=2,
166
167
            hidden_size=cross_attention_dim,
            intermediate_size=16,
168
            layer_norm_eps=1e-05,
169
170
            num_attention_heads=2,
            num_hidden_layers=2,
171
172
173
            pad_token_id=1,
            vocab_size=1000,
        )
174
175
        text_encoder = CLIPTextModel(text_encoder_config)
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
176

177
178
179
180
181
182
183
184
        components = {
            "unet": unet,
            "scheduler": scheduler,
            "vae": vae,
            "text_encoder": text_encoder,
            "tokenizer": tokenizer,
            "safety_checker": None,
            "feature_extractor": None,
185
            "image_encoder": None,
186
187
188
189
190
191
192
193
194
195
196
197
198
        }
        return components

    def get_dummy_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
        else:
            generator = torch.Generator(device=device).manual_seed(seed)
        inputs = {
            "prompt": "A painting of a squirrel eating a burger",
            "generator": generator,
            "num_inference_steps": 2,
            "guidance_scale": 6.0,
Aryan's avatar
Aryan committed
199
            "output_type": "np",
200
201
        }
        return inputs
202
203
204
205

    def test_stable_diffusion_ddim(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator

206
207
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
208
        sd_pipe = sd_pipe.to(torch_device)
209
210
        sd_pipe.set_progress_bar_config(disable=None)

211
212
        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs)
213
214
215
216
        image = output.images

        image_slice = image[0, -3:, -3:, -1]

217
        assert image.shape == (1, 64, 64, 3)
218
        expected_slice = np.array([0.1763, 0.4776, 0.4986, 0.2566, 0.3802, 0.4596, 0.5363, 0.3277, 0.3949])
219
220
221

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

Patrick von Platen's avatar
Patrick von Platen committed
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
    def test_stable_diffusion_lcm(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator

        components = self.get_dummy_components(time_cond_proj_dim=256)
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs)
        image = output.images

        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 64, 64, 3)
238
        expected_slice = np.array([0.2368, 0.4900, 0.5019, 0.2723, 0.4473, 0.4578, 0.4551, 0.3532, 0.4133])
Patrick von Platen's avatar
Patrick von Platen committed
239
240
241

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
    def test_stable_diffusion_lcm_custom_timesteps(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator

        components = self.get_dummy_components(time_cond_proj_dim=256)
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(device)
        del inputs["num_inference_steps"]
        inputs["timesteps"] = [999, 499]
        output = sd_pipe(**inputs)
        image = output.images

        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 64, 64, 3)
260
        expected_slice = np.array([0.2368, 0.4900, 0.5019, 0.2723, 0.4473, 0.4578, 0.4551, 0.3532, 0.4133])
261
262
263

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
    def test_stable_diffusion_ays(self):
        from diffusers.schedulers import AysSchedules

        timestep_schedule = AysSchedules["StableDiffusionTimesteps"]
        sigma_schedule = AysSchedules["StableDiffusionSigmas"]

        device = "cpu"  # ensure determinism for the device-dependent torch.Generator

        components = self.get_dummy_components(time_cond_proj_dim=256)
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(device)
        inputs["num_inference_steps"] = 10
        output = sd_pipe(**inputs).images

        inputs = self.get_dummy_inputs(device)
        inputs["num_inference_steps"] = None
        inputs["timesteps"] = timestep_schedule
        output_ts = sd_pipe(**inputs).images

        inputs = self.get_dummy_inputs(device)
        inputs["num_inference_steps"] = None
        inputs["sigmas"] = sigma_schedule
        output_sigmas = sd_pipe(**inputs).images

        assert (
            np.abs(output_sigmas.flatten() - output_ts.flatten()).max() < 1e-3
        ), "ays timesteps and ays sigmas should have the same outputs"
        assert (
            np.abs(output.flatten() - output_ts.flatten()).max() > 1e-3
        ), "use ays timesteps should have different outputs"
        assert (
            np.abs(output.flatten() - output_sigmas.flatten()).max() > 1e-3
        ), "use ays sigmas should have different outputs"

302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
    def test_stable_diffusion_prompt_embeds(self):
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(torch_device)
        inputs["prompt"] = 3 * [inputs["prompt"]]

        # forward
        output = sd_pipe(**inputs)
        image_slice_1 = output.images[0, -3:, -3:, -1]

        inputs = self.get_dummy_inputs(torch_device)
        prompt = 3 * [inputs.pop("prompt")]

        text_inputs = sd_pipe.tokenizer(
            prompt,
            padding="max_length",
            max_length=sd_pipe.tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
        )
        text_inputs = text_inputs["input_ids"].to(torch_device)

        prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]

        inputs["prompt_embeds"] = prompt_embeds

        # forward
        output = sd_pipe(**inputs)
        image_slice_2 = output.images[0, -3:, -3:, -1]

        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4

    def test_stable_diffusion_negative_prompt_embeds(self):
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(torch_device)
        negative_prompt = 3 * ["this is a negative prompt"]
        inputs["negative_prompt"] = negative_prompt
        inputs["prompt"] = 3 * [inputs["prompt"]]

        # forward
        output = sd_pipe(**inputs)
        image_slice_1 = output.images[0, -3:, -3:, -1]

        inputs = self.get_dummy_inputs(torch_device)
        prompt = 3 * [inputs.pop("prompt")]

        embeds = []
        for p in [prompt, negative_prompt]:
            text_inputs = sd_pipe.tokenizer(
                p,
                padding="max_length",
                max_length=sd_pipe.tokenizer.model_max_length,
                truncation=True,
                return_tensors="pt",
            )
            text_inputs = text_inputs["input_ids"].to(torch_device)

            embeds.append(sd_pipe.text_encoder(text_inputs)[0])

        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds

        # forward
        output = sd_pipe(**inputs)
        image_slice_2 = output.images[0, -3:, -3:, -1]

        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4

378
379
380
    def test_stable_diffusion_ddim_factor_8(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator

381
382
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
383
384
385
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

386
387
        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs, height=136, width=136)
388
389
390
391
        image = output.images

        image_slice = image[0, -3:, -3:, -1]

392
        assert image.shape == (1, 136, 136, 3)
393
        expected_slice = np.array([0.4720, 0.5426, 0.5160, 0.3961, 0.4696, 0.4296, 0.5738, 0.5888, 0.5481])
394
395
396
397
398

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

    def test_stable_diffusion_pndm(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
399
400
401
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
402
403
404
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

405
406
        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs)
407
408
409
        image = output.images
        image_slice = image[0, -3:, -3:, -1]

410
        assert image.shape == (1, 64, 64, 3)
411
        expected_slice = np.array([0.1941, 0.4748, 0.4880, 0.2222, 0.4221, 0.4545, 0.5604, 0.3488, 0.3902])
412

413
414
415
416
417
418
419
420
421
422
423
424
425
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

    def test_stable_diffusion_no_safety_checker(self):
        pipe = StableDiffusionPipeline.from_pretrained(
            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
        )
        assert isinstance(pipe, StableDiffusionPipeline)
        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
        assert pipe.safety_checker is None

        image = pipe("example prompt", num_inference_steps=2).images[0]
        assert image is not None

426
427
428
429
430
431
432
433
434
435
        # check that there's no error when saving a pipeline with one of the models being None
        with tempfile.TemporaryDirectory() as tmpdirname:
            pipe.save_pretrained(tmpdirname)
            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)

        # sanity check that the pipeline still works
        assert pipe.safety_checker is None
        image = pipe("example prompt", num_inference_steps=2).images[0]
        assert image is not None

436
437
    def test_stable_diffusion_k_lms(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
hlky's avatar
hlky committed
438

439
440
441
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
hlky's avatar
hlky committed
442
443
444
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

445
446
        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs)
hlky's avatar
hlky committed
447
448
449
        image = output.images
        image_slice = image[0, -3:, -3:, -1]

450
        assert image.shape == (1, 64, 64, 3)
451
        expected_slice = np.array([0.2681, 0.4785, 0.4857, 0.2426, 0.4473, 0.4481, 0.5610, 0.3676, 0.3855])
452

hlky's avatar
hlky committed
453
454
455
456
457
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

    def test_stable_diffusion_k_euler_ancestral(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator

458
459
460
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
hlky's avatar
hlky committed
461
462
463
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

464
465
        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs)
hlky's avatar
hlky committed
466
467
468
        image = output.images
        image_slice = image[0, -3:, -3:, -1]

469
        assert image.shape == (1, 64, 64, 3)
470
        expected_slice = np.array([0.2682, 0.4782, 0.4855, 0.2424, 0.4472, 0.4479, 0.5612, 0.3676, 0.3854])
471

hlky's avatar
hlky committed
472
473
474
475
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

    def test_stable_diffusion_k_euler(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
476

477
478
479
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
480
481
482
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

483
484
        inputs = self.get_dummy_inputs(device)
        output = sd_pipe(**inputs)
485
486
487
        image = output.images
        image_slice = image[0, -3:, -3:, -1]

488
        assert image.shape == (1, 64, 64, 3)
489
        expected_slice = np.array([0.2681, 0.4785, 0.4857, 0.2426, 0.4473, 0.4481, 0.5610, 0.3676, 0.3855])
490

491
492
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

493
494
    def test_stable_diffusion_vae_slicing(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
495
496
497
        components = self.get_dummy_components()
        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
        sd_pipe = StableDiffusionPipeline(**components)
498
499
500
501
502
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

        image_count = 4

503
504
505
        inputs = self.get_dummy_inputs(device)
        inputs["prompt"] = [inputs["prompt"]] * image_count
        output_1 = sd_pipe(**inputs)
506
507
508

        # make sure sliced vae decode yields the same result
        sd_pipe.enable_vae_slicing()
509
510
511
        inputs = self.get_dummy_inputs(device)
        inputs["prompt"] = [inputs["prompt"]] * image_count
        output_2 = sd_pipe(**inputs)
512
513
514
515

        # there is a small discrepancy at image borders vs. full batch decode
        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 3e-3

516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
    def test_stable_diffusion_vae_tiling(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
        components = self.get_dummy_components()

        # make sure here that pndm scheduler skips prk
        components["safety_checker"] = None
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

        prompt = "A painting of a squirrel eating a burger"

        # Test that tiled decode at 512x512 yields the same result as the non-tiled decode
        generator = torch.Generator(device=device).manual_seed(0)
        output_1 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")

        # make sure tiled vae decode yields the same result
        sd_pipe.enable_vae_tiling()
        generator = torch.Generator(device=device).manual_seed(0)
        output_2 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")

        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 5e-1

539
540
541
542
543
544
        # test that tiled decode works with various shapes
        shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]
        for shape in shapes:
            zeros = torch.zeros(shape).to(device)
            sd_pipe.vae.decode(zeros)

545
546
    def test_stable_diffusion_negative_prompt(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
547
548
549
        components = self.get_dummy_components()
        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
        sd_pipe = StableDiffusionPipeline(**components)
550
551
552
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

553
        inputs = self.get_dummy_inputs(device)
554
        negative_prompt = "french fries"
555
        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
556
557
558
559

        image = output.images
        image_slice = image[0, -3:, -3:, -1]

560
        assert image.shape == (1, 64, 64, 3)
561
        expected_slice = np.array([0.1907, 0.4709, 0.4858, 0.2224, 0.4223, 0.4539, 0.5606, 0.3489, 0.3900])
562

563
564
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

565
    def test_stable_diffusion_long_prompt(self):
566
567
568
        components = self.get_dummy_components()
        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
        sd_pipe = StableDiffusionPipeline(**components)
569
570
571
572
573
574
575
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        do_classifier_free_guidance = True
        negative_prompt = None
        num_images_per_prompt = 1
        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
576
        logger.setLevel(logging.WARNING)
577
578
579

        prompt = 100 * "@"
        with CaptureLogger(logger) as cap_logger:
580
            negative_text_embeddings, text_embeddings = sd_pipe.encode_prompt(
581
582
                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
            )
583
584
            if negative_text_embeddings is not None:
                text_embeddings = torch.cat([negative_text_embeddings, text_embeddings])
585

586
587
588
        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
        assert cap_logger.out.count("@") == 25

589
590
        negative_prompt = "Hello"
        with CaptureLogger(logger) as cap_logger_2:
591
            negative_text_embeddings_2, text_embeddings_2 = sd_pipe.encode_prompt(
592
593
                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
            )
594
595
            if negative_text_embeddings_2 is not None:
                text_embeddings_2 = torch.cat([negative_text_embeddings_2, text_embeddings_2])
596

597
598
599
600
601
602
603
604
605
606
        assert cap_logger.out == cap_logger_2.out

        prompt = 25 * "@"
        with CaptureLogger(logger) as cap_logger_3:
            negative_text_embeddings_3, text_embeddings_3 = sd_pipe.encode_prompt(
                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
            )
            if negative_text_embeddings_3 is not None:
                text_embeddings_3 = torch.cat([negative_text_embeddings_3, text_embeddings_3])

607
608
609
610
        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
        assert text_embeddings.shape[1] == 77
        assert cap_logger_3.out == ""

611
    def test_stable_diffusion_height_width_opt(self):
612
613
614
        components = self.get_dummy_components()
        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
        sd_pipe = StableDiffusionPipeline(**components)
615
616
617
618
619
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        prompt = "hey"

620
        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
621
        image_shape = output.images[0].shape[:2]
Patrick von Platen's avatar
Patrick von Platen committed
622
        assert image_shape == (64, 64)
623

624
        output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
625
        image_shape = output.images[0].shape[:2]
Patrick von Platen's avatar
Patrick von Platen committed
626
        assert image_shape == (96, 96)
627
628
629

        config = dict(sd_pipe.unet.config)
        config["sample_size"] = 96
Patrick von Platen's avatar
Patrick von Platen committed
630
        sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
631
        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
632
        image_shape = output.images[0].shape[:2]
Patrick von Platen's avatar
Patrick von Platen committed
633
        assert image_shape == (192, 192)
634

635
636
637
638
639
640
    def test_attention_slicing_forward_pass(self):
        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)

    def test_inference_batch_single_identical(self):
        super().test_inference_batch_single_identical(expected_max_diff=3e-3)

641
642
    # MPS currently doesn't support ComplexFloats, which are required for freeU - see https://github.com/huggingface/diffusers/issues/7569.
    @skip_mps
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
    def test_freeu_enabled(self):
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        prompt = "hey"
        output = sd_pipe(prompt, num_inference_steps=1, output_type="np", generator=torch.manual_seed(0)).images

        sd_pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
        output_freeu = sd_pipe(prompt, num_inference_steps=1, output_type="np", generator=torch.manual_seed(0)).images

        assert not np.allclose(
            output[0, -3:, -3:, -1], output_freeu[0, -3:, -3:, -1]
        ), "Enabling of FreeU should lead to different results."

    def test_freeu_disabled(self):
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        prompt = "hey"
        output = sd_pipe(prompt, num_inference_steps=1, output_type="np", generator=torch.manual_seed(0)).images

        sd_pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
        sd_pipe.disable_freeu()

        freeu_keys = {"s1", "s2", "b1", "b2"}
        for upsample_block in sd_pipe.unet.up_blocks:
            for key in freeu_keys:
                assert getattr(upsample_block, key) is None, f"Disabling of FreeU should have set {key} to None."

        output_no_freeu = sd_pipe(
            prompt, num_inference_steps=1, output_type="np", generator=torch.manual_seed(0)
        ).images

        assert np.allclose(
            output[0, -3:, -3:, -1], output_no_freeu[0, -3:, -3:, -1]
        ), "Disabling of FreeU should lead to results similar to the default pipeline results."

684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
    def test_fused_qkv_projections(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(device)
        image = sd_pipe(**inputs).images
        original_image_slice = image[0, -3:, -3:, -1]

        sd_pipe.fuse_qkv_projections()
        inputs = self.get_dummy_inputs(device)
        image = sd_pipe(**inputs).images
        image_slice_fused = image[0, -3:, -3:, -1]

        sd_pipe.unfuse_qkv_projections()
        inputs = self.get_dummy_inputs(device)
        image = sd_pipe(**inputs).images
        image_slice_disabled = image[0, -3:, -3:, -1]

        assert np.allclose(
            original_image_slice, image_slice_fused, atol=1e-2, rtol=1e-2
        ), "Fusion of QKV projections shouldn't affect the outputs."
        assert np.allclose(
            image_slice_fused, image_slice_disabled, atol=1e-2, rtol=1e-2
        ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
        assert np.allclose(
            original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
        ), "Original outputs should match when fused QKV projections are disabled."

Dhruv Nair's avatar
Dhruv Nair committed
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
    def test_pipeline_interrupt(self):
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionPipeline(**components)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        prompt = "hey"
        num_inference_steps = 3

        # store intermediate latents from the generation process
        class PipelineState:
            def __init__(self):
                self.state = []

            def apply(self, pipe, i, t, callback_kwargs):
                self.state.append(callback_kwargs["latents"])
                return callback_kwargs

        pipe_state = PipelineState()
        sd_pipe(
            prompt,
            num_inference_steps=num_inference_steps,
            output_type="np",
            generator=torch.Generator("cpu").manual_seed(0),
            callback_on_step_end=pipe_state.apply,
        ).images

        # interrupt generation at step index
        interrupt_step_idx = 1

        def callback_on_step_end(pipe, i, t, callback_kwargs):
            if i == interrupt_step_idx:
                pipe._interrupt = True

            return callback_kwargs

        output_interrupted = sd_pipe(
            prompt,
            num_inference_steps=num_inference_steps,
            output_type="latent",
            generator=torch.Generator("cpu").manual_seed(0),
            callback_on_step_end=callback_on_step_end,
        ).images

        # fetch intermediate latents at the interrupted step
        # from the completed generation process
        intermediate_latent = pipe_state.state[interrupt_step_idx]

        # compare the intermediate latent to the output of the interrupted process
        # they should be the same
        assert torch.allclose(intermediate_latent, output_interrupted, atol=1e-4)

767
768
769
770
771
772
773
774
    def test_pipeline_accept_tuple_type_unet_sample_size(self):
        # the purpose of this test is to see whether the pipeline would accept a unet with the tuple-typed sample size
        sd_repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
        sample_size = [60, 80]
        customised_unet = UNet2DConditionModel(sample_size=sample_size)
        pipe = StableDiffusionPipeline.from_pretrained(sd_repo_id, unet=customised_unet)
        assert pipe.unet.config.sample_size == sample_size

775
776
777
778
779
780
781
    def test_encode_prompt_works_in_isolation(self):
        extra_required_param_value_dict = {
            "device": torch.device(torch_device).type,
            "do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
        }
        return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)

782
783

@slow
784
@require_torch_gpu
785
class StableDiffusionPipelineSlowTests(unittest.TestCase):
786
    def setUp(self):
787
788
789
        gc.collect()
        torch.cuda.empty_cache()

790
791
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
792
793
794
795
796
797
798
799
        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
        inputs = {
            "prompt": "a photograph of an astronaut riding a horse",
            "latents": latents,
            "generator": generator,
            "num_inference_steps": 3,
            "guidance_scale": 7.5,
800
            "output_type": "np",
801
802
803
804
        }
        return inputs

    def test_stable_diffusion_1_1_pndm(self):
805
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
806
807
808
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

809
810
811
        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
812

813
        assert image.shape == (1, 512, 512, 3)
Dhruv Nair's avatar
Dhruv Nair committed
814
        expected_slice = np.array([0.4363, 0.4355, 0.3667, 0.4066, 0.3970, 0.3866, 0.4394, 0.4356, 0.4059])
815
        assert np.abs(image_slice - expected_slice).max() < 3e-3
816

817
818
819
820
821
822
823
824
825
826
827
828
829
830
    def test_stable_diffusion_v1_4_with_freeu(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        inputs["num_inference_steps"] = 25

        sd_pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
        image = sd_pipe(**inputs).images
        image = image[0, -3:, -3:, -1].flatten()
        expected_image = [0.0721, 0.0588, 0.0268, 0.0384, 0.0636, 0.0, 0.0429, 0.0344, 0.0309]
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3

831
832
833
834
    def test_stable_diffusion_1_4_pndm(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)
835

836
837
838
        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
839

840
        assert image.shape == (1, 512, 512, 3)
Dhruv Nair's avatar
Dhruv Nair committed
841
        expected_slice = np.array([0.5740, 0.4784, 0.3162, 0.6358, 0.5831, 0.5505, 0.5082, 0.5631, 0.5575])
842
        assert np.abs(image_slice - expected_slice).max() < 3e-3
843

844
845
846
    def test_stable_diffusion_ddim(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
847
848
849
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

850
851
852
        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
853

854
855
856
        assert image.shape == (1, 512, 512, 3)
        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
        assert np.abs(image_slice - expected_slice).max() < 1e-4
857

858
859
860
861
862
863
864
865
866
    def test_stable_diffusion_lms(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
867
868

        assert image.shape == (1, 512, 512, 3)
869
        expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])
870
        assert np.abs(image_slice - expected_slice).max() < 3e-3
871

872
873
    def test_stable_diffusion_dpm(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
874
875
876
877
        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(
            sd_pipe.scheduler.config,
            final_sigmas_type="sigma_min",
        )
878
879
        sd_pipe = sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)
880

881
882
883
        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
884
885

        assert image.shape == (1, 512, 512, 3)
886
        expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000])
887
        assert np.abs(image_slice - expected_slice).max() < 3e-3
888

889
    def test_stable_diffusion_attention_slicing(self):
890
        torch.cuda.reset_peak_memory_stats()
891
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
892
        pipe.unet.set_default_attn_processor()
893
        pipe = pipe.to(torch_device)
894
895
        pipe.set_progress_bar_config(disable=None)

896
        # enable attention slicing
897
        pipe.enable_attention_slicing()
898
899
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        image_sliced = pipe(**inputs).images
900
901
902
903
904
905

        mem_bytes = torch.cuda.max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()
        # make sure that less than 3.75 GB is allocated
        assert mem_bytes < 3.75 * 10**9

906
        # disable slicing
907
        pipe.disable_attention_slicing()
908
        pipe.unet.set_default_attn_processor()
909
910
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        image = pipe(**inputs).images
911
912
913
914

        # make sure that more than 3.75 GB is allocated
        mem_bytes = torch.cuda.max_memory_allocated()
        assert mem_bytes > 3.75 * 10**9
915
916
        max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten())
        assert max_diff < 1e-3
917

918
919
    def test_stable_diffusion_vae_slicing(self):
        torch.cuda.reset_peak_memory_stats()
920
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
921
        pipe = pipe.to(torch_device)
922
923
924
925
926
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()

        # enable vae slicing
        pipe.enable_vae_slicing()
927
928
929
930
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        inputs["prompt"] = [inputs["prompt"]] * 4
        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
        image_sliced = pipe(**inputs).images
931
932
933
934
935
936
937
938

        mem_bytes = torch.cuda.max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()
        # make sure that less than 4 GB is allocated
        assert mem_bytes < 4e9

        # disable vae slicing
        pipe.disable_vae_slicing()
939
940
941
942
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        inputs["prompt"] = [inputs["prompt"]] * 4
        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
        image = pipe(**inputs).images
943
944
945
946
947

        # make sure that more than 4 GB is allocated
        mem_bytes = torch.cuda.max_memory_allocated()
        assert mem_bytes > 4e9
        # There is a small discrepancy at the image borders vs. a fully batched version.
948
949
        max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten())
        assert max_diff < 1e-2
950

951
952
953
    def test_stable_diffusion_vae_tiling(self):
        torch.cuda.reset_peak_memory_stats()
        model_id = "CompVis/stable-diffusion-v1-4"
954
        pipe = StableDiffusionPipeline.from_pretrained(
955
            model_id, variant="fp16", torch_dtype=torch.float16, safety_checker=None
956
        )
957
958
959
960
961
962
963
964
965
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()
        pipe.unet = pipe.unet.to(memory_format=torch.channels_last)
        pipe.vae = pipe.vae.to(memory_format=torch.channels_last)

        prompt = "a photograph of an astronaut riding a horse"

        # enable vae tiling
        pipe.enable_vae_tiling()
966
967
968
969
970
971
972
973
974
        pipe.enable_model_cpu_offload()
        generator = torch.Generator(device="cpu").manual_seed(0)
        output_chunked = pipe(
            [prompt],
            width=1024,
            height=1024,
            generator=generator,
            guidance_scale=7.5,
            num_inference_steps=2,
975
            output_type="np",
976
977
        )
        image_chunked = output_chunked.images
978
979
980
981
982

        mem_bytes = torch.cuda.max_memory_allocated()

        # disable vae tiling
        pipe.disable_vae_tiling()
983
984
985
986
987
988
989
990
        generator = torch.Generator(device="cpu").manual_seed(0)
        output = pipe(
            [prompt],
            width=1024,
            height=1024,
            generator=generator,
            guidance_scale=7.5,
            num_inference_steps=2,
991
            output_type="np",
992
993
        )
        image = output.images
994

995
        assert mem_bytes < 1e10
996
997
        max_diff = numpy_cosine_similarity_distance(image_chunked.flatten(), image.flatten())
        assert max_diff < 1e-2
998

999
    def test_stable_diffusion_fp16_vs_autocast(self):
1000
1001
        # this test makes sure that the original model with autocast
        # and the new model with fp16 yield the same result
1002
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
1003
        pipe = pipe.to(torch_device)
1004
1005
        pipe.set_progress_bar_config(disable=None)

1006
1007
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        image_fp16 = pipe(**inputs).images
1008
1009

        with torch.autocast(torch_device):
1010
1011
            inputs = self.get_inputs(torch_device)
            image_autocast = pipe(**inputs).images
1012
1013

        # Make sure results are close enough
1014
        diff = np.abs(image_fp16.flatten() - image_autocast.flatten())
1015
1016
1017
1018
        # They ARE different since ops are not run always at the same precision
        # however, they should be extremely close.
        assert diff.mean() < 2e-2

1019
    def test_stable_diffusion_intermediate_state(self):
1020
1021
        number_of_steps = 0

1022
        def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None:
1023
            callback_fn.has_been_called = True
1024
1025
            nonlocal number_of_steps
            number_of_steps += 1
1026
            if step == 1:
1027
1028
1029
                latents = latents.detach().cpu().numpy()
                assert latents.shape == (1, 4, 64, 64)
                latents_slice = latents[0, -3:, -3:, -1]
1030
1031
1032
1033
1034
                expected_slice = np.array(
                    [-0.5693, -0.3018, -0.9746, 0.0518, -0.8770, 0.7559, -1.7402, 0.1022, 1.1582]
                )

                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
1035
            elif step == 2:
1036
1037
1038
                latents = latents.detach().cpu().numpy()
                assert latents.shape == (1, 4, 64, 64)
                latents_slice = latents[0, -3:, -3:, -1]
1039
1040
1041
1042
1043
                expected_slice = np.array(
                    [-0.1958, -0.2993, -1.0166, -0.5005, -0.4810, 0.6162, -0.9492, 0.6621, 1.4492]
                )

                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
1044

1045
        callback_fn.has_been_called = False
1046

1047
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
1048
1049
1050
1051
        pipe = pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()

1052
1053
1054
1055
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        pipe(**inputs, callback=callback_fn, callback_steps=1)
        assert callback_fn.has_been_called
        assert number_of_steps == inputs["num_inference_steps"]
1056

1057
    def test_stable_diffusion_low_cpu_mem_usage(self):
1058
1059
1060
        pipeline_id = "CompVis/stable-diffusion-v1-4"

        start_time = time.time()
1061
        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
1062
1063
        pipeline_low_cpu_mem_usage.to(torch_device)
        low_cpu_mem_usage_time = time.time() - start_time
1064
1065

        start_time = time.time()
1066
        _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
1067
        normal_load_time = time.time() - start_time
1068

1069
        assert 2 * low_cpu_mem_usage_time < normal_load_time
1070

1071
    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
1072
1073
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
Anton Lozhkov's avatar
Anton Lozhkov committed
1074
        torch.cuda.reset_peak_memory_stats()
1075

1076
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
1077
1078
1079
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing(1)
        pipe.enable_sequential_cpu_offload()
1080

1081
1082
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        _ = pipe(**inputs)
1083
1084

        mem_bytes = torch.cuda.max_memory_allocated()
Anton Lozhkov's avatar
Anton Lozhkov committed
1085
1086
        # make sure that less than 2.8 GB is allocated
        assert mem_bytes < 2.8 * 10**9
1087

1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
    def test_stable_diffusion_pipeline_with_model_offloading(self):
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()

        inputs = self.get_inputs(torch_device, dtype=torch.float16)

        # Normal inference

        pipe = StableDiffusionPipeline.from_pretrained(
            "CompVis/stable-diffusion-v1-4",
            torch_dtype=torch.float16,
        )
1101
        pipe.unet.set_default_attn_processor()
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        outputs = pipe(**inputs)
        mem_bytes = torch.cuda.max_memory_allocated()

        # With model offloading

        # Reload but don't move to cuda
        pipe = StableDiffusionPipeline.from_pretrained(
            "CompVis/stable-diffusion-v1-4",
            torch_dtype=torch.float16,
        )
1114
        pipe.unet.set_default_attn_processor()
1115
1116
1117
1118
1119
1120
1121

        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()

        pipe.enable_model_cpu_offload()
        pipe.set_progress_bar_config(disable=None)
1122
1123
        inputs = self.get_inputs(torch_device, dtype=torch.float16)

1124
1125
1126
        outputs_offloaded = pipe(**inputs)
        mem_bytes_offloaded = torch.cuda.max_memory_allocated()

1127
1128
1129
1130
1131
        images = outputs.images
        offloaded_images = outputs_offloaded.images

        max_diff = numpy_cosine_similarity_distance(images.flatten(), offloaded_images.flatten())
        assert max_diff < 1e-3
1132
1133
        assert mem_bytes_offloaded < mem_bytes
        assert mem_bytes_offloaded < 3.5 * 10**9
1134
        for module in pipe.text_encoder, pipe.unet, pipe.vae:
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
            assert module.device == torch.device("cpu")

        # With attention slicing
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()

        pipe.enable_attention_slicing()
        _ = pipe(**inputs)
        mem_bytes_slicing = torch.cuda.max_memory_allocated()

        assert mem_bytes_slicing < mem_bytes_offloaded
        assert mem_bytes_slicing < 3 * 10**9

1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
    def test_stable_diffusion_textual_inversion(self):
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")

        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
        a111_file_neg = hf_hub_download(
            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
        )
        pipe.load_textual_inversion(a111_file)
        pipe.load_textual_inversion(a111_file_neg)
        pipe.to("cuda")

        generator = torch.Generator(device="cpu").manual_seed(1)

        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
        neg_prompt = "Style-Winter-neg"

        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
        expected_image = load_numpy(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
        )

        max_diff = np.abs(expected_image - image).max()
1172
        assert max_diff < 8e-1
1173

1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
    def test_stable_diffusion_textual_inversion_with_model_cpu_offload(self):
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
        pipe.enable_model_cpu_offload()
        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")

        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
        a111_file_neg = hf_hub_download(
            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
        )
        pipe.load_textual_inversion(a111_file)
        pipe.load_textual_inversion(a111_file_neg)

        generator = torch.Generator(device="cpu").manual_seed(1)

        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
        neg_prompt = "Style-Winter-neg"

        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
        expected_image = load_numpy(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
        )

        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 8e-1

    def test_stable_diffusion_textual_inversion_with_sequential_cpu_offload(self):
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
        pipe.enable_sequential_cpu_offload()
        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")

        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
        a111_file_neg = hf_hub_download(
            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
        )
        pipe.load_textual_inversion(a111_file)
        pipe.load_textual_inversion(a111_file_neg)

        generator = torch.Generator(device="cpu").manual_seed(1)

        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
        neg_prompt = "Style-Winter-neg"

        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
        expected_image = load_numpy(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
        )

        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 8e-1

1224
    @is_torch_compile
1225
    @require_torch_2
1226
    def test_stable_diffusion_compile(self):
1227
1228
1229
1230
1231
1232
1233
        seed = 0
        inputs = self.get_inputs(torch_device, seed=seed)
        # Can't pickle a Generator object
        del inputs["generator"]
        inputs["torch_device"] = torch_device
        inputs["seed"] = seed
        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=inputs)
1234

Patrick von Platen's avatar
Patrick von Platen committed
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
    def test_stable_diffusion_lcm(self):
        unet = UNet2DConditionModel.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", subfolder="unet")
        sd_pipe = StableDiffusionPipeline.from_pretrained("Lykon/dreamshaper-7", unet=unet).to(torch_device)
        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        inputs["num_inference_steps"] = 6
        inputs["output_type"] = "pil"

        image = sd_pipe(**inputs).images[0]

        expected_image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_full/stable_diffusion_lcm.png"
        )

        image = sd_pipe.image_processor.pil_to_numpy(image)
        expected_image = sd_pipe.image_processor.pil_to_numpy(expected_image)

        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())

        assert max_diff < 1e-2

1258

1lint's avatar
1lint committed
1259
1260
1261
@slow
@require_torch_gpu
class StableDiffusionPipelineCkptTests(unittest.TestCase):
1262
1263
1264
1265
1266
    def setUp(self):
        super().setUp()
        gc.collect()
        torch.cuda.empty_cache()

1lint's avatar
1lint committed
1267
1268
1269
1270
1271
1272
1273
    def tearDown(self):
        super().tearDown()
        gc.collect()
        torch.cuda.empty_cache()

    def test_download_from_hub(self):
        ckpt_paths = [
1274
            "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
1275
            "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors",
1lint's avatar
1lint committed
1276
1277
1278
        ]

        for ckpt_path in ckpt_paths:
Patrick von Platen's avatar
Patrick von Platen committed
1279
            pipe = StableDiffusionPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16)
1lint's avatar
1lint committed
1280
1281
1282
1283
1284
1285
1286
1287
            pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
            pipe.to("cuda")

        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]

        assert image_out.shape == (512, 512, 3)

    def test_download_local(self):
1288
1289
1290
1291
        ckpt_filename = hf_hub_download(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.safetensors"
        )
        config_filename = hf_hub_download("stable-diffusion-v1-5/stable-diffusion-v1-5", filename="v1-inference.yaml")
1lint's avatar
1lint committed
1292

1293
1294
1295
        pipe = StableDiffusionPipeline.from_single_file(
            ckpt_filename, config_files={"v1": config_filename}, torch_dtype=torch.float16
        )
1lint's avatar
1lint committed
1296
1297
1298
1299
1300
1301
1302
1303
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
        pipe.to("cuda")

        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]

        assert image_out.shape == (512, 512, 3)


1304
1305
1306
@nightly
@require_torch_gpu
class StableDiffusionPipelineNightlyTests(unittest.TestCase):
1307
1308
1309
1310
1311
    def setUp(self):
        super().setUp()
        gc.collect()
        torch.cuda.empty_cache()

1312
1313
1314
1315
1316
    def tearDown(self):
        super().tearDown()
        gc.collect()
        torch.cuda.empty_cache()

1317
1318
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
1319
1320
1321
1322
1323
1324
1325
1326
        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
        inputs = {
            "prompt": "a photograph of an astronaut riding a horse",
            "latents": latents,
            "generator": generator,
            "num_inference_steps": 50,
            "guidance_scale": 7.5,
1327
            "output_type": "np",
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
        }
        return inputs

    def test_stable_diffusion_1_4_pndm(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3

    def test_stable_diffusion_1_5_pndm(self):
1346
1347
1348
        sd_pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5").to(
            torch_device
        )
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_text2img/stable_diffusion_1_5_pndm.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3

    def test_stable_diffusion_ddim(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"
        )
        max_diff = np.abs(expected_image - image).max()
1374
        assert max_diff < 3e-3
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404

    def test_stable_diffusion_lms(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3

    def test_stable_diffusion_euler(self):
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_text2img/stable_diffusion_1_4_euler.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428


# (sayakpaul): This test suite was run in the DGX with two GPUs (1, 2).
@slow
@require_torch_multi_gpu
@require_accelerate_version_greater("0.27.0")
class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
    def tearDown(self):
        super().tearDown()
        gc.collect()
        torch.cuda.empty_cache()

    def get_inputs(self, generator_device="cpu", seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
        inputs = {
            "prompt": "a photograph of an astronaut riding a horse",
            "generator": generator,
            "num_inference_steps": 50,
            "guidance_scale": 7.5,
            "output_type": "np",
        }
        return inputs

    def get_pipeline_output_without_device_map(self):
1429
1430
1431
        sd_pipe = StableDiffusionPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16
        ).to(torch_device)
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
        sd_pipe.set_progress_bar_config(disable=True)
        inputs = self.get_inputs()
        no_device_map_image = sd_pipe(**inputs).images

        del sd_pipe

        return no_device_map_image

    def test_forward_pass_balanced_device_map(self):
        no_device_map_image = self.get_pipeline_output_without_device_map()

        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1444
            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
        )
        sd_pipe_with_device_map.set_progress_bar_config(disable=True)
        inputs = self.get_inputs()
        device_map_image = sd_pipe_with_device_map(**inputs).images

        max_diff = np.abs(device_map_image - no_device_map_image).max()
        assert max_diff < 1e-3

    def test_components_put_in_right_devices(self):
        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1455
            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
1456
1457
1458
1459
1460
1461
1462
1463
        )

        assert len(set(sd_pipe_with_device_map.hf_device_map.values())) >= 2

    def test_max_memory(self):
        no_device_map_image = self.get_pipeline_output_without_device_map()

        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1464
            "stable-diffusion-v1-5/stable-diffusion-v1-5",
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
            device_map="balanced",
            max_memory={0: "1GB", 1: "1GB"},
            torch_dtype=torch.float16,
        )
        sd_pipe_with_device_map.set_progress_bar_config(disable=True)
        inputs = self.get_inputs()
        device_map_image = sd_pipe_with_device_map(**inputs).images

        max_diff = np.abs(device_map_image - no_device_map_image).max()
        assert max_diff < 1e-3

    def test_reset_device_map(self):
        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1478
            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
        )
        sd_pipe_with_device_map.reset_device_map()

        assert sd_pipe_with_device_map.hf_device_map is None

        for name, component in sd_pipe_with_device_map.components.items():
            if isinstance(component, torch.nn.Module):
                assert component.device.type == "cpu"

    def test_reset_device_map_to(self):
        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1490
            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
        )
        sd_pipe_with_device_map.reset_device_map()

        assert sd_pipe_with_device_map.hf_device_map is None

        # Make sure `to()` can be used and the pipeline can be called.
        pipe = sd_pipe_with_device_map.to("cuda")
        _ = pipe("hello", num_inference_steps=2)

    def test_reset_device_map_enable_model_cpu_offload(self):
        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1502
            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
        )
        sd_pipe_with_device_map.reset_device_map()

        assert sd_pipe_with_device_map.hf_device_map is None

        # Make sure `enable_model_cpu_offload()` can be used and the pipeline can be called.
        sd_pipe_with_device_map.enable_model_cpu_offload()
        _ = sd_pipe_with_device_map("hello", num_inference_steps=2)

    def test_reset_device_map_enable_sequential_cpu_offload(self):
        sd_pipe_with_device_map = StableDiffusionPipeline.from_pretrained(
1514
            "stable-diffusion-v1-5/stable-diffusion-v1-5", device_map="balanced", torch_dtype=torch.float16
1515
1516
1517
1518
1519
1520
1521
1522
        )
        sd_pipe_with_device_map.reset_device_map()

        assert sd_pipe_with_device_map.hf_device_map is None

        # Make sure `enable_sequential_cpu_offload()` can be used and the pipeline can be called.
        sd_pipe_with_device_map.enable_sequential_cpu_offload()
        _ = sd_pipe_with_device_map("hello", num_inference_steps=2)