test_stable_diffusion_img2img.py 22.9 KB
Newer Older
1
# coding=utf-8
Patrick von Platen's avatar
Patrick von Platen committed
2
# Copyright 2023 HuggingFace Inc.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import random
18
import traceback
19
20
21
22
import unittest

import numpy as np
import torch
23
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
24
25
26

from diffusers import (
    AutoencoderKL,
27
    DDIMScheduler,
28
    DPMSolverMultistepScheduler,
29
    HeunDiscreteScheduler,
30
31
32
33
34
    LMSDiscreteScheduler,
    PNDMScheduler,
    StableDiffusionImg2ImgPipeline,
    UNet2DConditionModel,
)
35
36
from diffusers.utils.testing_utils import (
    enable_full_determinism,
Dhruv Nair's avatar
Dhruv Nair committed
37
38
39
40
    floats_tensor,
    load_image,
    load_numpy,
    nightly,
Dhruv Nair's avatar
Dhruv Nair committed
41
    require_python39_or_higher,
42
43
44
45
    require_torch_2,
    require_torch_gpu,
    run_test_in_subprocess,
    skip_mps,
Dhruv Nair's avatar
Dhruv Nair committed
46
47
    slow,
    torch_device,
48
)
49

50
51
52
53
54
from ..pipeline_params import (
    IMAGE_TO_IMAGE_IMAGE_PARAMS,
    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
)
55
from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
56

57

58
enable_full_determinism()
59
60


61
62
63
64
65
66
67
68
69
70
71
# Will be run via run_test_in_subprocess
def _test_img2img_compile(in_queue, out_queue, timeout):
    error = None
    try:
        inputs = in_queue.get(timeout=timeout)
        torch_device = inputs.pop("torch_device")
        seed = inputs.pop("seed")
        inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed)

        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
72
        pipe.unet.set_default_attn_processor()
73
74
75
76
77
78
79
80
81
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        pipe.unet.to(memory_format=torch.channels_last)
        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)

        image = pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()

        assert image.shape == (1, 512, 768, 3)
82
        expected_slice = np.array([0.0606, 0.0570, 0.0805, 0.0579, 0.0628, 0.0623, 0.0843, 0.1115, 0.0806])
83
84
85
86
87
88
89
90
91
92

        assert np.abs(expected_slice - image_slice).max() < 1e-3
    except Exception:
        error = f"{traceback.format_exc()}"

    results = {"error": error}
    out_queue.put(results, timeout=timeout)
    out_queue.join()


93
94
95
class StableDiffusionImg2ImgPipelineFastTests(
    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
):
96
    pipeline_class = StableDiffusionImg2ImgPipeline
97
98
99
    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
100
    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
101
    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
102

103
    def get_dummy_components(self):
104
        torch.manual_seed(0)
105
        unet = UNet2DConditionModel(
106
107
108
109
110
111
112
113
114
            block_out_channels=(32, 64),
            layers_per_block=2,
            sample_size=32,
            in_channels=4,
            out_channels=4,
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
            cross_attention_dim=32,
        )
115
        scheduler = PNDMScheduler(skip_prk_steps=True)
116
        torch.manual_seed(0)
117
        vae = AutoencoderKL(
118
119
120
121
122
123
124
125
            block_out_channels=[32, 64],
            in_channels=3,
            out_channels=3,
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
            latent_channels=4,
        )
        torch.manual_seed(0)
126
        text_encoder_config = CLIPTextConfig(
127
128
129
130
131
132
133
134
135
136
            bos_token_id=0,
            eos_token_id=2,
            hidden_size=32,
            intermediate_size=37,
            layer_norm_eps=1e-05,
            num_attention_heads=4,
            num_hidden_layers=5,
            pad_token_id=1,
            vocab_size=1000,
        )
137
138
        text_encoder = CLIPTextModel(text_encoder_config)
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
139

140
141
142
143
144
145
146
        components = {
            "unet": unet,
            "scheduler": scheduler,
            "vae": vae,
            "text_encoder": text_encoder,
            "tokenizer": tokenizer,
            "safety_checker": None,
147
            "feature_extractor": None,
148
149
150
        }
        return components

151
    def get_dummy_inputs(self, device, seed=0):
152
        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
153
        image = image / 2 + 0.5
154
155
156
157
158
159
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
        else:
            generator = torch.Generator(device=device).manual_seed(seed)
        inputs = {
            "prompt": "A painting of a squirrel eating a burger",
160
            "image": image,
161
162
163
            "generator": generator,
            "num_inference_steps": 2,
            "guidance_scale": 6.0,
164
            "output_type": "numpy",
165
166
        }
        return inputs
167

168
    def test_stable_diffusion_img2img_default_case(self):
169
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
170
171
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
172
173
174
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

175
176
        inputs = self.get_dummy_inputs(device)
        image = sd_pipe(**inputs).images
177
178
179
        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 32, 32, 3)
180
        expected_slice = np.array([0.4555, 0.3216, 0.4049, 0.4620, 0.4618, 0.4126, 0.4122, 0.4629, 0.4579])
181

182
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
183
184
185

    def test_stable_diffusion_img2img_negative_prompt(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
186
187
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
188
189
190
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

191
        inputs = self.get_dummy_inputs(device)
192
        negative_prompt = "french fries"
193
        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
194
195
196
197
        image = output.images
        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 32, 32, 3)
198
        expected_slice = np.array([0.4593, 0.3408, 0.4232, 0.4749, 0.4476, 0.4115, 0.4357, 0.4733, 0.4663])
199

200
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
201
202
203

    def test_stable_diffusion_img2img_multiple_init_images(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
204
205
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
206
207
208
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

209
210
211
212
        inputs = self.get_dummy_inputs(device)
        inputs["prompt"] = [inputs["prompt"]] * 2
        inputs["image"] = inputs["image"].repeat(2, 1, 1, 1)
        image = sd_pipe(**inputs).images
213
214
215
        image_slice = image[-1, -3:, -3:, -1]

        assert image.shape == (2, 32, 32, 3)
216
        expected_slice = np.array([0.4241, 0.5576, 0.5711, 0.4792, 0.4311, 0.5952, 0.5827, 0.5138, 0.5109])
217

218
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
219
220
221

    def test_stable_diffusion_img2img_k_lms(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
222
223
224
        components = self.get_dummy_components()
        components["scheduler"] = LMSDiscreteScheduler(
            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
225
        )
226
        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
227
228
229
        sd_pipe = sd_pipe.to(device)
        sd_pipe.set_progress_bar_config(disable=None)

230
231
        inputs = self.get_dummy_inputs(device)
        image = sd_pipe(**inputs).images
232
233
234
        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 32, 32, 3)
235
        expected_slice = np.array([0.4398, 0.4949, 0.4337, 0.6580, 0.5555, 0.4338, 0.5769, 0.5955, 0.5175])
236

237
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
238

239
240
241
242
243
244
245
246
247
248
249
250
251
252
    @skip_mps
    def test_save_load_local(self):
        return super().test_save_load_local()

    @skip_mps
    def test_dict_tuple_outputs_equivalent(self):
        return super().test_dict_tuple_outputs_equivalent()

    @skip_mps
    def test_save_load_optional_components(self):
        return super().test_save_load_optional_components()

    @skip_mps
    def test_attention_slicing_forward_pass(self):
253
254
255
256
        return super().test_attention_slicing_forward_pass(expected_max_diff=5e-3)

    def test_inference_batch_single_identical(self):
        super().test_inference_batch_single_identical(expected_max_diff=3e-3)
257

258
259
260
    def test_float16_inference(self):
        super().test_float16_inference(expected_max_diff=5e-1)

261
262

@slow
263
@require_torch_gpu
264
class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
265
266
267
268
269
    def tearDown(self):
        super().tearDown()
        gc.collect()
        torch.cuda.empty_cache()

270
271
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
272
        init_image = load_image(
273
274
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_img2img/sketch-mountains-input.png"
275
        )
276
277
278
279
280
281
282
        inputs = {
            "prompt": "a fantasy landscape, concept art, high resolution",
            "image": init_image,
            "generator": generator,
            "num_inference_steps": 3,
            "strength": 0.75,
            "guidance_scale": 7.5,
YiYi Xu's avatar
YiYi Xu committed
283
            "output_type": "np",
284
285
        }
        return inputs
286

287
288
    def test_stable_diffusion_img2img_default(self):
        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
289
290
291
292
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()

293
294
295
        inputs = self.get_inputs(torch_device)
        image = pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
296

297
        assert image.shape == (1, 512, 768, 3)
298
299
        expected_slice = np.array([0.4300, 0.4662, 0.4930, 0.3990, 0.4307, 0.4525, 0.3719, 0.4064, 0.3923])

300
        assert np.abs(expected_slice - image_slice).max() < 1e-3
301

302
303
304
    def test_stable_diffusion_img2img_k_lms(self):
        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
305
306
307
308
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()

309
310
311
        inputs = self.get_inputs(torch_device)
        image = pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
312

313
        assert image.shape == (1, 512, 768, 3)
314
315
        expected_slice = np.array([0.0389, 0.0346, 0.0415, 0.0290, 0.0218, 0.0210, 0.0408, 0.0567, 0.0271])

316
        assert np.abs(expected_slice - image_slice).max() < 1e-3
317

318
319
320
    def test_stable_diffusion_img2img_ddim(self):
        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
321
322
323
324
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()

325
326
327
        inputs = self.get_inputs(torch_device)
        image = pipe(**inputs).images
        image_slice = image[0, -3:, -3:, -1].flatten()
328

329
        assert image.shape == (1, 512, 768, 3)
330
331
        expected_slice = np.array([0.0593, 0.0607, 0.0851, 0.0582, 0.0636, 0.0721, 0.0751, 0.0981, 0.0781])

332
        assert np.abs(expected_slice - image_slice).max() < 1e-3
333
334
335
336

    def test_stable_diffusion_img2img_intermediate_state(self):
        number_of_steps = 0

337
338
        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
            callback_fn.has_been_called = True
339
340
            nonlocal number_of_steps
            number_of_steps += 1
341
            if step == 1:
342
343
344
                latents = latents.detach().cpu().numpy()
                assert latents.shape == (1, 4, 64, 96)
                latents_slice = latents[0, -3:, -3:, -1]
345
346
347
                expected_slice = np.array([-0.4958, 0.5107, 1.1045, 2.7539, 4.6680, 3.8320, 1.5049, 1.8633, 2.6523])

                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
348
            elif step == 2:
349
350
351
                latents = latents.detach().cpu().numpy()
                assert latents.shape == (1, 4, 64, 96)
                latents_slice = latents[0, -3:, -3:, -1]
352
353
354
                expected_slice = np.array([-0.4956, 0.5078, 1.0918, 2.7520, 4.6484, 3.8125, 1.5146, 1.8633, 2.6367])

                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
355

356
        callback_fn.has_been_called = False
357
358

        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
359
            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
360
        )
361
        pipe = pipe.to(torch_device)
362
363
364
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()

365
366
367
368
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        pipe(**inputs, callback=callback_fn, callback_steps=1)
        assert callback_fn.has_been_called
        assert number_of_steps == 2
369
370
371
372

    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
Anton Lozhkov's avatar
Anton Lozhkov committed
373
        torch.cuda.reset_peak_memory_stats()
374
375

        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
376
            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
377
        )
378
        pipe = pipe.to(torch_device)
379
380
381
382
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing(1)
        pipe.enable_sequential_cpu_offload()

383
384
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
        _ = pipe(**inputs)
385
386

        mem_bytes = torch.cuda.max_memory_allocated()
Anton Lozhkov's avatar
Anton Lozhkov committed
387
388
        # make sure that less than 2.2 GB is allocated
        assert mem_bytes < 2.2 * 10**9
389

390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
    def test_stable_diffusion_pipeline_with_model_offloading(self):
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()

        inputs = self.get_inputs(torch_device, dtype=torch.float16)

        # Normal inference

        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
            "CompVis/stable-diffusion-v1-4",
            safety_checker=None,
            torch_dtype=torch.float16,
        )
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        pipe(**inputs)
        mem_bytes = torch.cuda.max_memory_allocated()

        # With model offloading

        # Reload but don't move to cuda
        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
            "CompVis/stable-diffusion-v1-4",
            safety_checker=None,
            torch_dtype=torch.float16,
        )

        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()

        pipe.enable_model_cpu_offload()
        pipe.set_progress_bar_config(disable=None)
        _ = pipe(**inputs)
        mem_bytes_offloaded = torch.cuda.max_memory_allocated()

        assert mem_bytes_offloaded < mem_bytes
        for module in pipe.text_encoder, pipe.unet, pipe.vae:
            assert module.device == torch.device("cpu")

431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
    def test_img2img_2nd_order(self):
        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
        sd_pipe.scheduler = HeunDiscreteScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        inputs["num_inference_steps"] = 10
        inputs["strength"] = 0.75
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/img2img_heun.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 5e-2

        inputs = self.get_inputs(torch_device)
        inputs["num_inference_steps"] = 11
        inputs["strength"] = 0.75
        image_other = sd_pipe(**inputs).images[0]

        mean_diff = np.abs(image - image_other).mean()

        # images should be very similar
        assert mean_diff < 5e-2

458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
    def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
        init_image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
            "/img2img/sketch-mountains-input.jpg"
        )
        # resize to resolution that is divisible by 8 but not 16 or 32
        init_image = init_image.resize((760, 504))

        model_id = "CompVis/stable-diffusion-v1-4"
        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
            model_id,
            safety_checker=None,
        )
        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()

        prompt = "A fantasy landscape, trending on artstation"

477
        generator = torch.manual_seed(0)
478
479
480
481
482
483
484
485
486
487
488
489
490
        output = pipe(
            prompt=prompt,
            image=init_image,
            strength=0.75,
            guidance_scale=7.5,
            generator=generator,
            output_type="np",
        )
        image = output.images[0]

        image_slice = image[255:258, 383:386, -1]

        assert image.shape == (504, 760, 3)
491
492
493
        expected_slice = np.array([0.9393, 0.9500, 0.9399, 0.9438, 0.9458, 0.9400, 0.9455, 0.9414, 0.9423])

        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
494

495
496
497
498
499
500
501
502
503
504
505
506
507
508
    def test_img2img_safety_checker_works(self):
        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
        sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        inputs["num_inference_steps"] = 20
        # make sure the safety checker is activated
        inputs["prompt"] = "naked, sex, porn"
        out = sd_pipe(**inputs)

        assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}"
        assert np.abs(out.images[0]).sum() < 1e-5  # should be all zeros

Dhruv Nair's avatar
Dhruv Nair committed
509
    @require_python39_or_higher
510
    @require_torch_2
511
    def test_img2img_compile(self):
512
513
514
515
516
517
518
        seed = 0
        inputs = self.get_inputs(torch_device, seed=seed)
        # Can't pickle a Generator object
        del inputs["generator"]
        inputs["torch_device"] = torch_device
        inputs["seed"] = seed
        run_test_in_subprocess(test_case=self, target_func=_test_img2img_compile, inputs=inputs)
519

520
521
522
523
524
525
526
527
528

@nightly
@require_torch_gpu
class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):
    def tearDown(self):
        super().tearDown()
        gc.collect()
        torch.cuda.empty_cache()

529
530
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
        generator = torch.Generator(device=generator_device).manual_seed(seed)
531
532
533
534
535
536
537
538
539
540
541
        init_image = load_image(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_img2img/sketch-mountains-input.png"
        )
        inputs = {
            "prompt": "a fantasy landscape, concept art, high resolution",
            "image": init_image,
            "generator": generator,
            "num_inference_steps": 50,
            "strength": 0.75,
            "guidance_scale": 7.5,
YiYi Xu's avatar
YiYi Xu committed
542
            "output_type": "np",
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
        }
        return inputs

    def test_img2img_pndm(self):
        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
        sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_img2img/stable_diffusion_1_5_pndm.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3

    def test_img2img_ddim(self):
        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_img2img/stable_diffusion_1_5_ddim.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3

    def test_img2img_lms(self):
        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_img2img/stable_diffusion_1_5_lms.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3

    def test_img2img_dpm(self):
        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
        sd_pipe.to(torch_device)
        sd_pipe.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device)
        inputs["num_inference_steps"] = 30
        image = sd_pipe(**inputs).images[0]

        expected_image = load_numpy(
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
            "/stable_diffusion_img2img/stable_diffusion_1_5_dpm.npy"
        )
        max_diff = np.abs(expected_image - image).max()
        assert max_diff < 1e-3