Improve reproduceability 2/3 (#1906)

* [Repro] Correct reproducability * up * up * uP * up * need better image * allow conversion from no state dict checkpoints * up * up * up * up * check tensors * check tensors * check tensors * check tensors * next try * up * up * better name * up * up * Apply suggestions from code review * correct more * up * replace all torch randn * fix * correct * correct * finish * fix more * up

Improve reproduceability 2/3 (#1906)
* [Repro] Correct reproducability * up * up * uP * up * need better image * allow conversion from no state dict checkpoints * up * up * up * up * check tensors * check tensors * check tensors * check tensors * next try * up * up * better name * up * up * Apply suggestions from code review * correct more * up * replace all torch randn * fix * correct * correct * finish * fix more * up
9b638548 · Patrick von Platen · GitHub · 67e2f95c · 9b638548 · 9b638548
Unverified Commit 9b638548 authored Jan 05, 2023 by Patrick von Platen Committed by GitHub Jan 04, 2023
9 changed files
--- a/src/diffusers/schedulers/scheduling_sde_ve.py
+++ b/src/diffusers/schedulers/scheduling_sde_ve.py
@@ -21,7 +21,7 @@ from typing import Optional, Tuple, Union
 import torch
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
+from ..utils import BaseOutput, randn_tensor
 from .scheduling_utils import SchedulerMixin, SchedulerOutput
@@ -201,7 +201,9 @@ class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin):
        drift = drift - diffusion**2 * model_output
        #  equation 6: sample noise for the diffusion term of
-        noise = torch.randn(sample.shape, layout=sample.layout, generator=generator).to(sample.device)
+        noise = randn_tensor(
+            sample.shape, layout=sample.layout, generator=generator, device=sample.device, dtype=sample.dtype
+        )
        prev_sample_mean = sample - drift  # subtract because `dt` is a small negative timestep
        # TODO is the variable diffusion the correct scaling term for the noise?
        prev_sample = prev_sample_mean + diffusion * noise  # add impact of diffusion field g
@@ -241,7 +243,7 @@ class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin):
        # For small batch sizes, the paper "suggest replacing norm(z) with sqrt(d), where d is the dim. of z"
        # sample noise for correction
-        noise = torch.randn(sample.shape, layout=sample.layout, generator=generator).to(sample.device)
+        noise = randn_tensor(sample.shape, layout=sample.layout, generator=generator).to(sample.device)
        # compute step size from the model_output, the noise, and the snr
        grad_norm = torch.norm(model_output.reshape(model_output.shape[0], -1), dim=-1).mean()

--- a/src/diffusers/schedulers/scheduling_sde_vp.py
+++ b/src/diffusers/schedulers/scheduling_sde_vp.py
@@ -20,6 +20,7 @@ from typing import Union
 import torch
 from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import randn_tensor
 from .scheduling_utils import SchedulerMixin
@@ -80,7 +81,7 @@ class ScoreSdeVpScheduler(SchedulerMixin, ConfigMixin):
        x_mean = x + drift * dt
        # add noise
-        noise = torch.randn(x.shape, layout=x.layout, generator=generator).to(x.device)
+        noise = randn_tensor(x.shape, layout=x.layout, generator=generator, device=x.device, dtype=x.dtype)
        x = x_mean + diffusion * math.sqrt(-dt) * noise
        return x, x_mean

--- a/src/diffusers/schedulers/scheduling_unclip.py
+++ b/src/diffusers/schedulers/scheduling_unclip.py
@@ -20,7 +20,7 @@ import numpy as np
 import torch
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, torch_randn
+from ..utils import BaseOutput, randn_tensor
 from .scheduling_utils import SchedulerMixin
@@ -273,7 +273,7 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin):
        # 6. Add noise
        variance = 0
        if t > 0:
-            variance_noise = torch_randn(
+            variance_noise = randn_tensor(
                model_output.shape, dtype=model_output.dtype, generator=generator, device=model_output.device
            )

--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -64,7 +64,7 @@ from .import_utils import (
 from .logging import get_logger
 from .outputs import BaseOutput
 from .pil_utils import PIL_INTERPOLATION
-from .torch_utils import torch_randn
+from .torch_utils import randn_tensor
 if is_torch_available():

--- a/src/diffusers/utils/torch_utils.py
+++ b/src/diffusers/utils/torch_utils.py
@@ -26,11 +26,12 @@ if is_torch_available():
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-def torch_randn(
+def randn_tensor(
    shape: Union[Tuple, List],
    generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None,
    device: Optional["torch.device"] = None,
    dtype: Optional["torch.dtype"] = None,
+    layout: Optional["torch.layout"] = None,
 ):
    """This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When
    passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor
@@ -40,8 +41,12 @@ def torch_randn(
    rand_device = device
    batch_size = shape[0]
+    layout = layout or torch.strided
+    device = device or torch.device("cpu")
    if generator is not None:
-        if generator.device != device and generator.device.type == "cpu":
+        gen_device_type = generator.device.type if not isinstance(generator, list) else generator[0].device.type
+        if gen_device_type != device.type and gen_device_type == "cpu":
            rand_device = "cpu"
            if device != "mps":
                logger.info(
@@ -49,16 +54,17 @@ def torch_randn(
                    f" Tensors will be created on 'cpu' and then moved to {device}. Note that one can probably"
                    f" slighly speed up this function by passing a generator that was created on the {device} device."
                )
-        elif generator.device.type != device.type and generator.device.type == "cuda":
+        elif gen_device_type != device.type and gen_device_type == "cuda":
-            raise ValueError(f"Cannot generate a {device} tensor from a generator of type {generator.device.type}.")
+            raise ValueError(f"Cannot generate a {device} tensor from a generator of type {gen_device_type}.")
    if isinstance(generator, list):
        shape = (1,) + shape[1:]
        latents = [
-            torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype) for i in range(batch_size)
+            torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype, layout=layout)
+            for i in range(batch_size)
        ]
        latents = torch.cat(latents, dim=0).to(device)
    else:
-        latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
+        latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype, layout=layout).to(device)
    return latents
--- a/tests/pipelines/karras_ve/test_karras_ve.py
+++ b/tests/pipelines/karras_ve/test_karras_ve.py
@@ -25,44 +25,6 @@ from diffusers.utils.testing_utils import require_torch, slow, torch_device
 torch.backends.cuda.matmul.allow_tf32 = False
-class KarrasVePipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-    def test_inference(self):
-        unet = self.dummy_uncond_unet
-        scheduler = KarrasVeScheduler()
-        pipe = KarrasVePipeline(unet=unet, scheduler=scheduler)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        generator = torch.manual_seed(0)
-        image = pipe(num_inference_steps=2, generator=generator, output_type="numpy").images
-        generator = torch.manual_seed(0)
-        image_from_tuple = pipe(num_inference_steps=2, generator=generator, output_type="numpy", return_dict=False)[0]
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 @slow
 @require_torch
 class KarrasVePipelineIntegrationTests(unittest.TestCase):

--- a/tests/pipelines/paint_by_example/test_paint_by_example.py
+++ b/tests/pipelines/paint_by_example/test_paint_by_example.py
@@ -132,7 +132,7 @@ class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        image_slice = image[0, -3:, -3:, -1]
        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4397, 0.5553, 0.3802, 0.5222, 0.5811, 0.4342, 0.494, 0.4577, 0.4428])
+        expected_slice = np.array([0.4701, 0.5555, 0.3994, 0.5107, 0.5691, 0.4517, 0.5125, 0.4769, 0.4539])
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -21,7 +21,7 @@ import torch
 from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel
 from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
-from diffusers.utils import load_numpy, slow, torch_device
+from diffusers.utils import load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
@@ -363,6 +363,37 @@ class UnCLIPPipelineFastTests(unittest.TestCase):
        assert np.abs(image - image_from_text).max() < 1e-4
+@nightly
+class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+    def test_unclip_karlo_cpu_fp32(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/unclip/karlo_v1_alpha_horse_cpu.npy"
+        )
+        pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha")
+        pipeline.set_progress_bar_config(disable=None)
+        generator = torch.manual_seed(0)
+        output = pipeline(
+            "horse",
+            num_images_per_prompt=1,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+        assert image.shape == (256, 256, 3)
+        assert np.abs(expected_image - image).max() < 1e-1
 @slow
 @require_torch_gpu
 class UnCLIPPipelineIntegrationTests(unittest.TestCase):
@@ -385,15 +416,19 @@ class UnCLIPPipelineIntegrationTests(unittest.TestCase):
        generator = torch.Generator(device="cpu").manual_seed(0)
        output = pipeline(
            "horse",
-            num_images_per_prompt=1,
            generator=generator,
            output_type="np",
        )
-        image = output.images[0]
+        image = np.asarray(pipeline.numpy_to_pil(output.images)[0], dtype=np.float32)
+        expected_image = np.asarray(pipeline.numpy_to_pil(expected_image)[0], dtype=np.float32)
+        # Karlo is extremely likely to strongly deviate depending on which hardware is used
+        # Here we just check that the image doesn't deviate more than 10 pixels from the reference image on average
+        avg_diff = np.abs(image - expected_image).mean()
+        assert avg_diff < 10, f"Error image deviates {avg_diff} pixels on average"
        assert image.shape == (256, 256, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
    def test_unclip_pipeline_with_sequential_cpu_offloading(self):
        torch.cuda.empty_cache()

--- a/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -475,20 +475,25 @@ class UnCLIPImageVariationPipelineIntegrationTests(unittest.TestCase):
            "/unclip/karlo_v1_alpha_cat_variation_fp16.npy"
        )
-        pipeline = UnCLIPImageVariationPipeline.from_pretrained("fusing/karlo-image-variations-diffusers")
+        pipeline = UnCLIPImageVariationPipeline.from_pretrained(
+            "fusing/karlo-image-variations-diffusers", torch_dtype=torch.float16
+        )
        pipeline = pipeline.to(torch_device)
        pipeline.set_progress_bar_config(disable=None)
-        pipeline.enable_sequential_cpu_offload()
        generator = torch.Generator(device="cpu").manual_seed(0)
        output = pipeline(
            input_image,
-            num_images_per_prompt=1,
            generator=generator,
            output_type="np",
        )
-        image = output.images[0]
+        image = np.asarray(pipeline.numpy_to_pil(output.images)[0], dtype=np.float32)
+        expected_image = np.asarray(pipeline.numpy_to_pil(expected_image)[0], dtype=np.float32)
+        # Karlo is extremely likely to strongly deviate depending on which hardware is used
+        # Here we just check that the image doesn't deviate more than 10 pixels from the reference image on average
+        avg_diff = np.abs(image - expected_image).mean()
+        assert avg_diff < 10, f"Error image deviates {avg_diff} pixels on average"
        assert image.shape == (256, 256, 3)
-        assert np.abs(expected_image - image).max() < 5e-2