Merge remote-tracking branch 'origin/main'

# Conflicts: # tests/test_modeling_utils.py

Merge remote-tracking branch 'origin/main'
# Conflicts: # tests/test_modeling_utils.py
0e13d329 · anton-l · 3f9e3d8a · e13ee8b5 · 0e13d329 · 0e13d329
Commit 0e13d329 authored Jun 27, 2022 by anton-l
12 changed files
--- a/src/diffusers/pipelines/pipeline_grad_tts.py
+++ b/src/diffusers/pipelines/pipeline_grad_tts.py
@@ -420,7 +420,7 @@ class TextEncoder(ModelMixin, ConfigMixin):
        return mu, logw, x_mask


-class GradTTS(DiffusionPipeline):
+class GradTTSPipeline(DiffusionPipeline):
    def __init__(self, unet, text_encoder, noise_scheduler, tokenizer):
        super().__init__()
        noise_scheduler = noise_scheduler.set_format("pt")
@@ -430,7 +430,14 @@ class GradTTS(DiffusionPipeline):

    @torch.no_grad()
    def __call__(
-        self, text, num_inference_steps=50, temperature=1.3, length_scale=0.91, speaker_id=15, torch_device=None
+        self,
+        text,
+        num_inference_steps=50,
+        temperature=1.3,
+        length_scale=0.91,
+        speaker_id=15,
+        torch_device=None,
+        generator=None,
    ):
        if torch_device is None:
            torch_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -464,17 +471,19 @@ class GradTTS(DiffusionPipeline):
        mu_y = mu_y.transpose(1, 2)

        # Sample latent representation from terminal distribution N(mu_y, I)
-        z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature
+        z = mu_y + torch.randn(mu_y.shape, generator=generator).to(mu_y.device)

        xt = z * y_mask
        h = 1.0 / num_inference_steps
+        # (Patrick: TODO)
        for t in tqdm.tqdm(range(num_inference_steps), total=num_inference_steps):
+            t_new = num_inference_steps - t - 1
            t = (1.0 - (t + 0.5) * h) * torch.ones(z.shape[0], dtype=z.dtype, device=z.device)
-            time = t.unsqueeze(-1).unsqueeze(-1)

            residual = self.unet(xt, t, mu_y, y_mask, speaker_id)

-            xt = self.noise_scheduler.step(xt, residual, mu_y, h, time)
+            scheduler_residual = residual - mu_y + xt
+            xt = self.noise_scheduler.step(scheduler_residual, xt, t_new, num_inference_steps)
            xt = xt * y_mask

        return xt[:, :, :y_max_length]
--- a/src/diffusers/pipelines/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/pipeline_latent_diffusion.py
--- a/src/diffusers/pipelines/pipeline_pndm.py
+++ b/src/diffusers/pipelines/pipeline_pndm.py
@@ -21,7 +21,7 @@ import tqdm
 from ..pipeline_utils import DiffusionPipeline


-class PNDM(DiffusionPipeline):
+class PNDMPipeline(DiffusionPipeline):
    def __init__(self, unet, noise_scheduler):
        super().__init__()
        noise_scheduler = noise_scheduler.set_format("pt")

--- a/src/diffusers/pipelines/pipeline_score_sde_ve.py
+++ b/src/diffusers/pipelines/pipeline_score_sde_ve.py
+#!/usr/bin/env python3
+import torch
+
+from diffusers import DiffusionPipeline
+
+
+# TODO(Patrick, Anton, Suraj) - rename `x` to better variable names
+class ScoreSdeVePipeline(DiffusionPipeline):
+    def __init__(self, model, scheduler):
+        super().__init__()
+        self.register_modules(model=model, scheduler=scheduler)
+
+    def __call__(self, num_inference_steps=2000, generator=None):
+        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+
+        img_size = self.model.config.image_size
+        channels = self.model.config.num_channels
+        shape = (1, channels, img_size, img_size)
+
+        model = self.model.to(device)
+
+        # TODO(Patrick) move to scheduler config
+        n_steps = 1
+
+        x = torch.randn(*shape) * self.scheduler.config.sigma_max
+        x = x.to(device)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.set_sigmas(num_inference_steps)
+
+        for i, t in enumerate(self.scheduler.timesteps):
+            sigma_t = self.scheduler.sigmas[i] * torch.ones(shape[0], device=device)
+
+            for _ in range(n_steps):
+                with torch.no_grad():
+                    result = self.model(x, sigma_t)
+                x = self.scheduler.step_correct(result, x)
+
+            with torch.no_grad():
+                result = model(x, sigma_t)
+
+            x, x_mean = self.scheduler.step_pred(result, x, t)
+
+        return x_mean
--- a/src/diffusers/pipelines/pipeline_score_sde_vp.py
+++ b/src/diffusers/pipelines/pipeline_score_sde_vp.py
+#!/usr/bin/env python3
+import torch
+
+from diffusers import DiffusionPipeline
+
+
+# TODO(Patrick, Anton, Suraj) - rename `x` to better variable names
+class ScoreSdeVpPipeline(DiffusionPipeline):
+    def __init__(self, model, scheduler):
+        super().__init__()
+        self.register_modules(model=model, scheduler=scheduler)
+
+    def __call__(self, num_inference_steps=1000, generator=None):
+        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+
+        img_size = self.model.config.image_size
+        channels = self.model.config.num_channels
+        shape = (1, channels, img_size, img_size)
+
+        model = self.model.to(device)
+
+        x = torch.randn(*shape).to(device)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.scheduler.timesteps:
+            t = t * torch.ones(shape[0], device=device)
+            scaled_t = t * (num_inference_steps - 1)
+
+            with torch.no_grad():
+                result = model(x, scaled_t)
+
+            x, x_mean = self.scheduler.step_pred(result, x, t)
+
+        x_mean = (x_mean + 1.0) / 2.0
+
+        return x_mean
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -20,4 +20,6 @@ from .scheduling_ddim import DDIMScheduler
 from .scheduling_ddpm import DDPMScheduler
 from .scheduling_grad_tts import GradTTSScheduler
 from .scheduling_pndm import PNDMScheduler
+from .scheduling_sde_ve import ScoreSdeVeScheduler
+from .scheduling_sde_vp import ScoreSdeVpScheduler
 from .scheduling_utils import SchedulerMixin
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -92,9 +92,9 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
        alpha_prod_t = self.alphas_cumprod[t]
        alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one

-        # For t > 0, compute predicted variance βt (see formala (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
        # and sample from it to get previous sample
-        # x_{t-1} ~ N(pred_prev_sample, variance) == add variane to pred_sample
+        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t]

        if variance_type is None:

--- a/src/diffusers/schedulers/scheduling_grad_tts.py
+++ b/src/diffusers/schedulers/scheduling_grad_tts.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import numpy as np
+
 from ..configuration_utils import ConfigMixin
 from .scheduling_utils import SchedulerMixin

@@ -19,29 +21,34 @@ from .scheduling_utils import SchedulerMixin
 class GradTTSScheduler(SchedulerMixin, ConfigMixin):
    def __init__(
        self,
-        timesteps=1000,
-        beta_start=0.0001,
-        beta_end=0.02,
+        beta_start=0.05,
+        beta_end=20,
        tensor_format="np",
    ):
        super().__init__()
        self.register_to_config(
-            timesteps=timesteps,
            beta_start=beta_start,
            beta_end=beta_end,
        )
        self.set_format(tensor_format=tensor_format)
+        self.betas = None
+
+    def get_timesteps(self, num_inference_steps):
+        return np.array([(t + 0.5) / num_inference_steps for t in range(num_inference_steps)])
+
+    def set_betas(self, num_inference_steps):
+        timesteps = self.get_timesteps(num_inference_steps)
+        self.betas = np.array([self.beta_start + (self.beta_end - self.beta_start) * t for t in timesteps])
+
+    def step(self, residual, sample, t, num_inference_steps):
+        # This is a VE scheduler from https://arxiv.org/pdf/2011.13456.pdf (see Algorithm 2 in Appendix)
+        if self.betas is None:
+            self.set_betas(num_inference_steps)

-    def sample_noise(self, timestep):
-        noise = self.beta_start + (self.beta_end - self.beta_start) * timestep
-        return noise
+        beta_t = self.betas[t]
+        beta_t_deriv = beta_t / num_inference_steps

-    def step(self, xt, residual, mu, h, timestep):
-        noise_t = self.sample_noise(timestep)
-        dxt = 0.5 * (mu - xt - residual)
-        dxt = dxt * noise_t * h
-        xt = xt - dxt
-        return xt
+        sample_deriv = residual * beta_t_deriv / 2

-    def __len__(self):
-        return len(self.config.timesteps)
+        sample = sample + sample_deriv
+        return sample
--- a/src/diffusers/schedulers/scheduling_sde_ve.py
+++ b/src/diffusers/schedulers/scheduling_sde_ve.py
+# Copyright 2022 Google Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
+
+# TODO(Patrick, Anton, Suraj) - make scheduler framework indepedent and clean-up a bit
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin
+from .scheduling_utils import SchedulerMixin
+
+
+class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin):
+    def __init__(self, snr=0.15, sigma_min=0.01, sigma_max=1348, sampling_eps=1e-5, tensor_format="np"):
+        super().__init__()
+        self.register_to_config(
+            snr=snr,
+            sigma_min=sigma_min,
+            sigma_max=sigma_max,
+            sampling_eps=sampling_eps,
+        )
+
+        self.sigmas = None
+        self.discrete_sigmas = None
+        self.timesteps = None
+
+    def set_timesteps(self, num_inference_steps):
+        self.timesteps = torch.linspace(1, self.config.sampling_eps, num_inference_steps)
+
+    def set_sigmas(self, num_inference_steps):
+        if self.timesteps is None:
+            self.set_timesteps(num_inference_steps)
+
+        self.discrete_sigmas = torch.exp(
+            torch.linspace(np.log(self.config.sigma_min), np.log(self.config.sigma_max), num_inference_steps)
+        )
+        self.sigmas = torch.tensor(
+            [self.config.sigma_min * (self.config.sigma_max / self.sigma_min) ** t for t in self.timesteps]
+        )
+
+    def step_pred(self, result, x, t):
+        # TODO(Patrick) better comments + non-PyTorch
+        t = t * torch.ones(x.shape[0], device=x.device)
+        timestep = (t * (len(self.timesteps) - 1)).long()
+
+        sigma = self.discrete_sigmas.to(t.device)[timestep]
+        adjacent_sigma = torch.where(
+            timestep == 0, torch.zeros_like(t), self.discrete_sigmas[timestep - 1].to(timestep.device)
+        )
+        f = torch.zeros_like(x)
+        G = torch.sqrt(sigma**2 - adjacent_sigma**2)
+
+        f = f - G[:, None, None, None] ** 2 * result
+
+        z = torch.randn_like(x)
+        x_mean = x - f
+        x = x_mean + G[:, None, None, None] * z
+        return x, x_mean
+
+    def step_correct(self, result, x):
+        # TODO(Patrick) better comments + non-PyTorch
+        noise = torch.randn_like(x)
+        grad_norm = torch.norm(result.reshape(result.shape[0], -1), dim=-1).mean()
+        noise_norm = torch.norm(noise.reshape(noise.shape[0], -1), dim=-1).mean()
+        step_size = (self.config.snr * noise_norm / grad_norm) ** 2 * 2
+        step_size = step_size * torch.ones(x.shape[0], device=x.device)
+        x_mean = x + step_size[:, None, None, None] * result
+
+        x = x_mean + torch.sqrt(step_size * 2)[:, None, None, None] * noise
+
+        return x
--- a/src/diffusers/schedulers/scheduling_sde_vp.py
+++ b/src/diffusers/schedulers/scheduling_sde_vp.py
+# Copyright 2022 Google Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
+
+# TODO(Patrick, Anton, Suraj) - make scheduler framework indepedent and clean-up a bit
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin
+from .scheduling_utils import SchedulerMixin
+
+
+class ScoreSdeVpScheduler(SchedulerMixin, ConfigMixin):
+    def __init__(self, beta_min=0.1, beta_max=20, sampling_eps=1e-3, tensor_format="np"):
+        super().__init__()
+        self.register_to_config(
+            beta_min=beta_min,
+            beta_max=beta_max,
+            sampling_eps=sampling_eps,
+        )
+
+        self.sigmas = None
+        self.discrete_sigmas = None
+        self.timesteps = None
+
+    def set_timesteps(self, num_inference_steps):
+        self.timesteps = torch.linspace(1, self.config.sampling_eps, num_inference_steps)
+
+    def step_pred(self, result, x, t):
+        # TODO(Patrick) better comments + non-PyTorch
+        # postprocess model result
+        log_mean_coeff = (
+            -0.25 * t**2 * (self.config.beta_max - self.config.beta_min) - 0.5 * t * self.config.beta_min
+        )
+        std = torch.sqrt(1.0 - torch.exp(2.0 * log_mean_coeff))
+        result = -result / std[:, None, None, None]
+
+        # compute
+        dt = -1.0 / len(self.timesteps)
+
+        beta_t = self.config.beta_min + t * (self.config.beta_max - self.config.beta_min)
+        drift = -0.5 * beta_t[:, None, None, None] * x
+        diffusion = torch.sqrt(beta_t)
+        drift = drift - diffusion[:, None, None, None] ** 2 * result
+        x_mean = x + drift * dt
+
+        # add noise
+        z = torch.randn_like(x)
+        x = x_mean + diffusion[:, None, None, None] * np.sqrt(-dt) * z
+
+        return x, x_mean
--- a/tests/test_layers_utils.py
+++ b/tests/test_layers_utils.py
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers.models.embeddings import get_timestep_embedding
+from diffusers.testing_utils import floats_tensor, slow, torch_device
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class EmbeddingsTests(unittest.TestCase):
+    def test_timestep_embeddings(self):
+        embedding_dim = 256
+        timesteps = torch.arange(16)
+
+        t1 = get_timestep_embedding(timesteps, embedding_dim)
+
+        # first vector should always be composed only of 0's and 1's
+        assert (t1[0, : embedding_dim // 2] - 0).abs().sum() < 1e-5
+        assert (t1[0, embedding_dim // 2 :] - 1).abs().sum() < 1e-5
+
+        # last element of each vector should be one
+        assert (t1[:, -1] - 1).abs().sum() < 1e-5
+
+        # For large embeddings (e.g. 128) the frequency of every vector is higher
+        # than the previous one which means that the gradients of later vectors are
+        # ALWAYS higher than the previous ones
+        grad_mean = np.abs(np.gradient(t1, axis=-1)).mean(axis=1)
+
+        prev_grad = 0.0
+        for grad in grad_mean:
+            assert grad > prev_grad
+            prev_grad = grad
+
+    def test_timestep_defaults(self):
+        embedding_dim = 16
+        timesteps = torch.arange(10)
+
+        t1 = get_timestep_embedding(timesteps, embedding_dim)
+        t2 = get_timestep_embedding(
+            timesteps, embedding_dim, flip_sin_to_cos=False, downscale_freq_shift=1, max_period=10_000
+        )
+
+        assert torch.allclose(t1.cpu(), t2.cpu(), 1e-3)
+
+    def test_timestep_flip_sin_cos(self):
+        embedding_dim = 16
+        timesteps = torch.arange(10)
+
+        t1 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=True)
+        t1 = torch.cat([t1[:, embedding_dim // 2 :], t1[:, : embedding_dim // 2]], dim=-1)
+
+        t2 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=False)
+
+        assert torch.allclose(t1.cpu(), t2.cpu(), 1e-3)
+
+    def test_timestep_downscale_freq_shift(self):
+        embedding_dim = 16
+        timesteps = torch.arange(10)
+
+        t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0)
+        t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1)
+
+        # get cosine half (vectors that are wrapped into cosine)
+        cosine_half = (t1 - t2)[:, embedding_dim // 2 :]
+
+        # cosine needs to be negative
+        assert (np.abs((cosine_half <= 0).numpy()) - 1).sum() < 1e-5
+
+    def test_sinoid_embeddings_hardcoded(self):
+        embedding_dim = 64
+        timesteps = torch.arange(128)
+
+        # standard unet, score_vde
+        t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1, flip_sin_to_cos=False)
+        # glide, ldm
+        t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0, flip_sin_to_cos=True)
+        # grad-tts
+        t3 = get_timestep_embedding(timesteps, embedding_dim, scale=1000)
+
+        assert torch.allclose(
+            t1[23:26, 47:50].flatten().cpu(),
+            torch.tensor([0.9646, 0.9804, 0.9892, 0.9615, 0.9787, 0.9882, 0.9582, 0.9769, 0.9872]),
+            1e-3,
+        )
+        assert torch.allclose(
+            t2[23:26, 47:50].flatten().cpu(),
+            torch.tensor([0.3019, 0.2280, 0.1716, 0.3146, 0.2377, 0.1790, 0.3272, 0.2474, 0.1864]),
+            1e-3,
+        )
+        assert torch.allclose(
+            t3[23:26, 47:50].flatten().cpu(),
+            torch.tensor([-0.9801, -0.9464, -0.9349, -0.3952, 0.8887, -0.9709, 0.5299, -0.2853, -0.9927]),
+            1e-3,
+        )
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -22,18 +22,24 @@ import numpy as np
 import torch

 from diffusers import (
-    BDDM,
-    DDIM,
-    DDPM,
-    PNDM,
+    BDDMPipeline,
+    DDIMPipeline,
    DDIMScheduler,
+    DDPMPipeline,
    DDPMScheduler,
-    Glide,
+    GlidePipeline,
    GlideSuperResUNetModel,
    GlideTextToImageUNetModel,
-    GradTTS,
-    LatentDiffusion,
+    GradTTSPipeline,
+    GradTTSScheduler,
+    LatentDiffusionPipeline,
+    NCSNpp,
+    PNDMPipeline,
    PNDMScheduler,
+    ScoreSdeVePipeline,
+    ScoreSdeVeScheduler,
+    ScoreSdeVpPipeline,
+    ScoreSdeVpScheduler,
    UNetGradTTSModel,
    UNetLDMModel,
    UNetModel,
@@ -107,7 +113,7 @@ class ModelTesterMixin:
            new_image = new_model(**inputs_dict)

        max_diff = (image - new_image).abs().sum().item()
-        self.assertLessEqual(max_diff, 1e-5, "Models give different forward passes")
+        self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes")

    def test_determinism(self):
        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -425,11 +431,12 @@ class GlideTextToImageUNetModelTests(ModelTesterMixin, unittest.TestCase):
        emb = torch.randn((1, 16, model.config.transformer_dim)).to(torch_device)
        time_step = torch.tensor([10] * noise.shape[0], device=torch_device)

+        model.to(torch_device)
        with torch.no_grad():
            output = model(noise, time_step, emb)

        output, _ = torch.split(output, 3, dim=1)
-        output_slice = output[0, -1, -3:, -3:].flatten()
+        output_slice = output[0, -1, -3:, -3:].cpu().flatten()
        # fmt: off
        expected_output_slice = torch.tensor([2.7766, -10.3558, -14.9149, -0.9376, -14.9175, -17.7679, -5.5565, -12.9521, -12.9845])
        # fmt: on
@@ -583,11 +590,11 @@ class PipelineTesterMixin(unittest.TestCase):
        model = UNetModel(ch=32, ch_mult=(1, 2), num_res_blocks=2, attn_resolutions=(16,), resolution=32)
        schedular = DDPMScheduler(timesteps=10)

-        ddpm = DDPM(model, schedular)
+        ddpm = DDPMPipeline(model, schedular)

        with tempfile.TemporaryDirectory() as tmpdirname:
            ddpm.save_pretrained(tmpdirname)
-            new_ddpm = DDPM.from_pretrained(tmpdirname)
+            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)

        generator = torch.manual_seed(0)

@@ -601,7 +608,7 @@ class PipelineTesterMixin(unittest.TestCase):
    def test_from_pretrained_hub(self):
        model_path = "fusing/ddpm-cifar10"

-        ddpm = DDPM.from_pretrained(model_path)
+        ddpm = DDPMPipeline.from_pretrained(model_path)
        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path)

        ddpm.noise_scheduler.num_timesteps = 10
@@ -624,7 +631,7 @@ class PipelineTesterMixin(unittest.TestCase):
        noise_scheduler = DDPMScheduler.from_config(model_id)
        noise_scheduler = noise_scheduler.set_format("pt")

-        ddpm = DDPM(unet=unet, noise_scheduler=noise_scheduler)
+        ddpm = DDPMPipeline(unet=unet, noise_scheduler=noise_scheduler)
        image = ddpm(generator=generator)

        image_slice = image[0, -1, -3:, -3:].cpu()
@@ -641,7 +648,7 @@ class PipelineTesterMixin(unittest.TestCase):
        unet = UNetModel.from_pretrained(model_id)
        noise_scheduler = DDIMScheduler(tensor_format="pt")

-        ddim = DDIM(unet=unet, noise_scheduler=noise_scheduler)
+        ddim = DDIMPipeline(unet=unet, noise_scheduler=noise_scheduler)
        image = ddim(generator=generator, eta=0.0)

        image_slice = image[0, -1, -3:, -3:].cpu()
@@ -660,7 +667,7 @@ class PipelineTesterMixin(unittest.TestCase):
        unet = UNetModel.from_pretrained(model_id)
        noise_scheduler = PNDMScheduler(tensor_format="pt")

-        pndm = PNDM(unet=unet, noise_scheduler=noise_scheduler)
+        pndm = PNDMPipeline(unet=unet, noise_scheduler=noise_scheduler)
        image = pndm(generator=generator)

        image_slice = image[0, -1, -3:, -3:].cpu()
@@ -672,9 +679,10 @@ class PipelineTesterMixin(unittest.TestCase):
        assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2

    @slow
+    @unittest.skip("Skipping for now as it takes too long")
    def test_ldm_text2img(self):
        model_id = "fusing/latent-diffusion-text2im-large"
-        ldm = LatentDiffusion.from_pretrained(model_id)
+        ldm = LatentDiffusionPipeline.from_pretrained(model_id)

        prompt = "A painting of a squirrel eating a burger"
        generator = torch.manual_seed(0)
@@ -686,10 +694,25 @@ class PipelineTesterMixin(unittest.TestCase):
        expected_slice = torch.tensor([0.7295, 0.7358, 0.7256, 0.7435, 0.7095, 0.6884, 0.7325, 0.6921, 0.6458])
        assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2

+    @slow
+    def test_ldm_text2img_fast(self):
+        model_id = "fusing/latent-diffusion-text2im-large"
+        ldm = LatentDiffusionPipeline.from_pretrained(model_id)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        image = ldm([prompt], generator=generator, num_inference_steps=1)
+
+        image_slice = image[0, -1, -3:, -3:].cpu()
+
+        assert image.shape == (1, 3, 256, 256)
+        expected_slice = torch.tensor([0.3163, 0.8670, 0.6465, 0.1865, 0.6291, 0.5139, 0.2824, 0.3723, 0.4344])
+        assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2
+
    @slow
    def test_glide_text2img(self):
        model_id = "fusing/glide-base"
-        glide = Glide.from_pretrained(model_id)
+        glide = GlidePipeline.from_pretrained(model_id)

        prompt = "a pencil sketch of a corgi"
        generator = torch.manual_seed(0)
@@ -704,22 +727,61 @@ class PipelineTesterMixin(unittest.TestCase):
    @slow
    def test_grad_tts(self):
        model_id = "fusing/grad-tts-libri-tts"
-        grad_tts = GradTTS.from_pretrained(model_id)
+        grad_tts = GradTTSPipeline.from_pretrained(model_id)
+        noise_scheduler = GradTTSScheduler()
+        grad_tts.noise_scheduler = noise_scheduler

        text = "Hello world, I missed you so much."
+        generator = torch.manual_seed(0)

        # generate mel spectograms using text
-        mel_spec = grad_tts(text)
+        mel_spec = grad_tts(text, generator=generator)

-        assert mel_spec.shape == (1, 256, 256, 3)
-        expected_slice = torch.tensor([0.7119, 0.7073, 0.6460, 0.7780, 0.7423, 0.6926, 0.7378, 0.7189, 0.7784])
-        assert (mel_spec.flatten() - expected_slice).abs().max() < 1e-2
+        assert mel_spec.shape == (1, 80, 143)
+        expected_slice = torch.tensor(
+            [-6.7584, -6.8347, -6.3293, -6.6437, -6.7233, -6.4684, -6.1187, -6.3172, -6.6890]
+        )
+        assert (mel_spec[0, :3, :3].cpu().flatten() - expected_slice).abs().max() < 1e-2
+
+    @slow
+    def test_score_sde_ve_pipeline(self):
+        torch.manual_seed(0)
+
+        model = NCSNpp.from_pretrained("fusing/ffhq_ncsnpp")
+        scheduler = ScoreSdeVeScheduler.from_config("fusing/ffhq_ncsnpp")
+
+        sde_ve = ScoreSdeVePipeline(model=model, scheduler=scheduler)
+
+        image = sde_ve(num_inference_steps=2)
+
+        expected_image_sum = 3382810112.0
+        expected_image_mean = 1075.366455078125
+
+        assert (image.abs().sum() - expected_image_sum).abs().cpu().item() < 1e-2
+        assert (image.abs().mean() - expected_image_mean).abs().cpu().item() < 1e-4
+
+    @slow
+    def test_score_sde_vp_pipeline(self):
+
+        model = NCSNpp.from_pretrained("fusing/cifar10-ddpmpp-vp")
+        scheduler = ScoreSdeVpScheduler.from_config("fusing/cifar10-ddpmpp-vp")
+
+        sde_vp = ScoreSdeVpPipeline(model=model, scheduler=scheduler)
+
+        torch.manual_seed(0)
+        image = sde_vp(num_inference_steps=10)
+
+        expected_image_sum = 4183.2012
+        expected_image_mean = 1.3617
+
+        assert (image.abs().sum() - expected_image_sum).abs().cpu().item() < 1e-2
+        assert (image.abs().mean() - expected_image_mean).abs().cpu().item() < 1e-4

    def test_module_from_pipeline(self):
        model = DiffWave(num_res_layers=4)
        noise_scheduler = DDPMScheduler(timesteps=12)

-        bddm = BDDM(model, noise_scheduler)
+        bddm = BDDMPipeline(model, noise_scheduler)

        # check if the library name for the diffwave moduel is set to pipeline module
        self.assertTrue(bddm.config["diffwave"][0] == "pipeline_bddm")
@@ -727,6 +789,6 @@ class PipelineTesterMixin(unittest.TestCase):
        # check if we can save and load the pipeline
        with tempfile.TemporaryDirectory() as tmpdirname:
            bddm.save_pretrained(tmpdirname)
-            _ = BDDM.from_pretrained(tmpdirname)
+            _ = BDDMPipeline.from_pretrained(tmpdirname)
            # check if the same works using the DifusionPipeline class
            _ = DiffusionPipeline.from_pretrained(tmpdirname)