The big purge -> remove everything except vision for now

2a69c0b7 · Patrick von Platen · c8c0c0e8 · c8c0c0e8 · 2a69c0b7 · 2a69c0b7
Commit 2a69c0b7 authored Jul 13, 2022 by Patrick von Platen
14 changed files
--- a/scripts/conversion_bddm.py
+++ b/scripts/conversion_bddm.py
-import argparse
-import torch
-from diffusers.pipelines.bddm import DiffWave, BDDMPipeline
-from diffusers import DDPMScheduler
-def convert_bddm_orginal(checkpoint_path, noise_scheduler_checkpoint_path, output_path):
-    sd = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
-    noise_scheduler_sd = torch.load(noise_scheduler_checkpoint_path, map_location="cpu")
-    model = DiffWave()
-    model.load_state_dict(sd, strict=False)
-    ts, _, betas, _ = noise_scheduler_sd
-    ts, betas = list(ts.numpy().tolist()), list(betas.numpy().tolist())
-    noise_scheduler = DDPMScheduler(
-        timesteps=12,
-        trained_betas=betas,
-        timestep_values=ts,
-        clip_sample=False,
-        tensor_format="np",
-    )
-    pipeline = BDDMPipeline(model, noise_scheduler)
-    pipeline.save_pretrained(output_path)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--checkpoint_path", type=str, required=True)
-    parser.add_argument("--noise_scheduler_checkpoint_path", type=str, required=True)
-    parser.add_argument("--output_path", type=str, required=True)
-    args = parser.parse_args()
-    convert_bddm_orginal(args.checkpoint_path, args.noise_scheduler_checkpoint_path, args.output_path)
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -7,10 +7,9 @@ from .utils import is_inflect_available, is_transformers_available, is_unidecode
 __version__ = "0.0.4"
 from .modeling_utils import ModelMixin
-from .models import AutoencoderKL, NCSNpp, TemporalUNet, UNetLDMModel, UNetModel, UNetUnconditionalModel, VQModel
+from .models import AutoencoderKL, NCSNpp, UNetLDMModel, UNetModel, UNetUnconditionalModel, VQModel
 from .pipeline_utils import DiffusionPipeline
 from .pipelines import (
-    BDDMPipeline,
    DDIMPipeline,
    DDPMPipeline,
    LatentDiffusionUncondPipeline,
@@ -21,7 +20,6 @@ from .pipelines import (
 from .schedulers import (
    DDIMScheduler,
    DDPMScheduler,
-    GradTTSScheduler,
    PNDMScheduler,
    SchedulerMixin,
    ScoreSdeVeScheduler,
@@ -31,13 +29,6 @@ from .schedulers import (
 if is_transformers_available():
    from .models.unet_glide import GlideSuperResUNetModel, GlideTextToImageUNetModel, GlideUNetModel
-    from .models.unet_grad_tts import UNetGradTTSModel
    from .pipelines import GlidePipeline, LatentDiffusionPipeline
 else:
    from .utils.dummy_transformers_objects import *
-if is_transformers_available() and is_inflect_available() and is_unidecode_available():
-    from .pipelines import GradTTSPipeline
-else:
-    from .utils.dummy_transformers_and_inflect_and_unidecode_objects import *
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -18,9 +18,7 @@
 from .unet import UNetModel
 from .unet_glide import GlideSuperResUNetModel, GlideTextToImageUNetModel, GlideUNetModel
-from .unet_grad_tts import UNetGradTTSModel
 from .unet_ldm import UNetLDMModel
-from .unet_rl import TemporalUNet
 from .unet_sde_score_estimation import NCSNpp
 from .unet_unconditional import UNetUnconditionalModel
 from .vae import AutoencoderKL, VQModel
--- a/src/diffusers/models/unet_grad_tts.py
+++ b/src/diffusers/models/unet_grad_tts.py
-import torch
-from ..configuration_utils import ConfigMixin
-from ..modeling_utils import ModelMixin
-from .attention import LinearAttention
-from .embeddings import get_timestep_embedding
-from .resnet import Downsample2D, ResnetBlock2D, Upsample2D
-from .unet_new import UNetMidBlock2D
-class Mish(torch.nn.Module):
-    def forward(self, x):
-        return x * torch.tanh(torch.nn.functional.softplus(x))
-class Rezero(torch.nn.Module):
-    def __init__(self, fn):
-        super(Rezero, self).__init__()
-        self.fn = fn
-        self.g = torch.nn.Parameter(torch.zeros(1))
-    def forward(self, x, encoder_out=None):
-        return self.fn(x, encoder_out) * self.g
-class Block(torch.nn.Module):
-    def __init__(self, dim, dim_out, groups=8):
-        super(Block, self).__init__()
-        self.block = torch.nn.Sequential(
-            torch.nn.Conv2d(dim, dim_out, 3, padding=1), torch.nn.GroupNorm(groups, dim_out), Mish()
-        )
-    def forward(self, x, mask):
-        output = self.block(x * mask)
-        return output * mask
-class Residual(torch.nn.Module):
-    def __init__(self, fn):
-        super(Residual, self).__init__()
-        self.fn = fn
-    def forward(self, x, *args, **kwargs):
-        output = self.fn(x, *args, **kwargs) + x
-        return output
-class UNetGradTTSModel(ModelMixin, ConfigMixin):
-    def __init__(self, dim, dim_mults=(1, 2, 4), groups=8, n_spks=None, spk_emb_dim=64, n_feats=80, pe_scale=1000):
-        super(UNetGradTTSModel, self).__init__()
-        self.register_to_config(
-            dim=dim,
-            dim_mults=dim_mults,
-            groups=groups,
-            n_spks=n_spks,
-            spk_emb_dim=spk_emb_dim,
-            n_feats=n_feats,
-            pe_scale=pe_scale,
-        )
-        self.dim = dim
-        self.dim_mults = dim_mults
-        self.groups = groups
-        self.n_spks = n_spks if not isinstance(n_spks, type(None)) else 1
-        self.spk_emb_dim = spk_emb_dim
-        self.pe_scale = pe_scale
-        if n_spks > 1:
-            self.spk_emb = torch.nn.Embedding(n_spks, spk_emb_dim)
-            self.spk_mlp = torch.nn.Sequential(
-                torch.nn.Linear(spk_emb_dim, spk_emb_dim * 4), Mish(), torch.nn.Linear(spk_emb_dim * 4, n_feats)
-            )
-        self.mlp = torch.nn.Sequential(torch.nn.Linear(dim, dim * 4), Mish(), torch.nn.Linear(dim * 4, dim))
-        dims = [2 + (1 if n_spks > 1 else 0), *map(lambda m: dim * m, dim_mults)]
-        in_out = list(zip(dims[:-1], dims[1:]))
-        self.downs = torch.nn.ModuleList([])
-        self.ups = torch.nn.ModuleList([])
-        num_resolutions = len(in_out)
-        for ind, (dim_in, dim_out) in enumerate(in_out):
-            is_last = ind >= (num_resolutions - 1)
-            self.downs.append(
-                torch.nn.ModuleList(
-                    [
-                        ResnetBlock2D(
-                            in_channels=dim_in,
-                            out_channels=dim_out,
-                            temb_channels=dim,
-                            groups=8,
-                            pre_norm=False,
-                            eps=1e-5,
-                            non_linearity="mish",
-                            overwrite_for_grad_tts=True,
-                        ),
-                        ResnetBlock2D(
-                            in_channels=dim_out,
-                            out_channels=dim_out,
-                            temb_channels=dim,
-                            groups=8,
-                            pre_norm=False,
-                            eps=1e-5,
-                            non_linearity="mish",
-                            overwrite_for_grad_tts=True,
-                        ),
-                        Residual(Rezero(LinearAttention(dim_out))),
-                        Downsample2D(dim_out, use_conv=True, padding=1) if not is_last else torch.nn.Identity(),
-                    ]
-                )
-            )
-        mid_dim = dims[-1]
-        self.mid = UNetMidBlock2D(
-            in_channels=mid_dim,
-            temb_channels=dim,
-            resnet_groups=8,
-            resnet_pre_norm=False,
-            resnet_eps=1e-5,
-            resnet_act_fn="mish",
-            attention_layer_type="linear",
-        )
-        self.mid_block1 = ResnetBlock2D(
-            in_channels=mid_dim,
-            out_channels=mid_dim,
-            temb_channels=dim,
-            groups=8,
-            pre_norm=False,
-            eps=1e-5,
-            non_linearity="mish",
-            overwrite_for_grad_tts=True,
-        )
-        self.mid_attn = Residual(Rezero(LinearAttention(mid_dim)))
-        self.mid_block2 = ResnetBlock2D(
-            in_channels=mid_dim,
-            out_channels=mid_dim,
-            temb_channels=dim,
-            groups=8,
-            pre_norm=False,
-            eps=1e-5,
-            non_linearity="mish",
-            overwrite_for_grad_tts=True,
-        )
-        self.mid.resnets[0] = self.mid_block1
-        self.mid.attentions[0] = self.mid_attn
-        self.mid.resnets[1] = self.mid_block2
-        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
-            self.ups.append(
-                torch.nn.ModuleList(
-                    [
-                        ResnetBlock2D(
-                            in_channels=dim_out * 2,
-                            out_channels=dim_in,
-                            temb_channels=dim,
-                            groups=8,
-                            pre_norm=False,
-                            eps=1e-5,
-                            non_linearity="mish",
-                            overwrite_for_grad_tts=True,
-                        ),
-                        ResnetBlock2D(
-                            in_channels=dim_in,
-                            out_channels=dim_in,
-                            temb_channels=dim,
-                            groups=8,
-                            pre_norm=False,
-                            eps=1e-5,
-                            non_linearity="mish",
-                            overwrite_for_grad_tts=True,
-                        ),
-                        Residual(Rezero(LinearAttention(dim_in))),
-                        Upsample2D(dim_in, use_conv_transpose=True),
-                    ]
-                )
-            )
-        self.final_block = Block(dim, dim)
-        self.final_conv = torch.nn.Conv2d(dim, 1, 1)
-    def forward(self, sample, timesteps, mu, mask, spk=None):
-        x = sample
-        if self.n_spks > 1:
-            # Get speaker embedding
-            spk = self.spk_emb(spk)
-        if not isinstance(spk, type(None)):
-            s = self.spk_mlp(spk)
-        t = get_timestep_embedding(timesteps, self.dim, scale=self.pe_scale)
-        t = self.mlp(t)
-        if self.n_spks < 2:
-            x = torch.stack([mu, x], 1)
-        else:
-            s = s.unsqueeze(-1).repeat(1, 1, x.shape[-1])
-            x = torch.stack([mu, x, s], 1)
-        mask = mask.unsqueeze(1)
-        hiddens = []
-        masks = [mask]
-        for resnet1, resnet2, attn, downsample in self.downs:
-            mask_down = masks[-1]
-            x = resnet1(x, t, mask_down)
-            x = resnet2(x, t, mask_down)
-            x = attn(x)
-            hiddens.append(x)
-            x = downsample(x * mask_down)
-            masks.append(mask_down[:, :, :, ::2])
-        masks = masks[:-1]
-        mask_mid = masks[-1]
-        x = self.mid(x, t, mask=mask_mid)
-        for resnet1, resnet2, attn, upsample in self.ups:
-            mask_up = masks.pop()
-            x = torch.cat((x, hiddens.pop()), dim=1)
-            x = resnet1(x, t, mask_up)
-            x = resnet2(x, t, mask_up)
-            x = attn(x)
-            x = upsample(x * mask_up)
-        x = self.final_block(x, mask)
-        output = self.final_conv(x * mask)
-        return (output * mask).squeeze(1)
--- a/src/diffusers/models/unet_rl.py
+++ b/src/diffusers/models/unet_rl.py
-# model adapted from diffuser https://github.com/jannerm/diffuser/blob/main/diffuser/models/temporal.py
-import torch
-import torch.nn as nn
-from ..configuration_utils import ConfigMixin
-from ..modeling_utils import ModelMixin
-from .embeddings import get_timestep_embedding
-from .resnet import Downsample1D, ResidualTemporalBlock, Upsample1D
-class SinusoidalPosEmb(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-    def forward(self, x):
-        return get_timestep_embedding(x, self.dim)
-class RearrangeDim(nn.Module):
-    def __init__(self):
-        super().__init__()
-    def forward(self, tensor):
-        if len(tensor.shape) == 2:
-            return tensor[:, :, None]
-        if len(tensor.shape) == 3:
-            return tensor[:, :, None, :]
-        elif len(tensor.shape) == 4:
-            return tensor[:, :, 0, :]
-        else:
-            raise ValueError(f"`len(tensor)`: {len(tensor)} has to be 2, 3 or 4.")
-class Conv1dBlock(nn.Module):
-    """
-    Conv1d --> GroupNorm --> Mish
-    """
-    def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
-        super().__init__()
-        self.block = nn.Sequential(
-            nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2),
-            RearrangeDim(),
-            #            Rearrange("batch channels horizon -> batch channels 1 horizon"),
-            nn.GroupNorm(n_groups, out_channels),
-            RearrangeDim(),
-            #            Rearrange("batch channels 1 horizon -> batch channels horizon"),
-            nn.Mish(),
-        )
-    def forward(self, x):
-        return self.block(x)
-class TemporalUNet(ModelMixin, ConfigMixin):  # (nn.Module):
-    def __init__(
-        self,
-        training_horizon=128,
-        transition_dim=14,
-        cond_dim=3,
-        predict_epsilon=False,
-        clip_denoised=True,
-        dim=32,
-        dim_mults=(1, 4, 8),
-    ):
-        super().__init__()
-        self.transition_dim = transition_dim
-        self.cond_dim = cond_dim
-        self.predict_epsilon = predict_epsilon
-        self.clip_denoised = clip_denoised
-        dims = [transition_dim, *map(lambda m: dim * m, dim_mults)]
-        in_out = list(zip(dims[:-1], dims[1:]))
-        time_dim = dim
-        self.time_mlp = nn.Sequential(
-            SinusoidalPosEmb(dim),
-            nn.Linear(dim, dim * 4),
-            nn.Mish(),
-            nn.Linear(dim * 4, dim),
-        )
-        self.downs = nn.ModuleList([])
-        self.ups = nn.ModuleList([])
-        num_resolutions = len(in_out)
-        for ind, (dim_in, dim_out) in enumerate(in_out):
-            is_last = ind >= (num_resolutions - 1)
-            self.downs.append(
-                nn.ModuleList(
-                    [
-                        ResidualTemporalBlock(dim_in, dim_out, embed_dim=time_dim, horizon=training_horizon),
-                        ResidualTemporalBlock(dim_out, dim_out, embed_dim=time_dim, horizon=training_horizon),
-                        Downsample1D(dim_out, use_conv=True) if not is_last else nn.Identity(),
-                    ]
-                )
-            )
-            if not is_last:
-                training_horizon = training_horizon // 2
-        mid_dim = dims[-1]
-        self.mid_block1 = ResidualTemporalBlock(mid_dim, mid_dim, embed_dim=time_dim, horizon=training_horizon)
-        self.mid_block2 = ResidualTemporalBlock(mid_dim, mid_dim, embed_dim=time_dim, horizon=training_horizon)
-        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
-            is_last = ind >= (num_resolutions - 1)
-            self.ups.append(
-                nn.ModuleList(
-                    [
-                        ResidualTemporalBlock(dim_out * 2, dim_in, embed_dim=time_dim, horizon=training_horizon),
-                        ResidualTemporalBlock(dim_in, dim_in, embed_dim=time_dim, horizon=training_horizon),
-                        Upsample1D(dim_in, use_conv_transpose=True) if not is_last else nn.Identity(),
-                    ]
-                )
-            )
-            if not is_last:
-                training_horizon = training_horizon * 2
-        self.final_conv = nn.Sequential(
-            Conv1dBlock(dim, dim, kernel_size=5),
-            nn.Conv1d(dim, transition_dim, 1),
-        )
-    def forward(self, sample, timesteps):
-        """
-        x : [ batch x horizon x transition ]
-        """
-        x = sample
-        x = x.permute(0, 2, 1)
-        t = self.time_mlp(timesteps)
-        h = []
-        for resnet, resnet2, downsample in self.downs:
-            x = resnet(x, t)
-            x = resnet2(x, t)
-            h.append(x)
-            x = downsample(x)
-        x = self.mid_block1(x, t)
-        x = self.mid_block2(x, t)
-        for resnet, resnet2, upsample in self.ups:
-            x = torch.cat((x, h.pop()), dim=1)
-            x = resnet(x, t)
-            x = resnet2(x, t)
-            x = upsample(x)
-        x = self.final_conv(x)
-        x = x.permute(0, 2, 1)
-        return x
-class TemporalValue(nn.Module):
-    def __init__(
-        self,
-        horizon,
-        transition_dim,
-        cond_dim,
-        dim=32,
-        time_dim=None,
-        out_dim=1,
-        dim_mults=(1, 2, 4, 8),
-    ):
-        super().__init__()
-        dims = [transition_dim, *map(lambda m: dim * m, dim_mults)]
-        in_out = list(zip(dims[:-1], dims[1:]))
-        time_dim = time_dim or dim
-        self.time_mlp = nn.Sequential(
-            SinusoidalPosEmb(dim),
-            nn.Linear(dim, dim * 4),
-            nn.Mish(),
-            nn.Linear(dim * 4, dim),
-        )
-        self.blocks = nn.ModuleList([])
-        print(in_out)
-        for dim_in, dim_out in in_out:
-            self.blocks.append(
-                nn.ModuleList(
-                    [
-                        ResidualTemporalBlock(dim_in, dim_out, kernel_size=5, embed_dim=time_dim, horizon=horizon),
-                        ResidualTemporalBlock(dim_out, dim_out, kernel_size=5, embed_dim=time_dim, horizon=horizon),
-                        Downsample1d(dim_out),
-                    ]
-                )
-            )
-            horizon = horizon // 2
-        fc_dim = dims[-1] * max(horizon, 1)
-        self.final_block = nn.Sequential(
-            nn.Linear(fc_dim + time_dim, fc_dim // 2),
-            nn.Mish(),
-            nn.Linear(fc_dim // 2, out_dim),
-        )
-    def forward(self, x, cond, time, *args):
-        """
-        x : [ batch x horizon x transition ]
-        """
-        x = x.permute(0, 2, 1)
-        t = self.time_mlp(time)
-        for resnet, resnet2, downsample in self.blocks:
-            x = resnet(x, t)
-            x = resnet2(x, t)
-            x = downsample(x)
-        x = x.view(len(x), -1)
-        out = self.final_block(torch.cat([x, t], dim=-1))
-        return out
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
 from ..utils import is_inflect_available, is_transformers_available, is_unidecode_available
-from .bddm import BDDMPipeline
 from .ddim import DDIMPipeline
 from .ddpm import DDPMPipeline
 from .latent_diffusion_uncond import LatentDiffusionUncondPipeline
@@ -11,7 +10,3 @@ from .score_sde_vp import ScoreSdeVpPipeline
 if is_transformers_available():
    from .glide import GlidePipeline
    from .latent_diffusion import LatentDiffusionPipeline
-if is_transformers_available() and is_unidecode_available() and is_inflect_available():
-    from .grad_tts import GradTTSPipeline
--- a/src/diffusers/pipelines/bddm/__init__.py
+++ b/src/diffusers/pipelines/bddm/__init__.py
-from .pipeline_bddm import BDDMPipeline, DiffWave
--- a/src/diffusers/pipelines/bddm/pipeline_bddm.py
+++ b/src/diffusers/pipelines/bddm/pipeline_bddm.py
-#!/bin/env python
-# -*- coding: utf-8 -*-
-########################################################################
-#
-#  DiffWave: A Versatile Diffusion Model for Audio Synthesis
-#  (https://arxiv.org/abs/2009.09761)
-#  Modified from https://github.com/philsyn/DiffWave-Vocoder
-#
-#  Author: Max W. Y. Lam (maxwylam@tencent.com)
-#  Copyright (c) 2021Tencent. All Rights Reserved
-#
-########################################################################
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import tqdm
-from ...configuration_utils import ConfigMixin
-from ...modeling_utils import ModelMixin
-from ...pipeline_utils import DiffusionPipeline
-def calc_diffusion_step_embedding(diffusion_steps, diffusion_step_embed_dim_in):
-    """
-    Embed a diffusion step $t$ into a higher dimensional space
-        E.g. the embedding vector in the 128-dimensional space is [sin(t * 10^(0*4/63)), ... , sin(t * 10^(63*4/63)),
-         cos(t * 10^(0*4/63)), ... , cos(t * 10^(63*4/63))]
-    Parameters:
-        diffusion_steps (torch.long tensor, shape=(batchsize, 1)):
-                                    diffusion steps for batch data
-        diffusion_step_embed_dim_in (int, default=128):
-                                    dimensionality of the embedding space for discrete diffusion steps
-    Returns:
-        the embedding vectors (torch.tensor, shape=(batchsize, diffusion_step_embed_dim_in)):
-    """
-    assert diffusion_step_embed_dim_in % 2 == 0
-    half_dim = diffusion_step_embed_dim_in // 2
-    _embed = np.log(10000) / (half_dim - 1)
-    _embed = torch.exp(torch.arange(half_dim) * -_embed).cuda()
-    _embed = diffusion_steps * _embed
-    diffusion_step_embed = torch.cat((torch.sin(_embed), torch.cos(_embed)), 1)
-    return diffusion_step_embed
-"""
-Below scripts were borrowed from https://github.com/philsyn/DiffWave-Vocoder/blob/master/WaveNet.py
-"""
-def swish(x):
-    return x * torch.sigmoid(x)
-# dilated conv layer with kaiming_normal initialization
-# from https://github.com/ksw0306/FloWaveNet/blob/master/modules.py
-class Conv(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=3, dilation=1):
-        super().__init__()
-        self.padding = dilation * (kernel_size - 1) // 2
-        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, dilation=dilation, padding=self.padding)
-        self.conv = nn.utils.weight_norm(self.conv)
-        nn.init.kaiming_normal_(self.conv.weight)
-    def forward(self, x):
-        out = self.conv(x)
-        return out
-# conv1x1 layer with zero initialization
-# from https://github.com/ksw0306/FloWaveNet/blob/master/modules.py but the scale parameter is removed
-class ZeroConv1d(nn.Module):
-    def __init__(self, in_channel, out_channel):
-        super().__init__()
-        self.conv = nn.Conv1d(in_channel, out_channel, kernel_size=1, padding=0)
-        self.conv.weight.data.zero_()
-        self.conv.bias.data.zero_()
-    def forward(self, x):
-        out = self.conv(x)
-        return out
-# every residual block (named residual layer in paper)
-# contains one noncausal dilated conv
-class ResidualBlock(nn.Module):
-    def __init__(self, res_channels, skip_channels, dilation, diffusion_step_embed_dim_out):
-        super().__init__()
-        self.res_channels = res_channels
-        # Use a FC layer for diffusion step embedding
-        self.fc_t = nn.Linear(diffusion_step_embed_dim_out, self.res_channels)
-        # Dilated conv layer
-        self.dilated_conv_layer = Conv(self.res_channels, 2 * self.res_channels, kernel_size=3, dilation=dilation)
-        # Add mel spectrogram upsampler and conditioner conv1x1 layer
-        self.upsample_conv2d = nn.ModuleList()
-        for s in [16, 16]:
-            conv_trans2d = nn.ConvTranspose2d(1, 1, (3, 2 * s), padding=(1, s // 2), stride=(1, s))
-            conv_trans2d = nn.utils.weight_norm(conv_trans2d)
-            nn.init.kaiming_normal_(conv_trans2d.weight)
-            self.upsample_conv2d.append(conv_trans2d)
-        # 80 is mel bands
-        self.mel_conv = Conv(80, 2 * self.res_channels, kernel_size=1)
-        # Residual conv1x1 layer, connect to next residual layer
-        self.res_conv = nn.Conv1d(res_channels, res_channels, kernel_size=1)
-        self.res_conv = nn.utils.weight_norm(self.res_conv)
-        nn.init.kaiming_normal_(self.res_conv.weight)
-        # Skip conv1x1 layer, add to all skip outputs through skip connections
-        self.skip_conv = nn.Conv1d(res_channels, skip_channels, kernel_size=1)
-        self.skip_conv = nn.utils.weight_norm(self.skip_conv)
-        nn.init.kaiming_normal_(self.skip_conv.weight)
-    def forward(self, input_data):
-        x, mel_spec, diffusion_step_embed = input_data
-        h = x
-        batch_size, n_channels, seq_len = x.shape
-        assert n_channels == self.res_channels
-        # Add in diffusion step embedding
-        part_t = self.fc_t(diffusion_step_embed)
-        part_t = part_t.view([batch_size, self.res_channels, 1])
-        h += part_t
-        # Dilated conv layer
-        h = self.dilated_conv_layer(h)
-        # Upsample2D spectrogram to size of audio
-        mel_spec = torch.unsqueeze(mel_spec, dim=1)
-        mel_spec = F.leaky_relu(self.upsample_conv2d[0](mel_spec), 0.4, inplace=False)
-        mel_spec = F.leaky_relu(self.upsample_conv2d[1](mel_spec), 0.4, inplace=False)
-        mel_spec = torch.squeeze(mel_spec, dim=1)
-        assert mel_spec.size(2) >= seq_len
-        if mel_spec.size(2) > seq_len:
-            mel_spec = mel_spec[:, :, :seq_len]
-        mel_spec = self.mel_conv(mel_spec)
-        h += mel_spec
-        # Gated-tanh nonlinearity
-        out = torch.tanh(h[:, : self.res_channels, :]) * torch.sigmoid(h[:, self.res_channels :, :])
-        # Residual and skip outputs
-        res = self.res_conv(out)
-        assert x.shape == res.shape
-        skip = self.skip_conv(out)
-        # Normalize for training stability
-        return (x + res) * math.sqrt(0.5), skip
-class ResidualGroup(nn.Module):
-    def __init__(
-        self,
-        res_channels,
-        skip_channels,
-        num_res_layers,
-        dilation_cycle,
-        diffusion_step_embed_dim_in,
-        diffusion_step_embed_dim_mid,
-        diffusion_step_embed_dim_out,
-    ):
-        super().__init__()
-        self.num_res_layers = num_res_layers
-        self.diffusion_step_embed_dim_in = diffusion_step_embed_dim_in
-        # Use the shared two FC layers for diffusion step embedding
-        self.fc_t1 = nn.Linear(diffusion_step_embed_dim_in, diffusion_step_embed_dim_mid)
-        self.fc_t2 = nn.Linear(diffusion_step_embed_dim_mid, diffusion_step_embed_dim_out)
-        # Stack all residual blocks with dilations 1, 2, ... , 512, ... , 1, 2, ..., 512
-        self.residual_blocks = nn.ModuleList()
-        for n in range(self.num_res_layers):
-            self.residual_blocks.append(
-                ResidualBlock(
-                    res_channels,
-                    skip_channels,
-                    dilation=2 ** (n % dilation_cycle),
-                    diffusion_step_embed_dim_out=diffusion_step_embed_dim_out,
-                )
-            )
-    def forward(self, input_data):
-        x, mel_spectrogram, diffusion_steps = input_data
-        # Embed diffusion step t
-        diffusion_step_embed = calc_diffusion_step_embedding(diffusion_steps, self.diffusion_step_embed_dim_in)
-        diffusion_step_embed = swish(self.fc_t1(diffusion_step_embed))
-        diffusion_step_embed = swish(self.fc_t2(diffusion_step_embed))
-        # Pass all residual layers
-        h = x
-        skip = 0
-        for n in range(self.num_res_layers):
-            # Use the output from last residual layer
-            h, skip_n = self.residual_blocks[n]((h, mel_spectrogram, diffusion_step_embed))
-            # Accumulate all skip outputs
-            skip += skip_n
-        # Normalize for training stability
-        return skip * math.sqrt(1.0 / self.num_res_layers)
-class DiffWave(ModelMixin, ConfigMixin):
-    def __init__(
-        self,
-        in_channels=1,
-        res_channels=128,
-        skip_channels=128,
-        out_channels=1,
-        num_res_layers=30,
-        dilation_cycle=10,
-        diffusion_step_embed_dim_in=128,
-        diffusion_step_embed_dim_mid=512,
-        diffusion_step_embed_dim_out=512,
-    ):
-        super().__init__()
-        # register all init arguments with self.register
-        self.register_to_config(
-            in_channels=in_channels,
-            res_channels=res_channels,
-            skip_channels=skip_channels,
-            out_channels=out_channels,
-            num_res_layers=num_res_layers,
-            dilation_cycle=dilation_cycle,
-            diffusion_step_embed_dim_in=diffusion_step_embed_dim_in,
-            diffusion_step_embed_dim_mid=diffusion_step_embed_dim_mid,
-            diffusion_step_embed_dim_out=diffusion_step_embed_dim_out,
-        )
-        # Initial conv1x1 with relu
-        self.init_conv = nn.Sequential(Conv(in_channels, res_channels, kernel_size=1), nn.ReLU(inplace=False))
-        # All residual layers
-        self.residual_layer = ResidualGroup(
-            res_channels,
-            skip_channels,
-            num_res_layers,
-            dilation_cycle,
-            diffusion_step_embed_dim_in,
-            diffusion_step_embed_dim_mid,
-            diffusion_step_embed_dim_out,
-        )
-        # Final conv1x1 -> relu -> zeroconv1x1
-        self.final_conv = nn.Sequential(
-            Conv(skip_channels, skip_channels, kernel_size=1),
-            nn.ReLU(inplace=False),
-            ZeroConv1d(skip_channels, out_channels),
-        )
-    def forward(self, input_data):
-        audio, mel_spectrogram, diffusion_steps = input_data
-        x = audio
-        x = self.init_conv(x).clone()
-        x = self.residual_layer((x, mel_spectrogram, diffusion_steps))
-        return self.final_conv(x)
-class BDDMPipeline(DiffusionPipeline):
-    def __init__(self, diffwave, noise_scheduler):
-        super().__init__()
-        noise_scheduler = noise_scheduler.set_format("pt")
-        self.register_modules(diffwave=diffwave, noise_scheduler=noise_scheduler)
-    @torch.no_grad()
-    def __call__(self, mel_spectrogram, generator, torch_device=None):
-        if torch_device is None:
-            torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.diffwave.to(torch_device)
-        mel_spectrogram = mel_spectrogram.to(torch_device)
-        audio_length = mel_spectrogram.size(-1) * 256
-        audio_size = (1, 1, audio_length)
-        # Sample gaussian noise to begin loop
-        audio = torch.normal(0, 1, size=audio_size, generator=generator).to(torch_device)
-        timestep_values = self.noise_scheduler.config.timestep_values
-        num_prediction_steps = len(self.noise_scheduler)
-        for t in tqdm.tqdm(reversed(range(num_prediction_steps)), total=num_prediction_steps):
-            # 1. predict noise residual
-            ts = (torch.tensor(timestep_values[t]) * torch.ones((1, 1))).to(torch_device)
-            residual = self.diffwave((audio, mel_spectrogram, ts))
-            # 2. predict previous mean of audio x_t-1
-            pred_prev_audio = self.noise_scheduler.step(residual, audio, t)
-            # 3. optionally sample variance
-            variance = 0
-            if t > 0:
-                noise = torch.normal(0, 1, size=audio_size, generator=generator).to(torch_device)
-                variance = self.noise_scheduler.get_variance(t).sqrt() * noise
-            # 4. set current audio to prev_audio: x_t -> x_t-1
-            audio = pred_prev_audio + variance
-        return audio
--- a/src/diffusers/pipelines/grad_tts/__init__.py
+++ b/src/diffusers/pipelines/grad_tts/__init__.py
-from ...utils import is_inflect_available, is_transformers_available, is_unidecode_available
-if is_transformers_available() and is_unidecode_available() and is_inflect_available():
-    from .grad_tts_utils import GradTTSTokenizer
-    from .pipeline_grad_tts import GradTTSPipeline, TextEncoder
--- a/src/diffusers/pipelines/grad_tts/grad_tts_utils.py
+++ b/src/diffusers/pipelines/grad_tts/grad_tts_utils.py
-# tokenizer
-import os
-import re
-from shutil import copyfile
-import torch
-import inflect
-from transformers import PreTrainedTokenizer
-from unidecode import unidecode
-valid_symbols = [
-    "AA",
-    "AA0",
-    "AA1",
-    "AA2",
-    "AE",
-    "AE0",
-    "AE1",
-    "AE2",
-    "AH",
-    "AH0",
-    "AH1",
-    "AH2",
-    "AO",
-    "AO0",
-    "AO1",
-    "AO2",
-    "AW",
-    "AW0",
-    "AW1",
-    "AW2",
-    "AY",
-    "AY0",
-    "AY1",
-    "AY2",
-    "B",
-    "CH",
-    "D",
-    "DH",
-    "EH",
-    "EH0",
-    "EH1",
-    "EH2",
-    "ER",
-    "ER0",
-    "ER1",
-    "ER2",
-    "EY",
-    "EY0",
-    "EY1",
-    "EY2",
-    "F",
-    "G",
-    "HH",
-    "IH",
-    "IH0",
-    "IH1",
-    "IH2",
-    "IY",
-    "IY0",
-    "IY1",
-    "IY2",
-    "JH",
-    "K",
-    "L",
-    "M",
-    "N",
-    "NG",
-    "OW",
-    "OW0",
-    "OW1",
-    "OW2",
-    "OY",
-    "OY0",
-    "OY1",
-    "OY2",
-    "P",
-    "R",
-    "S",
-    "SH",
-    "T",
-    "TH",
-    "UH",
-    "UH0",
-    "UH1",
-    "UH2",
-    "UW",
-    "UW0",
-    "UW1",
-    "UW2",
-    "V",
-    "W",
-    "Y",
-    "Z",
-    "ZH",
-]
-_valid_symbol_set = set(valid_symbols)
-def intersperse(lst, item):
-    # Adds blank symbol
-    result = [item] * (len(lst) * 2 + 1)
-    result[1::2] = lst
-    return result
-class CMUDict:
-    def __init__(self, file_or_path, keep_ambiguous=True):
-        if isinstance(file_or_path, str):
-            with open(file_or_path, encoding="latin-1") as f:
-                entries = _parse_cmudict(f)
-        else:
-            entries = _parse_cmudict(file_or_path)
-        if not keep_ambiguous:
-            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
-        self._entries = entries
-    def __len__(self):
-        return len(self._entries)
-    def lookup(self, word):
-        return self._entries.get(word.upper())
-_alt_re = re.compile(r"\([0-9]+\)")
-def _parse_cmudict(file):
-    cmudict = {}
-    for line in file:
-        if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
-            parts = line.split("  ")
-            word = re.sub(_alt_re, "", parts[0])
-            pronunciation = _get_pronunciation(parts[1])
-            if pronunciation:
-                if word in cmudict:
-                    cmudict[word].append(pronunciation)
-                else:
-                    cmudict[word] = [pronunciation]
-    return cmudict
-def _get_pronunciation(s):
-    parts = s.strip().split(" ")
-    for part in parts:
-        if part not in _valid_symbol_set:
-            return None
-    return " ".join(parts)
-_whitespace_re = re.compile(r"\s+")
-_abbreviations = [
-    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
-    for x in [
-        ("mrs", "misess"),
-        ("mr", "mister"),
-        ("dr", "doctor"),
-        ("st", "saint"),
-        ("co", "company"),
-        ("jr", "junior"),
-        ("maj", "major"),
-        ("gen", "general"),
-        ("drs", "doctors"),
-        ("rev", "reverend"),
-        ("lt", "lieutenant"),
-        ("hon", "honorable"),
-        ("sgt", "sergeant"),
-        ("capt", "captain"),
-        ("esq", "esquire"),
-        ("ltd", "limited"),
-        ("col", "colonel"),
-        ("ft", "fort"),
-    ]
-]
-def expand_abbreviations(text):
-    for regex, replacement in _abbreviations:
-        text = re.sub(regex, replacement, text)
-    return text
-def expand_numbers(text):
-    return normalize_numbers(text)
-def lowercase(text):
-    return text.lower()
-def collapse_whitespace(text):
-    return re.sub(_whitespace_re, " ", text)
-def convert_to_ascii(text):
-    return unidecode(text)
-def basic_cleaners(text):
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-def transliteration_cleaners(text):
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-def english_cleaners(text):
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = expand_numbers(text)
-    text = expand_abbreviations(text)
-    text = collapse_whitespace(text)
-    return text
-_inflect = inflect.engine()
-_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
-_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
-_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
-_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
-_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
-_number_re = re.compile(r"[0-9]+")
-def _remove_commas(m):
-    return m.group(1).replace(",", "")
-def _expand_decimal_point(m):
-    return m.group(1).replace(".", " point ")
-def _expand_dollars(m):
-    match = m.group(1)
-    parts = match.split(".")
-    if len(parts) > 2:
-        return match + " dollars"
-    dollars = int(parts[0]) if parts[0] else 0
-    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
-    if dollars and cents:
-        dollar_unit = "dollar" if dollars == 1 else "dollars"
-        cent_unit = "cent" if cents == 1 else "cents"
-        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
-    elif dollars:
-        dollar_unit = "dollar" if dollars == 1 else "dollars"
-        return "%s %s" % (dollars, dollar_unit)
-    elif cents:
-        cent_unit = "cent" if cents == 1 else "cents"
-        return "%s %s" % (cents, cent_unit)
-    else:
-        return "zero dollars"
-def _expand_ordinal(m):
-    return _inflect.number_to_words(m.group(0))
-def _expand_number(m):
-    num = int(m.group(0))
-    if num > 1000 and num < 3000:
-        if num == 2000:
-            return "two thousand"
-        elif num > 2000 and num < 2010:
-            return "two thousand " + _inflect.number_to_words(num % 100)
-        elif num % 100 == 0:
-            return _inflect.number_to_words(num // 100) + " hundred"
-        else:
-            return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
-    else:
-        return _inflect.number_to_words(num, andword="")
-def normalize_numbers(text):
-    text = re.sub(_comma_number_re, _remove_commas, text)
-    text = re.sub(_pounds_re, r"\1 pounds", text)
-    text = re.sub(_dollars_re, _expand_dollars, text)
-    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
-    text = re.sub(_ordinal_re, _expand_ordinal, text)
-    text = re.sub(_number_re, _expand_number, text)
-    return text
-""" from https://github.com/keithito/tacotron"""
-_pad = "_"
-_punctuation = "!'(),.:;? "
-_special = "-"
-_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-# Prepend "@" to ARPAbet symbols to ensure uniqueness:
-_arpabet = ["@" + s for s in valid_symbols]
-# Export all symbols:
-symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
-_symbol_to_id = {s: i for i, s in enumerate(symbols)}
-_id_to_symbol = {i: s for i, s in enumerate(symbols)}
-_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
-def get_arpabet(word, dictionary):
-    word_arpabet = dictionary.lookup(word)
-    if word_arpabet is not None:
-        return "{" + word_arpabet[0] + "}"
-    else:
-        return word
-def text_to_sequence(text, cleaner_names=[english_cleaners], dictionary=None):
-    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    The text can optionally have ARPAbet sequences enclosed in curly braces embedded in it. For example, "Turn left on
-    {HH AW1 S S T AH0 N} Street."
-    Args:
-      text: string to convert to a sequence
-      cleaner_names: names of the cleaner functions to run the text through
-      dictionary: arpabet class with arpabet dictionary
-    Returns:
-      List of integers corresponding to the symbols in the text
-    """
-    sequence = []
-    space = _symbols_to_sequence(" ")
-    # Check for curly braces and treat their contents as ARPAbet:
-    while len(text):
-        m = _curly_re.match(text)
-        if not m:
-            clean_text = _clean_text(text, cleaner_names)
-            if dictionary is not None:
-                clean_text = [get_arpabet(w, dictionary) for w in clean_text.split(" ")]
-                for i in range(len(clean_text)):
-                    t = clean_text[i]
-                    if t.startswith("{"):
-                        sequence += _arpabet_to_sequence(t[1:-1])
-                    else:
-                        sequence += _symbols_to_sequence(t)
-                    sequence += space
-            else:
-                sequence += _symbols_to_sequence(clean_text)
-            break
-        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
-        sequence += _arpabet_to_sequence(m.group(2))
-        text = m.group(3)
-    # remove trailing space
-    if dictionary is not None:
-        sequence = sequence[:-1] if sequence[-1] == space[0] else sequence
-    return sequence
-def sequence_to_text(sequence):
-    """Converts a sequence of IDs back to a string"""
-    result = ""
-    for symbol_id in sequence:
-        if symbol_id in _id_to_symbol:
-            s = _id_to_symbol[symbol_id]
-            # Enclose ARPAbet back in curly braces:
-            if len(s) > 1 and s[0] == "@":
-                s = "{%s}" % s[1:]
-            result += s
-    return result.replace("}{", " ")
-def _clean_text(text, cleaner_names):
-    for cleaner in cleaner_names:
-        text = cleaner(text)
-    return text
-def _symbols_to_sequence(symbols):
-    return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
-def _arpabet_to_sequence(text):
-    return _symbols_to_sequence(["@" + s for s in text.split()])
-def _should_keep_symbol(s):
-    return s in _symbol_to_id and s != "_" and s != "~"
-VOCAB_FILES_NAMES = {
-    "dict_file": "dict_file.txt",
-}
-class GradTTSTokenizer(PreTrainedTokenizer):
-    vocab_files_names = VOCAB_FILES_NAMES
-    def __init__(self, dict_file, **kwargs):
-        super().__init__(**kwargs)
-        self.cmu = CMUDict(dict_file)
-        self.dict_file = dict_file
-    def __call__(self, text):
-        x = torch.LongTensor(intersperse(text_to_sequence(text, dictionary=self.cmu), len(symbols)))[None]
-        x_lengths = torch.LongTensor([x.shape[-1]])
-        return x, x_lengths
-    def save_vocabulary(self, save_directory: str, filename_prefix=None):
-        dict_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["dict_file"]
-        )
-        copyfile(self.dict_file, dict_file)
-        return (dict_file,)
--- a/src/diffusers/pipelines/grad_tts/pipeline_grad_tts.py
+++ b/src/diffusers/pipelines/grad_tts/pipeline_grad_tts.py
-""" from https://github.com/jaywalnut310/glow-tts"""
-import math
-import torch
-from torch import nn
-import tqdm
-from ...configuration_utils import ConfigMixin
-from ...modeling_utils import ModelMixin
-from ...pipeline_utils import DiffusionPipeline
-from .grad_tts_utils import GradTTSTokenizer  # flake8: noqa
-def sequence_mask(length, max_length=None):
-    if max_length is None:
-        max_length = length.max()
-    x = torch.arange(int(max_length), dtype=length.dtype, device=length.device)
-    return x.unsqueeze(0) < length.unsqueeze(1)
-def fix_len_compatibility(length, num_downsamplings_in_unet=2):
-    while True:
-        if length % (2**num_downsamplings_in_unet) == 0:
-            return length
-        length += 1
-def convert_pad_shape(pad_shape):
-    l = pad_shape[::-1]
-    pad_shape = [item for sublist in l for item in sublist]
-    return pad_shape
-def generate_path(duration, mask):
-    device = duration.device
-    b, t_x, t_y = mask.shape
-    cum_duration = torch.cumsum(duration, 1)
-    path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
-    cum_duration_flat = cum_duration.view(b * t_x)
-    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
-    path = path.view(b, t_x, t_y)
-    path = path - torch.nn.functional.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
-    path = path * mask
-    return path
-def duration_loss(logw, logw_, lengths):
-    loss = torch.sum((logw - logw_) ** 2) / torch.sum(lengths)
-    return loss
-class LayerNorm(nn.Module):
-    def __init__(self, channels, eps=1e-4):
-        super(LayerNorm, self).__init__()
-        self.channels = channels
-        self.eps = eps
-        self.gamma = torch.nn.Parameter(torch.ones(channels))
-        self.beta = torch.nn.Parameter(torch.zeros(channels))
-    def forward(self, x):
-        n_dims = len(x.shape)
-        mean = torch.mean(x, 1, keepdim=True)
-        variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
-        x = (x - mean) * torch.rsqrt(variance + self.eps)
-        shape = [1, -1] + [1] * (n_dims - 2)
-        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
-        return x
-class ConvReluNorm(nn.Module):
-    def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
-        super(ConvReluNorm, self).__init__()
-        self.in_channels = in_channels
-        self.hidden_channels = hidden_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.n_layers = n_layers
-        self.p_dropout = p_dropout
-        self.conv_layers = torch.nn.ModuleList()
-        self.norm_layers = torch.nn.ModuleList()
-        self.conv_layers.append(torch.nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
-        self.norm_layers.append(LayerNorm(hidden_channels))
-        self.relu_drop = torch.nn.Sequential(torch.nn.ReLU(), torch.nn.Dropout(p_dropout))
-        for _ in range(n_layers - 1):
-            self.conv_layers.append(
-                torch.nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
-            )
-            self.norm_layers.append(LayerNorm(hidden_channels))
-        self.proj = torch.nn.Conv1d(hidden_channels, out_channels, 1)
-        self.proj.weight.data.zero_()
-        self.proj.bias.data.zero_()
-    def forward(self, x, x_mask):
-        x_org = x
-        for i in range(self.n_layers):
-            x = self.conv_layers[i](x * x_mask)
-            x = self.norm_layers[i](x)
-            x = self.relu_drop(x)
-        x = x_org + self.proj(x)
-        return x * x_mask
-class DurationPredictor(nn.Module):
-    def __init__(self, in_channels, filter_channels, kernel_size, p_dropout):
-        super(DurationPredictor, self).__init__()
-        self.in_channels = in_channels
-        self.filter_channels = filter_channels
-        self.p_dropout = p_dropout
-        self.drop = torch.nn.Dropout(p_dropout)
-        self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
-        self.norm_1 = LayerNorm(filter_channels)
-        self.conv_2 = torch.nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
-        self.norm_2 = LayerNorm(filter_channels)
-        self.proj = torch.nn.Conv1d(filter_channels, 1, 1)
-    def forward(self, x, x_mask):
-        x = self.conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_1(x)
-        x = self.drop(x)
-        x = self.conv_2(x * x_mask)
-        x = torch.relu(x)
-        x = self.norm_2(x)
-        x = self.drop(x)
-        x = self.proj(x * x_mask)
-        return x * x_mask
-class MultiHeadAttention(nn.Module):
-    def __init__(
-        self,
-        channels,
-        out_channels,
-        n_heads,
-        window_size=None,
-        heads_share=True,
-        p_dropout=0.0,
-        proximal_bias=False,
-        proximal_init=False,
-    ):
-        super(MultiHeadAttention, self).__init__()
-        assert channels % n_heads == 0
-        self.channels = channels
-        self.out_channels = out_channels
-        self.n_heads = n_heads
-        self.window_size = window_size
-        self.heads_share = heads_share
-        self.proximal_bias = proximal_bias
-        self.p_dropout = p_dropout
-        self.attn = None
-        self.k_channels = channels // n_heads
-        self.conv_q = torch.nn.Conv1d(channels, channels, 1)
-        self.conv_k = torch.nn.Conv1d(channels, channels, 1)
-        self.conv_v = torch.nn.Conv1d(channels, channels, 1)
-        if window_size is not None:
-            n_heads_rel = 1 if heads_share else n_heads
-            rel_stddev = self.k_channels**-0.5
-            self.emb_rel_k = torch.nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev
-            )
-            self.emb_rel_v = torch.nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev
-            )
-        self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
-        self.drop = torch.nn.Dropout(p_dropout)
-        torch.nn.init.xavier_uniform_(self.conv_q.weight)
-        torch.nn.init.xavier_uniform_(self.conv_k.weight)
-        if proximal_init:
-            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
-            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
-        torch.nn.init.xavier_uniform_(self.conv_v.weight)
-    def forward(self, x, c, attn_mask=None):
-        q = self.conv_q(x)
-        k = self.conv_k(c)
-        v = self.conv_v(c)
-        x, self.attn = self.attention(q, k, v, mask=attn_mask)
-        x = self.conv_o(x)
-        return x
-    def attention(self, query, key, value, mask=None):
-        b, d, t_s, t_t = (*key.size(), query.size(2))
-        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
-        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels)
-        if self.window_size is not None:
-            assert t_s == t_t, "Relative attention is only available for self-attention."
-            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
-            rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings)
-            rel_logits = self._relative_position_to_absolute_position(rel_logits)
-            scores_local = rel_logits / math.sqrt(self.k_channels)
-            scores = scores + scores_local
-        if self.proximal_bias:
-            assert t_s == t_t, "Proximal bias is only available for self-attention."
-            scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
-        if mask is not None:
-            scores = scores.masked_fill(mask == 0, -1e4)
-        p_attn = torch.nn.functional.softmax(scores, dim=-1)
-        p_attn = self.drop(p_attn)
-        output = torch.matmul(p_attn, value)
-        if self.window_size is not None:
-            relative_weights = self._absolute_position_to_relative_position(p_attn)
-            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
-            output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
-        output = output.transpose(2, 3).contiguous().view(b, d, t_t)
-        return output, p_attn
-    def _matmul_with_relative_values(self, x, y):
-        ret = torch.matmul(x, y.unsqueeze(0))
-        return ret
-    def _matmul_with_relative_keys(self, x, y):
-        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
-        return ret
-    def _get_relative_embeddings(self, relative_embeddings, length):
-        pad_length = max(length - (self.window_size + 1), 0)
-        slice_start_position = max((self.window_size + 1) - length, 0)
-        slice_end_position = slice_start_position + 2 * length - 1
-        if pad_length > 0:
-            padded_relative_embeddings = torch.nn.functional.pad(
-                relative_embeddings, convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])
-            )
-        else:
-            padded_relative_embeddings = relative_embeddings
-        used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
-        return used_relative_embeddings
-    def _relative_position_to_absolute_position(self, x):
-        batch, heads, length, _ = x.size()
-        x = torch.nn.functional.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
-        x_flat = x.view([batch, heads, length * 2 * length])
-        x_flat = torch.nn.functional.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
-        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :]
-        return x_final
-    def _absolute_position_to_relative_position(self, x):
-        batch, heads, length, _ = x.size()
-        x = torch.nn.functional.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
-        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
-        x_flat = torch.nn.functional.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
-        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
-        return x_final
-    def _attention_bias_proximal(self, length):
-        r = torch.arange(length, dtype=torch.float32)
-        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
-class FFN(nn.Module):
-    def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0):
-        super(FFN, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
-        self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size, padding=kernel_size // 2)
-        self.drop = torch.nn.Dropout(p_dropout)
-    def forward(self, x, x_mask):
-        x = self.conv_1(x * x_mask)
-        x = torch.relu(x)
-        x = self.drop(x)
-        x = self.conv_2(x * x_mask)
-        return x * x_mask
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        hidden_channels,
-        filter_channels,
-        n_heads,
-        n_layers,
-        kernel_size=1,
-        p_dropout=0.0,
-        window_size=None,
-        **kwargs,
-    ):
-        super(Encoder, self).__init__()
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        self.drop = torch.nn.Dropout(p_dropout)
-        self.attn_layers = torch.nn.ModuleList()
-        self.norm_layers_1 = torch.nn.ModuleList()
-        self.ffn_layers = torch.nn.ModuleList()
-        self.norm_layers_2 = torch.nn.ModuleList()
-        for _ in range(self.n_layers):
-            self.attn_layers.append(
-                MultiHeadAttention(
-                    hidden_channels, hidden_channels, n_heads, window_size=window_size, p_dropout=p_dropout
-                )
-            )
-            self.norm_layers_1.append(LayerNorm(hidden_channels))
-            self.ffn_layers.append(
-                FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)
-            )
-            self.norm_layers_2.append(LayerNorm(hidden_channels))
-    def forward(self, x, x_mask):
-        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-        for i in range(self.n_layers):
-            x = x * x_mask
-            y = self.attn_layers[i](x, x, attn_mask)
-            y = self.drop(y)
-            x = self.norm_layers_1[i](x + y)
-            y = self.ffn_layers[i](x, x_mask)
-            y = self.drop(y)
-            x = self.norm_layers_2[i](x + y)
-        x = x * x_mask
-        return x
-class TextEncoder(ModelMixin, ConfigMixin):
-    def __init__(
-        self,
-        n_vocab,
-        n_feats,
-        n_channels,
-        filter_channels,
-        filter_channels_dp,
-        n_heads,
-        n_layers,
-        kernel_size,
-        p_dropout,
-        window_size=None,
-        spk_emb_dim=64,
-        n_spks=1,
-    ):
-        super(TextEncoder, self).__init__()
-        self.register_to_config(
-            n_vocab=n_vocab,
-            n_feats=n_feats,
-            n_channels=n_channels,
-            filter_channels=filter_channels,
-            filter_channels_dp=filter_channels_dp,
-            n_heads=n_heads,
-            n_layers=n_layers,
-            kernel_size=kernel_size,
-            p_dropout=p_dropout,
-            window_size=window_size,
-            spk_emb_dim=spk_emb_dim,
-            n_spks=n_spks,
-        )
-        self.n_vocab = n_vocab
-        self.n_feats = n_feats
-        self.n_channels = n_channels
-        self.filter_channels = filter_channels
-        self.filter_channels_dp = filter_channels_dp
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        self.spk_emb_dim = spk_emb_dim
-        self.n_spks = n_spks
-        self.emb = torch.nn.Embedding(n_vocab, n_channels)
-        torch.nn.init.normal_(self.emb.weight, 0.0, n_channels**-0.5)
-        self.prenet = ConvReluNorm(n_channels, n_channels, n_channels, kernel_size=5, n_layers=3, p_dropout=0.5)
-        self.encoder = Encoder(
-            n_channels + (spk_emb_dim if n_spks > 1 else 0),
-            filter_channels,
-            n_heads,
-            n_layers,
-            kernel_size,
-            p_dropout,
-            window_size=window_size,
-        )
-        self.proj_m = torch.nn.Conv1d(n_channels + (spk_emb_dim if n_spks > 1 else 0), n_feats, 1)
-        self.proj_w = DurationPredictor(
-            n_channels + (spk_emb_dim if n_spks > 1 else 0), filter_channels_dp, kernel_size, p_dropout
-        )
-    def forward(self, x, x_lengths, spk=None):
-        x = self.emb(x) * math.sqrt(self.n_channels)
-        x = torch.transpose(x, 1, -1)
-        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
-        x = self.prenet(x, x_mask)
-        if self.n_spks > 1:
-            x = torch.cat([x, spk.unsqueeze(-1).repeat(1, 1, x.shape[-1])], dim=1)
-        x = self.encoder(x, x_mask)
-        mu = self.proj_m(x) * x_mask
-        x_dp = torch.detach(x)
-        logw = self.proj_w(x_dp, x_mask)
-        return mu, logw, x_mask
-class GradTTSPipeline(DiffusionPipeline):
-    def __init__(self, unet, text_encoder, noise_scheduler, tokenizer):
-        super().__init__()
-        noise_scheduler = noise_scheduler.set_format("pt")
-        self.register_modules(
-            unet=unet, text_encoder=text_encoder, noise_scheduler=noise_scheduler, tokenizer=tokenizer
-        )
-    @torch.no_grad()
-    def __call__(
-        self,
-        text,
-        num_inference_steps=50,
-        temperature=1.3,
-        length_scale=0.91,
-        speaker_id=15,
-        torch_device=None,
-        generator=None,
-    ):
-        if torch_device is None:
-            torch_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        self.unet.to(torch_device)
-        self.text_encoder.to(torch_device)
-        x, x_lengths = self.tokenizer(text)
-        x = x.to(torch_device)
-        x_lengths = x_lengths.to(torch_device)
-        if speaker_id is not None:
-            speaker_id = torch.LongTensor([speaker_id]).to(torch_device)
-        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
-        mu_x, logw, x_mask = self.text_encoder(x, x_lengths)
-        w = torch.exp(logw) * x_mask
-        w_ceil = torch.ceil(w) * length_scale
-        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
-        y_max_length = int(y_lengths.max())
-        y_max_length_ = fix_len_compatibility(y_max_length)
-        # Using obtained durations `w` construct alignment map `attn`
-        y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype)
-        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
-        attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
-        # Align encoded text and get mu_y
-        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
-        mu_y = mu_y.transpose(1, 2)
-        # Sample latent representation from terminal distribution N(mu_y, I)
-        z = mu_y + torch.randn(mu_y.shape, generator=generator).to(mu_y.device)
-        xt = z * y_mask
-        h = 1.0 / num_inference_steps
-        # (Patrick: TODO)
-        for t in tqdm.tqdm(range(num_inference_steps), total=num_inference_steps):
-            t_new = num_inference_steps - t - 1
-            t = (1.0 - (t + 0.5) * h) * torch.ones(z.shape[0], dtype=z.dtype, device=z.device)
-            residual = self.unet(xt, t, mu_y, y_mask, speaker_id)
-            scheduler_residual = residual - mu_y + xt
-            xt = self.noise_scheduler.step(scheduler_residual, xt, t_new, num_inference_steps)
-            xt = xt * y_mask
-        return xt[:, :, :y_max_length]
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -18,7 +18,6 @@
 from .scheduling_ddim import DDIMScheduler
 from .scheduling_ddpm import DDPMScheduler
-from .scheduling_grad_tts import GradTTSScheduler
 from .scheduling_pndm import PNDMScheduler
 from .scheduling_sde_ve import ScoreSdeVeScheduler
 from .scheduling_sde_vp import ScoreSdeVpScheduler

--- a/src/diffusers/schedulers/scheduling_grad_tts.py
+++ b/src/diffusers/schedulers/scheduling_grad_tts.py
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-from ..configuration_utils import ConfigMixin
-from .scheduling_utils import SchedulerMixin
-class GradTTSScheduler(SchedulerMixin, ConfigMixin):
-    def __init__(
-        self,
-        beta_start=0.05,
-        beta_end=20,
-        tensor_format="np",
-    ):
-        super().__init__()
-        self.register_to_config(
-            beta_start=beta_start,
-            beta_end=beta_end,
-        )
-        self.set_format(tensor_format=tensor_format)
-        self.betas = None
-    def get_timesteps(self, num_inference_steps):
-        return np.array([(t + 0.5) / num_inference_steps for t in range(num_inference_steps)])
-    def set_betas(self, num_inference_steps):
-        timesteps = self.get_timesteps(num_inference_steps)
-        self.betas = np.array([self.beta_start + (self.beta_end - self.beta_start) * t for t in timesteps])
-    def step(self, residual, sample, t, num_inference_steps):
-        # This is a VE scheduler from https://arxiv.org/pdf/2011.13456.pdf (see Algorithm 2 in Appendix)
-        if self.betas is None:
-            self.set_betas(num_inference_steps)
-        beta_t = self.betas[t]
-        beta_t_deriv = beta_t / num_inference_steps
-        sample_deriv = residual * beta_t_deriv / 2
-        sample = sample + sample_deriv
-        return sample
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -23,7 +23,6 @@ import torch
 from diffusers import (
    AutoencoderKL,
-    BDDMPipeline,
    DDIMPipeline,
    DDIMScheduler,
    DDPMPipeline,
@@ -31,8 +30,6 @@ from diffusers import (
    GlidePipeline,
    GlideSuperResUNetModel,
    GlideTextToImageUNetModel,
-    GradTTSPipeline,
-    GradTTSScheduler,
    LatentDiffusionPipeline,
    LatentDiffusionUncondPipeline,
    NCSNpp,
@@ -42,8 +39,6 @@ from diffusers import (
    ScoreSdeVeScheduler,
    ScoreSdeVpPipeline,
    ScoreSdeVpScheduler,
-    TemporalUNet,
-    UNetGradTTSModel,
    UNetLDMModel,
    UNetModel,
    UNetUnconditionalModel,
@@ -51,7 +46,6 @@ from diffusers import (
 )
 from diffusers.configuration_utils import ConfigMixin
 from diffusers.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.bddm.pipeline_bddm import DiffWave
 from diffusers.testing_utils import floats_tensor, slow, torch_device
 from diffusers.training_utils import EMAModel
@@ -556,149 +550,6 @@ class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase):
        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
-class UNetGradTTSModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNetGradTTSModel
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_features = 32
-        seq_len = 16
-        noise = floats_tensor((batch_size, num_features, seq_len)).to(torch_device)
-        condition = floats_tensor((batch_size, num_features, seq_len)).to(torch_device)
-        mask = floats_tensor((batch_size, 1, seq_len)).to(torch_device)
-        time_step = torch.tensor([10] * batch_size).to(torch_device)
-        return {"sample": noise, "timesteps": time_step, "mu": condition, "mask": mask}
-    @property
-    def input_shape(self):
-        return (4, 32, 16)
-    @property
-    def output_shape(self):
-        return (4, 32, 16)
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "dim": 64,
-            "groups": 4,
-            "dim_mults": (1, 2),
-            "n_feats": 32,
-            "pe_scale": 1000,
-            "n_spks": 1,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-    def test_from_pretrained_hub(self):
-        model, loading_info = UNetGradTTSModel.from_pretrained("fusing/unet-grad-tts-dummy", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-        model.to(torch_device)
-        image = model(**self.dummy_input)
-        assert image is not None, "Make sure output is not None"
-    def test_output_pretrained(self):
-        model = UNetGradTTSModel.from_pretrained("fusing/unet-grad-tts-dummy")
-        model.eval()
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-        num_features = model.config.n_feats
-        seq_len = 16
-        noise = torch.randn((1, num_features, seq_len))
-        condition = torch.randn((1, num_features, seq_len))
-        mask = torch.randn((1, 1, seq_len))
-        time_step = torch.tensor([10])
-        with torch.no_grad():
-            output = model(noise, time_step, condition, mask)
-        output_slice = output[0, -3:, -3:].flatten()
-        # fmt: off
-        expected_output_slice = torch.tensor([-0.0690, -0.0531, 0.0633, -0.0660, -0.0541, 0.0650, -0.0656, -0.0555, 0.0617])
-        # fmt: on
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-3))
-class TemporalUNetModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = TemporalUNet
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_features = 14
-        seq_len = 16
-        noise = floats_tensor((batch_size, seq_len, num_features)).to(torch_device)
-        time_step = torch.tensor([10] * batch_size).to(torch_device)
-        return {"sample": noise, "timesteps": time_step}
-    @property
-    def input_shape(self):
-        return (4, 16, 14)
-    @property
-    def output_shape(self):
-        return (4, 16, 14)
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "training_horizon": 128,
-            "dim": 32,
-            "dim_mults": [1, 4, 8],
-            "predict_epsilon": False,
-            "clip_denoised": True,
-            "transition_dim": 14,
-            "cond_dim": 3,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-    def test_from_pretrained_hub(self):
-        model, loading_info = TemporalUNet.from_pretrained(
-            "fusing/ddpm-unet-rl-hopper-hor128", output_loading_info=True
-        )
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-        model.to(torch_device)
-        image = model(**self.dummy_input)
-        assert image is not None, "Make sure output is not None"
-    def test_output_pretrained(self):
-        model = TemporalUNet.from_pretrained("fusing/ddpm-unet-rl-hopper-hor128")
-        model.eval()
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-        num_features = model.transition_dim
-        seq_len = 16
-        noise = torch.randn((1, seq_len, num_features))
-        time_step = torch.full((num_features,), 0)
-        with torch.no_grad():
-            output = model(noise, time_step)
-        output_slice = output[0, -3:, -3:].flatten()
-        # fmt: off
-        expected_output_slice = torch.tensor([-0.2714, 0.1042, -0.0794, -0.2820, 0.0803, -0.0811, -0.2345, 0.0580, -0.0584])
-        # fmt: on
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-3))
 class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
    model_class = NCSNpp
@@ -1116,25 +967,6 @@ class PipelineTesterMixin(unittest.TestCase):
        expected_slice = torch.tensor([0.7119, 0.7073, 0.6460, 0.7780, 0.7423, 0.6926, 0.7378, 0.7189, 0.7784])
        assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2
-    @slow
-    def test_grad_tts(self):
-        model_id = "fusing/grad-tts-libri-tts"
-        grad_tts = GradTTSPipeline.from_pretrained(model_id)
-        noise_scheduler = GradTTSScheduler()
-        grad_tts.noise_scheduler = noise_scheduler
-        text = "Hello world, I missed you so much."
-        generator = torch.manual_seed(0)
-        # generate mel spectograms using text
-        mel_spec = grad_tts(text, generator=generator)
-        assert mel_spec.shape == (1, 80, 143)
-        expected_slice = torch.tensor(
-            [-6.7584, -6.8347, -6.3293, -6.6437, -6.7233, -6.4684, -6.1187, -6.3172, -6.6890]
-        )
-        assert (mel_spec[0, :3, :3].cpu().flatten() - expected_slice).abs().max() < 1e-2
    @slow
    def test_score_sde_ve_pipeline(self):
        model = NCSNpp.from_pretrained("fusing/ffhq_ncsnpp")
@@ -1181,21 +1013,3 @@ class PipelineTesterMixin(unittest.TestCase):
            [-0.1202, -0.1005, -0.0635, -0.0520, -0.1282, -0.0838, -0.0981, -0.1318, -0.1106]
        )
        assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2
-    def test_module_from_pipeline(self):
-        model = DiffWave(num_res_layers=4)
-        noise_scheduler = DDPMScheduler(timesteps=12)
-        bddm = BDDMPipeline(model, noise_scheduler)
-        # check if the library name for the diffwave moduel is set to pipeline module
-        self.assertTrue(bddm.config["diffwave"][0] == "bddm")
-        # check if we can save and load the pipeline
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            bddm.save_pretrained(tmpdirname)
-            _ = BDDMPipeline.from_pretrained(tmpdirname)
-            # check if the same works using the DifusionPipeline class
-            bddm = DiffusionPipeline.from_pretrained(tmpdirname)
-        self.assertTrue(bddm.config["diffwave"][0] == "bddm")