correct merge

08e7f4b0 · Patrick von Platen · acb948bd · a020285e · 08e7f4b0 · 08e7f4b0
Commit 08e7f4b0 authored Jun 12, 2022 by Patrick von Platen
12 changed files
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ cd diffusers && pip install -e .
 It could become a central place for all kinds of models, schedulers, training utils and processors that one can mix and match for one's own use case.
 Both models and schedulers should be load- and saveable from the Hub.
-**Example for [DDPM](https://arxiv.org/abs/2006.11239):**
+#### **Example for [DDPM](https://arxiv.org/abs/2006.11239):**
 ```python
 import torch
@@ -45,29 +45,29 @@ torch_device = "cuda" if torch.cuda.is_available() else "cpu"
 # 1. Load models
 noise_scheduler = GaussianDDPMScheduler.from_config("fusing/ddpm-lsun-church")
-model = UNetModel.from_pretrained("fusing/ddpm-lsun-church").to(torch_device)
+unet = UNetModel.from_pretrained("fusing/ddpm-lsun-church").to(torch_device)
 # 2. Sample gaussian noise
-image = noise_scheduler.sample_noise((1, model.in_channels, model.resolution, model.resolution), device=torch_device, generator=generator)
+image = noise_scheduler.sample_noise((1, unet.in_channels, unet.resolution, unet.resolution), device=torch_device, generator=generator)
 # 3. Denoise                                                                                                                                           
 num_prediction_steps = len(noise_scheduler)
 for t in tqdm.tqdm(reversed(range(num_prediction_steps)), total=num_prediction_steps):
-		# predict noise residual
+	# predict noise residual
-		with torch.no_grad():
+	with torch.no_grad():
-				residual = unet(image, t)
+		residual = unet(image, t)
-		# predict previous mean of image x_t-1
+	# predict previous mean of image x_t-1
-		pred_prev_image = noise_scheduler.compute_prev_image_step(residual, image, t)
+	pred_prev_image = noise_scheduler.step(residual, image, t)
-		# optionally sample variance
+	# optionally sample variance
-		variance = 0
+	variance = 0
-		if t > 0:
+	if t > 0:
-				noise = noise_scheduler.sample_noise(image.shape, device=image.device, generator=generator)
+		noise = noise_scheduler.sample_noise(image.shape, device=image.device, generator=generator)
-				variance = noise_scheduler.get_variance(t).sqrt() * noise
+		variance = noise_scheduler.get_variance(t).sqrt() * noise
-		# set current image to prev_image: x_t -> x_t-1
+	# set current image to prev_image: x_t -> x_t-1
-		image = pred_prev_image + variance
+	image = pred_prev_image + variance
 # 5. process image to PIL
 image_processed = image.cpu().permute(0, 2, 3, 1)
@@ -79,7 +79,7 @@ image_pil = PIL.Image.fromarray(image_processed[0])
 image_pil.save("test.png")
 ```
-**Example for [DDIM](https://arxiv.org/abs/2010.02502):**
+#### **Example for [DDIM](https://arxiv.org/abs/2010.02502):**
 ```python
 import torch
@@ -93,31 +93,32 @@ torch_device = "cuda" if torch.cuda.is_available() else "cpu"
 # 1. Load models
 noise_scheduler = DDIMScheduler.from_config("fusing/ddpm-celeba-hq")
-model = UNetModel.from_pretrained("fusing/ddpm-celeba-hq").to(torch_device)
+unet = UNetModel.from_pretrained("fusing/ddpm-celeba-hq").to(torch_device)
 # 2. Sample gaussian noise
-image = noise_scheduler.sample_noise((1, model.in_channels, model.resolution, model.resolution), device=torch_device, generator=generator)
+image = noise_scheduler.sample_noise((1, unet.in_channels, unet.resolution, unet.resolution), device=torch_device, generator=generator)
 # 3. Denoise                                                                                                                                           
 num_inference_steps = 50
 eta = 0.0  # <- deterministic sampling
 for t in tqdm.tqdm(reversed(range(num_inference_steps)), total=num_inference_steps):
-		# 1. predict noise residual
+	# 1. predict noise residual
-		with torch.no_grad():
+	orig_t = noise_scheduler.get_orig_t(t, num_inference_steps)
-				residual = unet(image, inference_step_times[t])
+	with torch.no_grad():
+	    residual = unet(image, orig_t)
-		# 2. predict previous mean of image x_t-1
+	# 2. predict previous mean of image x_t-1
-		pred_prev_image = noise_scheduler.compute_prev_image_step(residual, image, t, num_inference_steps, eta)
+	pred_prev_image = noise_scheduler.step(residual, image, t, num_inference_steps, eta)
-		# 3. optionally sample variance
+	# 3. optionally sample variance
-		variance = 0
+	variance = 0
-		if eta > 0:
+	if eta > 0:
-				noise = noise_scheduler.sample_noise(image.shape, device=image.device, generator=generator)
+		noise = noise_scheduler.sample_noise(image.shape, device=image.device, generator=generator)
-				variance = noise_scheduler.get_variance(t).sqrt() * eta * noise
+		variance = noise_scheduler.get_variance(t).sqrt() * eta * noise
-		# 4. set current image to prev_image: x_t -> x_t-1
+	# 4. set current image to prev_image: x_t -> x_t-1
-		image = pred_prev_image + variance
+	image = pred_prev_image + variance
 # 5. process image to PIL
 image_processed = image.cpu().permute(0, 2, 3, 1)
@@ -132,7 +133,7 @@ image_pil.save("test.png")
 ### 2. `diffusers` as a collection of most important Diffusion systems (GLIDE, Dalle, ...)
 `models` directory in repository hosts the complete code necessary for running a diffusion system as well as to train it. A `DiffusionPipeline` class allows to easily run the diffusion model in inference:
-**Example image generation with DDPM**
+#### **Example image generation with DDPM**
 ```python
 from diffusers import DiffusionPipeline
@@ -155,6 +156,28 @@ image_pil = PIL.Image.fromarray(image_processed[0])
 image_pil.save("test.png")
 ```
+**Text to Image generation with Latent Diffusion**
+```python
+from diffusers import DiffusionPipeline
+ldm = DiffusionPipeline.from_pretrained("fusing/latent-diffusion-text2im-large")
+generator = torch.Generator()
+generator = generator.manual_seed(6694729458485568)
+prompt = "A painting of a squirrel eating a burger"
+image = ldm([prompt], generator=generator, eta=0.3, guidance_scale=6.0, num_inference_steps=50)
+image_processed = image.cpu().permute(0, 2, 3, 1)
+image_processed = image_processed  * 255.
+image_processed = image_processed.numpy().astype(np.uint8)
+image_pil = PIL.Image.fromarray(image_processed[0])
+# save image
+image_pil.save("test.png")
+```
 ## Library structure:
 ```

--- a/src/diffusers/pipelines/configuration_ldmbert.py
+++ b/src/diffusers/pipelines/configuration_ldmbert.py
+# coding=utf-8
+# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LDMBERT model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+LDMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "ldm-bert": "https://huggingface.co/ldm-bert/resolve/main/config.json",
+}
+class LDMBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LDMBertModel`]. It is used to instantiate a
+    LDMBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the LDMBERT
+    [facebook/ldmbert-large](https://huggingface.co/facebook/ldmbert-large) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the LDMBERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LDMBertModel`] or [`TFLDMBertModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_labels: (`int`, *optional*, defaults to 3):
+            The number of labels to use in [`LDMBertForSequenceClassification`].
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+    Example:
+    ```python
+    >>> from transformers import LDMBertModel, LDMBertConfig
+    >>> # Initializing a LDMBERT facebook/ldmbert-large style configuration
+    >>> configuration = LDMBertConfig()
+    >>> # Initializing a model from the facebook/ldmbert-large style configuration
+    >>> model = LDMBertModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "ldmbert"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    def __init__(
+        self,
+        vocab_size=30522,
+        max_position_embeddings=77,
+        encoder_layers=32,
+        encoder_ffn_dim=5120,
+        encoder_attention_heads=8,
+        head_dim=64,
+        encoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1280,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        pad_token_id=0,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.head_dim = head_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
--- a/src/diffusers/pipelines/modeling_vae.py
+++ b/src/diffusers/pipelines/modeling_vae.py
--- a/src/diffusers/pipelines/old/latent_diffusion/configuration_ldmbert.py
+++ b/src/diffusers/pipelines/old/latent_diffusion/configuration_ldmbert.py
+# coding=utf-8
+# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LDMBERT model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+LDMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "ldm-bert": "https://huggingface.co/ldm-bert/resolve/main/config.json",
+}
+class LDMBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LDMBertModel`]. It is used to instantiate a
+    LDMBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the LDMBERT
+    [facebook/ldmbert-large](https://huggingface.co/facebook/ldmbert-large) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the LDMBERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LDMBertModel`] or [`TFLDMBertModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_labels: (`int`, *optional*, defaults to 3):
+            The number of labels to use in [`LDMBertForSequenceClassification`].
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+    Example:
+    ```python
+    >>> from transformers import LDMBertModel, LDMBertConfig
+    >>> # Initializing a LDMBERT facebook/ldmbert-large style configuration
+    >>> configuration = LDMBertConfig()
+    >>> # Initializing a model from the facebook/ldmbert-large style configuration
+    >>> model = LDMBertModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "ldmbert"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    def __init__(
+        self,
+        vocab_size=30522,
+        max_position_embeddings=77,
+        encoder_layers=32,
+        encoder_ffn_dim=5120,
+        encoder_attention_heads=8,
+        head_dim=64,
+        encoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1280,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        pad_token_id=0,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.head_dim = head_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
--- a/src/diffusers/pipelines/old/latent_diffusion/modeling_latent_diffusion.py
+++ b/src/diffusers/pipelines/old/latent_diffusion/modeling_latent_diffusion.py
--- a/src/diffusers/pipelines/old/latent_diffusion/modeling_ldmbert.py
+++ b/src/diffusers/pipelines/old/latent_diffusion/modeling_ldmbert.py
--- a/src/diffusers/pipelines/old/latent_diffusion/modeling_vae.py
+++ b/src/diffusers/pipelines/old/latent_diffusion/modeling_vae.py
--- a/src/diffusers/pipelines/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/pipeline_ddpm.py
@@ -45,7 +45,7 @@ class DDPM(DiffusionPipeline):
                residual = self.unet(image, t)
            # 2. predict previous mean of image x_t-1
-            pred_prev_image = self.noise_scheduler.compute_prev_image_step(residual, image, t)
+            pred_prev_image = self.noise_scheduler.step(residual, image, t)
            # 3. optionally sample variance
            variance = 0

--- a/src/diffusers/schedulers/ddim.py
+++ b/src/diffusers/schedulers/ddim.py
@@ -105,7 +105,7 @@ class DDIMScheduler(nn.Module, ConfigMixin):
        return variance
-    def compute_prev_image_step(self, residual, image, t, num_inference_steps, eta, output_pred_x_0=False):
+    def step(self, residual, image, t, num_inference_steps, eta, output_pred_x_0=False):
        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
        # Ideally, read DDIM paper in-detail understanding

--- a/src/diffusers/schedulers/gaussian_ddpm.py
+++ b/src/diffusers/schedulers/gaussian_ddpm.py
@@ -24,7 +24,6 @@ SAMPLING_CONFIG_NAME = "scheduler_config.json"
 class GaussianDDPMScheduler(nn.Module, ConfigMixin):
    config_name = SAMPLING_CONFIG_NAME
    def __init__(
@@ -108,7 +107,7 @@ class GaussianDDPMScheduler(nn.Module, ConfigMixin):
        return variance
-    def compute_prev_image_step(self, residual, image, t, output_pred_x_0=False):
+    def step(self, residual, image, t, output_pred_x_0=False):
        # 1. compute alphas, betas
        alpha_prod_t = self.get_alpha_prod(t)
        alpha_prod_t_prev = self.get_alpha_prod(t - 1)

--- a/tests/test_ddpm_scheduler.py
+++ b/tests/test_ddpm_scheduler.py
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import random
+import tempfile
+import unittest
+import numpy as np
+from distutils.util import strtobool
+import torch
+from diffusers import GaussianDDPMScheduler, UNetModel, DDIMScheduler
+from diffusers.configuration_utils import ConfigMixin
+from diffusers.pipeline_utils import DiffusionPipeline
+from models.vision.ddim.modeling_ddim import DDIM
+from models.vision.ddpm.modeling_ddpm import DDPM
+from models.vision.latent_diffusion.modeling_latent_diffusion import LatentDiffusion
+global_rng = random.Random()
+torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+torch.backends.cuda.matmul.allow_tf32 = False
+def parse_flag_from_env(key, default=False):
+    try:
+        value = os.environ[key]
+    except KeyError:
+        # KEY isn't set, default to `default`.
+        _value = default
+    else:
+        # KEY is set, convert it to True or False.
+        try:
+            _value = strtobool(value)
+        except ValueError:
+            # More values are supported, but let's keep the message simple.
+            raise ValueError(f"If set, {key} must be yes or no.")
+    return _value
+_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
+def slow(test_case):
+    """
+    Decorator marking a test as slow.
+    Slow tests are skipped by default. Set the RUN_SLOW environment variable to a truthy value to run them.
+    """
+    return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case)
+def floats_tensor(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.random() * scale)
+    return np.random.randn(data=values, dtype=torch.float).view(shape).contiguous()
+class SchedulerCommonTest(unittest.TestCase):
+    scheduler_class = None
+    @property
+    def dummy_image(self):
+        batch_size = 4
+        num_channels = 3
+        height = 8
+        width = 8
+        image = np.random.rand(batch_size, num_channels, height, width)
+        return image
+    def get_scheduler_config(self):
+        raise NotImplementedError
+    def dummy_model(self):
+        def model(image, residual, t, *args):
+            return (image + residual) * t / (t + 1)
+        return model
+    def test_from_pretrained_save_pretrained(self):
+        image = self.dummy_image
+        residual = 0.1 * image
+        scheduler_config = self.get_scheduler_config()
+        scheduler = self.scheduler_class(scheduler_config())
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            scheduler.save_pretrained(tmpdirname)
+            new_scheduler = self.scheduler_class.from_config(tmpdirname)
+        output = scheduler(residual, image, 1)
+        new_output = new_scheduler(residual, image, 1)
+        import ipdb; ipdb.set_trace()
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -27,7 +27,7 @@ from diffusers.configuration_utils import ConfigMixin
 from diffusers.pipeline_utils import DiffusionPipeline
 from models.vision.ddim.modeling_ddim import DDIM
 from models.vision.ddpm.modeling_ddpm import DDPM
+from models.vision.latent_diffusion.modeling_latent_diffusion import LatentDiffusion
 global_rng = random.Random()
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -334,3 +334,19 @@ class PipelineTesterMixin(unittest.TestCase):
            [-0.7383, -0.7385, -0.7298, -0.7364, -0.7414, -0.7239, -0.6737, -0.6813, -0.7068]
        )
        assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2
+    @slow
+    def test_ldm_text2img(self):
+        model_id = "fusing/latent-diffusion-text2im-large"
+        ldm = LatentDiffusion.from_pretrained(model_id)
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        image = ldm([prompt], generator=generator, num_inference_steps=20)
+        image_slice = image[0, -1, -3:, -3:].cpu()
+        print(image_slice.shape)
+        assert image.shape == (1, 3, 256, 256)
+        expected_slice = torch.tensor([0.7295, 0.7358, 0.7256, 0.7435, 0.7095, 0.6884, 0.7325, 0.6921, 0.6458])
+        assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2