[Flax] Stateless schedulers, fixes and refactors (#1661)

* [Flax] Stateless schedulers, fixes and refactors * Remove scheduling_common_flax and some renames * Update src/diffusers/schedulers/scheduling_pndm_flax.py Co-authored-by: Pedro Cuenca <pedro@huggingface.co> Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

[Flax] Stateless schedulers, fixes and refactors (#1661)
* [Flax] Stateless schedulers, fixes and refactors * Remove scheduling_common_flax and some renames * Update src/diffusers/schedulers/scheduling_pndm_flax.py Co-authored-by: Pedro Cuenca <pedro@huggingface.co> Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
f106ab40 · Simon Kirsten · GitHub · d87cc159 · f106ab40 · f106ab40
Unverified Commit f106ab40 authored Dec 20, 2022 by Simon Kirsten Committed by GitHub Dec 20, 2022
12 changed files
--- a/examples/dreambooth/train_dreambooth_flax.py
+++ b/examples/dreambooth/train_dreambooth_flax.py
@@ -475,6 +475,7 @@ def main():
    noise_scheduler = FlaxDDPMScheduler(
        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
    )
+    noise_scheduler_state = noise_scheduler.create_state()
    # Initialize our training
    train_rngs = jax.random.split(rng, jax.local_device_count())
@@ -511,7 +512,7 @@ def main():
            # Add noise to the latents according to the noise magnitude at each timestep
            # (this is the forward diffusion process)
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+            noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
            # Get the text embedding for conditioning
            if args.train_text_encoder:

--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -417,6 +417,7 @@ def main():
    noise_scheduler = FlaxDDPMScheduler(
        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
    )
+    noise_scheduler_state = noise_scheduler.create_state()
    # Initialize our training
    rng = jax.random.PRNGKey(args.seed)
@@ -449,7 +450,7 @@ def main():
            # Add noise to the latents according to the noise magnitude at each timestep
            # (this is the forward diffusion process)
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+            noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
            # Get the text embedding for conditioning
            encoder_hidden_states = text_encoder(

--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -505,6 +505,7 @@ def main():
    noise_scheduler = FlaxDDPMScheduler(
        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
    )
+    noise_scheduler_state = noise_scheduler.create_state()
    # Initialize our training
    train_rngs = jax.random.split(rng, jax.local_device_count())
@@ -531,7 +532,7 @@ def main():
                0,
                noise_scheduler.config.num_train_timesteps,
            )
-            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+            noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
            encoder_hidden_states = state.apply_fn(
                batch["input_ids"], params=params, dropout_rng=dropout_rng, train=True
            )[0]

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -261,7 +261,8 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
        )
        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
+        latents = latents * params["scheduler"].init_noise_sigma
        if DEBUG:
            # run with python for loop
            for i in range(num_inference_steps):

--- a/src/diffusers/schedulers/scheduling_ddim_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddim_flax.py
@@ -15,7 +15,6 @@
 # DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
 # and https://github.com/hojonathanho/diffusion
-import math
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
@@ -26,51 +25,37 @@ from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import deprecate
 from .scheduling_utils_flax import (
    _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
+    CommonSchedulerState,
    FlaxSchedulerMixin,
    FlaxSchedulerOutput,
-    broadcast_to_shape_from_left,
+    add_noise_common,
 )
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> jnp.ndarray:
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-    Returns:
-        betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return jnp.array(betas, dtype=jnp.float32)
 @flax.struct.dataclass
 class DDIMSchedulerState:
+    common: CommonSchedulerState
+    final_alpha_cumprod: jnp.ndarray
    # setable values
+    init_noise_sigma: jnp.ndarray
    timesteps: jnp.ndarray
-    alphas_cumprod: jnp.ndarray
    num_inference_steps: Optional[int] = None
    @classmethod
-    def create(cls, num_train_timesteps: int, alphas_cumprod: jnp.ndarray):
+    def create(
-        return cls(timesteps=jnp.arange(0, num_train_timesteps)[::-1], alphas_cumprod=alphas_cumprod)
+        cls,
+        common: CommonSchedulerState,
+        final_alpha_cumprod: jnp.ndarray,
+        init_noise_sigma: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ):
+        return cls(
+            common=common,
+            final_alpha_cumprod=final_alpha_cumprod,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
 @dataclass
@@ -112,12 +97,15 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
        prediction_type (`str`, default `epsilon`):
            indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
            `v-prediction` is not supported for this scheduler.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
    """
    _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
    _deprecated_kwargs = ["predict_epsilon"]
+    dtype: jnp.dtype
    @property
    def has_state(self):
        return True
@@ -129,43 +117,46 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
        beta_start: float = 0.0001,
        beta_end: float = 0.02,
        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
        set_alpha_to_one: bool = True,
        steps_offset: int = 0,
        prediction_type: str = "epsilon",
+        dtype: jnp.dtype = jnp.float32,
        **kwargs,
    ):
        message = (
            "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
-            " FlaxDDIMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
+            f" {self.__class__.__name__}.from_pretrained(<model_id>, prediction_type='epsilon')`."
        )
        predict_epsilon = deprecate("predict_epsilon", "0.13.0", message, take_from=kwargs)
        if predict_epsilon is not None:
            self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
-        if beta_schedule == "linear":
+        self.dtype = dtype
-            self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-        self.alphas = 1.0 - self.betas
-        # HACK for now - clean up later (PVP)
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> DDIMSchedulerState:
-        self._alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
+        if common is None:
+            common = CommonSchedulerState.create(self)
        # At every step in ddim, we are looking into the previous alphas_cumprod
        # For the final step, there is no previous alphas_cumprod because we are already at 0
        # `set_alpha_to_one` decides whether we set this parameter simply to one or
        # whether we use the final alpha of the "non-previous" one.
-        self.final_alpha_cumprod = jnp.array(1.0) if set_alpha_to_one else float(self._alphas_cumprod[0])
+        final_alpha_cumprod = (
+            jnp.array(1.0, dtype=self.dtype) if self.config.set_alpha_to_one else common.alphas_cumprod[0]
+        )
        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
+        init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+        return DDIMSchedulerState.create(
+            common=common,
+            final_alpha_cumprod=final_alpha_cumprod,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
    def scale_model_input(
        self, state: DDIMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
@@ -181,21 +172,6 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
        """
        return sample
-    def create_state(self):
-        return DDIMSchedulerState.create(
-            num_train_timesteps=self.config.num_train_timesteps, alphas_cumprod=self._alphas_cumprod
-        )
-    def _get_variance(self, timestep, prev_timestep, alphas_cumprod):
-        alpha_prod_t = alphas_cumprod[timestep]
-        alpha_prod_t_prev = jnp.where(prev_timestep >= 0, alphas_cumprod[prev_timestep], self.final_alpha_cumprod)
-        beta_prod_t = 1 - alpha_prod_t
-        beta_prod_t_prev = 1 - alpha_prod_t_prev
-        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
-        return variance
    def set_timesteps(
        self, state: DDIMSchedulerState, num_inference_steps: int, shape: Tuple = ()
    ) -> DDIMSchedulerState:
@@ -208,15 +184,27 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
            num_inference_steps (`int`):
                the number of diffusion steps used when generating samples with a pre-trained model.
        """
-        offset = self.config.steps_offset
        step_ratio = self.config.num_train_timesteps // num_inference_steps
        # creates integer timesteps by multiplying by ratio
-        # casting to int to avoid issues when num_inference_step is power of 3
+        # rounding to avoid issues when num_inference_step is power of 3
-        timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1]
+        timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1] + self.config.steps_offset
-        timesteps = timesteps + offset
+        return state.replace(
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+        )
+    def _get_variance(self, state: DDIMSchedulerState, timestep, prev_timestep):
+        alpha_prod_t = state.common.alphas_cumprod[timestep]
+        alpha_prod_t_prev = jnp.where(
+            prev_timestep >= 0, state.common.alphas_cumprod[prev_timestep], state.final_alpha_cumprod
+        )
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
-        return state.replace(num_inference_steps=num_inference_steps, timesteps=timesteps)
+        return variance
    def step(
        self,
@@ -224,6 +212,7 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
        model_output: jnp.ndarray,
        timestep: int,
        sample: jnp.ndarray,
+        eta: float = 0.0,
        return_dict: bool = True,
    ) -> Union[FlaxDDIMSchedulerOutput, Tuple]:
        """
@@ -259,17 +248,15 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
        # - pred_sample_direction -> "direction pointing to x_t"
        # - pred_prev_sample -> "x_t-1"
-        # TODO(Patrick) - eta is always 0.0 for now, allow to be set in step function
-        eta = 0.0
        # 1. get previous step value (=t-1)
        prev_timestep = timestep - self.config.num_train_timesteps // state.num_inference_steps
-        alphas_cumprod = state.alphas_cumprod
+        alphas_cumprod = state.common.alphas_cumprod
+        final_alpha_cumprod = state.final_alpha_cumprod
        # 2. compute alphas, betas
        alpha_prod_t = alphas_cumprod[timestep]
-        alpha_prod_t_prev = jnp.where(prev_timestep >= 0, alphas_cumprod[prev_timestep], self.final_alpha_cumprod)
+        alpha_prod_t_prev = jnp.where(prev_timestep >= 0, alphas_cumprod[prev_timestep], final_alpha_cumprod)
        beta_prod_t = 1 - alpha_prod_t
@@ -291,7 +278,7 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
        # 4. compute variance: "sigma_t(η)" -> see formula (16)
        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
-        variance = self._get_variance(timestep, prev_timestep, alphas_cumprod)
+        variance = self._get_variance(state, timestep, prev_timestep)
        std_dev_t = eta * variance ** (0.5)
        # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
@@ -307,20 +294,12 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
    def add_noise(
        self,
+        state: DDIMSchedulerState,
        original_samples: jnp.ndarray,
        noise: jnp.ndarray,
        timesteps: jnp.ndarray,
    ) -> jnp.ndarray:
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+        return add_noise_common(state.common, original_samples, noise, timesteps)
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.0
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
    def __len__(self):
        return self.config.num_train_timesteps
--- a/src/diffusers/schedulers/scheduling_ddpm_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_flax.py
@@ -14,62 +14,36 @@
 # DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
-import math
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 import flax
+import jax
 import jax.numpy as jnp
-from jax import random
-from ..configuration_utils import ConfigMixin, FrozenDict, register_to_config
+from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import deprecate
 from .scheduling_utils_flax import (
    _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
+    CommonSchedulerState,
    FlaxSchedulerMixin,
    FlaxSchedulerOutput,
-    broadcast_to_shape_from_left,
+    add_noise_common,
 )
-def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> jnp.ndarray:
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-    Returns:
-        betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-    def alpha_bar(time_step):
-        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return jnp.array(betas, dtype=jnp.float32)
 @flax.struct.dataclass
 class DDPMSchedulerState:
+    common: CommonSchedulerState
    # setable values
+    init_noise_sigma: jnp.ndarray
    timesteps: jnp.ndarray
    num_inference_steps: Optional[int] = None
    @classmethod
-    def create(cls, num_train_timesteps: int):
+    def create(cls, common: CommonSchedulerState, init_noise_sigma: jnp.ndarray, timesteps: jnp.ndarray):
-        return cls(timesteps=jnp.arange(0, num_train_timesteps)[::-1])
+        return cls(common=common, init_noise_sigma=init_noise_sigma, timesteps=timesteps)
 @dataclass
@@ -106,11 +80,15 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
        prediction_type (`str`, default `epsilon`):
            indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
            `v-prediction` is not supported for this scheduler.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
    """
    _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
    _deprecated_kwargs = ["predict_epsilon"]
+    dtype: jnp.dtype
    @property
    def has_state(self):
        return True
@@ -126,35 +104,47 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
        variance_type: str = "fixed_small",
        clip_sample: bool = True,
        prediction_type: str = "epsilon",
+        dtype: jnp.dtype = jnp.float32,
        **kwargs,
    ):
        message = (
            "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
-            " FlaxDDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
+            f" {self.__class__.__name__}.from_pretrained(<model_id>, prediction_type='epsilon')`."
        )
        predict_epsilon = deprecate("predict_epsilon", "0.13.0", message, take_from=kwargs)
        if predict_epsilon is not None:
            self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
-        if trained_betas is not None:
+        self.dtype = dtype
-            self.betas = jnp.asarray(trained_betas)
-        elif beta_schedule == "linear":
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> DDPMSchedulerState:
-            self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
+        if common is None:
-        elif beta_schedule == "scaled_linear":
+            common = CommonSchedulerState.create(self)
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
+        # standard deviation of the initial noise distribution
-        elif beta_schedule == "squaredcos_cap_v2":
+        init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+        return DDPMSchedulerState.create(
+            common=common,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
-        self.alphas = 1.0 - self.betas
+    def scale_model_input(
-        self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
+        self, state: DDPMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
-        self.one = jnp.array(1.0)
+    ) -> jnp.ndarray:
+        """
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            sample (`jnp.ndarray`): input sample
+            timestep (`int`, optional): current timestep
-    def create_state(self):
+        Returns:
-        return DDPMSchedulerState.create(num_train_timesteps=self.config.num_train_timesteps)
+            `jnp.ndarray`: scaled input sample
+        """
+        return sample
    def set_timesteps(
        self, state: DDPMSchedulerState, num_inference_steps: int, shape: Tuple = ()
@@ -168,20 +158,25 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
            num_inference_steps (`int`):
                the number of diffusion steps used when generating samples with a pre-trained model.
        """
-        num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
-        timesteps = jnp.arange(
-            0, self.config.num_train_timesteps, self.config.num_train_timesteps // num_inference_steps
-        )[::-1]
-        return state.replace(num_inference_steps=num_inference_steps, timesteps=timesteps)
-    def _get_variance(self, t, predicted_variance=None, variance_type=None):
+        step_ratio = self.config.num_train_timesteps // num_inference_steps
-        alpha_prod_t = self.alphas_cumprod[t]
+        # creates integer timesteps by multiplying by ratio
-        alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one
+        # rounding to avoid issues when num_inference_step is power of 3
+        timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1]
+        return state.replace(
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+        )
+    def _get_variance(self, state: DDPMSchedulerState, t, predicted_variance=None, variance_type=None):
+        alpha_prod_t = state.common.alphas_cumprod[t]
+        alpha_prod_t_prev = jnp.where(t > 0, state.common.alphas_cumprod[t - 1], jnp.array(1.0, dtype=self.dtype))
        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
        # and sample from it to get previous sample
        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
-        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t]
+        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * state.common.betas[t]
        if variance_type is None:
            variance_type = self.config.variance_type
@@ -193,15 +188,15 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
        elif variance_type == "fixed_small_log":
            variance = jnp.log(jnp.clip(variance, a_min=1e-20))
        elif variance_type == "fixed_large":
-            variance = self.betas[t]
+            variance = state.common.betas[t]
        elif variance_type == "fixed_large_log":
            # Glide max_log
-            variance = jnp.log(self.betas[t])
+            variance = jnp.log(state.common.betas[t])
        elif variance_type == "learned":
            return predicted_variance
        elif variance_type == "learned_range":
            min_log = variance
-            max_log = self.betas[t]
+            max_log = state.common.betas[t]
            frac = (predicted_variance + 1) / 2
            variance = frac * max_log + (1 - frac) * min_log
@@ -213,9 +208,8 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
        model_output: jnp.ndarray,
        timestep: int,
        sample: jnp.ndarray,
-        key: random.KeyArray,
+        key: jax.random.KeyArray = jax.random.PRNGKey(0),
        return_dict: bool = True,
-        **kwargs,
    ) -> Union[FlaxDDPMSchedulerOutput, Tuple]:
        """
        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -227,7 +221,7 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
            timestep (`int`): current discrete timestep in the diffusion chain.
            sample (`jnp.ndarray`):
                current instance of sample being created by diffusion process.
-            key (`random.KeyArray`): a PRNG key.
+            key (`jax.random.KeyArray`): a PRNG key.
            return_dict (`bool`): option for returning tuple rather than FlaxDDPMSchedulerOutput class
        Returns:
@@ -235,16 +229,6 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
            `tuple`. When returning a tuple, the first element is the sample tensor.
        """
-        message = (
-            "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
-            " FlaxDDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
-        )
-        predict_epsilon = deprecate("predict_epsilon", "0.13.0", message, take_from=kwargs)
-        if predict_epsilon is not None:
-            new_config = dict(self.config)
-            new_config["prediction_type"] = "epsilon" if predict_epsilon else "sample"
-            self._internal_dict = FrozenDict(new_config)
        t = timestep
        if model_output.shape[1] == sample.shape[1] * 2 and self.config.variance_type in ["learned", "learned_range"]:
@@ -253,8 +237,8 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
            predicted_variance = None
        # 1. compute alphas, betas
-        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t = state.common.alphas_cumprod[t]
-        alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one
+        alpha_prod_t_prev = jnp.where(t > 0, state.common.alphas_cumprod[t - 1], jnp.array(1.0, dtype=self.dtype))
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev
@@ -264,6 +248,8 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
        elif self.config.prediction_type == "sample":
            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
        else:
            raise ValueError(
                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
@@ -276,19 +262,20 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * self.betas[t]) / beta_prod_t
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * state.common.betas[t]) / beta_prod_t
-        current_sample_coeff = self.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t
+        current_sample_coeff = state.common.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t
        # 5. Compute predicted previous sample µ_t
        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
        # 6. Add noise
-        variance = 0
+        def random_variance():
-        if t > 0:
+            split_key = jax.random.split(key, num=1)
-            key = random.split(key, num=1)
+            noise = jax.random.normal(split_key, shape=model_output.shape, dtype=self.dtype)
-            noise = random.normal(key=key, shape=model_output.shape)
+            return (self._get_variance(state, t, predicted_variance=predicted_variance) ** 0.5) * noise
-            variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * noise
+        variance = jnp.where(t > 0, random_variance(), jnp.zeros(model_output.shape, dtype=self.dtype))
        pred_prev_sample = pred_prev_sample + variance
@@ -299,20 +286,12 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
    def add_noise(
        self,
+        state: DDPMSchedulerState,
        original_samples: jnp.ndarray,
        noise: jnp.ndarray,
        timesteps: jnp.ndarray,
    ) -> jnp.ndarray:
-        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+        return add_noise_common(state.common, original_samples, noise, timesteps)
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
    def __len__(self):
        return self.config.num_train_timesteps
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
--- a/src/diffusers/schedulers/scheduling_karras_ve_flax.py
+++ b/src/diffusers/schedulers/scheduling_karras_ve_flax.py
@@ -233,5 +233,5 @@ class FlaxKarrasVeScheduler(FlaxSchedulerMixin, ConfigMixin):
        return FlaxKarrasVeOutput(prev_sample=sample_prev, derivative=derivative, state=state)
-    def add_noise(self, original_samples, noise, timesteps):
+    def add_noise(self, state: KarrasVeSchedulerState, original_samples, noise, timesteps):
        raise NotImplementedError()
--- a/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
@@ -22,6 +22,7 @@ from scipy import integrate
 from ..configuration_utils import ConfigMixin, register_to_config
 from .scheduling_utils_flax import (
    _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
+    CommonSchedulerState,
    FlaxSchedulerMixin,
    FlaxSchedulerOutput,
    broadcast_to_shape_from_left,
@@ -30,15 +31,22 @@ from .scheduling_utils_flax import (
 @flax.struct.dataclass
 class LMSDiscreteSchedulerState:
+    common: CommonSchedulerState
    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    sigmas: jnp.ndarray
    num_inference_steps: Optional[int] = None
-    timesteps: Optional[jnp.ndarray] = None
-    sigmas: Optional[jnp.ndarray] = None
+    # running values
-    derivatives: jnp.ndarray = jnp.array([])
+    derivatives: Optional[jnp.ndarray] = None
    @classmethod
-    def create(cls, num_train_timesteps: int, sigmas: jnp.ndarray):
+    def create(
-        return cls(timesteps=jnp.arange(0, num_train_timesteps)[::-1], sigmas=sigmas)
+        cls, common: CommonSchedulerState, init_noise_sigma: jnp.ndarray, timesteps: jnp.ndarray, sigmas: jnp.ndarray
+    ):
+        return cls(common=common, init_noise_sigma=init_noise_sigma, timesteps=timesteps, sigmas=sigmas)
 @dataclass
@@ -66,10 +74,18 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
            `linear` or `scaled_linear`.
        trained_betas (`jnp.ndarray`, optional):
            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+            https://imagen.research.google/video/paper.pdf)
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
    """
    _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
+    dtype: jnp.dtype
    @property
    def has_state(self):
        return True
@@ -82,24 +98,26 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
        beta_end: float = 0.02,
        beta_schedule: str = "linear",
        trained_betas: Optional[jnp.ndarray] = None,
+        prediction_type: str = "epsilon",
+        dtype: jnp.dtype = jnp.float32,
    ):
-        if trained_betas is not None:
+        self.dtype = dtype
-            self.betas = jnp.asarray(trained_betas)
-        elif beta_schedule == "linear":
-            self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
-        else:
-            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
-        self.alphas = 1.0 - self.betas
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> LMSDiscreteSchedulerState:
-        self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
+        if common is None:
+            common = CommonSchedulerState.create(self)
-    def create_state(self):
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
-        self.state = LMSDiscreteSchedulerState.create(
+        sigmas = ((1 - common.alphas_cumprod) / common.alphas_cumprod) ** 0.5
-            num_train_timesteps=self.config.num_train_timesteps,
-            sigmas=((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5,
+        # standard deviation of the initial noise distribution
+        init_noise_sigma = sigmas.max()
+        return LMSDiscreteSchedulerState.create(
+            common=common,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+            sigmas=sigmas,
        )
    def scale_model_input(self, state: LMSDiscreteSchedulerState, sample: jnp.ndarray, timestep: int) -> jnp.ndarray:
@@ -118,11 +136,13 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
            `jnp.ndarray`: scaled input sample
        """
        (step_index,) = jnp.where(state.timesteps == timestep, size=1)
+        step_index = step_index[0]
        sigma = state.sigmas[step_index]
        sample = sample / ((sigma**2 + 1) ** 0.5)
        return sample
-    def get_lms_coefficient(self, state, order, t, current_order):
+    def get_lms_coefficient(self, state: LMSDiscreteSchedulerState, order, t, current_order):
        """
        Compute a linear multistep coefficient.
@@ -156,20 +176,28 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
            num_inference_steps (`int`):
                the number of diffusion steps used when generating samples with a pre-trained model.
        """
-        timesteps = jnp.linspace(self.config.num_train_timesteps - 1, 0, num_inference_steps, dtype=jnp.float32)
-        low_idx = jnp.floor(timesteps).astype(int)
+        timesteps = jnp.linspace(self.config.num_train_timesteps - 1, 0, num_inference_steps, dtype=self.dtype)
-        high_idx = jnp.ceil(timesteps).astype(int)
+        low_idx = jnp.floor(timesteps).astype(jnp.int32)
+        high_idx = jnp.ceil(timesteps).astype(jnp.int32)
        frac = jnp.mod(timesteps, 1.0)
-        sigmas = jnp.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = ((1 - state.common.alphas_cumprod) / state.common.alphas_cumprod) ** 0.5
        sigmas = (1 - frac) * sigmas[low_idx] + frac * sigmas[high_idx]
-        sigmas = jnp.concatenate([sigmas, jnp.array([0.0])]).astype(jnp.float32)
+        sigmas = jnp.concatenate([sigmas, jnp.array([0.0], dtype=self.dtype)])
+        timesteps = timesteps.astype(jnp.int32)
+        # initial running values
+        derivatives = jnp.zeros((0,) + shape, dtype=self.dtype)
        return state.replace(
-            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
-            timesteps=timesteps.astype(int),
-            derivatives=jnp.array([]),
            sigmas=sigmas,
+            num_inference_steps=num_inference_steps,
+            derivatives=derivatives,
        )
    def step(
@@ -199,10 +227,23 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
            `tuple`. When returning a tuple, the first element is the sample tensor.
        """
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
        sigma = state.sigmas[timestep]
        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        pred_original_sample = sample - sigma * model_output
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
        # 2. Convert to an ODE derivative
        derivative = (sample - pred_original_sample) / sigma

--- a/src/diffusers/schedulers/scheduling_pndm_flax.py
+++ b/src/diffusers/schedulers/scheduling_pndm_flax.py
--- a/src/diffusers/schedulers/scheduling_utils_flax.py
+++ b/src/diffusers/schedulers/scheduling_utils_flax.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import importlib
+import math
 import os
 from dataclasses import dataclass
 from typing import Any, Dict, Optional, Tuple, Union
+import flax
 import jax.numpy as jnp
 from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, BaseOutput
@@ -50,6 +52,7 @@ class FlaxSchedulerMixin:
    """
    config_name = SCHEDULER_CONFIG_NAME
+    ignore_for_config = ["dtype"]
    _compatibles = []
    has_compatibles = True
@@ -167,3 +170,90 @@ class FlaxSchedulerMixin:
 def broadcast_to_shape_from_left(x: jnp.ndarray, shape: Tuple[int]) -> jnp.ndarray:
    assert len(shape) >= x.ndim
    return jnp.broadcast_to(x.reshape(x.shape + (1,) * (len(shape) - x.ndim)), shape)
+def betas_for_alpha_bar(num_diffusion_timesteps: int, max_beta=0.999, dtype=jnp.float32) -> jnp.ndarray:
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    Returns:
+        betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    def alpha_bar(time_step):
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return jnp.array(betas, dtype=dtype)
+@flax.struct.dataclass
+class CommonSchedulerState:
+    alphas: jnp.ndarray
+    betas: jnp.ndarray
+    alphas_cumprod: jnp.ndarray
+    @classmethod
+    def create(cls, scheduler):
+        config = scheduler.config
+        if config.trained_betas is not None:
+            betas = jnp.asarray(config.trained_betas, dtype=scheduler.dtype)
+        elif config.beta_schedule == "linear":
+            betas = jnp.linspace(config.beta_start, config.beta_end, config.num_train_timesteps, dtype=scheduler.dtype)
+        elif config.beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            betas = (
+                jnp.linspace(
+                    config.beta_start**0.5, config.beta_end**0.5, config.num_train_timesteps, dtype=scheduler.dtype
+                )
+                ** 2
+            )
+        elif config.beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            betas = betas_for_alpha_bar(config.num_train_timesteps, dtype=scheduler.dtype)
+        else:
+            raise NotImplementedError(
+                f"beta_schedule {config.beta_schedule} is not implemented for scheduler {scheduler.__class__.__name__}"
+            )
+        alphas = 1.0 - betas
+        alphas_cumprod = jnp.cumprod(alphas, axis=0)
+        return cls(
+            alphas=alphas,
+            betas=betas,
+            alphas_cumprod=alphas_cumprod,
+        )
+def add_noise_common(
+    state: CommonSchedulerState, original_samples: jnp.ndarray, noise: jnp.ndarray, timesteps: jnp.ndarray
+):
+    alphas_cumprod = state.alphas_cumprod
+    sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+    sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+    sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
+    sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+    sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+    sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
+    noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+    return noisy_samples
--- a/tests/test_scheduler_flax.py
+++ b/tests/test_scheduler_flax.py
@@ -296,10 +296,11 @@ class FlaxDDPMSchedulerTest(FlaxSchedulerCommonTest):
        scheduler_class = self.scheduler_classes[0]
        scheduler_config = self.get_scheduler_config()
        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
-        assert jnp.sum(jnp.abs(scheduler._get_variance(0) - 0.0)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0) - 0.0)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(487) - 0.00979)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 487) - 0.00979)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(999) - 0.02)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 999) - 0.02)) < 1e-5
    def test_full_loop_no_noise(self):
        scheduler_class = self.scheduler_classes[0]
@@ -577,12 +578,12 @@ class FlaxDDIMSchedulerTest(FlaxSchedulerCommonTest):
        scheduler = scheduler_class(**scheduler_config)
        state = scheduler.create_state()
-        assert jnp.sum(jnp.abs(scheduler._get_variance(0, 0, state.alphas_cumprod) - 0.0)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0, 0) - 0.0)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(420, 400, state.alphas_cumprod) - 0.14771)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 420, 400) - 0.14771)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(980, 960, state.alphas_cumprod) - 0.32460)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 980, 960) - 0.32460)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(0, 0, state.alphas_cumprod) - 0.0)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0, 0) - 0.0)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(487, 486, state.alphas_cumprod) - 0.00979)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 487, 486) - 0.00979)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(999, 998, state.alphas_cumprod) - 0.02)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 999, 998) - 0.02)) < 1e-5
    def test_full_loop_no_noise(self):
        sample = self.full_loop()