Unverified Commit f106ab40 authored by Simon Kirsten's avatar Simon Kirsten Committed by GitHub
Browse files

[Flax] Stateless schedulers, fixes and refactors (#1661)



* [Flax] Stateless schedulers, fixes and refactors

* Remove scheduling_common_flax and some renames

* Update src/diffusers/schedulers/scheduling_pndm_flax.py
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>
Co-authored-by: default avatarPedro Cuenca <pedro@huggingface.co>
parent d87cc159
...@@ -475,6 +475,7 @@ def main(): ...@@ -475,6 +475,7 @@ def main():
noise_scheduler = FlaxDDPMScheduler( noise_scheduler = FlaxDDPMScheduler(
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000 beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
) )
noise_scheduler_state = noise_scheduler.create_state()
# Initialize our training # Initialize our training
train_rngs = jax.random.split(rng, jax.local_device_count()) train_rngs = jax.random.split(rng, jax.local_device_count())
...@@ -511,7 +512,7 @@ def main(): ...@@ -511,7 +512,7 @@ def main():
# Add noise to the latents according to the noise magnitude at each timestep # Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process) # (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
# Get the text embedding for conditioning # Get the text embedding for conditioning
if args.train_text_encoder: if args.train_text_encoder:
......
...@@ -417,6 +417,7 @@ def main(): ...@@ -417,6 +417,7 @@ def main():
noise_scheduler = FlaxDDPMScheduler( noise_scheduler = FlaxDDPMScheduler(
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000 beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
) )
noise_scheduler_state = noise_scheduler.create_state()
# Initialize our training # Initialize our training
rng = jax.random.PRNGKey(args.seed) rng = jax.random.PRNGKey(args.seed)
...@@ -449,7 +450,7 @@ def main(): ...@@ -449,7 +450,7 @@ def main():
# Add noise to the latents according to the noise magnitude at each timestep # Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process) # (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
# Get the text embedding for conditioning # Get the text embedding for conditioning
encoder_hidden_states = text_encoder( encoder_hidden_states = text_encoder(
......
...@@ -505,6 +505,7 @@ def main(): ...@@ -505,6 +505,7 @@ def main():
noise_scheduler = FlaxDDPMScheduler( noise_scheduler = FlaxDDPMScheduler(
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000 beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
) )
noise_scheduler_state = noise_scheduler.create_state()
# Initialize our training # Initialize our training
train_rngs = jax.random.split(rng, jax.local_device_count()) train_rngs = jax.random.split(rng, jax.local_device_count())
...@@ -531,7 +532,7 @@ def main(): ...@@ -531,7 +532,7 @@ def main():
0, 0,
noise_scheduler.config.num_train_timesteps, noise_scheduler.config.num_train_timesteps,
) )
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
encoder_hidden_states = state.apply_fn( encoder_hidden_states = state.apply_fn(
batch["input_ids"], params=params, dropout_rng=dropout_rng, train=True batch["input_ids"], params=params, dropout_rng=dropout_rng, train=True
)[0] )[0]
......
...@@ -261,7 +261,8 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline): ...@@ -261,7 +261,8 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
) )
# scale the initial noise by the standard deviation required by the scheduler # scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma latents = latents * params["scheduler"].init_noise_sigma
if DEBUG: if DEBUG:
# run with python for loop # run with python for loop
for i in range(num_inference_steps): for i in range(num_inference_steps):
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion # DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
# and https://github.com/hojonathanho/diffusion # and https://github.com/hojonathanho/diffusion
import math
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
...@@ -26,51 +25,37 @@ from ..configuration_utils import ConfigMixin, register_to_config ...@@ -26,51 +25,37 @@ from ..configuration_utils import ConfigMixin, register_to_config
from ..utils import deprecate from ..utils import deprecate
from .scheduling_utils_flax import ( from .scheduling_utils_flax import (
_FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
CommonSchedulerState,
FlaxSchedulerMixin, FlaxSchedulerMixin,
FlaxSchedulerOutput, FlaxSchedulerOutput,
broadcast_to_shape_from_left, add_noise_common,
) )
def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> jnp.ndarray:
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
Returns:
betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
"""
def alpha_bar(time_step):
return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
t1 = i / num_diffusion_timesteps
t2 = (i + 1) / num_diffusion_timesteps
betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
return jnp.array(betas, dtype=jnp.float32)
@flax.struct.dataclass @flax.struct.dataclass
class DDIMSchedulerState: class DDIMSchedulerState:
common: CommonSchedulerState
final_alpha_cumprod: jnp.ndarray
# setable values # setable values
init_noise_sigma: jnp.ndarray
timesteps: jnp.ndarray timesteps: jnp.ndarray
alphas_cumprod: jnp.ndarray
num_inference_steps: Optional[int] = None num_inference_steps: Optional[int] = None
@classmethod @classmethod
def create(cls, num_train_timesteps: int, alphas_cumprod: jnp.ndarray): def create(
return cls(timesteps=jnp.arange(0, num_train_timesteps)[::-1], alphas_cumprod=alphas_cumprod) cls,
common: CommonSchedulerState,
final_alpha_cumprod: jnp.ndarray,
init_noise_sigma: jnp.ndarray,
timesteps: jnp.ndarray,
):
return cls(
common=common,
final_alpha_cumprod=final_alpha_cumprod,
init_noise_sigma=init_noise_sigma,
timesteps=timesteps,
)
@dataclass @dataclass
...@@ -112,12 +97,15 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -112,12 +97,15 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
prediction_type (`str`, default `epsilon`): prediction_type (`str`, default `epsilon`):
indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`. indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
`v-prediction` is not supported for this scheduler. `v-prediction` is not supported for this scheduler.
dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
the `dtype` used for params and computation.
""" """
_compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy() _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
_deprecated_kwargs = ["predict_epsilon"] _deprecated_kwargs = ["predict_epsilon"]
dtype: jnp.dtype
@property @property
def has_state(self): def has_state(self):
return True return True
...@@ -129,43 +117,46 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -129,43 +117,46 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
beta_start: float = 0.0001, beta_start: float = 0.0001,
beta_end: float = 0.02, beta_end: float = 0.02,
beta_schedule: str = "linear", beta_schedule: str = "linear",
trained_betas: Optional[jnp.ndarray] = None,
set_alpha_to_one: bool = True, set_alpha_to_one: bool = True,
steps_offset: int = 0, steps_offset: int = 0,
prediction_type: str = "epsilon", prediction_type: str = "epsilon",
dtype: jnp.dtype = jnp.float32,
**kwargs, **kwargs,
): ):
message = ( message = (
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler =" "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
" FlaxDDIMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`." f" {self.__class__.__name__}.from_pretrained(<model_id>, prediction_type='epsilon')`."
) )
predict_epsilon = deprecate("predict_epsilon", "0.13.0", message, take_from=kwargs) predict_epsilon = deprecate("predict_epsilon", "0.13.0", message, take_from=kwargs)
if predict_epsilon is not None: if predict_epsilon is not None:
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample") self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
if beta_schedule == "linear": self.dtype = dtype
self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
# HACK for now - clean up later (PVP) def create_state(self, common: Optional[CommonSchedulerState] = None) -> DDIMSchedulerState:
self._alphas_cumprod = jnp.cumprod(self.alphas, axis=0) if common is None:
common = CommonSchedulerState.create(self)
# At every step in ddim, we are looking into the previous alphas_cumprod # At every step in ddim, we are looking into the previous alphas_cumprod
# For the final step, there is no previous alphas_cumprod because we are already at 0 # For the final step, there is no previous alphas_cumprod because we are already at 0
# `set_alpha_to_one` decides whether we set this parameter simply to one or # `set_alpha_to_one` decides whether we set this parameter simply to one or
# whether we use the final alpha of the "non-previous" one. # whether we use the final alpha of the "non-previous" one.
self.final_alpha_cumprod = jnp.array(1.0) if set_alpha_to_one else float(self._alphas_cumprod[0]) final_alpha_cumprod = (
jnp.array(1.0, dtype=self.dtype) if self.config.set_alpha_to_one else common.alphas_cumprod[0]
)
# standard deviation of the initial noise distribution # standard deviation of the initial noise distribution
self.init_noise_sigma = 1.0 init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
return DDIMSchedulerState.create(
common=common,
final_alpha_cumprod=final_alpha_cumprod,
init_noise_sigma=init_noise_sigma,
timesteps=timesteps,
)
def scale_model_input( def scale_model_input(
self, state: DDIMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None self, state: DDIMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
...@@ -181,21 +172,6 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -181,21 +172,6 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
""" """
return sample return sample
def create_state(self):
return DDIMSchedulerState.create(
num_train_timesteps=self.config.num_train_timesteps, alphas_cumprod=self._alphas_cumprod
)
def _get_variance(self, timestep, prev_timestep, alphas_cumprod):
alpha_prod_t = alphas_cumprod[timestep]
alpha_prod_t_prev = jnp.where(prev_timestep >= 0, alphas_cumprod[prev_timestep], self.final_alpha_cumprod)
beta_prod_t = 1 - alpha_prod_t
beta_prod_t_prev = 1 - alpha_prod_t_prev
variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
return variance
def set_timesteps( def set_timesteps(
self, state: DDIMSchedulerState, num_inference_steps: int, shape: Tuple = () self, state: DDIMSchedulerState, num_inference_steps: int, shape: Tuple = ()
) -> DDIMSchedulerState: ) -> DDIMSchedulerState:
...@@ -208,15 +184,27 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -208,15 +184,27 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
num_inference_steps (`int`): num_inference_steps (`int`):
the number of diffusion steps used when generating samples with a pre-trained model. the number of diffusion steps used when generating samples with a pre-trained model.
""" """
offset = self.config.steps_offset
step_ratio = self.config.num_train_timesteps // num_inference_steps step_ratio = self.config.num_train_timesteps // num_inference_steps
# creates integer timesteps by multiplying by ratio # creates integer timesteps by multiplying by ratio
# casting to int to avoid issues when num_inference_step is power of 3 # rounding to avoid issues when num_inference_step is power of 3
timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1] timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1] + self.config.steps_offset
timesteps = timesteps + offset
return state.replace(
num_inference_steps=num_inference_steps,
timesteps=timesteps,
)
def _get_variance(self, state: DDIMSchedulerState, timestep, prev_timestep):
alpha_prod_t = state.common.alphas_cumprod[timestep]
alpha_prod_t_prev = jnp.where(
prev_timestep >= 0, state.common.alphas_cumprod[prev_timestep], state.final_alpha_cumprod
)
beta_prod_t = 1 - alpha_prod_t
beta_prod_t_prev = 1 - alpha_prod_t_prev
variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
return state.replace(num_inference_steps=num_inference_steps, timesteps=timesteps) return variance
def step( def step(
self, self,
...@@ -224,6 +212,7 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -224,6 +212,7 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
model_output: jnp.ndarray, model_output: jnp.ndarray,
timestep: int, timestep: int,
sample: jnp.ndarray, sample: jnp.ndarray,
eta: float = 0.0,
return_dict: bool = True, return_dict: bool = True,
) -> Union[FlaxDDIMSchedulerOutput, Tuple]: ) -> Union[FlaxDDIMSchedulerOutput, Tuple]:
""" """
...@@ -259,17 +248,15 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -259,17 +248,15 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
# - pred_sample_direction -> "direction pointing to x_t" # - pred_sample_direction -> "direction pointing to x_t"
# - pred_prev_sample -> "x_t-1" # - pred_prev_sample -> "x_t-1"
# TODO(Patrick) - eta is always 0.0 for now, allow to be set in step function
eta = 0.0
# 1. get previous step value (=t-1) # 1. get previous step value (=t-1)
prev_timestep = timestep - self.config.num_train_timesteps // state.num_inference_steps prev_timestep = timestep - self.config.num_train_timesteps // state.num_inference_steps
alphas_cumprod = state.alphas_cumprod alphas_cumprod = state.common.alphas_cumprod
final_alpha_cumprod = state.final_alpha_cumprod
# 2. compute alphas, betas # 2. compute alphas, betas
alpha_prod_t = alphas_cumprod[timestep] alpha_prod_t = alphas_cumprod[timestep]
alpha_prod_t_prev = jnp.where(prev_timestep >= 0, alphas_cumprod[prev_timestep], self.final_alpha_cumprod) alpha_prod_t_prev = jnp.where(prev_timestep >= 0, alphas_cumprod[prev_timestep], final_alpha_cumprod)
beta_prod_t = 1 - alpha_prod_t beta_prod_t = 1 - alpha_prod_t
...@@ -291,7 +278,7 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -291,7 +278,7 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
# 4. compute variance: "sigma_t(η)" -> see formula (16) # 4. compute variance: "sigma_t(η)" -> see formula (16)
# σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1) # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
variance = self._get_variance(timestep, prev_timestep, alphas_cumprod) variance = self._get_variance(state, timestep, prev_timestep)
std_dev_t = eta * variance ** (0.5) std_dev_t = eta * variance ** (0.5)
# 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
...@@ -307,20 +294,12 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -307,20 +294,12 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
def add_noise( def add_noise(
self, self,
state: DDIMSchedulerState,
original_samples: jnp.ndarray, original_samples: jnp.ndarray,
noise: jnp.ndarray, noise: jnp.ndarray,
timesteps: jnp.ndarray, timesteps: jnp.ndarray,
) -> jnp.ndarray: ) -> jnp.ndarray:
sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5 return add_noise_common(state.common, original_samples, noise, timesteps)
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.0
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
def __len__(self): def __len__(self):
return self.config.num_train_timesteps return self.config.num_train_timesteps
...@@ -14,62 +14,36 @@ ...@@ -14,62 +14,36 @@
# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim # DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
import math
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
import flax import flax
import jax
import jax.numpy as jnp import jax.numpy as jnp
from jax import random
from ..configuration_utils import ConfigMixin, FrozenDict, register_to_config from ..configuration_utils import ConfigMixin, register_to_config
from ..utils import deprecate from ..utils import deprecate
from .scheduling_utils_flax import ( from .scheduling_utils_flax import (
_FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
CommonSchedulerState,
FlaxSchedulerMixin, FlaxSchedulerMixin,
FlaxSchedulerOutput, FlaxSchedulerOutput,
broadcast_to_shape_from_left, add_noise_common,
) )
def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> jnp.ndarray:
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
Returns:
betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
"""
def alpha_bar(time_step):
return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
t1 = i / num_diffusion_timesteps
t2 = (i + 1) / num_diffusion_timesteps
betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
return jnp.array(betas, dtype=jnp.float32)
@flax.struct.dataclass @flax.struct.dataclass
class DDPMSchedulerState: class DDPMSchedulerState:
common: CommonSchedulerState
# setable values # setable values
init_noise_sigma: jnp.ndarray
timesteps: jnp.ndarray timesteps: jnp.ndarray
num_inference_steps: Optional[int] = None num_inference_steps: Optional[int] = None
@classmethod @classmethod
def create(cls, num_train_timesteps: int): def create(cls, common: CommonSchedulerState, init_noise_sigma: jnp.ndarray, timesteps: jnp.ndarray):
return cls(timesteps=jnp.arange(0, num_train_timesteps)[::-1]) return cls(common=common, init_noise_sigma=init_noise_sigma, timesteps=timesteps)
@dataclass @dataclass
...@@ -106,11 +80,15 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -106,11 +80,15 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
prediction_type (`str`, default `epsilon`): prediction_type (`str`, default `epsilon`):
indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`. indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
`v-prediction` is not supported for this scheduler. `v-prediction` is not supported for this scheduler.
dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
the `dtype` used for params and computation.
""" """
_compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy() _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
_deprecated_kwargs = ["predict_epsilon"] _deprecated_kwargs = ["predict_epsilon"]
dtype: jnp.dtype
@property @property
def has_state(self): def has_state(self):
return True return True
...@@ -126,35 +104,47 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -126,35 +104,47 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
variance_type: str = "fixed_small", variance_type: str = "fixed_small",
clip_sample: bool = True, clip_sample: bool = True,
prediction_type: str = "epsilon", prediction_type: str = "epsilon",
dtype: jnp.dtype = jnp.float32,
**kwargs, **kwargs,
): ):
message = ( message = (
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler =" "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
" FlaxDDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`." f" {self.__class__.__name__}.from_pretrained(<model_id>, prediction_type='epsilon')`."
) )
predict_epsilon = deprecate("predict_epsilon", "0.13.0", message, take_from=kwargs) predict_epsilon = deprecate("predict_epsilon", "0.13.0", message, take_from=kwargs)
if predict_epsilon is not None: if predict_epsilon is not None:
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample") self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
if trained_betas is not None: self.dtype = dtype
self.betas = jnp.asarray(trained_betas)
elif beta_schedule == "linear": def create_state(self, common: Optional[CommonSchedulerState] = None) -> DDPMSchedulerState:
self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32) if common is None:
elif beta_schedule == "scaled_linear": common = CommonSchedulerState.create(self)
# this schedule is very specific to the latent diffusion model.
self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2 # standard deviation of the initial noise distribution
elif beta_schedule == "squaredcos_cap_v2": init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps) timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
else:
raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") return DDPMSchedulerState.create(
common=common,
init_noise_sigma=init_noise_sigma,
timesteps=timesteps,
)
self.alphas = 1.0 - self.betas def scale_model_input(
self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0) self, state: DDPMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
self.one = jnp.array(1.0) ) -> jnp.ndarray:
"""
Args:
state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
sample (`jnp.ndarray`): input sample
timestep (`int`, optional): current timestep
def create_state(self): Returns:
return DDPMSchedulerState.create(num_train_timesteps=self.config.num_train_timesteps) `jnp.ndarray`: scaled input sample
"""
return sample
def set_timesteps( def set_timesteps(
self, state: DDPMSchedulerState, num_inference_steps: int, shape: Tuple = () self, state: DDPMSchedulerState, num_inference_steps: int, shape: Tuple = ()
...@@ -168,20 +158,25 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -168,20 +158,25 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
num_inference_steps (`int`): num_inference_steps (`int`):
the number of diffusion steps used when generating samples with a pre-trained model. the number of diffusion steps used when generating samples with a pre-trained model.
""" """
num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
timesteps = jnp.arange(
0, self.config.num_train_timesteps, self.config.num_train_timesteps // num_inference_steps
)[::-1]
return state.replace(num_inference_steps=num_inference_steps, timesteps=timesteps)
def _get_variance(self, t, predicted_variance=None, variance_type=None): step_ratio = self.config.num_train_timesteps // num_inference_steps
alpha_prod_t = self.alphas_cumprod[t] # creates integer timesteps by multiplying by ratio
alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one # rounding to avoid issues when num_inference_step is power of 3
timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1]
return state.replace(
num_inference_steps=num_inference_steps,
timesteps=timesteps,
)
def _get_variance(self, state: DDPMSchedulerState, t, predicted_variance=None, variance_type=None):
alpha_prod_t = state.common.alphas_cumprod[t]
alpha_prod_t_prev = jnp.where(t > 0, state.common.alphas_cumprod[t - 1], jnp.array(1.0, dtype=self.dtype))
# For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf) # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
# and sample from it to get previous sample # and sample from it to get previous sample
# x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t] variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * state.common.betas[t]
if variance_type is None: if variance_type is None:
variance_type = self.config.variance_type variance_type = self.config.variance_type
...@@ -193,15 +188,15 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -193,15 +188,15 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
elif variance_type == "fixed_small_log": elif variance_type == "fixed_small_log":
variance = jnp.log(jnp.clip(variance, a_min=1e-20)) variance = jnp.log(jnp.clip(variance, a_min=1e-20))
elif variance_type == "fixed_large": elif variance_type == "fixed_large":
variance = self.betas[t] variance = state.common.betas[t]
elif variance_type == "fixed_large_log": elif variance_type == "fixed_large_log":
# Glide max_log # Glide max_log
variance = jnp.log(self.betas[t]) variance = jnp.log(state.common.betas[t])
elif variance_type == "learned": elif variance_type == "learned":
return predicted_variance return predicted_variance
elif variance_type == "learned_range": elif variance_type == "learned_range":
min_log = variance min_log = variance
max_log = self.betas[t] max_log = state.common.betas[t]
frac = (predicted_variance + 1) / 2 frac = (predicted_variance + 1) / 2
variance = frac * max_log + (1 - frac) * min_log variance = frac * max_log + (1 - frac) * min_log
...@@ -213,9 +208,8 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -213,9 +208,8 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
model_output: jnp.ndarray, model_output: jnp.ndarray,
timestep: int, timestep: int,
sample: jnp.ndarray, sample: jnp.ndarray,
key: random.KeyArray, key: jax.random.KeyArray = jax.random.PRNGKey(0),
return_dict: bool = True, return_dict: bool = True,
**kwargs,
) -> Union[FlaxDDPMSchedulerOutput, Tuple]: ) -> Union[FlaxDDPMSchedulerOutput, Tuple]:
""" """
Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
...@@ -227,7 +221,7 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -227,7 +221,7 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
timestep (`int`): current discrete timestep in the diffusion chain. timestep (`int`): current discrete timestep in the diffusion chain.
sample (`jnp.ndarray`): sample (`jnp.ndarray`):
current instance of sample being created by diffusion process. current instance of sample being created by diffusion process.
key (`random.KeyArray`): a PRNG key. key (`jax.random.KeyArray`): a PRNG key.
return_dict (`bool`): option for returning tuple rather than FlaxDDPMSchedulerOutput class return_dict (`bool`): option for returning tuple rather than FlaxDDPMSchedulerOutput class
Returns: Returns:
...@@ -235,16 +229,6 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -235,16 +229,6 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
`tuple`. When returning a tuple, the first element is the sample tensor. `tuple`. When returning a tuple, the first element is the sample tensor.
""" """
message = (
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
" FlaxDDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
)
predict_epsilon = deprecate("predict_epsilon", "0.13.0", message, take_from=kwargs)
if predict_epsilon is not None:
new_config = dict(self.config)
new_config["prediction_type"] = "epsilon" if predict_epsilon else "sample"
self._internal_dict = FrozenDict(new_config)
t = timestep t = timestep
if model_output.shape[1] == sample.shape[1] * 2 and self.config.variance_type in ["learned", "learned_range"]: if model_output.shape[1] == sample.shape[1] * 2 and self.config.variance_type in ["learned", "learned_range"]:
...@@ -253,8 +237,8 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -253,8 +237,8 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
predicted_variance = None predicted_variance = None
# 1. compute alphas, betas # 1. compute alphas, betas
alpha_prod_t = self.alphas_cumprod[t] alpha_prod_t = state.common.alphas_cumprod[t]
alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one alpha_prod_t_prev = jnp.where(t > 0, state.common.alphas_cumprod[t - 1], jnp.array(1.0, dtype=self.dtype))
beta_prod_t = 1 - alpha_prod_t beta_prod_t = 1 - alpha_prod_t
beta_prod_t_prev = 1 - alpha_prod_t_prev beta_prod_t_prev = 1 - alpha_prod_t_prev
...@@ -264,6 +248,8 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -264,6 +248,8 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
elif self.config.prediction_type == "sample": elif self.config.prediction_type == "sample":
pred_original_sample = model_output pred_original_sample = model_output
elif self.config.prediction_type == "v_prediction":
pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
else: else:
raise ValueError( raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` " f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
...@@ -276,19 +262,20 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -276,19 +262,20 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
# 4. Compute coefficients for pred_original_sample x_0 and current sample x_t # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
# See formula (7) from https://arxiv.org/pdf/2006.11239.pdf # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * self.betas[t]) / beta_prod_t pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * state.common.betas[t]) / beta_prod_t
current_sample_coeff = self.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t current_sample_coeff = state.common.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t
# 5. Compute predicted previous sample µ_t # 5. Compute predicted previous sample µ_t
# See formula (7) from https://arxiv.org/pdf/2006.11239.pdf # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
# 6. Add noise # 6. Add noise
variance = 0 def random_variance():
if t > 0: split_key = jax.random.split(key, num=1)
key = random.split(key, num=1) noise = jax.random.normal(split_key, shape=model_output.shape, dtype=self.dtype)
noise = random.normal(key=key, shape=model_output.shape) return (self._get_variance(state, t, predicted_variance=predicted_variance) ** 0.5) * noise
variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * noise
variance = jnp.where(t > 0, random_variance(), jnp.zeros(model_output.shape, dtype=self.dtype))
pred_prev_sample = pred_prev_sample + variance pred_prev_sample = pred_prev_sample + variance
...@@ -299,20 +286,12 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -299,20 +286,12 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
def add_noise( def add_noise(
self, self,
state: DDPMSchedulerState,
original_samples: jnp.ndarray, original_samples: jnp.ndarray,
noise: jnp.ndarray, noise: jnp.ndarray,
timesteps: jnp.ndarray, timesteps: jnp.ndarray,
) -> jnp.ndarray: ) -> jnp.ndarray:
sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5 return add_noise_common(state.common, original_samples, noise, timesteps)
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
def __len__(self): def __len__(self):
return self.config.num_train_timesteps return self.config.num_train_timesteps
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver # DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
import math
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
...@@ -26,57 +25,49 @@ from ..configuration_utils import ConfigMixin, register_to_config ...@@ -26,57 +25,49 @@ from ..configuration_utils import ConfigMixin, register_to_config
from ..utils import deprecate from ..utils import deprecate
from .scheduling_utils_flax import ( from .scheduling_utils_flax import (
_FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
CommonSchedulerState,
FlaxSchedulerMixin, FlaxSchedulerMixin,
FlaxSchedulerOutput, FlaxSchedulerOutput,
broadcast_to_shape_from_left, add_noise_common,
) )
def betas_for_alpha_bar(num_diffusion_timesteps: int, max_beta=0.999) -> jnp.ndarray:
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
Returns:
betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
"""
def alpha_bar(time_step):
return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
t1 = i / num_diffusion_timesteps
t2 = (i + 1) / num_diffusion_timesteps
betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
return jnp.array(betas, dtype=jnp.float32)
@flax.struct.dataclass @flax.struct.dataclass
class DPMSolverMultistepSchedulerState: class DPMSolverMultistepSchedulerState:
common: CommonSchedulerState
alpha_t: jnp.ndarray
sigma_t: jnp.ndarray
lambda_t: jnp.ndarray
# setable values # setable values
init_noise_sigma: jnp.ndarray
timesteps: jnp.ndarray
num_inference_steps: Optional[int] = None num_inference_steps: Optional[int] = None
timesteps: Optional[jnp.ndarray] = None
# running values # running values
model_outputs: Optional[jnp.ndarray] = None model_outputs: Optional[jnp.ndarray] = None
lower_order_nums: Optional[int] = None lower_order_nums: Optional[jnp.int32] = None
step_index: Optional[int] = None prev_timestep: Optional[jnp.int32] = None
prev_timestep: Optional[int] = None
cur_sample: Optional[jnp.ndarray] = None cur_sample: Optional[jnp.ndarray] = None
@classmethod @classmethod
def create(cls, num_train_timesteps: int): def create(
return cls(timesteps=jnp.arange(0, num_train_timesteps)[::-1]) cls,
common: CommonSchedulerState,
alpha_t: jnp.ndarray,
sigma_t: jnp.ndarray,
lambda_t: jnp.ndarray,
init_noise_sigma: jnp.ndarray,
timesteps: jnp.ndarray,
):
return cls(
common=common,
alpha_t=alpha_t,
sigma_t=sigma_t,
lambda_t=lambda_t,
init_noise_sigma=init_noise_sigma,
timesteps=timesteps,
)
@dataclass @dataclass
...@@ -145,12 +136,15 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -145,12 +136,15 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
lower_order_final (`bool`, default `True`): lower_order_final (`bool`, default `True`):
whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically
find this trick can stabilize the sampling of DPM-Solver for steps < 15, especially for steps <= 10. find this trick can stabilize the sampling of DPM-Solver for steps < 15, especially for steps <= 10.
dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
the `dtype` used for params and computation.
""" """
_compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy() _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
_deprecated_kwargs = ["predict_epsilon"] _deprecated_kwargs = ["predict_epsilon"]
dtype: jnp.dtype
@property @property
def has_state(self): def has_state(self):
return True return True
...@@ -171,47 +165,47 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -171,47 +165,47 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
algorithm_type: str = "dpmsolver++", algorithm_type: str = "dpmsolver++",
solver_type: str = "midpoint", solver_type: str = "midpoint",
lower_order_final: bool = True, lower_order_final: bool = True,
dtype: jnp.dtype = jnp.float32,
**kwargs, **kwargs,
): ):
message = ( message = (
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler =" "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
" FlaxDPMSolverMultistepScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`." f" {self.__class__.__name__}.from_pretrained(<model_id>, prediction_type='epsilon')`."
) )
predict_epsilon = deprecate("predict_epsilon", "0.13.0", message, take_from=kwargs) predict_epsilon = deprecate("predict_epsilon", "0.13.0", message, take_from=kwargs)
if predict_epsilon is not None: if predict_epsilon is not None:
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample") self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
if trained_betas is not None: self.dtype = dtype
self.betas = jnp.asarray(trained_betas)
elif beta_schedule == "linear": def create_state(self, common: Optional[CommonSchedulerState] = None) -> DPMSolverMultistepSchedulerState:
self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32) if common is None:
elif beta_schedule == "scaled_linear": common = CommonSchedulerState.create(self)
# this schedule is very specific to the latent diffusion model.
self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
# Currently we only support VP-type noise schedule # Currently we only support VP-type noise schedule
self.alpha_t = jnp.sqrt(self.alphas_cumprod) alpha_t = jnp.sqrt(common.alphas_cumprod)
self.sigma_t = jnp.sqrt(1 - self.alphas_cumprod) sigma_t = jnp.sqrt(1 - common.alphas_cumprod)
self.lambda_t = jnp.log(self.alpha_t) - jnp.log(self.sigma_t) lambda_t = jnp.log(alpha_t) - jnp.log(sigma_t)
# settings for DPM-Solver
if self.config.algorithm_type not in ["dpmsolver", "dpmsolver++"]:
raise NotImplementedError(f"{self.config.algorithm_type} does is not implemented for {self.__class__}")
if self.config.solver_type not in ["midpoint", "heun"]:
raise NotImplementedError(f"{self.config.solver_type} does is not implemented for {self.__class__}")
# standard deviation of the initial noise distribution # standard deviation of the initial noise distribution
self.init_noise_sigma = 1.0 init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
# settings for DPM-Solver timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
if solver_type not in ["midpoint", "heun"]:
raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
def create_state(self): return DPMSolverMultistepSchedulerState.create(
return DPMSolverMultistepSchedulerState.create(num_train_timesteps=self.config.num_train_timesteps) common=common,
alpha_t=alpha_t,
sigma_t=sigma_t,
lambda_t=lambda_t,
init_noise_sigma=init_noise_sigma,
timesteps=timesteps,
)
def set_timesteps( def set_timesteps(
self, state: DPMSolverMultistepSchedulerState, num_inference_steps: int, shape: Tuple self, state: DPMSolverMultistepSchedulerState, num_inference_steps: int, shape: Tuple
...@@ -227,24 +221,32 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -227,24 +221,32 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
shape (`Tuple`): shape (`Tuple`):
the shape of the samples to be generated. the shape of the samples to be generated.
""" """
timesteps = ( timesteps = (
jnp.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1) jnp.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
.round()[::-1][:-1] .round()[::-1][:-1]
.astype(jnp.int32) .astype(jnp.int32)
) )
# initial running values
model_outputs = jnp.zeros((self.config.solver_order,) + shape, dtype=self.dtype)
lower_order_nums = jnp.int32(0)
prev_timestep = jnp.int32(-1)
cur_sample = jnp.zeros(shape, dtype=self.dtype)
return state.replace( return state.replace(
num_inference_steps=num_inference_steps, num_inference_steps=num_inference_steps,
timesteps=timesteps, timesteps=timesteps,
model_outputs=jnp.zeros((self.config.solver_order,) + shape), model_outputs=model_outputs,
lower_order_nums=0, lower_order_nums=lower_order_nums,
step_index=0, prev_timestep=prev_timestep,
prev_timestep=-1, cur_sample=cur_sample,
cur_sample=jnp.zeros(shape),
) )
def convert_model_output( def convert_model_output(
self, self,
state: DPMSolverMultistepSchedulerState,
model_output: jnp.ndarray, model_output: jnp.ndarray,
timestep: int, timestep: int,
sample: jnp.ndarray, sample: jnp.ndarray,
...@@ -271,12 +273,12 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -271,12 +273,12 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
# DPM-Solver++ needs to solve an integral of the data prediction model. # DPM-Solver++ needs to solve an integral of the data prediction model.
if self.config.algorithm_type == "dpmsolver++": if self.config.algorithm_type == "dpmsolver++":
if self.config.prediction_type == "epsilon": if self.config.prediction_type == "epsilon":
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] alpha_t, sigma_t = state.alpha_t[timestep], state.sigma_t[timestep]
x0_pred = (sample - sigma_t * model_output) / alpha_t x0_pred = (sample - sigma_t * model_output) / alpha_t
elif self.config.prediction_type == "sample": elif self.config.prediction_type == "sample":
x0_pred = model_output x0_pred = model_output
elif self.config.prediction_type == "v_prediction": elif self.config.prediction_type == "v_prediction":
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] alpha_t, sigma_t = state.alpha_t[timestep], state.sigma_t[timestep]
x0_pred = alpha_t * sample - sigma_t * model_output x0_pred = alpha_t * sample - sigma_t * model_output
else: else:
raise ValueError( raise ValueError(
...@@ -299,11 +301,11 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -299,11 +301,11 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
if self.config.prediction_type == "epsilon": if self.config.prediction_type == "epsilon":
return model_output return model_output
elif self.config.prediction_type == "sample": elif self.config.prediction_type == "sample":
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] alpha_t, sigma_t = state.alpha_t[timestep], state.sigma_t[timestep]
epsilon = (sample - alpha_t * model_output) / sigma_t epsilon = (sample - alpha_t * model_output) / sigma_t
return epsilon return epsilon
elif self.config.prediction_type == "v_prediction": elif self.config.prediction_type == "v_prediction":
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] alpha_t, sigma_t = state.alpha_t[timestep], state.sigma_t[timestep]
epsilon = alpha_t * model_output + sigma_t * sample epsilon = alpha_t * model_output + sigma_t * sample
return epsilon return epsilon
else: else:
...@@ -313,7 +315,12 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -313,7 +315,12 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
) )
def dpm_solver_first_order_update( def dpm_solver_first_order_update(
self, model_output: jnp.ndarray, timestep: int, prev_timestep: int, sample: jnp.ndarray self,
state: DPMSolverMultistepSchedulerState,
model_output: jnp.ndarray,
timestep: int,
prev_timestep: int,
sample: jnp.ndarray,
) -> jnp.ndarray: ) -> jnp.ndarray:
""" """
One step for the first-order DPM-Solver (equivalent to DDIM). One step for the first-order DPM-Solver (equivalent to DDIM).
...@@ -332,9 +339,9 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -332,9 +339,9 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
""" """
t, s0 = prev_timestep, timestep t, s0 = prev_timestep, timestep
m0 = model_output m0 = model_output
lambda_t, lambda_s = self.lambda_t[t], self.lambda_t[s0] lambda_t, lambda_s = state.lambda_t[t], state.lambda_t[s0]
alpha_t, alpha_s = self.alpha_t[t], self.alpha_t[s0] alpha_t, alpha_s = state.alpha_t[t], state.alpha_t[s0]
sigma_t, sigma_s = self.sigma_t[t], self.sigma_t[s0] sigma_t, sigma_s = state.sigma_t[t], state.sigma_t[s0]
h = lambda_t - lambda_s h = lambda_t - lambda_s
if self.config.algorithm_type == "dpmsolver++": if self.config.algorithm_type == "dpmsolver++":
x_t = (sigma_t / sigma_s) * sample - (alpha_t * (jnp.exp(-h) - 1.0)) * m0 x_t = (sigma_t / sigma_s) * sample - (alpha_t * (jnp.exp(-h) - 1.0)) * m0
...@@ -344,6 +351,7 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -344,6 +351,7 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
def multistep_dpm_solver_second_order_update( def multistep_dpm_solver_second_order_update(
self, self,
state: DPMSolverMultistepSchedulerState,
model_output_list: jnp.ndarray, model_output_list: jnp.ndarray,
timestep_list: List[int], timestep_list: List[int],
prev_timestep: int, prev_timestep: int,
...@@ -365,9 +373,9 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -365,9 +373,9 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
""" """
t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2] t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
m0, m1 = model_output_list[-1], model_output_list[-2] m0, m1 = model_output_list[-1], model_output_list[-2]
lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1] lambda_t, lambda_s0, lambda_s1 = state.lambda_t[t], state.lambda_t[s0], state.lambda_t[s1]
alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0] alpha_t, alpha_s0 = state.alpha_t[t], state.alpha_t[s0]
sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0] sigma_t, sigma_s0 = state.sigma_t[t], state.sigma_t[s0]
h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1 h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
r0 = h_0 / h r0 = h_0 / h
D0, D1 = m0, (1.0 / r0) * (m0 - m1) D0, D1 = m0, (1.0 / r0) * (m0 - m1)
...@@ -403,6 +411,7 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -403,6 +411,7 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
def multistep_dpm_solver_third_order_update( def multistep_dpm_solver_third_order_update(
self, self,
state: DPMSolverMultistepSchedulerState,
model_output_list: jnp.ndarray, model_output_list: jnp.ndarray,
timestep_list: List[int], timestep_list: List[int],
prev_timestep: int, prev_timestep: int,
...@@ -425,13 +434,13 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -425,13 +434,13 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3] t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3] m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
lambda_t, lambda_s0, lambda_s1, lambda_s2 = ( lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
self.lambda_t[t], state.lambda_t[t],
self.lambda_t[s0], state.lambda_t[s0],
self.lambda_t[s1], state.lambda_t[s1],
self.lambda_t[s2], state.lambda_t[s2],
) )
alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0] alpha_t, alpha_s0 = state.alpha_t[t], state.alpha_t[s0]
sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0] sigma_t, sigma_s0 = state.sigma_t[t], state.sigma_t[s0]
h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2 h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
r0, r1 = h_0 / h, h_1 / h r0, r1 = h_0 / h, h_1 / h
D0 = m0 D0 = m0
...@@ -482,14 +491,17 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -482,14 +491,17 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
`return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor. `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
""" """
prev_timestep = jax.lax.cond( if state.num_inference_steps is None:
state.step_index == len(state.timesteps) - 1, raise ValueError(
lambda _: 0, "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
lambda _: state.timesteps[state.step_index + 1], )
(),
)
model_output = self.convert_model_output(model_output, timestep, sample) (step_index,) = jnp.where(state.timesteps == timestep, size=1)
step_index = step_index[0]
prev_timestep = jax.lax.select(step_index == len(state.timesteps) - 1, 0, state.timesteps[step_index + 1])
model_output = self.convert_model_output(state, model_output, timestep, sample)
model_outputs_new = jnp.roll(state.model_outputs, -1, axis=0) model_outputs_new = jnp.roll(state.model_outputs, -1, axis=0)
model_outputs_new = model_outputs_new.at[-1].set(model_output) model_outputs_new = model_outputs_new.at[-1].set(model_output)
...@@ -501,16 +513,18 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -501,16 +513,18 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
def step_1(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray: def step_1(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
return self.dpm_solver_first_order_update( return self.dpm_solver_first_order_update(
state,
state.model_outputs[-1], state.model_outputs[-1],
state.timesteps[state.step_index], state.timesteps[step_index],
state.prev_timestep, state.prev_timestep,
state.cur_sample, state.cur_sample,
) )
def step_23(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray: def step_23(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
def step_2(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray: def step_2(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
timestep_list = jnp.array([state.timesteps[state.step_index - 1], state.timesteps[state.step_index]]) timestep_list = jnp.array([state.timesteps[step_index - 1], state.timesteps[step_index]])
return self.multistep_dpm_solver_second_order_update( return self.multistep_dpm_solver_second_order_update(
state,
state.model_outputs, state.model_outputs,
timestep_list, timestep_list,
state.prev_timestep, state.prev_timestep,
...@@ -520,65 +534,67 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -520,65 +534,67 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
def step_3(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray: def step_3(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
timestep_list = jnp.array( timestep_list = jnp.array(
[ [
state.timesteps[state.step_index - 2], state.timesteps[step_index - 2],
state.timesteps[state.step_index - 1], state.timesteps[step_index - 1],
state.timesteps[state.step_index], state.timesteps[step_index],
] ]
) )
return self.multistep_dpm_solver_third_order_update( return self.multistep_dpm_solver_third_order_update(
state,
state.model_outputs, state.model_outputs,
timestep_list, timestep_list,
state.prev_timestep, state.prev_timestep,
state.cur_sample, state.cur_sample,
) )
step_2_output = step_2(state)
step_3_output = step_3(state)
if self.config.solver_order == 2: if self.config.solver_order == 2:
return step_2(state) return step_2_output
elif self.config.lower_order_final and len(state.timesteps) < 15: elif self.config.lower_order_final and len(state.timesteps) < 15:
return jax.lax.cond( return jax.lax.select(
state.lower_order_nums < 2, state.lower_order_nums < 2,
step_2, step_2_output,
lambda state: jax.lax.cond( jax.lax.select(
state.step_index == len(state.timesteps) - 2, step_index == len(state.timesteps) - 2,
step_2, step_2_output,
step_3, step_3_output,
state,
), ),
state,
) )
else: else:
return jax.lax.cond( return jax.lax.select(
state.lower_order_nums < 2, state.lower_order_nums < 2,
step_2, step_2_output,
step_3, step_3_output,
state,
) )
step_1_output = step_1(state)
step_23_output = step_23(state)
if self.config.solver_order == 1: if self.config.solver_order == 1:
prev_sample = step_1(state) prev_sample = step_1_output
elif self.config.lower_order_final and len(state.timesteps) < 15: elif self.config.lower_order_final and len(state.timesteps) < 15:
prev_sample = jax.lax.cond( prev_sample = jax.lax.select(
state.lower_order_nums < 1, state.lower_order_nums < 1,
step_1, step_1_output,
lambda state: jax.lax.cond( jax.lax.select(
state.step_index == len(state.timesteps) - 1, step_index == len(state.timesteps) - 1,
step_1, step_1_output,
step_23, step_23_output,
state,
), ),
state,
) )
else: else:
prev_sample = jax.lax.cond( prev_sample = jax.lax.select(
state.lower_order_nums < 1, state.lower_order_nums < 1,
step_1, step_1_output,
step_23, step_23_output,
state,
) )
state = state.replace( state = state.replace(
lower_order_nums=jnp.minimum(state.lower_order_nums + 1, self.config.solver_order), lower_order_nums=jnp.minimum(state.lower_order_nums + 1, self.config.solver_order),
step_index=(state.step_index + 1),
) )
if not return_dict: if not return_dict:
...@@ -606,20 +622,12 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -606,20 +622,12 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
def add_noise( def add_noise(
self, self,
state: DPMSolverMultistepSchedulerState,
original_samples: jnp.ndarray, original_samples: jnp.ndarray,
noise: jnp.ndarray, noise: jnp.ndarray,
timesteps: jnp.ndarray, timesteps: jnp.ndarray,
) -> jnp.ndarray: ) -> jnp.ndarray:
sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5 return add_noise_common(state.common, original_samples, noise, timesteps)
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.0
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
def __len__(self): def __len__(self):
return self.config.num_train_timesteps return self.config.num_train_timesteps
...@@ -233,5 +233,5 @@ class FlaxKarrasVeScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -233,5 +233,5 @@ class FlaxKarrasVeScheduler(FlaxSchedulerMixin, ConfigMixin):
return FlaxKarrasVeOutput(prev_sample=sample_prev, derivative=derivative, state=state) return FlaxKarrasVeOutput(prev_sample=sample_prev, derivative=derivative, state=state)
def add_noise(self, original_samples, noise, timesteps): def add_noise(self, state: KarrasVeSchedulerState, original_samples, noise, timesteps):
raise NotImplementedError() raise NotImplementedError()
...@@ -22,6 +22,7 @@ from scipy import integrate ...@@ -22,6 +22,7 @@ from scipy import integrate
from ..configuration_utils import ConfigMixin, register_to_config from ..configuration_utils import ConfigMixin, register_to_config
from .scheduling_utils_flax import ( from .scheduling_utils_flax import (
_FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
CommonSchedulerState,
FlaxSchedulerMixin, FlaxSchedulerMixin,
FlaxSchedulerOutput, FlaxSchedulerOutput,
broadcast_to_shape_from_left, broadcast_to_shape_from_left,
...@@ -30,15 +31,22 @@ from .scheduling_utils_flax import ( ...@@ -30,15 +31,22 @@ from .scheduling_utils_flax import (
@flax.struct.dataclass @flax.struct.dataclass
class LMSDiscreteSchedulerState: class LMSDiscreteSchedulerState:
common: CommonSchedulerState
# setable values # setable values
init_noise_sigma: jnp.ndarray
timesteps: jnp.ndarray
sigmas: jnp.ndarray
num_inference_steps: Optional[int] = None num_inference_steps: Optional[int] = None
timesteps: Optional[jnp.ndarray] = None
sigmas: Optional[jnp.ndarray] = None # running values
derivatives: jnp.ndarray = jnp.array([]) derivatives: Optional[jnp.ndarray] = None
@classmethod @classmethod
def create(cls, num_train_timesteps: int, sigmas: jnp.ndarray): def create(
return cls(timesteps=jnp.arange(0, num_train_timesteps)[::-1], sigmas=sigmas) cls, common: CommonSchedulerState, init_noise_sigma: jnp.ndarray, timesteps: jnp.ndarray, sigmas: jnp.ndarray
):
return cls(common=common, init_noise_sigma=init_noise_sigma, timesteps=timesteps, sigmas=sigmas)
@dataclass @dataclass
...@@ -66,10 +74,18 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -66,10 +74,18 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
`linear` or `scaled_linear`. `linear` or `scaled_linear`.
trained_betas (`jnp.ndarray`, optional): trained_betas (`jnp.ndarray`, optional):
option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc. option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
prediction_type (`str`, default `epsilon`, optional):
prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
https://imagen.research.google/video/paper.pdf)
dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
the `dtype` used for params and computation.
""" """
_compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy() _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
dtype: jnp.dtype
@property @property
def has_state(self): def has_state(self):
return True return True
...@@ -82,24 +98,26 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -82,24 +98,26 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
beta_end: float = 0.02, beta_end: float = 0.02,
beta_schedule: str = "linear", beta_schedule: str = "linear",
trained_betas: Optional[jnp.ndarray] = None, trained_betas: Optional[jnp.ndarray] = None,
prediction_type: str = "epsilon",
dtype: jnp.dtype = jnp.float32,
): ):
if trained_betas is not None: self.dtype = dtype
self.betas = jnp.asarray(trained_betas)
elif beta_schedule == "linear":
self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
else:
raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas def create_state(self, common: Optional[CommonSchedulerState] = None) -> LMSDiscreteSchedulerState:
self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0) if common is None:
common = CommonSchedulerState.create(self)
def create_state(self): timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
self.state = LMSDiscreteSchedulerState.create( sigmas = ((1 - common.alphas_cumprod) / common.alphas_cumprod) ** 0.5
num_train_timesteps=self.config.num_train_timesteps,
sigmas=((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5, # standard deviation of the initial noise distribution
init_noise_sigma = sigmas.max()
return LMSDiscreteSchedulerState.create(
common=common,
init_noise_sigma=init_noise_sigma,
timesteps=timesteps,
sigmas=sigmas,
) )
def scale_model_input(self, state: LMSDiscreteSchedulerState, sample: jnp.ndarray, timestep: int) -> jnp.ndarray: def scale_model_input(self, state: LMSDiscreteSchedulerState, sample: jnp.ndarray, timestep: int) -> jnp.ndarray:
...@@ -118,11 +136,13 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -118,11 +136,13 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
`jnp.ndarray`: scaled input sample `jnp.ndarray`: scaled input sample
""" """
(step_index,) = jnp.where(state.timesteps == timestep, size=1) (step_index,) = jnp.where(state.timesteps == timestep, size=1)
step_index = step_index[0]
sigma = state.sigmas[step_index] sigma = state.sigmas[step_index]
sample = sample / ((sigma**2 + 1) ** 0.5) sample = sample / ((sigma**2 + 1) ** 0.5)
return sample return sample
def get_lms_coefficient(self, state, order, t, current_order): def get_lms_coefficient(self, state: LMSDiscreteSchedulerState, order, t, current_order):
""" """
Compute a linear multistep coefficient. Compute a linear multistep coefficient.
...@@ -156,20 +176,28 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -156,20 +176,28 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
num_inference_steps (`int`): num_inference_steps (`int`):
the number of diffusion steps used when generating samples with a pre-trained model. the number of diffusion steps used when generating samples with a pre-trained model.
""" """
timesteps = jnp.linspace(self.config.num_train_timesteps - 1, 0, num_inference_steps, dtype=jnp.float32)
low_idx = jnp.floor(timesteps).astype(int) timesteps = jnp.linspace(self.config.num_train_timesteps - 1, 0, num_inference_steps, dtype=self.dtype)
high_idx = jnp.ceil(timesteps).astype(int)
low_idx = jnp.floor(timesteps).astype(jnp.int32)
high_idx = jnp.ceil(timesteps).astype(jnp.int32)
frac = jnp.mod(timesteps, 1.0) frac = jnp.mod(timesteps, 1.0)
sigmas = jnp.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
sigmas = ((1 - state.common.alphas_cumprod) / state.common.alphas_cumprod) ** 0.5
sigmas = (1 - frac) * sigmas[low_idx] + frac * sigmas[high_idx] sigmas = (1 - frac) * sigmas[low_idx] + frac * sigmas[high_idx]
sigmas = jnp.concatenate([sigmas, jnp.array([0.0])]).astype(jnp.float32) sigmas = jnp.concatenate([sigmas, jnp.array([0.0], dtype=self.dtype)])
timesteps = timesteps.astype(jnp.int32)
# initial running values
derivatives = jnp.zeros((0,) + shape, dtype=self.dtype)
return state.replace( return state.replace(
num_inference_steps=num_inference_steps, timesteps=timesteps,
timesteps=timesteps.astype(int),
derivatives=jnp.array([]),
sigmas=sigmas, sigmas=sigmas,
num_inference_steps=num_inference_steps,
derivatives=derivatives,
) )
def step( def step(
...@@ -199,10 +227,23 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -199,10 +227,23 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
`tuple`. When returning a tuple, the first element is the sample tensor. `tuple`. When returning a tuple, the first element is the sample tensor.
""" """
if state.num_inference_steps is None:
raise ValueError(
"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
)
sigma = state.sigmas[timestep] sigma = state.sigmas[timestep]
# 1. compute predicted original sample (x_0) from sigma-scaled predicted noise # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
pred_original_sample = sample - sigma * model_output if self.config.prediction_type == "epsilon":
pred_original_sample = sample - sigma * model_output
elif self.config.prediction_type == "v_prediction":
# * c_out + input * c_skip
pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
else:
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
)
# 2. Convert to an ODE derivative # 2. Convert to an ODE derivative
derivative = (sample - pred_original_sample) / sigma derivative = (sample - pred_original_sample) / sigma
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim # DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
import math
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
...@@ -25,59 +24,45 @@ import jax.numpy as jnp ...@@ -25,59 +24,45 @@ import jax.numpy as jnp
from ..configuration_utils import ConfigMixin, register_to_config from ..configuration_utils import ConfigMixin, register_to_config
from .scheduling_utils_flax import ( from .scheduling_utils_flax import (
_FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
CommonSchedulerState,
FlaxSchedulerMixin, FlaxSchedulerMixin,
FlaxSchedulerOutput, FlaxSchedulerOutput,
broadcast_to_shape_from_left, add_noise_common,
) )
def betas_for_alpha_bar(num_diffusion_timesteps: int, max_beta=0.999) -> jnp.ndarray:
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
Returns:
betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
"""
def alpha_bar(time_step):
return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
t1 = i / num_diffusion_timesteps
t2 = (i + 1) / num_diffusion_timesteps
betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
return jnp.array(betas, dtype=jnp.float32)
@flax.struct.dataclass @flax.struct.dataclass
class PNDMSchedulerState: class PNDMSchedulerState:
common: CommonSchedulerState
final_alpha_cumprod: jnp.ndarray
# setable values # setable values
_timesteps: jnp.ndarray init_noise_sigma: jnp.ndarray
timesteps: jnp.ndarray
num_inference_steps: Optional[int] = None num_inference_steps: Optional[int] = None
prk_timesteps: Optional[jnp.ndarray] = None prk_timesteps: Optional[jnp.ndarray] = None
plms_timesteps: Optional[jnp.ndarray] = None plms_timesteps: Optional[jnp.ndarray] = None
timesteps: Optional[jnp.ndarray] = None
# running values # running values
cur_model_output: Optional[jnp.ndarray] = None cur_model_output: Optional[jnp.ndarray] = None
counter: int = 0 counter: Optional[jnp.int32] = None
cur_sample: Optional[jnp.ndarray] = None cur_sample: Optional[jnp.ndarray] = None
ets: jnp.ndarray = jnp.array([]) ets: Optional[jnp.ndarray] = None
@classmethod @classmethod
def create(cls, num_train_timesteps: int): def create(
return cls(_timesteps=jnp.arange(0, num_train_timesteps)[::-1]) cls,
common: CommonSchedulerState,
final_alpha_cumprod: jnp.ndarray,
init_noise_sigma: jnp.ndarray,
timesteps: jnp.ndarray,
):
return cls(
common=common,
final_alpha_cumprod=final_alpha_cumprod,
init_noise_sigma=init_noise_sigma,
timesteps=timesteps,
)
@dataclass @dataclass
...@@ -117,10 +102,19 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -117,10 +102,19 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
an offset added to the inference steps. You can use a combination of `offset=1` and an offset added to the inference steps. You can use a combination of `offset=1` and
`set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
stable diffusion. stable diffusion.
prediction_type (`str`, default `epsilon`, optional):
prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
https://imagen.research.google/video/paper.pdf)
dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
the `dtype` used for params and computation.
""" """
_compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy() _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
dtype: jnp.dtype
pndm_order: int
@property @property
def has_state(self): def has_state(self):
return True return True
...@@ -136,35 +130,39 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -136,35 +130,39 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
skip_prk_steps: bool = False, skip_prk_steps: bool = False,
set_alpha_to_one: bool = False, set_alpha_to_one: bool = False,
steps_offset: int = 0, steps_offset: int = 0,
prediction_type: str = "epsilon",
dtype: jnp.dtype = jnp.float32,
): ):
if trained_betas is not None: self.dtype = dtype
self.betas = jnp.asarray(trained_betas)
elif beta_schedule == "linear":
self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
elif beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
elif beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
self.betas = betas_for_alpha_bar(num_train_timesteps)
else:
raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
self.alphas = 1.0 - self.betas
self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
self.final_alpha_cumprod = jnp.array(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
# For now we only support F-PNDM, i.e. the runge-kutta method # For now we only support F-PNDM, i.e. the runge-kutta method
# For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
# mainly at formula (9), (12), (13) and the Algorithm 2. # mainly at formula (9), (12), (13) and the Algorithm 2.
self.pndm_order = 4 self.pndm_order = 4
def create_state(self, common: Optional[CommonSchedulerState] = None) -> PNDMSchedulerState:
if common is None:
common = CommonSchedulerState.create(self)
# At every step in ddim, we are looking into the previous alphas_cumprod
# For the final step, there is no previous alphas_cumprod because we are already at 0
# `set_alpha_to_one` decides whether we set this parameter simply to one or
# whether we use the final alpha of the "non-previous" one.
final_alpha_cumprod = (
jnp.array(1.0, dtype=self.dtype) if self.config.set_alpha_to_one else common.alphas_cumprod[0]
)
# standard deviation of the initial noise distribution # standard deviation of the initial noise distribution
self.init_noise_sigma = 1.0 init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
def create_state(self): timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
return PNDMSchedulerState.create(num_train_timesteps=self.config.num_train_timesteps)
return PNDMSchedulerState.create(
common=common,
final_alpha_cumprod=final_alpha_cumprod,
init_noise_sigma=init_noise_sigma,
timesteps=timesteps,
)
def set_timesteps(self, state: PNDMSchedulerState, num_inference_steps: int, shape: Tuple) -> PNDMSchedulerState: def set_timesteps(self, state: PNDMSchedulerState, num_inference_steps: int, shape: Tuple) -> PNDMSchedulerState:
""" """
...@@ -178,42 +176,47 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -178,42 +176,47 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
shape (`Tuple`): shape (`Tuple`):
the shape of the samples to be generated. the shape of the samples to be generated.
""" """
offset = self.config.steps_offset
step_ratio = self.config.num_train_timesteps // num_inference_steps step_ratio = self.config.num_train_timesteps // num_inference_steps
# creates integer timesteps by multiplying by ratio # creates integer timesteps by multiplying by ratio
# rounding to avoid issues when num_inference_step is power of 3 # rounding to avoid issues when num_inference_step is power of 3
_timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round() + offset _timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round() + self.config.steps_offset
state = state.replace(num_inference_steps=num_inference_steps, _timesteps=_timesteps)
if self.config.skip_prk_steps: if self.config.skip_prk_steps:
# for some models like stable diffusion the prk steps can/should be skipped to # for some models like stable diffusion the prk steps can/should be skipped to
# produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation # produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation
# is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51 # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51
state = state.replace(
prk_timesteps=jnp.array([]), prk_timesteps = jnp.array([], dtype=jnp.int32)
plms_timesteps=jnp.concatenate( plms_timesteps = jnp.concatenate([_timesteps[:-1], _timesteps[-2:-1], _timesteps[-1:]])[::-1]
[state._timesteps[:-1], state._timesteps[-2:-1], state._timesteps[-1:]]
)[::-1],
)
else: else:
prk_timesteps = jnp.array(state._timesteps[-self.pndm_order :]).repeat(2) + jnp.tile( prk_timesteps = _timesteps[-self.pndm_order :].repeat(2) + jnp.tile(
jnp.array([0, self.config.num_train_timesteps // num_inference_steps // 2]), self.pndm_order jnp.array([0, self.config.num_train_timesteps // num_inference_steps // 2], dtype=jnp.int32),
self.pndm_order,
) )
state = state.replace( prk_timesteps = (prk_timesteps[:-1].repeat(2)[1:-1])[::-1]
prk_timesteps=(prk_timesteps[:-1].repeat(2)[1:-1])[::-1], plms_timesteps = _timesteps[:-3][::-1]
plms_timesteps=state._timesteps[:-3][::-1],
) timesteps = jnp.concatenate([prk_timesteps, plms_timesteps])
# initial running values
cur_model_output = jnp.zeros(shape, dtype=self.dtype)
counter = jnp.int32(0)
cur_sample = jnp.zeros(shape, dtype=self.dtype)
ets = jnp.zeros((4,) + shape, dtype=self.dtype)
return state.replace( return state.replace(
timesteps=jnp.concatenate([state.prk_timesteps, state.plms_timesteps]).astype(jnp.int32), timesteps=timesteps,
counter=0, num_inference_steps=num_inference_steps,
# Reserve space for the state variables prk_timesteps=prk_timesteps,
cur_model_output=jnp.zeros(shape), plms_timesteps=plms_timesteps,
cur_sample=jnp.zeros(shape), cur_model_output=cur_model_output,
ets=jnp.zeros((4,) + shape), counter=counter,
cur_sample=cur_sample,
ets=ets,
) )
def scale_model_input( def scale_model_input(
...@@ -260,19 +263,27 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -260,19 +263,27 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
`tuple`. When returning a tuple, the first element is the sample tensor. `tuple`. When returning a tuple, the first element is the sample tensor.
""" """
if self.config.skip_prk_steps:
prev_sample, state = self.step_plms( if state.num_inference_steps is None:
state=state, model_output=model_output, timestep=timestep, sample=sample raise ValueError(
"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
) )
if self.config.skip_prk_steps:
prev_sample, state = self.step_plms(state, model_output, timestep, sample)
else: else:
prev_sample, state = jax.lax.switch( prk_prev_sample, prk_state = self.step_prk(state, model_output, timestep, sample)
jnp.where(state.counter < len(state.prk_timesteps), 0, 1), plms_prev_sample, plms_state = self.step_plms(state, model_output, timestep, sample)
(self.step_prk, self.step_plms),
# Args to either branch cond = state.counter < len(state.prk_timesteps)
state,
model_output, prev_sample = jax.lax.select(cond, prk_prev_sample, plms_prev_sample)
timestep,
sample, state = state.replace(
cur_model_output=jax.lax.select(cond, prk_state.cur_model_output, plms_state.cur_model_output),
ets=jax.lax.select(cond, prk_state.ets, plms_state.ets),
cur_sample=jax.lax.select(cond, prk_state.cur_sample, plms_state.cur_sample),
counter=jax.lax.select(cond, prk_state.counter, plms_state.counter),
) )
if not return_dict: if not return_dict:
...@@ -304,6 +315,7 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -304,6 +315,7 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
`tuple`. When returning a tuple, the first element is the sample tensor. `tuple`. When returning a tuple, the first element is the sample tensor.
""" """
if state.num_inference_steps is None: if state.num_inference_steps is None:
raise ValueError( raise ValueError(
"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
...@@ -315,37 +327,34 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -315,37 +327,34 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
prev_timestep = timestep - diff_to_prev prev_timestep = timestep - diff_to_prev
timestep = state.prk_timesteps[state.counter // 4 * 4] timestep = state.prk_timesteps[state.counter // 4 * 4]
def remainder_0(state: PNDMSchedulerState, model_output: jnp.ndarray, ets_at: int): model_output = jax.lax.select(
return ( (state.counter % 4) != 3,
state.replace( model_output, # remainder 0, 1, 2
cur_model_output=state.cur_model_output + 1 / 6 * model_output, state.cur_model_output + 1 / 6 * model_output, # remainder 3
ets=state.ets.at[ets_at].set(model_output), )
cur_sample=sample,
),
model_output,
)
def remainder_1(state: PNDMSchedulerState, model_output: jnp.ndarray, ets_at: int):
return state.replace(cur_model_output=state.cur_model_output + 1 / 3 * model_output), model_output
def remainder_2(state: PNDMSchedulerState, model_output: jnp.ndarray, ets_at: int):
return state.replace(cur_model_output=state.cur_model_output + 1 / 3 * model_output), model_output
def remainder_3(state: PNDMSchedulerState, model_output: jnp.ndarray, ets_at: int):
model_output = state.cur_model_output + 1 / 6 * model_output
return state.replace(cur_model_output=jnp.zeros_like(state.cur_model_output)), model_output
state, model_output = jax.lax.switch( state = state.replace(
state.counter % 4, cur_model_output=jax.lax.select_n(
(remainder_0, remainder_1, remainder_2, remainder_3), state.counter % 4,
# Args to either branch state.cur_model_output + 1 / 6 * model_output, # remainder 0
state, state.cur_model_output + 1 / 3 * model_output, # remainder 1
model_output, state.cur_model_output + 1 / 3 * model_output, # remainder 2
state.counter // 4, jnp.zeros_like(state.cur_model_output), # remainder 3
),
ets=jax.lax.select(
(state.counter % 4) == 0,
state.ets.at[0:3].set(state.ets[1:4]).at[3].set(model_output), # remainder 0
state.ets, # remainder 1, 2, 3
),
cur_sample=jax.lax.select(
(state.counter % 4) == 0,
sample, # remainder 0
state.cur_sample, # remainder 1, 2, 3
),
) )
cur_sample = state.cur_sample cur_sample = state.cur_sample
prev_sample = self._get_prev_sample(cur_sample, timestep, prev_timestep, model_output) prev_sample = self._get_prev_sample(state, cur_sample, timestep, prev_timestep, model_output)
state = state.replace(counter=state.counter + 1) state = state.replace(counter=state.counter + 1)
return (prev_sample, state) return (prev_sample, state)
...@@ -374,18 +383,13 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -374,18 +383,13 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
`tuple`. When returning a tuple, the first element is the sample tensor. `tuple`. When returning a tuple, the first element is the sample tensor.
""" """
if state.num_inference_steps is None: if state.num_inference_steps is None:
raise ValueError( raise ValueError(
"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
) )
if not self.config.skip_prk_steps and len(state.ets) < 3: # NOTE: There is no way to check in the jitted runtime if the prk mode was ran before
raise ValueError(
f"{self.__class__} can only be run AFTER scheduler has been run "
"in 'prk' mode for at least 12 iterations "
"See: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py "
"for more information."
)
prev_timestep = timestep - self.config.num_train_timesteps // state.num_inference_steps prev_timestep = timestep - self.config.num_train_timesteps // state.num_inference_steps
prev_timestep = jnp.where(prev_timestep > 0, prev_timestep, 0) prev_timestep = jnp.where(prev_timestep > 0, prev_timestep, 0)
...@@ -417,64 +421,39 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -417,64 +421,39 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
# else: # else:
# model_output = (1 / 24) * (55 * state.ets[-1] - 59 * state.ets[-2] + 37 * state.ets[-3] - 9 * state.ets[-4]) # model_output = (1 / 24) * (55 * state.ets[-1] - 59 * state.ets[-2] + 37 * state.ets[-3] - 9 * state.ets[-4])
def counter_0(state: PNDMSchedulerState): state = state.replace(
ets = state.ets.at[0].set(model_output) ets=jax.lax.select(
return state.replace( state.counter != 1,
ets=ets, state.ets.at[0:3].set(state.ets[1:4]).at[3].set(model_output), # counter != 1
cur_sample=sample, state.ets, # counter 1
cur_model_output=jnp.array(model_output, dtype=jnp.float32), ),
) cur_sample=jax.lax.select(
state.counter != 1,
def counter_1(state: PNDMSchedulerState): sample, # counter != 1
return state.replace( state.cur_sample, # counter 1
cur_model_output=(model_output + state.ets[0]) / 2, ),
) )
def counter_2(state: PNDMSchedulerState):
ets = state.ets.at[1].set(model_output)
return state.replace(
ets=ets,
cur_model_output=(3 * ets[1] - ets[0]) / 2,
cur_sample=sample,
)
def counter_3(state: PNDMSchedulerState):
ets = state.ets.at[2].set(model_output)
return state.replace(
ets=ets,
cur_model_output=(23 * ets[2] - 16 * ets[1] + 5 * ets[0]) / 12,
cur_sample=sample,
)
def counter_other(state: PNDMSchedulerState):
ets = state.ets.at[3].set(model_output)
next_model_output = (1 / 24) * (55 * ets[3] - 59 * ets[2] + 37 * ets[1] - 9 * ets[0])
ets = ets.at[0].set(ets[1])
ets = ets.at[1].set(ets[2])
ets = ets.at[2].set(ets[3])
return state.replace(
ets=ets,
cur_model_output=next_model_output,
cur_sample=sample,
)
counter = jnp.clip(state.counter, 0, 4) state = state.replace(
state = jax.lax.switch( cur_model_output=jax.lax.select_n(
counter, jnp.clip(state.counter, 0, 4),
[counter_0, counter_1, counter_2, counter_3, counter_other], model_output, # counter 0
state, (model_output + state.ets[-1]) / 2, # counter 1
(3 * state.ets[-1] - state.ets[-2]) / 2, # counter 2
(23 * state.ets[-1] - 16 * state.ets[-2] + 5 * state.ets[-3]) / 12, # counter 3
(1 / 24)
* (55 * state.ets[-1] - 59 * state.ets[-2] + 37 * state.ets[-3] - 9 * state.ets[-4]), # counter >= 4
),
) )
sample = state.cur_sample sample = state.cur_sample
model_output = state.cur_model_output model_output = state.cur_model_output
prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output) prev_sample = self._get_prev_sample(state, sample, timestep, prev_timestep, model_output)
state = state.replace(counter=state.counter + 1) state = state.replace(counter=state.counter + 1)
return (prev_sample, state) return (prev_sample, state)
def _get_prev_sample(self, sample, timestep, prev_timestep, model_output): def _get_prev_sample(self, state: PNDMSchedulerState, sample, timestep, prev_timestep, model_output):
# See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf # See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf
# this function computes x_(t−δ) using the formula of (9) # this function computes x_(t−δ) using the formula of (9)
# Note that x_t needs to be added to both sides of the equation # Note that x_t needs to be added to both sides of the equation
...@@ -487,11 +466,20 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -487,11 +466,20 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
# sample -> x_t # sample -> x_t
# model_output -> e_θ(x_t, t) # model_output -> e_θ(x_t, t)
# prev_sample -> x_(t−δ) # prev_sample -> x_(t−δ)
alpha_prod_t = self.alphas_cumprod[timestep] alpha_prod_t = state.common.alphas_cumprod[timestep]
alpha_prod_t_prev = jnp.where(prev_timestep >= 0, self.alphas_cumprod[prev_timestep], self.final_alpha_cumprod) alpha_prod_t_prev = jnp.where(
prev_timestep >= 0, state.common.alphas_cumprod[prev_timestep], state.final_alpha_cumprod
)
beta_prod_t = 1 - alpha_prod_t beta_prod_t = 1 - alpha_prod_t
beta_prod_t_prev = 1 - alpha_prod_t_prev beta_prod_t_prev = 1 - alpha_prod_t_prev
if self.config.prediction_type == "v_prediction":
model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
elif self.config.prediction_type != "epsilon":
raise ValueError(
f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`"
)
# corresponds to (α_(t−δ) - α_t) divided by # corresponds to (α_(t−δ) - α_t) divided by
# denominator of x_t in formula (9) and plus 1 # denominator of x_t in formula (9) and plus 1
# Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) = # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
...@@ -512,20 +500,12 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin): ...@@ -512,20 +500,12 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
def add_noise( def add_noise(
self, self,
state: PNDMSchedulerState,
original_samples: jnp.ndarray, original_samples: jnp.ndarray,
noise: jnp.ndarray, noise: jnp.ndarray,
timesteps: jnp.ndarray, timesteps: jnp.ndarray,
) -> jnp.ndarray: ) -> jnp.ndarray:
sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5 return add_noise_common(state.common, original_samples, noise, timesteps)
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
def __len__(self): def __len__(self):
return self.config.num_train_timesteps return self.config.num_train_timesteps
...@@ -12,10 +12,12 @@ ...@@ -12,10 +12,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import importlib import importlib
import math
import os import os
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any, Dict, Optional, Tuple, Union from typing import Any, Dict, Optional, Tuple, Union
import flax
import jax.numpy as jnp import jax.numpy as jnp
from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, BaseOutput from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, BaseOutput
...@@ -50,6 +52,7 @@ class FlaxSchedulerMixin: ...@@ -50,6 +52,7 @@ class FlaxSchedulerMixin:
""" """
config_name = SCHEDULER_CONFIG_NAME config_name = SCHEDULER_CONFIG_NAME
ignore_for_config = ["dtype"]
_compatibles = [] _compatibles = []
has_compatibles = True has_compatibles = True
...@@ -167,3 +170,90 @@ class FlaxSchedulerMixin: ...@@ -167,3 +170,90 @@ class FlaxSchedulerMixin:
def broadcast_to_shape_from_left(x: jnp.ndarray, shape: Tuple[int]) -> jnp.ndarray: def broadcast_to_shape_from_left(x: jnp.ndarray, shape: Tuple[int]) -> jnp.ndarray:
assert len(shape) >= x.ndim assert len(shape) >= x.ndim
return jnp.broadcast_to(x.reshape(x.shape + (1,) * (len(shape) - x.ndim)), shape) return jnp.broadcast_to(x.reshape(x.shape + (1,) * (len(shape) - x.ndim)), shape)
def betas_for_alpha_bar(num_diffusion_timesteps: int, max_beta=0.999, dtype=jnp.float32) -> jnp.ndarray:
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
Returns:
betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
"""
def alpha_bar(time_step):
return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
betas = []
for i in range(num_diffusion_timesteps):
t1 = i / num_diffusion_timesteps
t2 = (i + 1) / num_diffusion_timesteps
betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
return jnp.array(betas, dtype=dtype)
@flax.struct.dataclass
class CommonSchedulerState:
alphas: jnp.ndarray
betas: jnp.ndarray
alphas_cumprod: jnp.ndarray
@classmethod
def create(cls, scheduler):
config = scheduler.config
if config.trained_betas is not None:
betas = jnp.asarray(config.trained_betas, dtype=scheduler.dtype)
elif config.beta_schedule == "linear":
betas = jnp.linspace(config.beta_start, config.beta_end, config.num_train_timesteps, dtype=scheduler.dtype)
elif config.beta_schedule == "scaled_linear":
# this schedule is very specific to the latent diffusion model.
betas = (
jnp.linspace(
config.beta_start**0.5, config.beta_end**0.5, config.num_train_timesteps, dtype=scheduler.dtype
)
** 2
)
elif config.beta_schedule == "squaredcos_cap_v2":
# Glide cosine schedule
betas = betas_for_alpha_bar(config.num_train_timesteps, dtype=scheduler.dtype)
else:
raise NotImplementedError(
f"beta_schedule {config.beta_schedule} is not implemented for scheduler {scheduler.__class__.__name__}"
)
alphas = 1.0 - betas
alphas_cumprod = jnp.cumprod(alphas, axis=0)
return cls(
alphas=alphas,
betas=betas,
alphas_cumprod=alphas_cumprod,
)
def add_noise_common(
state: CommonSchedulerState, original_samples: jnp.ndarray, noise: jnp.ndarray, timesteps: jnp.ndarray
):
alphas_cumprod = state.alphas_cumprod
sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
sqrt_alpha_prod = sqrt_alpha_prod.flatten()
sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
return noisy_samples
...@@ -296,10 +296,11 @@ class FlaxDDPMSchedulerTest(FlaxSchedulerCommonTest): ...@@ -296,10 +296,11 @@ class FlaxDDPMSchedulerTest(FlaxSchedulerCommonTest):
scheduler_class = self.scheduler_classes[0] scheduler_class = self.scheduler_classes[0]
scheduler_config = self.get_scheduler_config() scheduler_config = self.get_scheduler_config()
scheduler = scheduler_class(**scheduler_config) scheduler = scheduler_class(**scheduler_config)
state = scheduler.create_state()
assert jnp.sum(jnp.abs(scheduler._get_variance(0) - 0.0)) < 1e-5 assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0) - 0.0)) < 1e-5
assert jnp.sum(jnp.abs(scheduler._get_variance(487) - 0.00979)) < 1e-5 assert jnp.sum(jnp.abs(scheduler._get_variance(state, 487) - 0.00979)) < 1e-5
assert jnp.sum(jnp.abs(scheduler._get_variance(999) - 0.02)) < 1e-5 assert jnp.sum(jnp.abs(scheduler._get_variance(state, 999) - 0.02)) < 1e-5
def test_full_loop_no_noise(self): def test_full_loop_no_noise(self):
scheduler_class = self.scheduler_classes[0] scheduler_class = self.scheduler_classes[0]
...@@ -577,12 +578,12 @@ class FlaxDDIMSchedulerTest(FlaxSchedulerCommonTest): ...@@ -577,12 +578,12 @@ class FlaxDDIMSchedulerTest(FlaxSchedulerCommonTest):
scheduler = scheduler_class(**scheduler_config) scheduler = scheduler_class(**scheduler_config)
state = scheduler.create_state() state = scheduler.create_state()
assert jnp.sum(jnp.abs(scheduler._get_variance(0, 0, state.alphas_cumprod) - 0.0)) < 1e-5 assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0, 0) - 0.0)) < 1e-5
assert jnp.sum(jnp.abs(scheduler._get_variance(420, 400, state.alphas_cumprod) - 0.14771)) < 1e-5 assert jnp.sum(jnp.abs(scheduler._get_variance(state, 420, 400) - 0.14771)) < 1e-5
assert jnp.sum(jnp.abs(scheduler._get_variance(980, 960, state.alphas_cumprod) - 0.32460)) < 1e-5 assert jnp.sum(jnp.abs(scheduler._get_variance(state, 980, 960) - 0.32460)) < 1e-5
assert jnp.sum(jnp.abs(scheduler._get_variance(0, 0, state.alphas_cumprod) - 0.0)) < 1e-5 assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0, 0) - 0.0)) < 1e-5
assert jnp.sum(jnp.abs(scheduler._get_variance(487, 486, state.alphas_cumprod) - 0.00979)) < 1e-5 assert jnp.sum(jnp.abs(scheduler._get_variance(state, 487, 486) - 0.00979)) < 1e-5
assert jnp.sum(jnp.abs(scheduler._get_variance(999, 998, state.alphas_cumprod) - 0.02)) < 1e-5 assert jnp.sum(jnp.abs(scheduler._get_variance(state, 999, 998) - 0.02)) < 1e-5
def test_full_loop_no_noise(self): def test_full_loop_no_noise(self):
sample = self.full_loop() sample = self.full_loop()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment