scheduling_pndm_flax.py 21.2 KB
Newer Older
Patrick von Platen's avatar
Patrick von Platen committed
1
# Copyright 2023 Zhejiang University Team and The HuggingFace Team. All rights reserved.
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
16

17
18
19
20
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import flax
Pedro Cuenca's avatar
Pedro Cuenca committed
21
import jax
22
23
24
import jax.numpy as jnp

from ..configuration_utils import ConfigMixin, register_to_config
25
from .scheduling_utils_flax import (
26
    CommonSchedulerState,
Kashif Rasul's avatar
Kashif Rasul committed
27
    FlaxKarrasDiffusionSchedulers,
28
29
    FlaxSchedulerMixin,
    FlaxSchedulerOutput,
30
    add_noise_common,
31
)
32
33
34
35


@flax.struct.dataclass
class PNDMSchedulerState:
36
37
38
    common: CommonSchedulerState
    final_alpha_cumprod: jnp.ndarray

39
    # setable values
40
41
    init_noise_sigma: jnp.ndarray
    timesteps: jnp.ndarray
42
    num_inference_steps: Optional[int] = None
43
44
    prk_timesteps: Optional[jnp.ndarray] = None
    plms_timesteps: Optional[jnp.ndarray] = None
45
46
47

    # running values
    cur_model_output: Optional[jnp.ndarray] = None
48
    counter: Optional[jnp.int32] = None
49
    cur_sample: Optional[jnp.ndarray] = None
50
    ets: Optional[jnp.ndarray] = None
51
52

    @classmethod
53
54
55
56
57
58
59
60
61
62
63
64
65
    def create(
        cls,
        common: CommonSchedulerState,
        final_alpha_cumprod: jnp.ndarray,
        init_noise_sigma: jnp.ndarray,
        timesteps: jnp.ndarray,
    ):
        return cls(
            common=common,
            final_alpha_cumprod=final_alpha_cumprod,
            init_noise_sigma=init_noise_sigma,
            timesteps=timesteps,
        )
66
67
68


@dataclass
69
class FlaxPNDMSchedulerOutput(FlaxSchedulerOutput):
70
71
72
    state: PNDMSchedulerState


73
class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
74
75
76
77
78
79
    """
    Pseudo numerical methods for diffusion models (PNDM) proposes using more advanced ODE integration techniques,
    namely Runge-Kutta method and a linear multi-step method.

    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
80
81
    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
    [`~SchedulerMixin.from_pretrained`] functions.
82
83
84
85
86
87
88
89
90
91

    For more details, see the original paper: https://arxiv.org/abs/2202.09778

    Args:
        num_train_timesteps (`int`): number of diffusion steps used to train the model.
        beta_start (`float`): the starting `beta` value of inference.
        beta_end (`float`): the final `beta` value.
        beta_schedule (`str`):
            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
92
        trained_betas (`jnp.ndarray`, optional):
93
94
95
96
            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
        skip_prk_steps (`bool`):
            allows the scheduler to skip the Runge-Kutta steps that are defined in the original paper as being required
            before plms steps; defaults to `False`.
97
98
99
100
101
102
103
104
        set_alpha_to_one (`bool`, default `False`):
            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
            otherwise it uses the value of alpha at step 0.
        steps_offset (`int`, default `0`):
            an offset added to the inference steps. You can use a combination of `offset=1` and
            `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
            stable diffusion.
105
106
107
108
109
110
        prediction_type (`str`, default `epsilon`, optional):
            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
            https://imagen.research.google/video/paper.pdf)
        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
            the `dtype` used for params and computation.
111
112
    """

Kashif Rasul's avatar
Kashif Rasul committed
113
    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
114

115
116
117
    dtype: jnp.dtype
    pndm_order: int

118
119
120
121
    @property
    def has_state(self):
        return True

122
123
124
125
126
127
128
    @register_to_config
    def __init__(
        self,
        num_train_timesteps: int = 1000,
        beta_start: float = 0.0001,
        beta_end: float = 0.02,
        beta_schedule: str = "linear",
129
        trained_betas: Optional[jnp.ndarray] = None,
130
        skip_prk_steps: bool = False,
131
132
        set_alpha_to_one: bool = False,
        steps_offset: int = 0,
133
134
        prediction_type: str = "epsilon",
        dtype: jnp.dtype = jnp.float32,
135
    ):
136
        self.dtype = dtype
137

138
139
140
141
142
        # For now we only support F-PNDM, i.e. the runge-kutta method
        # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
        # mainly at formula (9), (12), (13) and the Algorithm 2.
        self.pndm_order = 4

143
144
145
146
147
148
149
150
151
152
153
154
    def create_state(self, common: Optional[CommonSchedulerState] = None) -> PNDMSchedulerState:
        if common is None:
            common = CommonSchedulerState.create(self)

        # At every step in ddim, we are looking into the previous alphas_cumprod
        # For the final step, there is no previous alphas_cumprod because we are already at 0
        # `set_alpha_to_one` decides whether we set this parameter simply to one or
        # whether we use the final alpha of the "non-previous" one.
        final_alpha_cumprod = (
            jnp.array(1.0, dtype=self.dtype) if self.config.set_alpha_to_one else common.alphas_cumprod[0]
        )

Suraj Patil's avatar
Suraj Patil committed
155
        # standard deviation of the initial noise distribution
156
        init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
Suraj Patil's avatar
Suraj Patil committed
157

158
159
160
161
162
163
164
165
        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]

        return PNDMSchedulerState.create(
            common=common,
            final_alpha_cumprod=final_alpha_cumprod,
            init_noise_sigma=init_noise_sigma,
            timesteps=timesteps,
        )
166

167
    def set_timesteps(self, state: PNDMSchedulerState, num_inference_steps: int, shape: Tuple) -> PNDMSchedulerState:
168
169
170
171
172
        """
        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.

        Args:
            state (`PNDMSchedulerState`):
173
                the `FlaxPNDMScheduler` state data class instance.
174
175
            num_inference_steps (`int`):
                the number of diffusion steps used when generating samples with a pre-trained model.
176
177
            shape (`Tuple`):
                the shape of the samples to be generated.
178
        """
179

180
181
        step_ratio = self.config.num_train_timesteps // num_inference_steps
        # creates integer timesteps by multiplying by ratio
182
        # rounding to avoid issues when num_inference_step is power of 3
183
        _timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round() + self.config.steps_offset
184
185
186
187
188

        if self.config.skip_prk_steps:
            # for some models like stable diffusion the prk steps can/should be skipped to
            # produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation
            # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51
189
190
191
192

            prk_timesteps = jnp.array([], dtype=jnp.int32)
            plms_timesteps = jnp.concatenate([_timesteps[:-1], _timesteps[-2:-1], _timesteps[-1:]])[::-1]

193
        else:
194
195
196
            prk_timesteps = _timesteps[-self.pndm_order :].repeat(2) + jnp.tile(
                jnp.array([0, self.config.num_train_timesteps // num_inference_steps // 2], dtype=jnp.int32),
                self.pndm_order,
197
198
            )

199
200
201
202
203
204
205
206
207
208
209
            prk_timesteps = (prk_timesteps[:-1].repeat(2)[1:-1])[::-1]
            plms_timesteps = _timesteps[:-3][::-1]

        timesteps = jnp.concatenate([prk_timesteps, plms_timesteps])

        # initial running values

        cur_model_output = jnp.zeros(shape, dtype=self.dtype)
        counter = jnp.int32(0)
        cur_sample = jnp.zeros(shape, dtype=self.dtype)
        ets = jnp.zeros((4,) + shape, dtype=self.dtype)
210
211

        return state.replace(
212
213
214
215
216
217
218
219
            timesteps=timesteps,
            num_inference_steps=num_inference_steps,
            prk_timesteps=prk_timesteps,
            plms_timesteps=plms_timesteps,
            cur_model_output=cur_model_output,
            counter=counter,
            cur_sample=cur_sample,
            ets=ets,
220
221
        )

Suraj Patil's avatar
Suraj Patil committed
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
    def scale_model_input(
        self, state: PNDMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
    ) -> jnp.ndarray:
        """
        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
        current timestep.

        Args:
            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
            sample (`jnp.ndarray`): input sample
            timestep (`int`, optional): current timestep

        Returns:
            `jnp.ndarray`: scaled input sample
        """
        return sample

239
240
241
242
243
244
245
    def step(
        self,
        state: PNDMSchedulerState,
        model_output: jnp.ndarray,
        timestep: int,
        sample: jnp.ndarray,
        return_dict: bool = True,
246
    ) -> Union[FlaxPNDMSchedulerOutput, Tuple]:
247
248
249
250
251
252
253
        """
        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
        process from the learned model outputs (most often the predicted noise).

        This function calls `step_prk()` or `step_plms()` depending on the internal variable `counter`.

        Args:
254
            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
255
256
257
258
            model_output (`jnp.ndarray`): direct output from learned diffusion model.
            timestep (`int`): current discrete timestep in the diffusion chain.
            sample (`jnp.ndarray`):
                current instance of sample being created by diffusion process.
259
            return_dict (`bool`): option for returning tuple rather than FlaxPNDMSchedulerOutput class
260
261

        Returns:
262
263
            [`FlaxPNDMSchedulerOutput`] or `tuple`: [`FlaxPNDMSchedulerOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is the sample tensor.
264
265

        """
266
267
268
269

        if state.num_inference_steps is None:
            raise ValueError(
                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
270
            )
271
272
273

        if self.config.skip_prk_steps:
            prev_sample, state = self.step_plms(state, model_output, timestep, sample)
274
        else:
275
276
277
278
279
280
281
282
283
284
285
286
            prk_prev_sample, prk_state = self.step_prk(state, model_output, timestep, sample)
            plms_prev_sample, plms_state = self.step_plms(state, model_output, timestep, sample)

            cond = state.counter < len(state.prk_timesteps)

            prev_sample = jax.lax.select(cond, prk_prev_sample, plms_prev_sample)

            state = state.replace(
                cur_model_output=jax.lax.select(cond, prk_state.cur_model_output, plms_state.cur_model_output),
                ets=jax.lax.select(cond, prk_state.ets, plms_state.ets),
                cur_sample=jax.lax.select(cond, prk_state.cur_sample, plms_state.cur_sample),
                counter=jax.lax.select(cond, prk_state.counter, plms_state.counter),
287
288
            )

Pedro Cuenca's avatar
Pedro Cuenca committed
289
290
291
        if not return_dict:
            return (prev_sample, state)

292
        return FlaxPNDMSchedulerOutput(prev_sample=prev_sample, state=state)
Pedro Cuenca's avatar
Pedro Cuenca committed
293

294
295
296
297
298
299
    def step_prk(
        self,
        state: PNDMSchedulerState,
        model_output: jnp.ndarray,
        timestep: int,
        sample: jnp.ndarray,
300
    ) -> Union[FlaxPNDMSchedulerOutput, Tuple]:
301
302
303
304
305
        """
        Step function propagating the sample with the Runge-Kutta method. RK takes 4 forward passes to approximate the
        solution to the differential equation.

        Args:
306
            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
307
308
309
310
            model_output (`jnp.ndarray`): direct output from learned diffusion model.
            timestep (`int`): current discrete timestep in the diffusion chain.
            sample (`jnp.ndarray`):
                current instance of sample being created by diffusion process.
311
            return_dict (`bool`): option for returning tuple rather than FlaxPNDMSchedulerOutput class
312
313

        Returns:
314
315
            [`FlaxPNDMSchedulerOutput`] or `tuple`: [`FlaxPNDMSchedulerOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is the sample tensor.
316
317

        """
318

319
320
321
322
323
        if state.num_inference_steps is None:
            raise ValueError(
                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
            )

Pedro Cuenca's avatar
Pedro Cuenca committed
324
325
326
        diff_to_prev = jnp.where(
            state.counter % 2, 0, self.config.num_train_timesteps // state.num_inference_steps // 2
        )
327
        prev_timestep = timestep - diff_to_prev
328
329
        timestep = state.prk_timesteps[state.counter // 4 * 4]

330
331
332
333
334
        model_output = jax.lax.select(
            (state.counter % 4) != 3,
            model_output,  # remainder 0, 1, 2
            state.cur_model_output + 1 / 6 * model_output,  # remainder 3
        )
Pedro Cuenca's avatar
Pedro Cuenca committed
335

336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
        state = state.replace(
            cur_model_output=jax.lax.select_n(
                state.counter % 4,
                state.cur_model_output + 1 / 6 * model_output,  # remainder 0
                state.cur_model_output + 1 / 3 * model_output,  # remainder 1
                state.cur_model_output + 1 / 3 * model_output,  # remainder 2
                jnp.zeros_like(state.cur_model_output),  # remainder 3
            ),
            ets=jax.lax.select(
                (state.counter % 4) == 0,
                state.ets.at[0:3].set(state.ets[1:4]).at[3].set(model_output),  # remainder 0
                state.ets,  # remainder 1, 2, 3
            ),
            cur_sample=jax.lax.select(
                (state.counter % 4) == 0,
                sample,  # remainder 0
                state.cur_sample,  # remainder 1, 2, 3
            ),
Pedro Cuenca's avatar
Pedro Cuenca committed
354
355
356
        )

        cur_sample = state.cur_sample
357
        prev_sample = self._get_prev_sample(state, cur_sample, timestep, prev_timestep, model_output)
358
        state = state.replace(counter=state.counter + 1)
359

Pedro Cuenca's avatar
Pedro Cuenca committed
360
        return (prev_sample, state)
361
362
363
364
365
366
367

    def step_plms(
        self,
        state: PNDMSchedulerState,
        model_output: jnp.ndarray,
        timestep: int,
        sample: jnp.ndarray,
368
    ) -> Union[FlaxPNDMSchedulerOutput, Tuple]:
369
370
371
372
373
        """
        Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
        times to approximate the solution.

        Args:
374
            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
375
376
377
378
            model_output (`jnp.ndarray`): direct output from learned diffusion model.
            timestep (`int`): current discrete timestep in the diffusion chain.
            sample (`jnp.ndarray`):
                current instance of sample being created by diffusion process.
379
            return_dict (`bool`): option for returning tuple rather than FlaxPNDMSchedulerOutput class
380
381

        Returns:
382
383
            [`FlaxPNDMSchedulerOutput`] or `tuple`: [`FlaxPNDMSchedulerOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is the sample tensor.
384
385

        """
386

387
388
389
390
391
        if state.num_inference_steps is None:
            raise ValueError(
                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
            )

392
        # NOTE: There is no way to check in the jitted runtime if the prk mode was ran before
393

394
        prev_timestep = timestep - self.config.num_train_timesteps // state.num_inference_steps
Pedro Cuenca's avatar
Pedro Cuenca committed
395
396
397
398
399
400
401
402
403
404
405
406
407
        prev_timestep = jnp.where(prev_timestep > 0, prev_timestep, 0)

        # Reference:
        # if state.counter != 1:
        #     state.ets.append(model_output)
        # else:
        #     prev_timestep = timestep
        #     timestep = timestep + self.config.num_train_timesteps // state.num_inference_steps

        prev_timestep = jnp.where(state.counter == 1, timestep, prev_timestep)
        timestep = jnp.where(
            state.counter == 1, timestep + self.config.num_train_timesteps // state.num_inference_steps, timestep
        )
408

Pedro Cuenca's avatar
Pedro Cuenca committed
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
        # Reference:
        # if len(state.ets) == 1 and state.counter == 0:
        #     model_output = model_output
        #     state.cur_sample = sample
        # elif len(state.ets) == 1 and state.counter == 1:
        #     model_output = (model_output + state.ets[-1]) / 2
        #     sample = state.cur_sample
        #     state.cur_sample = None
        # elif len(state.ets) == 2:
        #     model_output = (3 * state.ets[-1] - state.ets[-2]) / 2
        # elif len(state.ets) == 3:
        #     model_output = (23 * state.ets[-1] - 16 * state.ets[-2] + 5 * state.ets[-3]) / 12
        # else:
        #     model_output = (1 / 24) * (55 * state.ets[-1] - 59 * state.ets[-2] + 37 * state.ets[-3] - 9 * state.ets[-4])

424
425
426
427
428
429
430
431
432
433
434
435
        state = state.replace(
            ets=jax.lax.select(
                state.counter != 1,
                state.ets.at[0:3].set(state.ets[1:4]).at[3].set(model_output),  # counter != 1
                state.ets,  # counter 1
            ),
            cur_sample=jax.lax.select(
                state.counter != 1,
                sample,  # counter != 1
                state.cur_sample,  # counter 1
            ),
        )
Pedro Cuenca's avatar
Pedro Cuenca committed
436

437
438
439
440
441
442
443
444
445
446
        state = state.replace(
            cur_model_output=jax.lax.select_n(
                jnp.clip(state.counter, 0, 4),
                model_output,  # counter 0
                (model_output + state.ets[-1]) / 2,  # counter 1
                (3 * state.ets[-1] - state.ets[-2]) / 2,  # counter 2
                (23 * state.ets[-1] - 16 * state.ets[-2] + 5 * state.ets[-3]) / 12,  # counter 3
                (1 / 24)
                * (55 * state.ets[-1] - 59 * state.ets[-2] + 37 * state.ets[-3] - 9 * state.ets[-4]),  # counter >= 4
            ),
Pedro Cuenca's avatar
Pedro Cuenca committed
447
448
449
450
        )

        sample = state.cur_sample
        model_output = state.cur_model_output
451
        prev_sample = self._get_prev_sample(state, sample, timestep, prev_timestep, model_output)
452
        state = state.replace(counter=state.counter + 1)
453

Pedro Cuenca's avatar
Pedro Cuenca committed
454
        return (prev_sample, state)
455

456
    def _get_prev_sample(self, state: PNDMSchedulerState, sample, timestep, prev_timestep, model_output):
457
458
459
460
461
462
463
464
465
466
467
468
        # See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf
        # this function computes x_(t−δ) using the formula of (9)
        # Note that x_t needs to be added to both sides of the equation

        # Notation (<variable name> -> <name in paper>
        # alpha_prod_t -> α_t
        # alpha_prod_t_prev -> α_(t−δ)
        # beta_prod_t -> (1 - α_t)
        # beta_prod_t_prev -> (1 - α_(t−δ))
        # sample -> x_t
        # model_output -> e_θ(x_t, t)
        # prev_sample -> x_(t−δ)
469
470
471
472
        alpha_prod_t = state.common.alphas_cumprod[timestep]
        alpha_prod_t_prev = jnp.where(
            prev_timestep >= 0, state.common.alphas_cumprod[prev_timestep], state.final_alpha_cumprod
        )
473
474
475
        beta_prod_t = 1 - alpha_prod_t
        beta_prod_t_prev = 1 - alpha_prod_t_prev

476
477
478
479
480
481
482
        if self.config.prediction_type == "v_prediction":
            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
        elif self.config.prediction_type != "epsilon":
            raise ValueError(
                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`"
            )

483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
        # corresponds to (α_(t−δ) - α_t) divided by
        # denominator of x_t in formula (9) and plus 1
        # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
        # sqrt(α_(t−δ)) / sqrt(α_t))
        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)

        # corresponds to denominator of e_θ(x_t, t) in formula (9)
        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
        ) ** (0.5)

        # full formula (9)
        prev_sample = (
            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
        )

        return prev_sample

    def add_noise(
        self,
503
        state: PNDMSchedulerState,
504
505
506
507
        original_samples: jnp.ndarray,
        noise: jnp.ndarray,
        timesteps: jnp.ndarray,
    ) -> jnp.ndarray:
508
        return add_noise_common(state.common, original_samples, noise, timesteps)
509
510
511

    def __len__(self):
        return self.config.num_train_timesteps