Improve docstrings and type hints in scheduling_ddpm.py (#12651)

* Enhance type hints and docstrings in scheduling_ddpm.py - Added type hints for function parameters and return types across the DDPMScheduler class and related functions. - Improved docstrings for clarity, including detailed descriptions of parameters and return values. - Updated the alpha_transform_type and beta_schedule parameters to use Literal types for better type safety. - Refined the _get_variance and previous_timestep methods with comprehensive documentation. * Refactor docstrings and type hints in scheduling_ddpm.py - Cleaned up whitespace in the rescale_zero_terminal_snr function. - Enhanced the variance_type parameter in the DDPMScheduler class with improved formatting for better readability. - Updated the docstring for the compute_variance method to maintain consistency and clarity in parameter descriptions and return values. * Apply `make fix-copies` * Refactor type hints across multiple scheduler files - Updated type hints to include `Literal` for improved type safety in various scheduling files. - Ensured consistency in type hinting for parameters and return types across the affected modules. - This change enhances code clarity and maintainability.

Improve docstrings and type hints in scheduling_ddpm.py (#12651)
* Enhance type hints and docstrings in scheduling_ddpm.py - Added type hints for function parameters and return types across the DDPMScheduler class and related functions. - Improved docstrings for clarity, including detailed descriptions of parameters and return values. - Updated the alpha_transform_type and beta_schedule parameters to use Literal types for better type safety. - Refined the _get_variance and previous_timestep methods with comprehensive documentation. * Refactor docstrings and type hints in scheduling_ddpm.py - Cleaned up whitespace in the rescale_zero_terminal_snr function. - Enhanced the variance_type parameter in the DDPMScheduler class with improved formatting for better readability. - Updated the docstring for the compute_variance method to maintain consistency and clarity in parameter descriptions and return values. * Apply `make fix-copies` * Refactor type hints across multiple scheduler files - Updated type hints to include `Literal` for improved type safety in various scheduling files. - Ensured consistency in type hinting for parameters and return types across the affected modules. - This change enhances code clarity and maintainability.
3c1ca869 · David El Malih · GitHub · 6fe4a6ff · 3c1ca869 · 3c1ca869
Unverified Commit 3c1ca869 authored Nov 13, 2025 by David El Malih Committed by GitHub Nov 13, 2025
20 changed files
--- a/src/diffusers/schedulers/scheduling_consistency_decoder.py
+++ b/src/diffusers/schedulers/scheduling_consistency_decoder.py
 import math
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import Literal, Optional, Tuple, Union
 import torch
@@ -12,10 +12,10 @@ from .scheduling_utils import SchedulerMixin
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -23,16 +23,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":

--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -49,10 +49,10 @@ class DDIMSchedulerOutput(BaseOutput):
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":
@@ -281,6 +282,8 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
        """
+        Apply dynamic thresholding to the predicted sample.
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -288,6 +291,14 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
        photorealism as well as better image-text alignment, especially when using very large guidance weights."
        https://huggingface.co/papers/2205.11487
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
        """
        dtype = sample.dtype
        batch_size, channels, *remaining_dims = sample.shape
@@ -501,6 +512,22 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
        noise: torch.Tensor,
        timesteps: torch.IntTensor,
    ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
        # for the subsequent add_noise calls
@@ -523,6 +550,21 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as sample
        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)

--- a/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
+++ b/src/diffusers/schedulers/scheduling_ddim_cogvideox.py
@@ -18,7 +18,7 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -49,10 +49,10 @@ class DDIMSchedulerOutput(BaseOutput):
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":
@@ -408,6 +409,22 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
        noise: torch.Tensor,
        timesteps: torch.IntTensor,
    ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
        # for the subsequent add_noise calls
@@ -430,6 +447,21 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as sample
        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)

--- a/src/diffusers/schedulers/scheduling_ddim_inverse.py
+++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py
@@ -16,7 +16,7 @@
 # and https://github.com/hojonathanho/diffusion
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -47,10 +47,10 @@ class DDIMSchedulerOutput(BaseOutput):
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -58,16 +58,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":

--- a/src/diffusers/schedulers/scheduling_ddim_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py
@@ -49,10 +49,10 @@ class DDIMParallelSchedulerOutput(BaseOutput):
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":
@@ -284,6 +285,8 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
        """
+        Apply dynamic thresholding to the predicted sample.
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -291,6 +294,14 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
        photorealism as well as better image-text alignment, especially when using very large guidance weights."
        https://huggingface.co/papers/2205.11487
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
        """
        dtype = sample.dtype
        batch_size, channels, *remaining_dims = sample.shape
@@ -606,6 +617,22 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
        noise: torch.Tensor,
        timesteps: torch.IntTensor,
    ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
        # for the subsequent add_noise calls
@@ -628,6 +655,21 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as sample
        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)

--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -16,7 +16,7 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -46,10 +46,10 @@ class DDPMSchedulerOutput(BaseOutput):
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -57,16 +57,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":
@@ -90,7 +91,7 @@ def betas_for_alpha_bar(
 # Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
-def rescale_zero_terminal_snr(betas):
+def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
    """
    Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
@@ -133,39 +134,37 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
    methods the library implements for all schedulers such as loading and saving.
    Args:
-        num_train_timesteps (`int`, defaults to 1000):
+        num_train_timesteps (`int`, defaults to `1000`):
            The number of diffusion steps to train the model.
-        beta_start (`float`, defaults to 0.0001):
+        beta_start (`float`, defaults to `0.0001`):
            The starting `beta` value of inference.
-        beta_end (`float`, defaults to 0.02):
+        beta_end (`float`, defaults to `0.02`):
            The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
+        beta_schedule (`"linear"`, `"scaled_linear"`, `"squaredcos_cap_v2"`, or `"sigmoid"`, defaults to `"linear"`):
-            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model.
-            `linear`, `scaled_linear`, `squaredcos_cap_v2`, or `sigmoid`.
        trained_betas (`np.ndarray`, *optional*):
            An array of betas to pass directly to the constructor without using `beta_start` and `beta_end`.
-        variance_type (`str`, defaults to `"fixed_small"`):
+        variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, defaults to `"fixed_small"`):
-            Clip the variance when adding noise to the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
+            Clip the variance when adding noise to the denoised sample.
-            `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
        clip_sample (`bool`, defaults to `True`):
            Clip the predicted sample for numerical stability.
-        clip_sample_range (`float`, defaults to 1.0):
+        clip_sample_range (`float`, defaults to `1.0`):
            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
+        prediction_type (`"epsilon"`, `"sample"`, or `"v_prediction"`, defaults to `"epsilon"`):
            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
            Video](https://imagen.research.google/video/paper.pdf) paper).
        thresholding (`bool`, defaults to `False`):
            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
            as Stable Diffusion.
-        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+        dynamic_thresholding_ratio (`float`, defaults to `0.995`):
            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
-        sample_max_value (`float`, defaults to 1.0):
+        sample_max_value (`float`, defaults to `1.0`):
            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
-        timestep_spacing (`str`, defaults to `"leading"`):
+        timestep_spacing (`"linspace"`, `"leading"`, or `"trailing"`, defaults to `"leading"`):
            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        steps_offset (`int`, defaults to 0):
+        steps_offset (`int`, defaults to `0`):
            An offset added to the inference steps, as required by some model families.
        rescale_betas_zero_snr (`bool`, defaults to `False`):
            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
@@ -182,16 +181,18 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
        num_train_timesteps: int = 1000,
        beta_start: float = 0.0001,
        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2", "sigmoid"] = "linear",
        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        variance_type: str = "fixed_small",
+        variance_type: Literal[
+            "fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"
+        ] = "fixed_small",
        clip_sample: bool = True,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
        thresholding: bool = False,
        dynamic_thresholding_ratio: float = 0.995,
        clip_sample_range: float = 1.0,
        sample_max_value: float = 1.0,
-        timestep_spacing: str = "leading",
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "leading",
        steps_offset: int = 0,
        rescale_betas_zero_snr: bool = False,
    ):
@@ -321,7 +322,31 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
        self.timesteps = torch.from_numpy(timesteps).to(device)
-    def _get_variance(self, t, predicted_variance=None, variance_type=None):
+    def _get_variance(
+        self,
+        t: int,
+        predicted_variance: Optional[torch.Tensor] = None,
+        variance_type: Optional[
+            Literal["fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"]
+        ] = None,
+    ) -> torch.Tensor:
+        """
+        Compute the variance for a given timestep according to the specified variance type.
+        Args:
+            t (`int`):
+                The current timestep.
+            predicted_variance (`torch.Tensor`, *optional*):
+                The predicted variance from the model. Used only when `variance_type` is `"learned"` or
+                `"learned_range"`.
+            variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, *optional*):
+                The type of variance to compute. If `None`, uses the variance type specified in the scheduler
+                configuration.
+        Returns:
+            `torch.Tensor`:
+                The computed variance.
+        """
        prev_t = self.previous_timestep(t)
        alpha_prod_t = self.alphas_cumprod[t]
@@ -363,6 +388,8 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
        """
+        Apply dynamic thresholding to the predicted sample.
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -370,6 +397,14 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
        photorealism as well as better image-text alignment, especially when using very large guidance weights."
        https://huggingface.co/papers/2205.11487
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
        """
        dtype = sample.dtype
        batch_size, channels, *remaining_dims = sample.shape
@@ -399,7 +434,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
        model_output: torch.Tensor,
        timestep: int,
        sample: torch.Tensor,
-        generator=None,
+        generator: Optional[torch.Generator] = None,
        return_dict: bool = True,
    ) -> Union[DDPMSchedulerOutput, Tuple]:
        """
@@ -409,20 +444,19 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
        Args:
            model_output (`torch.Tensor`):
                The direct output from learned diffusion model.
-            timestep (`float`):
+            timestep (`int`):
                The current discrete timestep in the diffusion chain.
            sample (`torch.Tensor`):
                A current instance of a sample created by the diffusion process.
            generator (`torch.Generator`, *optional*):
                A random number generator.
-            return_dict (`bool`, *optional*, defaults to `True`):
+            return_dict (`bool`, defaults to `True`):
                Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
        Returns:
            [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`:
                If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a
                tuple is returned where the first element is the sample tensor.
        """
        t = timestep
@@ -503,6 +537,22 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
        noise: torch.Tensor,
        timesteps: torch.IntTensor,
    ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
        # for the subsequent add_noise calls
@@ -524,6 +574,21 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
        return noisy_samples
    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as sample
        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -542,10 +607,21 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
        return velocity
-    def __len__(self):
+    def __len__(self) -> int:
        return self.config.num_train_timesteps
-    def previous_timestep(self, timestep):
+    def previous_timestep(self, timestep: int) -> int:
+        """
+        Compute the previous timestep in the diffusion chain.
+        Args:
+            timestep (`int`):
+                The current timestep.
+        Returns:
+            `int`:
+                The previous timestep.
+        """
        if self.custom_timesteps or self.num_inference_steps:
            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
            if index == self.timesteps.shape[0] - 1:

--- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py
@@ -16,7 +16,7 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -48,10 +48,10 @@ class DDPMParallelSchedulerOutput(BaseOutput):
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -59,16 +59,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":
@@ -190,16 +191,18 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
        num_train_timesteps: int = 1000,
        beta_start: float = 0.0001,
        beta_end: float = 0.02,
-        beta_schedule: str = "linear",
+        beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2", "sigmoid"] = "linear",
        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        variance_type: str = "fixed_small",
+        variance_type: Literal[
+            "fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"
+        ] = "fixed_small",
        clip_sample: bool = True,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
        thresholding: bool = False,
        dynamic_thresholding_ratio: float = 0.995,
        clip_sample_range: float = 1.0,
        sample_max_value: float = 1.0,
-        timestep_spacing: str = "leading",
+        timestep_spacing: Literal["linspace", "leading", "trailing"] = "leading",
        steps_offset: int = 0,
        rescale_betas_zero_snr: bool = False,
    ):
@@ -332,7 +335,31 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
        self.timesteps = torch.from_numpy(timesteps).to(device)
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance
-    def _get_variance(self, t, predicted_variance=None, variance_type=None):
+    def _get_variance(
+        self,
+        t: int,
+        predicted_variance: Optional[torch.Tensor] = None,
+        variance_type: Optional[
+            Literal["fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"]
+        ] = None,
+    ) -> torch.Tensor:
+        """
+        Compute the variance for a given timestep according to the specified variance type.
+        Args:
+            t (`int`):
+                The current timestep.
+            predicted_variance (`torch.Tensor`, *optional*):
+                The predicted variance from the model. Used only when `variance_type` is `"learned"` or
+                `"learned_range"`.
+            variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, *optional*):
+                The type of variance to compute. If `None`, uses the variance type specified in the scheduler
+                configuration.
+        Returns:
+            `torch.Tensor`:
+                The computed variance.
+        """
        prev_t = self.previous_timestep(t)
        alpha_prod_t = self.alphas_cumprod[t]
@@ -375,6 +402,8 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
        """
+        Apply dynamic thresholding to the predicted sample.
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -382,6 +411,14 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
        photorealism as well as better image-text alignment, especially when using very large guidance weights."
        https://huggingface.co/papers/2205.11487
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
        """
        dtype = sample.dtype
        batch_size, channels, *remaining_dims = sample.shape
@@ -592,6 +629,22 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
        noise: torch.Tensor,
        timesteps: torch.IntTensor,
    ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
        # for the subsequent add_noise calls
@@ -614,6 +667,21 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as sample
        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -637,6 +705,17 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
    def previous_timestep(self, timestep):
+        """
+        Compute the previous timestep in the diffusion chain.
+        Args:
+            timestep (`int`):
+                The current timestep.
+        Returns:
+            `int`:
+                The previous timestep.
+        """
        if self.custom_timesteps or self.num_inference_steps:
            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
            if index == self.timesteps.shape[0] - 1:

--- a/src/diffusers/schedulers/scheduling_deis_multistep.py
+++ b/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -16,7 +16,7 @@
 # The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
 import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -32,10 +32,10 @@ if is_scipy_available():
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -43,16 +43,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":
@@ -320,6 +321,8 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
        """
+        Apply dynamic thresholding to the predicted sample.
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -327,6 +330,14 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
        photorealism as well as better image-text alignment, especially when using very large guidance weights."
        https://huggingface.co/papers/2205.11487
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
        """
        dtype = sample.dtype
        batch_size, channels, *remaining_dims = sample.shape

--- a/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
+++ b/src/diffusers/schedulers/scheduling_dpm_cogvideox.py
@@ -18,7 +18,7 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -50,10 +50,10 @@ class DDIMSchedulerOutput(BaseOutput):
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -61,16 +61,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":
@@ -445,6 +446,22 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
        noise: torch.Tensor,
        timesteps: torch.IntTensor,
    ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
        # for the subsequent add_noise calls
@@ -467,6 +484,21 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as sample
        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)

--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -15,7 +15,7 @@
 # DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
 import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -32,10 +32,10 @@ if is_scipy_available():
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -43,16 +43,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":
@@ -459,6 +460,8 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
        """
+        Apply dynamic thresholding to the predicted sample.
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -466,6 +469,14 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        photorealism as well as better image-text alignment, especially when using very large guidance weights."
        https://huggingface.co/papers/2205.11487
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
        """
        dtype = sample.dtype
        batch_size, channels, *remaining_dims = sample.shape

--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -15,7 +15,7 @@
 # DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
 import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -32,10 +32,10 @@ if is_scipy_available():
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -43,16 +43,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":
@@ -332,6 +333,8 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
        """
+        Apply dynamic thresholding to the predicted sample.
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -339,6 +342,14 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        photorealism as well as better image-text alignment, especially when using very large guidance weights."
        https://huggingface.co/papers/2205.11487
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
        """
        dtype = sample.dtype
        batch_size, channels, *remaining_dims = sample.shape

--- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -14,7 +14,7 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -115,10 +115,10 @@ class BrownianTreeNoiseSampler:
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -126,16 +126,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":

--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -15,7 +15,7 @@
 # DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
 import math
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -34,10 +34,10 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -45,16 +45,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":
@@ -410,6 +411,8 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
        """
+        Apply dynamic thresholding to the predicted sample.
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -417,6 +420,14 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
        photorealism as well as better image-text alignment, especially when using very large guidance weights."
        https://huggingface.co/papers/2205.11487
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
        """
        dtype = sample.dtype
        batch_size, channels, *remaining_dims = sample.shape

--- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
@@ -299,6 +299,8 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
        """
+        Apply dynamic thresholding to the predicted sample.
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -306,6 +308,14 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        photorealism as well as better image-text alignment, especially when using very large guidance weights."
        https://huggingface.co/papers/2205.11487
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
        """
        dtype = sample.dtype
        batch_size, channels, *remaining_dims = sample.shape

--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -14,7 +14,7 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -49,10 +49,10 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":

--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -14,7 +14,7 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -52,10 +52,10 @@ class EulerDiscreteSchedulerOutput(BaseOutput):
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -63,16 +63,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":

--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -14,7 +14,7 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -49,10 +49,10 @@ class HeunDiscreteSchedulerOutput(BaseOutput):
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":

--- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -14,7 +14,7 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -50,10 +50,10 @@ class KDPM2AncestralDiscreteSchedulerOutput(BaseOutput):
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -61,16 +61,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":

--- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
+++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -14,7 +14,7 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -49,10 +49,10 @@ class KDPM2DiscreteSchedulerOutput(BaseOutput):
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":

--- a/src/diffusers/schedulers/scheduling_lcm.py
+++ b/src/diffusers/schedulers/scheduling_lcm.py
@@ -17,7 +17,7 @@
 import math
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -51,10 +51,10 @@ class LCMSchedulerOutput(BaseOutput):
 # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
-    num_diffusion_timesteps,
+    num_diffusion_timesteps: int,
-    max_beta=0.999,
+    max_beta: float = 0.999,
-    alpha_transform_type="cosine",
+    alpha_transform_type: Literal["cosine", "exp"] = "cosine",
-):
+) -> torch.Tensor:
    """
    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
    (1-beta) over time from t = [0,1].
@@ -62,16 +62,17 @@ def betas_for_alpha_bar(
    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
    to that part of the diffusion process.
    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
+        num_diffusion_timesteps (`int`):
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+            The number of betas to produce.
-                     prevent singularities.
+        max_beta (`float`, defaults to `0.999`):
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+            The maximum beta to use; use values lower than 1 to avoid numerical instability.
-                     Choose from `cosine` or `exp`
+        alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
+            The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+        `torch.Tensor`:
+            The betas used by the scheduler to step the model outputs.
    """
    if alpha_transform_type == "cosine":
@@ -314,6 +315,8 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
        """
+        Apply dynamic thresholding to the predicted sample.
        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -321,6 +324,14 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
        photorealism as well as better image-text alignment, especially when using very large guidance weights."
        https://huggingface.co/papers/2205.11487
+        Args:
+            sample (`torch.Tensor`):
+                The predicted sample to be thresholded.
+        Returns:
+            `torch.Tensor`:
+                The thresholded sample.
        """
        dtype = sample.dtype
        batch_size, channels, *remaining_dims = sample.shape
@@ -596,6 +607,22 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
        noise: torch.Tensor,
        timesteps: torch.IntTensor,
    ) -> torch.Tensor:
+        """
+        Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
+        diffusion process).
+        Args:
+            original_samples (`torch.Tensor`):
+                The original samples to which noise will be added.
+            noise (`torch.Tensor`):
+                The noise to add to the samples.
+            timesteps (`torch.IntTensor`):
+                The timesteps indicating the noise level for each sample.
+        Returns:
+            `torch.Tensor`:
+                The noisy samples.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
        # for the subsequent add_noise calls
@@ -618,6 +645,21 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        """
+        Compute the velocity prediction from the sample and noise according to the velocity formula.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            noise (`torch.Tensor`):
+                The noise tensor.
+            timesteps (`torch.IntTensor`):
+                The timesteps for velocity computation.
+        Returns:
+            `torch.Tensor`:
+                The computed velocity.
+        """
        # Make sure alphas_cumprod and timestep have same device and dtype as sample
        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -641,6 +683,17 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
    def previous_timestep(self, timestep):
+        """
+        Compute the previous timestep in the diffusion chain.
+        Args:
+            timestep (`int`):
+                The current timestep.
+        Returns:
+            `int`:
+                The previous timestep.
+        """
        if self.custom_timesteps or self.num_inference_steps:
            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
            if index == self.timesteps.shape[0] - 1: