add: utility to format our docs too 📜 (#7314)

* add: utility to format our docs too 📜 * debugging saga * fix: message * checking * should be fixed. * revert pipeline_fixture * remove empty line * make style * fix: setup.py * style.

add: utility to format our docs too 📜 (#7314)
* add: utility to format our docs too 📜 * debugging saga * fix: message * checking * should be fixed. * revert pipeline_fixture * remove empty line * make style * fix: setup.py * style.
4a343077 · Sayak Paul · GitHub · 8e963d1c · 4a343077 · 4a343077
Unverified Commit 4a343077 authored Apr 02, 2024 by Sayak Paul Committed by GitHub Apr 02, 2024
20 changed files
--- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
+++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
@@ -90,8 +90,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
@@ -773,10 +773,10 @@ class StableDiffusionLDM3DPipeline(
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                if `do_classifier_free_guidance` is set to `True`.
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):

--- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
+++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
@@ -90,8 +90,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
@@ -694,9 +694,9 @@ class StableDiffusionPanoramaPipeline(
        circular_padding: bool = False,
    ) -> List[Tuple[int, int, int, int]]:
        """
-        Generates a list of views based on the given parameters.
+        Generates a list of views based on the given parameters. Here, we define the mappings F_i (see Eq. 7 in the
-        Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113).
+        MultiDiffusion paper https://arxiv.org/abs/2302.08113). If panorama's height/width < window_size, num_blocks of
-        If panorama's height/width < window_size, num_blocks of height/width should return 1.
+        height/width should return 1.
        Args:
            panorama_height (int): The height of the panorama.
@@ -706,8 +706,8 @@ class StableDiffusionPanoramaPipeline(
            circular_padding (bool, optional): Whether to apply circular padding. Defaults to False.
        Returns:
-            List[Tuple[int, int, int, int]]: A list of tuples representing the views. Each tuple contains
+            List[Tuple[int, int, int, int]]: A list of tuples representing the views. Each tuple contains four integers
-            four integers representing the start and end coordinates of the window in the panorama.
+            representing the start and end coordinates of the window in the panorama.
        """
        panorama_height /= 8
@@ -800,8 +800,8 @@ class StableDiffusionPanoramaPipeline(
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
-                The timesteps at which to generate the images. If not specified, then the default
+                The timesteps at which to generate the images. If not specified, then the default timestep spacing
-                timestep spacing strategy of the scheduler is used.
+                strategy of the scheduler is used.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
@@ -832,10 +832,10 @@ class StableDiffusionPanoramaPipeline(
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                if `do_classifier_free_guidance` is set to `True`.
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):

--- a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
+++ b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
@@ -619,8 +619,8 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, Textua
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. If not
+                Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the
-                provided, embeddings are computed from the `ip_adapter_image` input argument.
+                `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -117,8 +117,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
@@ -919,10 +919,10 @@ class StableDiffusionXLPipeline(
                input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                if `do_classifier_free_guidance` is set to `True`.
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -134,8 +134,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
@@ -1067,10 +1067,10 @@ class StableDiffusionXLImg2ImgPipeline(
                input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                if `do_classifier_free_guidance` is set to `True`.
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -279,8 +279,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
@@ -1255,11 +1255,12 @@ class StableDiffusionXLInpaintPipeline(
                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
                and checkpoints that are not specifically fine-tuned on low resolutions.
            padding_mask_crop (`int`, *optional*, defaults to `None`):
-                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to
-                `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
+                image and mask_image. If `padding_mask_crop` is not `None`, it will first find a rectangular region
-                contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
+                with the same aspect ration of the image and contains all masked area, and then expand that area based
-                the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
+                on `padding_mask_crop`. The image and mask_image will then be cropped based on the expanded area before
-                and contain information irrelevant for inpainting, such as background.
+                resizing to the original image size for inpainting. This is useful when the masked area is small while
+                the image is large and contain information irrelevant for inpainting, such as background.
            strength (`float`, *optional*, defaults to 0.9999):
                Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
                between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
@@ -1319,10 +1320,10 @@ class StableDiffusionXLInpaintPipeline(
                input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                if `do_classifier_free_guidance` is set to `True`.
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):

--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -37,10 +37,14 @@ EXAMPLE_DOC_STRING = """
        >>> from diffusers import StableVideoDiffusionPipeline
        >>> from diffusers.utils import load_image, export_to_video
-        >>> pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
+        >>> pipe = StableVideoDiffusionPipeline.from_pretrained(
+        ...     "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
+        ... )
        >>> pipe.to("cuda")
-        >>> image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg")
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg"
+        ... )
        >>> image = image.resize((1024, 576))
        >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
@@ -86,8 +90,8 @@ class StableVideoDiffusionPipelineOutput(BaseOutput):
    Args:
        frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]):
-            List of denoised PIL images of length `batch_size` or numpy array or torch tensor
+            List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
-            of shape `(batch_size, num_frames, height, width, num_channels)`.
+            num_frames, height, width, num_channels)`.
    """
    frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor]
@@ -104,7 +108,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
        vae ([`AutoencoderKLTemporalDecoder`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
-            Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
+            Frozen CLIP image-encoder
+            ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
        unet ([`UNetSpatioTemporalConditionModel`]):
            A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
        scheduler ([`EulerDiscreteScheduler`]):
@@ -357,14 +362,15 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
        Args:
            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
-                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`.
+                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
+                1]`.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
            num_frames (`int`, *optional*):
-                The number of video frames to generate. Defaults to `self.unet.config.num_frames`
+                The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
-                (14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
+                `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
            num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
                expense of slower inference. This parameter is modulated by `strength`.
@@ -373,16 +379,18 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
            max_guidance_scale (`float`, *optional*, defaults to 3.0):
                The maximum guidance scale. Used for the classifier free guidance with last frame.
            fps (`int`, *optional*, defaults to 7):
-                Frames per second. The rate at which the generated images shall be exported to a video after generation.
+                Frames per second. The rate at which the generated images shall be exported to a video after
-                Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+                generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
            motion_bucket_id (`int`, *optional*, defaults to 127):
                Used for conditioning the amount of motion for the generation. The higher the number the more motion
                will be in the video.
            noise_aug_strength (`float`, *optional*, defaults to 0.02):
-                The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
+                The amount of noise added to the init image, the higher it is the less the video will look like the
+                init image. Increase it for more motion.
            decode_chunk_size (`int`, *optional*):
-                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the expense of more memory usage. By default, the decoder decodes all frames at once for maximal
+                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
-                quality. For lower memory usage, reduce `decode_chunk_size`.
+                expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
+                For lower memory usage, reduce `decode_chunk_size`.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                The number of videos to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -398,7 +406,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
                A function that is called at the end of each denoising step during inference. The function is called
                with the following arguments:
                    `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
-                `callback_kwargs` will include a list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+                `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -411,8 +420,9 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
        Returns:
            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
-                otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) is returned.
+                returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`)
+                is returned.
        """
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor

--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
@@ -134,8 +134,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):

--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -150,8 +150,8 @@ def retrieve_timesteps(
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            `timesteps` must be `None`.
+            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
@@ -943,10 +943,10 @@ class StableDiffusionXLAdapterPipeline(
                input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                if `do_classifier_free_guidance` is set to `True`.
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.

--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
@@ -17,7 +17,8 @@ class TextToVideoSDPipelineOutput(BaseOutput):
     Args:
         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
-             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+             denoised
     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
    `(batch_size, num_frames, channels, height, width)`
    """

--- a/src/diffusers/pipelines/unidiffuser/modeling_uvit.py
+++ b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py
@@ -752,7 +752,8 @@ class UTransformer2DModel(ModelMixin, ConfigMixin):
            cross_attention_kwargs (*optional*):
                Keyword arguments to supply to the cross attention layers, if used.
            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+                Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
            hidden_states_is_embedding (`bool`, *optional*, defaults to `False`):
                Whether or not hidden_states is an embedding directly usable by the transformer. In this case we will
                ignore input handling (e.g. continuous, vectorized, etc.) and directly feed hidden_states into the

--- a/src/diffusers/schedulers/scheduling_ddim_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddim_flax.py
@@ -85,7 +85,8 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
        trained_betas (`jnp.ndarray`, optional):
            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
        clip_sample (`bool`, default `True`):
-            option to clip predicted sample between for numerical stability. The clip range is determined by `clip_sample_range`.
+            option to clip predicted sample between for numerical stability. The clip range is determined by
+            `clip_sample_range`.
        clip_sample_range (`float`, default `1.0`):
            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
        set_alpha_to_one (`bool`, default `True`):

--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -166,8 +166,8 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
            the sampling process. If `True`, the sigmas and time steps are determined according to a sequence of
            `lambda(t)`.
        final_sigmas_type (`str`, defaults to `"zero"`):
-            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
-            is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
        lambda_min_clipped (`float`, defaults to `-inf`):
            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
            cosine (`squaredcos_cap_v2`) noise schedule.

--- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -108,11 +108,11 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
            `algorithm_type="dpmsolver++"`.
        algorithm_type (`str`, defaults to `dpmsolver++`):
-            Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++`. The
+            Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the
-            `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
+            algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927) paper, and the `dpmsolver++` type
-            paper, and the `dpmsolver++` type implements the algorithms in the
+            implements the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is
-            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
+            recommended to use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling like in
-            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
+            Stable Diffusion.
        solver_type (`str`, defaults to `midpoint`):
            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
@@ -123,8 +123,8 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
            the sigmas are determined according to a sequence of noise levels {σi}.
        final_sigmas_type (`str`, *optional*, defaults to `"zero"`):
-            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
-            is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
        lambda_min_clipped (`float`, defaults to `-inf`):
            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
            cosine (`squaredcos_cap_v2`) noise schedule.

--- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
@@ -62,10 +62,9 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
            `algorithm_type="dpmsolver++"`.
        algorithm_type (`str`, defaults to `dpmsolver++`):
-            Algorithm type for the solver; can be `dpmsolver++` or `sde-dpmsolver++`. The
+            Algorithm type for the solver; can be `dpmsolver++` or `sde-dpmsolver++`. The `dpmsolver++` type implements
-            `dpmsolver++` type implements the algorithms in the
+            the algorithms in the [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to
-            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
+            use `dpmsolver++` or `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
-            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
        solver_type (`str`, defaults to `midpoint`):
            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
@@ -77,8 +76,8 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
            steps, but sometimes may result in blurring.
        final_sigmas_type (`str`, defaults to `"zero"`):
-            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
-            is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
    """
    _compatibles = []

--- a/src/diffusers/schedulers/scheduling_edm_euler.py
+++ b/src/diffusers/schedulers/scheduling_edm_euler.py
@@ -278,8 +278,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
            generator (`torch.Generator`, *optional*):
                A random number generator.
            return_dict (`bool`):
-                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or tuple.
-                tuple.
        Returns:
            [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or `tuple`:

--- a/src/diffusers/schedulers/scheduling_sasolver.py
+++ b/src/diffusers/schedulers/scheduling_sasolver.py
@@ -92,19 +92,20 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
        trained_betas (`np.ndarray`, *optional*):
            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
        predictor_order (`int`, defaults to 2):
-            The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for guided
+            The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for
-            sampling, and `predictor_order=3` for unconditional sampling.
+            guided sampling, and `predictor_order=3` for unconditional sampling.
        corrector_order (`int`, defaults to 2):
-            The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for guided
+            The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for
-            sampling, and `corrector_order=3` for unconditional sampling.
+            guided sampling, and `corrector_order=3` for unconditional sampling.
        prediction_type (`str`, defaults to `epsilon`, *optional*):
            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
            Video](https://imagen.research.google/video/paper.pdf) paper).
        tau_func (`Callable`, *optional*):
-            Stochasticity during the sampling. Default in init is `lambda t: 1 if t >= 200 and t <= 800 else 0`. SA-Solver
+            Stochasticity during the sampling. Default in init is `lambda t: 1 if t >= 200 and t <= 800 else 0`.
-            will sample from vanilla diffusion ODE if tau_func is set to `lambda t: 0`. SA-Solver will sample from vanilla
+            SA-Solver will sample from vanilla diffusion ODE if tau_func is set to `lambda t: 0`. SA-Solver will sample
-            diffusion SDE if tau_func is set to `lambda t: 1`. For more details, please check https://arxiv.org/abs/2309.05019
+            from vanilla diffusion SDE if tau_func is set to `lambda t: 1`. For more details, please check
+            https://arxiv.org/abs/2309.05019
        thresholding (`bool`, defaults to `False`):
            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
            as Stable Diffusion.
@@ -114,8 +115,8 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
            `algorithm_type="dpmsolver++"`.
        algorithm_type (`str`, defaults to `data_prediction`):
-            Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use `data_prediction`
+            Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use
-            with `solver_order=2` for guided sampling like in Stable Diffusion.
+            `data_prediction` with `solver_order=2` for guided sampling like in Stable Diffusion.
        lower_order_final (`bool`, defaults to `True`):
            Whether to use lower-order solvers in the final steps. Default = True.
        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
@@ -402,14 +403,14 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
        **kwargs,
    ) -> torch.FloatTensor:
        """
-        Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs. Noise_prediction is
+        Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs.
-        designed to discretize an integral of the noise prediction model, and data_prediction is designed to discretize an
+        Noise_prediction is designed to discretize an integral of the noise prediction model, and data_prediction is
-        integral of the data prediction model.
+        designed to discretize an integral of the data prediction model.
        <Tip>
-        The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction for both noise
+        The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction for both
-        prediction and data prediction models.
+        noise prediction and data prediction models.
        </Tip>

--- a/src/diffusers/schedulers/scheduling_tcd.py
+++ b/src/diffusers/schedulers/scheduling_tcd.py
@@ -132,8 +132,8 @@ def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor:
 class TCDScheduler(SchedulerMixin, ConfigMixin):
    """
-    `TCDScheduler` incorporates the `Strategic Stochastic Sampling` introduced by the paper `Trajectory Consistency Distillation`,
+    `TCDScheduler` incorporates the `Strategic Stochastic Sampling` introduced by the paper `Trajectory Consistency
-    extending the original Multistep Consistency Sampling to enable unrestricted trajectory traversal.
+    Distillation`, extending the original Multistep Consistency Sampling to enable unrestricted trajectory traversal.
    This code is based on the official repo of TCD(https://github.com/jabir-zheng/TCD).
@@ -543,8 +543,9 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
            sample (`torch.FloatTensor`):
                A current instance of a sample created by the diffusion process.
            eta (`float`):
-                A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every step.
+                A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every
-                When eta = 0, it represents deterministic sampling, whereas eta = 1 indicates full stochastic sampling.
+                step. When eta = 0, it represents deterministic sampling, whereas eta = 1 indicates full stochastic
+                sampling.
            generator (`torch.Generator`, *optional*):
                A random number generator.
            return_dict (`bool`, *optional*, defaults to `True`):

--- a/src/diffusers/schedulers/scheduling_unipc_multistep.py
+++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -128,8 +128,8 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
        steps_offset (`int`, defaults to 0):
            An offset added to the inference steps, as required by some model families.
        final_sigmas_type (`str`, defaults to `"zero"`):
-            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
-            is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
    """
    _compatibles = [e.name for e in KarrasDiffusionSchedulers]

--- a/src/diffusers/utils/dynamic_modules_utils.py
+++ b/src/diffusers/utils/dynamic_modules_utils.py
@@ -246,8 +246,8 @@ def get_cached_module_file(
    <Tip>
-    You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private
+    You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private or
-    or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
+    [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
    </Tip>
@@ -434,8 +434,8 @@ def get_class_from_dynamic_module(
    <Tip>
-    You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private
+    You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private or
-    or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
+    [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
    </Tip>