Multiple small fixes to Video Pipeline docs (#6805)

* update * update * update * Update src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py Co-authored-by: YiYi Xu <yixu310@gmail.com> * update * update --------- Co-authored-by: YiYi Xu <yixu310@gmail.com>

Multiple small fixes to Video Pipeline docs (#6805)
* update * update * update * Update src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py Co-authored-by: YiYi Xu <yixu310@gmail.com> * update * update --------- Co-authored-by: YiYi Xu <yixu310@gmail.com>
f09ca909 · Dhruv Nair · GitHub · a5fc62f8 · f09ca909 · f09ca909
Unverified Commit f09ca909 authored Feb 05, 2024 by Dhruv Nair Committed by GitHub Feb 05, 2024
7 changed files
--- a/docs/source/en/api/pipelines/i2vgenxl.md
+++ b/docs/source/en/api/pipelines/i2vgenxl.md
@@ -31,7 +31,7 @@ Sample output with I2VGenXL:
 <table>
    <tr>
        <td><center>
-        masterpiece, bestquality, sunset.
+        library.
        <br>
        <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/i2vgen-xl-example.gif"
            alt="library"

--- a/docs/source/en/api/pipelines/pia.md
+++ b/docs/source/en/api/pipelines/pia.md
@@ -70,7 +70,7 @@ Here are some sample outputs:
 <table>
    <tr>
        <td><center>
-        masterpiece, bestquality, sunset.
+        cat in a field.
        <br>
        <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pia-default-output.gif"
            alt="cat in a field"
@@ -119,7 +119,7 @@ image = load_image(
    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"
 )
 image = image.resize((512, 512))
-prompt = "cat in a hat"
+prompt = "cat in a field"
 negative_prompt = "wrong white balance, dark, sketches,worst quality,low quality"
 generator = torch.Generator("cpu").manual_seed(0)
@@ -132,7 +132,7 @@ export_to_gif(frames, "pia-freeinit-animation.gif")
 <table>
    <tr>
        <td><center>
-        masterpiece, bestquality, sunset.
+        cat in a field.
        <br>
        <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pia-freeinit-output-cat.gif"
            alt="cat in a field"

--- a/docs/source/en/api/pipelines/text_to_video.md
+++ b/docs/source/en/api/pipelines/text_to_video.md
@@ -41,7 +41,7 @@ pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", tor
 pipe = pipe.to("cuda")
 prompt = "Spiderman is surfing"
-video_frames = pipe(prompt).frames
+video_frames = pipe(prompt).frames[0]
 video_path = export_to_video(video_frames)
 video_path
 ```
@@ -64,7 +64,7 @@ pipe.enable_model_cpu_offload()
 pipe.enable_vae_slicing()
 prompt = "Darth Vader surfing a wave"
-video_frames = pipe(prompt, num_frames=64).frames
+video_frames = pipe(prompt, num_frames=64).frames[0]
 video_path = export_to_video(video_frames)
 video_path
 ```
@@ -83,7 +83,7 @@ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 pipe.enable_model_cpu_offload()
 prompt = "Spiderman is surfing"
-video_frames = pipe(prompt, num_inference_steps=25).frames
+video_frames = pipe(prompt, num_inference_steps=25).frames[0]
 video_path = export_to_video(video_frames)
 video_path
 ```
@@ -130,7 +130,7 @@ pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
 pipe.enable_vae_slicing()
 prompt = "Darth Vader surfing a wave"
-video_frames = pipe(prompt, num_frames=24).frames
+video_frames = pipe(prompt, num_frames=24).frames[0]
 video_path = export_to_video(video_frames)
 video_path
 ```
@@ -148,7 +148,7 @@ pipe.enable_vae_slicing()
 video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames]
-video_frames = pipe(prompt, video=video, strength=0.6).frames
+video_frames = pipe(prompt, video=video, strength=0.6).frames[0]
 video_path = export_to_video(video_frames)
 video_path
 ```

--- a/src/diffusers/pipelines/animatediff/pipeline_output.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_output.py
@@ -14,9 +14,10 @@ class AnimateDiffPipelineOutput(BaseOutput):
     Output class for AnimateDiff pipelines.
     Args:
-        frames (`List[List[PIL.Image.Image]]` or `torch.Tensor` or `np.ndarray`):
+         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
-            List of PIL Images of length `batch_size` or torch.Tensor or np.ndarray of shape
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
-            `(batch_size, num_frames, height, width, num_channels)`.
+     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+    `(batch_size, num_frames, channels, height, width)`
    """
-    frames: Union[List[List[PIL.Image.Image]], torch.Tensor, np.ndarray]
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
--- a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
+++ b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
@@ -46,6 +46,7 @@ EXAMPLE_DOC_STRING = """
        ```py
        >>> import torch
        >>> from diffusers import I2VGenXLPipeline
+        >>> from diffusers.utils import export_to_gif, load_image
        >>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
        >>> pipeline.enable_model_cpu_offload()
@@ -98,12 +99,13 @@ class I2VGenXLPipelineOutput(BaseOutput):
     Output class for image-to-video pipeline.
     Args:
-        frames (`List[np.ndarray]` or `torch.FloatTensor`)
+         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
-            List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
-            a `torch` tensor. The length of the list denotes the video length (the number of frames).
+     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+    `(batch_size, num_frames, channels, height, width)`
    """
-    frames: Union[List[np.ndarray], torch.FloatTensor]
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
 class I2VGenXLPipeline(DiffusionPipeline):

--- a/src/diffusers/pipelines/pia/pipeline_pia.py
+++ b/src/diffusers/pipelines/pia/pipeline_pia.py
@@ -200,13 +200,13 @@ class PIAPipelineOutput(BaseOutput):
    Output class for PIAPipeline.
    Args:
-        frames (`torch.Tensor`, `np.ndarray`, or List[PIL.Image.Image]):
+        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
        Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`,
        NumPy array of shape `(batch_size, num_frames, channels, height, width,
        Torch tensor of shape `(batch_size, num_frames, channels, height, width)`.
    """
-    frames: Union[torch.Tensor, np.ndarray, PIL.Image.Image]
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
 class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):

--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
@@ -2,6 +2,7 @@ from dataclasses import dataclass
 from typing import List, Union
 import numpy as np
+import PIL
 import torch
 from ...utils import (
@@ -15,9 +16,10 @@ class TextToVideoSDPipelineOutput(BaseOutput):
     Output class for text-to-video pipelines.
     Args:
-        frames (`List[np.ndarray]` or `torch.FloatTensor`)
+         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
-            List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
-            a `torch` tensor. The length of the list denotes the video length (the number of frames).
+     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+    `(batch_size, num_frames, channels, height, width)`
    """
-    frames: Union[List[np.ndarray], torch.FloatTensor]
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]