Unverified Commit f09ca909 authored by Dhruv Nair's avatar Dhruv Nair Committed by GitHub
Browse files

Multiple small fixes to Video Pipeline docs (#6805)



* update

* update

* update

* Update src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
Co-authored-by: default avatarYiYi Xu <yixu310@gmail.com>

* update

* update

---------
Co-authored-by: default avatarYiYi Xu <yixu310@gmail.com>
parent a5fc62f8
...@@ -31,7 +31,7 @@ Sample output with I2VGenXL: ...@@ -31,7 +31,7 @@ Sample output with I2VGenXL:
<table> <table>
<tr> <tr>
<td><center> <td><center>
masterpiece, bestquality, sunset. library.
<br> <br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/i2vgen-xl-example.gif" <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/i2vgen-xl-example.gif"
alt="library" alt="library"
......
...@@ -70,7 +70,7 @@ Here are some sample outputs: ...@@ -70,7 +70,7 @@ Here are some sample outputs:
<table> <table>
<tr> <tr>
<td><center> <td><center>
masterpiece, bestquality, sunset. cat in a field.
<br> <br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pia-default-output.gif" <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pia-default-output.gif"
alt="cat in a field" alt="cat in a field"
...@@ -119,7 +119,7 @@ image = load_image( ...@@ -119,7 +119,7 @@ image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true" "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"
) )
image = image.resize((512, 512)) image = image.resize((512, 512))
prompt = "cat in a hat" prompt = "cat in a field"
negative_prompt = "wrong white balance, dark, sketches,worst quality,low quality" negative_prompt = "wrong white balance, dark, sketches,worst quality,low quality"
generator = torch.Generator("cpu").manual_seed(0) generator = torch.Generator("cpu").manual_seed(0)
...@@ -132,7 +132,7 @@ export_to_gif(frames, "pia-freeinit-animation.gif") ...@@ -132,7 +132,7 @@ export_to_gif(frames, "pia-freeinit-animation.gif")
<table> <table>
<tr> <tr>
<td><center> <td><center>
masterpiece, bestquality, sunset. cat in a field.
<br> <br>
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pia-freeinit-output-cat.gif" <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pia-freeinit-output-cat.gif"
alt="cat in a field" alt="cat in a field"
......
...@@ -41,7 +41,7 @@ pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", tor ...@@ -41,7 +41,7 @@ pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", tor
pipe = pipe.to("cuda") pipe = pipe.to("cuda")
prompt = "Spiderman is surfing" prompt = "Spiderman is surfing"
video_frames = pipe(prompt).frames video_frames = pipe(prompt).frames[0]
video_path = export_to_video(video_frames) video_path = export_to_video(video_frames)
video_path video_path
``` ```
...@@ -64,7 +64,7 @@ pipe.enable_model_cpu_offload() ...@@ -64,7 +64,7 @@ pipe.enable_model_cpu_offload()
pipe.enable_vae_slicing() pipe.enable_vae_slicing()
prompt = "Darth Vader surfing a wave" prompt = "Darth Vader surfing a wave"
video_frames = pipe(prompt, num_frames=64).frames video_frames = pipe(prompt, num_frames=64).frames[0]
video_path = export_to_video(video_frames) video_path = export_to_video(video_frames)
video_path video_path
``` ```
...@@ -83,7 +83,7 @@ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) ...@@ -83,7 +83,7 @@ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload()
prompt = "Spiderman is surfing" prompt = "Spiderman is surfing"
video_frames = pipe(prompt, num_inference_steps=25).frames video_frames = pipe(prompt, num_inference_steps=25).frames[0]
video_path = export_to_video(video_frames) video_path = export_to_video(video_frames)
video_path video_path
``` ```
...@@ -130,7 +130,7 @@ pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) ...@@ -130,7 +130,7 @@ pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
pipe.enable_vae_slicing() pipe.enable_vae_slicing()
prompt = "Darth Vader surfing a wave" prompt = "Darth Vader surfing a wave"
video_frames = pipe(prompt, num_frames=24).frames video_frames = pipe(prompt, num_frames=24).frames[0]
video_path = export_to_video(video_frames) video_path = export_to_video(video_frames)
video_path video_path
``` ```
...@@ -148,7 +148,7 @@ pipe.enable_vae_slicing() ...@@ -148,7 +148,7 @@ pipe.enable_vae_slicing()
video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames] video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames]
video_frames = pipe(prompt, video=video, strength=0.6).frames video_frames = pipe(prompt, video=video, strength=0.6).frames[0]
video_path = export_to_video(video_frames) video_path = export_to_video(video_frames)
video_path video_path
``` ```
......
...@@ -14,9 +14,10 @@ class AnimateDiffPipelineOutput(BaseOutput): ...@@ -14,9 +14,10 @@ class AnimateDiffPipelineOutput(BaseOutput):
Output class for AnimateDiff pipelines. Output class for AnimateDiff pipelines.
Args: Args:
frames (`List[List[PIL.Image.Image]]` or `torch.Tensor` or `np.ndarray`): frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
List of PIL Images of length `batch_size` or torch.Tensor or np.ndarray of shape List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
`(batch_size, num_frames, height, width, num_channels)`. PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
`(batch_size, num_frames, channels, height, width)`
""" """
frames: Union[List[List[PIL.Image.Image]], torch.Tensor, np.ndarray] frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
...@@ -46,6 +46,7 @@ EXAMPLE_DOC_STRING = """ ...@@ -46,6 +46,7 @@ EXAMPLE_DOC_STRING = """
```py ```py
>>> import torch >>> import torch
>>> from diffusers import I2VGenXLPipeline >>> from diffusers import I2VGenXLPipeline
>>> from diffusers.utils import export_to_gif, load_image
>>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") >>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
>>> pipeline.enable_model_cpu_offload() >>> pipeline.enable_model_cpu_offload()
...@@ -98,12 +99,13 @@ class I2VGenXLPipelineOutput(BaseOutput): ...@@ -98,12 +99,13 @@ class I2VGenXLPipelineOutput(BaseOutput):
Output class for image-to-video pipeline. Output class for image-to-video pipeline.
Args: Args:
frames (`List[np.ndarray]` or `torch.FloatTensor`) frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
a `torch` tensor. The length of the list denotes the video length (the number of frames). PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
`(batch_size, num_frames, channels, height, width)`
""" """
frames: Union[List[np.ndarray], torch.FloatTensor] frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
class I2VGenXLPipeline(DiffusionPipeline): class I2VGenXLPipeline(DiffusionPipeline):
......
...@@ -200,13 +200,13 @@ class PIAPipelineOutput(BaseOutput): ...@@ -200,13 +200,13 @@ class PIAPipelineOutput(BaseOutput):
Output class for PIAPipeline. Output class for PIAPipeline.
Args: Args:
frames (`torch.Tensor`, `np.ndarray`, or List[PIL.Image.Image]): frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`, Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`,
NumPy array of shape `(batch_size, num_frames, channels, height, width, NumPy array of shape `(batch_size, num_frames, channels, height, width,
Torch tensor of shape `(batch_size, num_frames, channels, height, width)`. Torch tensor of shape `(batch_size, num_frames, channels, height, width)`.
""" """
frames: Union[torch.Tensor, np.ndarray, PIL.Image.Image] frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin): class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
......
...@@ -2,6 +2,7 @@ from dataclasses import dataclass ...@@ -2,6 +2,7 @@ from dataclasses import dataclass
from typing import List, Union from typing import List, Union
import numpy as np import numpy as np
import PIL
import torch import torch
from ...utils import ( from ...utils import (
...@@ -15,9 +16,10 @@ class TextToVideoSDPipelineOutput(BaseOutput): ...@@ -15,9 +16,10 @@ class TextToVideoSDPipelineOutput(BaseOutput):
Output class for text-to-video pipelines. Output class for text-to-video pipelines.
Args: Args:
frames (`List[np.ndarray]` or `torch.FloatTensor`) frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
a `torch` tensor. The length of the list denotes the video length (the number of frames). PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
`(batch_size, num_frames, channels, height, width)`
""" """
frames: Union[List[np.ndarray], torch.FloatTensor] frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment