Unverified Commit a9e4883b authored by dg845's avatar dg845 Committed by GitHub
Browse files

Update Wan Animate Docs (#12658)

* Update the Wan Animate docs to reflect the most recent code

* Further explain input preprocessing and link to original Wan Animate preprocessing scripts
parent 63dd6017
...@@ -18,7 +18,7 @@ The model can be loaded with the following code snippet. ...@@ -18,7 +18,7 @@ The model can be loaded with the following code snippet.
```python ```python
from diffusers import WanAnimateTransformer3DModel from diffusers import WanAnimateTransformer3DModel
transformer = WanAnimateTransformer3DModel.from_pretrained("Wan-AI/Wan2.2-Animate-14B-720P-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16) transformer = WanAnimateTransformer3DModel.from_pretrained("Wan-AI/Wan2.2-Animate-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
``` ```
## WanAnimateTransformer3DModel ## WanAnimateTransformer3DModel
......
...@@ -281,7 +281,7 @@ For replacement mode, you additionally need: ...@@ -281,7 +281,7 @@ For replacement mode, you additionally need:
- **Mask video**: A mask indicating where to generate content (white) vs. preserve original (black) - **Mask video**: A mask indicating where to generate content (white) vs. preserve original (black)
> [!NOTE] > [!NOTE]
> The preprocessing tools are available in the original Wan-Animate repository. Integration of these preprocessing steps into Diffusers is planned for a future release. > Raw videos should not be used for inputs such as `pose_video`, which the pipeline expects to be preprocessed to extract the proper information. Preprocessing scripts to prepare these inputs are available in the [original Wan-Animate repository](https://github.com/Wan-Video/Wan2.2?tab=readme-ov-file#1-preprocessing). Integration of these preprocessing steps into Diffusers is planned for a future release.
The example below demonstrates how to use the Wan-Animate pipeline: The example below demonstrates how to use the Wan-Animate pipeline:
...@@ -293,13 +293,10 @@ import numpy as np ...@@ -293,13 +293,10 @@ import numpy as np
import torch import torch
from diffusers import AutoencoderKLWan, WanAnimatePipeline from diffusers import AutoencoderKLWan, WanAnimatePipeline
from diffusers.utils import export_to_video, load_image, load_video from diffusers.utils import export_to_video, load_image, load_video
from transformers import CLIPVisionModel
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers" model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32) vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanAnimatePipeline.from_pretrained( pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
model_id, vae=vae, torch_dtype=torch.bfloat16
)
pipe.to("cuda") pipe.to("cuda")
# Load character image and preprocessed videos # Load character image and preprocessed videos
...@@ -330,11 +327,11 @@ output = pipe( ...@@ -330,11 +327,11 @@ output = pipe(
negative_prompt=negative_prompt, negative_prompt=negative_prompt,
height=height, height=height,
width=width, width=width,
num_frames=81, segment_frame_length=77,
guidance_scale=5.0, guidance_scale=1.0,
mode="animation", # Animation mode (default) mode="animate", # Animation mode (default)
).frames[0] ).frames[0]
export_to_video(output, "animated_character.mp4", fps=16) export_to_video(output, "animated_character.mp4", fps=30)
``` ```
</hfoption> </hfoption>
...@@ -345,14 +342,10 @@ import numpy as np ...@@ -345,14 +342,10 @@ import numpy as np
import torch import torch
from diffusers import AutoencoderKLWan, WanAnimatePipeline from diffusers import AutoencoderKLWan, WanAnimatePipeline
from diffusers.utils import export_to_video, load_image, load_video from diffusers.utils import export_to_video, load_image, load_video
from transformers import CLIPVisionModel
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers" model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float16)
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32) vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanAnimatePipeline.from_pretrained( pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
)
pipe.to("cuda") pipe.to("cuda")
# Load all required inputs for replacement mode # Load all required inputs for replacement mode
...@@ -387,11 +380,11 @@ output = pipe( ...@@ -387,11 +380,11 @@ output = pipe(
negative_prompt=negative_prompt, negative_prompt=negative_prompt,
height=height, height=height,
width=width, width=width,
num_frames=81, segment_frame_lengths=77,
guidance_scale=5.0, guidance_scale=1.0,
mode="replacement", # Replacement mode mode="replace", # Replacement mode
).frames[0] ).frames[0]
export_to_video(output, "character_replaced.mp4", fps=16) export_to_video(output, "character_replaced.mp4", fps=30)
``` ```
</hfoption> </hfoption>
...@@ -402,14 +395,10 @@ import numpy as np ...@@ -402,14 +395,10 @@ import numpy as np
import torch import torch
from diffusers import AutoencoderKLWan, WanAnimatePipeline from diffusers import AutoencoderKLWan, WanAnimatePipeline
from diffusers.utils import export_to_video, load_image, load_video from diffusers.utils import export_to_video, load_image, load_video
from transformers import CLIPVisionModel
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers" model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float16)
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32) vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanAnimatePipeline.from_pretrained( pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
)
pipe.to("cuda") pipe.to("cuda")
image = load_image("path/to/character.jpg") image = load_image("path/to/character.jpg")
...@@ -443,14 +432,14 @@ output = pipe( ...@@ -443,14 +432,14 @@ output = pipe(
negative_prompt=negative_prompt, negative_prompt=negative_prompt,
height=height, height=height,
width=width, width=width,
num_frames=81, segment_frame_length=77,
num_inference_steps=50, num_inference_steps=50,
guidance_scale=5.0, guidance_scale=5.0,
num_frames_for_temporal_guidance=5, # Use 5 frames for temporal guidance (1 or 5 recommended) prev_segment_conditioning_frames=5, # Use 5 frames for temporal guidance (1 or 5 recommended)
callback_on_step_end=callback_fn, callback_on_step_end=callback_fn,
callback_on_step_end_tensor_inputs=["latents"], callback_on_step_end_tensor_inputs=["latents"],
).frames[0] ).frames[0]
export_to_video(output, "animated_advanced.mp4", fps=16) export_to_video(output, "animated_advanced.mp4", fps=30)
``` ```
</hfoption> </hfoption>
...@@ -458,10 +447,9 @@ export_to_video(output, "animated_advanced.mp4", fps=16) ...@@ -458,10 +447,9 @@ export_to_video(output, "animated_advanced.mp4", fps=16)
#### Key Parameters #### Key Parameters
- **mode**: Choose between `"animation"` (default) or `"replacement"` - **mode**: Choose between `"animate"` (default) or `"replace"`
- **num_frames_for_temporal_guidance**: Number of frames for temporal guidance (1 or 5 recommended). Using 5 provides better temporal consistency but requires more memory - **prev_segment_conditioning_frames**: Number of frames for temporal guidance (1 or 5 recommended). Using 5 provides better temporal consistency but requires more memory
- **guidance_scale**: Controls how closely the output follows the text prompt. Higher values (5-7) produce results more aligned with the prompt - **guidance_scale**: Controls how closely the output follows the text prompt. Higher values (5-7) produce results more aligned with the prompt. For Wan-Animate, CFG is disabled by default (`guidance_scale=1.0`) but can be enabled to support negative prompts and finer control over facial expressions. (Note that CFG will only target the text prompt and face conditioning.)
- **num_frames**: Total number of frames to generate. Should be divisible by `vae_scale_factor_temporal` (default: 4)
## Notes ## Notes
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment