Commit 1ff745e4 authored by helloyongyang's avatar helloyongyang
Browse files

update wan22 ti2v parallel

parent 74f2ec41
{
"infer_steps": 50,
"target_video_length": 121,
"text_len": 512,
"target_height": 704,
"target_width": 1280,
"num_channels_latents": 48,
"vae_stride": [4, 16, 16],
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale": 5.0,
"sample_shift": 5.0,
"enable_cfg": true,
"cpu_offload": false,
"offload_granularity": "model",
"fps": 24,
"use_image_encoder": false,
"parallel": {
"cfg_p_size": 2
}
}
{
"infer_steps": 50,
"target_video_length": 121,
"text_len": 512,
"target_height": 704,
"target_width": 1280,
"num_channels_latents": 48,
"vae_stride": [4, 16, 16],
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale": 5.0,
"sample_shift": 5.0,
"enable_cfg": true,
"cpu_offload": false,
"offload_granularity": "model",
"fps": 24,
"use_image_encoder": false,
"parallel": {
"seq_p_size": 4,
"seq_p_attn_type": "ulysses",
"cfg_p_size": 2
}
}
{
"infer_steps": 50,
"target_video_length": 121,
"text_len": 512,
"target_height": 704,
"target_width": 1280,
"num_channels_latents": 48,
"vae_stride": [4, 16, 16],
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale": 5.0,
"sample_shift": 5.0,
"enable_cfg": true,
"cpu_offload": false,
"offload_granularity": "model",
"fps": 24,
"use_image_encoder": false,
"parallel": {
"seq_p_size": 4,
"seq_p_attn_type": "ulysses"
}
}
{
"infer_steps": 50,
"target_video_length": 121,
"text_len": 512,
"target_height": 704,
"target_width": 1280,
"num_channels_latents": 48,
"vae_stride": [4, 16, 16],
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale": 5.0,
"sample_shift": 5.0,
"enable_cfg": true,
"cpu_offload": false,
"offload_granularity": "model",
"fps": 24,
"parallel": {
"cfg_p_size": 2
}
}
{
"infer_steps": 50,
"target_video_length": 121,
"text_len": 512,
"target_height": 704,
"target_width": 1280,
"num_channels_latents": 48,
"vae_stride": [4, 16, 16],
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale": 5.0,
"sample_shift": 5.0,
"enable_cfg": true,
"cpu_offload": false,
"offload_granularity": "model",
"fps": 24,
"parallel": {
"seq_p_size": 4,
"seq_p_attn_type": "ulysses",
"cfg_p_size": 2
}
}
{
"infer_steps": 50,
"target_video_length": 121,
"text_len": 512,
"target_height": 704,
"target_width": 1280,
"num_channels_latents": 48,
"vae_stride": [4, 16, 16],
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale": 5.0,
"sample_shift": 5.0,
"enable_cfg": true,
"cpu_offload": false,
"offload_granularity": "model",
"fps": 24,
"parallel": {
"seq_p_size": 4,
"seq_p_attn_type": "ulysses"
}
}
......@@ -12,7 +12,7 @@ class WanTransformerDistInfer(WanTransformerInfer):
self.seq_p_group = self.config["device_mesh"].get_group(mesh_dim="seq_p")
def infer(self, weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context, audio_dit_blocks=None):
x = self.dist_pre_process(x)
x, embed0 = self.dist_pre_process(x, embed0)
x = super().infer(weights, grid_sizes, embed, x, embed0, seq_lens, freqs, context, audio_dit_blocks)
x = self.dist_post_process(x)
return x
......@@ -24,7 +24,7 @@ class WanTransformerDistInfer(WanTransformerInfer):
freqs_i = self.compute_freqs_dist(q.size(0), q.size(2) // 2, grid_sizes, freqs)
return freqs_i
def dist_pre_process(self, x):
def dist_pre_process(self, x, embed0):
world_size = dist.get_world_size(self.seq_p_group)
cur_rank = dist.get_rank(self.seq_p_group)
......@@ -35,7 +35,9 @@ class WanTransformerDistInfer(WanTransformerInfer):
x = F.pad(x, (0, 0, 0, padding_size)) # (后维度填充, 前维度填充)
x = torch.chunk(x, world_size, dim=0)[cur_rank]
return x
if self.config["model_cls"].startswith("wan2.2"):
embed0 = torch.chunk(embed0, world_size, dim=0)[cur_rank]
return x, embed0
def dist_post_process(self, x):
world_size = dist.get_world_size(self.seq_p_group)
......
......@@ -73,6 +73,10 @@ class WanPreInfer:
x = x.flatten(2).transpose(1, 2).contiguous()
seq_lens = torch.tensor(x.size(1), dtype=torch.long).cuda().unsqueeze(0)
# wan2.2_moe会对t做扩展,我们发现这里做不做影响不大,而且做了拓展会增加耗时,目前忠实原作代码,后续可以考虑去掉
if self.config["model_cls"] == "wan2.2_moe":
t = t.expand(seq_lens[0])
embed = sinusoidal_embedding_1d(self.freq_dim, t.flatten())
if self.enable_dynamic_cfg:
s = torch.tensor([self.cfg_scale], dtype=torch.float32).to(x.device)
......
#!/bin/bash
# set path and first
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0,1
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
torchrun --nproc_per_node=2 -m lightx2v.infer \
--model_cls wan2.2 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/dist_infer/wan22_ti2v_i2v_cfg.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
--negative_prompt "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_ti2v_i2v_parallel_cfg.mp4
#!/bin/bash
# set path and first
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
torchrun --nproc_per_node=8 -m lightx2v.infer \
--model_cls wan2.2 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/dist_infer/wan22_ti2v_i2v_cfg_ulysses.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
--negative_prompt "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_ti2v_i2v_parallel_cfg_ulysses.mp4
#!/bin/bash
# set path and first
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0,1,2,3
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
torchrun --nproc_per_node=4 -m lightx2v.infer \
--model_cls wan2.2 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/dist_infer/wan22_ti2v_i2v_ulysses.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
--negative_prompt "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_ti2v_i2v_ulysses.mp4
#!/bin/bash
# set path and first
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0,1
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
torchrun --nproc_per_node=2 -m lightx2v.infer \
--model_cls wan2.2 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/dist_infer/wan22_ti2v_t2v_cfg.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
--negative_prompt "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_ti2v_t2v_parallel_cfg.mp4
#!/bin/bash
# set path and first
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
torchrun --nproc_per_node=8 -m lightx2v.infer \
--model_cls wan2.2 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/dist_infer/wan22_ti2v_t2v_cfg_ulysses.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
--negative_prompt "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_ti2v_t2v_parallel_cfg_ulysses.mp4
#!/bin/bash
# set path and first
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0,1,2,3
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
torchrun --nproc_per_node=4 -m lightx2v.infer \
--model_cls wan2.2 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/dist_infer/wan22_ti2v_t2v_ulysses.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
--negative_prompt "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan22_ti2v_t2v_parallel_ulysses.mp4
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment