"git@developer.sourcefind.cn:change/sglang.git" did not exist on "e9a6203dee21cda91a8f5a113ea4171f3b221571"
Commit a8aea27f authored by Yang Yong(雍洋)'s avatar Yang Yong(雍洋) Committed by GitHub
Browse files

update seko (#262)



* update seko talk

* update

* Update configs

* update

---------
Co-authored-by: default avatargushiqiao <975033167@qq.com>
parent aaf5f643
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 16,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale": 1,
"sample_shift": 5,
"enable_cfg": false,
"cpu_offload": false,
"adaptive_resize": true,
"use_31_block": false
}
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
"seed": 42,
"sample_guide_scale": 1,
"sample_shift": 5,
"enable_cfg": false,
"adaptive_resize": true,
"use_31_block": false,
"cpu_offload": true,
"offload_granularity": "block",
"offload_ratio_val": 1,
"t5_cpu_offload": true,
"t5_offload_granularity": "block",
"use_tiling_vae": true,
"audio_encoder_cpu_offload": true,
"audio_adapter_cpu_offload": false
}
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 120,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
"seed": 42,
"sample_guide_scale": 1,
"sample_shift": 5,
"enable_cfg": false,
"adaptive_resize": true,
"use_31_block": false,
"cpu_offload": true,
"offload_granularity": "block",
"offload_ratio_val": 1,
"t5_cpu_offload": true,
"t5_offload_granularity": "model",
"t5_quantized": true,
"t5_quant_scheme": "fp8-q8f",
"audio_encoder_cpu_offload": false,
"audio_adapter_cpu_offload": false,
"adapter_quantized": true,
"adapter_quant_scheme": "fp8",
"vae_cpu_offload": false,
"use_tiling_vae": true,
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F"
}
}
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 120,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
"seed": 42,
"sample_guide_scale": 1,
"sample_shift": 5,
"enable_cfg": false,
"adaptive_resize": true,
"use_31_block": false,
"cpu_offload": true,
"offload_granularity": "block",
"offload_ratio_val": 1,
"t5_cpu_offload": true,
"t5_offload_granularity": "model",
"t5_quantized": true,
"t5_quant_scheme": "fp8",
"audio_encoder_cpu_offload": false,
"audio_adapter_cpu_offload": false,
"adapter_quantized": true,
"adapter_quant_scheme": "fp8",
"vae_cpu_offload": false,
"use_tiling_vae": true,
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
}
}
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 120,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
"seed": 42,
"sample_guide_scale": 1,
"sample_shift": 5,
"enable_cfg": false,
"adaptive_resize": true,
"use_31_block": false,
"cpu_offload": true,
"offload_granularity": "block",
"offload_ratio_val": 1,
"t5_cpu_offload": true,
"t5_offload_granularity": "model",
"t5_quantized": true,
"t5_quant_scheme": "fp8-q8f",
"audio_encoder_cpu_offload": false,
"audio_adapter_cpu_offload": false,
"adapter_quantized": true,
"adapter_quant_scheme": "fp8",
"vae_cpu_offload": false,
"use_tiling_vae": true,
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Q8F"
}
}
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 120,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
"seed": 42,
"sample_guide_scale": 1,
"sample_shift": 5,
"enable_cfg": false,
"adaptive_resize": true,
"use_31_block": false,
"cpu_offload": true,
"offload_granularity": "block",
"offload_ratio_val": 1,
"t5_cpu_offload": true,
"t5_offload_granularity": "model",
"t5_quantized": true,
"t5_quant_scheme": "fp8",
"audio_encoder_cpu_offload": false,
"audio_adapter_cpu_offload": false,
"adapter_quantized": true,
"adapter_quant_scheme": "fp8",
"vae_cpu_offload": false,
"use_tiling_vae": true,
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
}
}
...@@ -52,7 +52,7 @@ class SageAttn2Weight(AttnWeightTemplate): ...@@ -52,7 +52,7 @@ class SageAttn2Weight(AttnWeightTemplate):
) )
x = torch.cat((x1, x2), dim=1) x = torch.cat((x1, x2), dim=1)
x = x.view(max_seqlen_q, -1) x = x.view(max_seqlen_q, -1)
elif model_cls in ["wan2.1", "wan2.1_distill", "wan2.1_causvid", "wan2.1_df", "wan2.1_audio", "wan2.2", "wan2.1_vace"]: elif model_cls in ["wan2.1", "wan2.1_distill", "wan2.1_causvid", "wan2.1_df", "seko_talk", "wan2.2", "wan2.1_vace"]:
x = sageattn( x = sageattn(
q.unsqueeze(0), q.unsqueeze(0),
k.unsqueeze(0), k.unsqueeze(0),
......
...@@ -48,7 +48,7 @@ def main(): ...@@ -48,7 +48,7 @@ def main():
"wan2.1_skyreels_v2_df", "wan2.1_skyreels_v2_df",
"wan2.1_vace", "wan2.1_vace",
"cogvideox", "cogvideox",
"wan2.1_audio", "seko_talk",
"wan2.2_moe", "wan2.2_moe",
"wan2.2", "wan2.2",
"wan2.2_moe_audio", "wan2.2_moe_audio",
......
...@@ -233,7 +233,7 @@ class AudioProcessor: ...@@ -233,7 +233,7 @@ class AudioProcessor:
return segments return segments
@RUNNER_REGISTER("wan2.1_audio") @RUNNER_REGISTER("seko_talk")
class WanAudioRunner(WanRunner): # type:ignore class WanAudioRunner(WanRunner): # type:ignore
def __init__(self, config): def __init__(self, config):
super().__init__(config) super().__init__(config)
......
#!/bin/bash #!/bin/bash
# set path and first lightx2v_path=/path/to/Lightx2v
lightx2v_path= model_path=/path/to/SekoTalk-Distill
model_path=
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
...@@ -16,12 +14,12 @@ export ENABLE_GRAPH_MODE=false ...@@ -16,12 +14,12 @@ export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None export SENSITIVE_LAYER_DTYPE=None
python -m lightx2v.infer \ python -m lightx2v.infer \
--model_cls wan2.1_audio \ --model_cls seko_talk \
--task i2v \ --task i2v \
--model_path $model_path \ --model_path $model_path \
--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio_offload.json \ --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_01_base.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \ --prompt "The video features a old lady is saying something and knitting a sweater." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \ --negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \ --image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \ --audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
#!/bin/bash #!/bin/bash
# set path and first lightx2v_path=/path/to/Lightx2v
lightx2v_path= model_path=/path/to/SekoTalk-Distill-fp8
model_path=
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
...@@ -15,12 +14,12 @@ export ENABLE_GRAPH_MODE=false ...@@ -15,12 +14,12 @@ export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None export SENSITIVE_LAYER_DTYPE=None
python -m lightx2v.infer \ python -m lightx2v.infer \
--model_cls wan2.1_audio \ --model_cls seko_talk \
--task i2v \ --task i2v \
--model_path $model_path \ --model_path $model_path \
--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio.json \ --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_02_fp8.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \ --prompt "The video features a old lady is saying something and knitting a sweater." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \ --negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \ --image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \ --audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
#!/bin/bash #!/bin/bash
lightx2v_path= lightx2v_path=/path/to/Lightx2v
model_path= model_path=/path/to/SekoTalk-Distill
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0,1,2,3
...@@ -14,12 +14,12 @@ export ENABLE_GRAPH_MODE=false ...@@ -14,12 +14,12 @@ export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None export SENSITIVE_LAYER_DTYPE=None
torchrun --nproc-per-node 4 -m lightx2v.infer \ torchrun --nproc-per-node 4 -m lightx2v.infer \
--model_cls wan2.1_audio \ --model_cls seko_talk \
--task i2v \ --task i2v \
--model_path $model_path \ --model_path $model_path \
--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio_dist.json \ --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_03_dist.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \ --prompt "The video features a old lady is saying something and knitting a sweater." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \ --negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \ --image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \ --audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
#!/bin/bash #!/bin/bash
lightx2v_path= lightx2v_path=/path/to/Lightx2v
model_path= model_path=/path/to/SekoTalk-Distill-fp8
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0,1,2,3
...@@ -14,12 +14,12 @@ export ENABLE_GRAPH_MODE=false ...@@ -14,12 +14,12 @@ export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None export SENSITIVE_LAYER_DTYPE=None
torchrun --nproc-per-node 4 -m lightx2v.infer \ torchrun --nproc-per-node 4 -m lightx2v.infer \
--model_cls wan2.1_audio \ --model_cls seko_talk \
--task i2v \ --task i2v \
--model_path $model_path \ --model_path $model_path \
--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio_quant_dist.json \ --config_json ${lightx2v_path}/configs/seko_talk/seko_talk_04_fp8_dist.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \ --prompt "The video features a old lady is saying something and knitting a sweater." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \ --negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \ --image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \ --audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
#!/bin/bash
lightx2v_path=/path/to/Lightx2v
model_path=/path/to/SekoTalk-Distill-fp8
export CUDA_VISIBLE_DEVICES=0
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None
python -m lightx2v.infer \
--model_cls seko_talk \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_05_offload_fp8_4090.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment