Commit 3652b385 authored by Yang Yong(雍洋)'s avatar Yang Yong(雍洋) Committed by GitHub
Browse files

update seko model filename & update seko scripts (#257)



* update seko file names

* update

* [Update] update seko-models configs

* Add seko scripts

---------
Co-authored-by: default avatargushiqiao <975033167@qq.com>
parent 84e756e9
{ {
"infer_steps": 4, "infer_steps": 4,
"target_fps": 16, "target_fps": 16,
"video_duration": 12, "video_duration": 12,
"audio_sr": 16000, "audio_sr": 16000,
"target_video_length": 81, "target_video_length": 81,
"target_height": 720, "target_height": 720,
"target_width": 1280, "target_width": 1280,
"self_attn_1_type": "flash_attn3", "self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3", "cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3", "cross_attn_2_type": "flash_attn3",
"seed": 42, "seed": 42,
"sample_guide_scale": 1, "sample_guide_scale": 1.0,
"sample_shift": 5, "sample_shift": 5,
"enable_cfg": false, "enable_cfg": false,
"cpu_offload": false, "cpu_offload": false,
"adaptive_resize": true, "use_31_block": false,
"use_31_block": false "adaptive_resize": true
} }
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 120,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
"seed": 42,
"sample_guide_scale": 1,
"sample_shift": 5,
"enable_cfg": false,
"adaptive_resize": true,
"use_31_block": false,
"cpu_offload": true,
"offload_granularity": "block",
"offload_ratio_val": 1,
"t5_cpu_offload": true,
"t5_offload_granularity": "model",
"t5_quantized": true,
"t5_quant_scheme": "fp8",
"audio_encoder_cpu_offload": false,
"audio_adapter_cpu_offload": false,
"adapter_quantized": true,
"adapter_quant_scheme": "fp8",
"vae_cpu_offload": false,
"use_tiling_vae": true,
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
}
}
{ {
"infer_steps": 4, "infer_steps": 4,
"target_fps": 16, "target_fps": 16,
"video_duration": 16, "video_duration": 12,
"audio_sr": 16000, "audio_sr": 16000,
"target_video_length": 81, "target_video_length": 81,
"target_height": 720, "target_height": 720,
...@@ -10,15 +10,17 @@ ...@@ -10,15 +10,17 @@
"cross_attn_1_type": "sage_attn2", "cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2", "cross_attn_2_type": "sage_attn2",
"seed": 42, "seed": 42,
"sample_guide_scale": 1, "sample_guide_scale": 1.0,
"sample_shift": 5, "sample_shift": 5,
"enable_cfg": false, "enable_cfg": false,
"cpu_offload": false, "cpu_offload": false,
"use_31_block": false, "use_31_block": false,
"dit_quantized_ckpt": "/path/to/Wan2.1-R2V721-Audio-14B-720P/fp8", "adaptive_resize": true,
"mm_config": { "mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm" "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
}, },
"adapter_quantized": true, "adapter_quantized": true,
"adapter_quant_scheme": "fp8" "adapter_quant_scheme": "fp8",
"t5_quantized": true,
"t5_quant_scheme": "fp8"
} }
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
"target_width": 1280,
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
"seed": 42,
"sample_guide_scale": 1.0,
"sample_shift": 5,
"enable_cfg": false,
"cpu_offload": false,
"use_31_block": false,
"adaptive_resize": true,
"parallel": {
"seq_p_size": 4,
"seq_p_attn_type": "ulysses"
},
"mm_config": {
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm"
},
"adapter_quantized": true,
"adapter_quant_scheme": "fp8",
"t5_quantized": true,
"t5_quant_scheme": "fp8"
}
...@@ -534,7 +534,7 @@ class WanAudioRunner(WanRunner): # type:ignore ...@@ -534,7 +534,7 @@ class WanAudioRunner(WanRunner): # type:ignore
return base_model return base_model
def load_audio_encoder(self): def load_audio_encoder(self):
audio_encoder_path = os.path.join(self.config["model_path"], "audio_encoder") audio_encoder_path = os.path.join(self.config["model_path"], "TencentGameMate-chinese-hubert-large")
audio_encoder_offload = self.config.get("audio_encoder_cpu_offload", self.config.get("cpu_offload", False)) audio_encoder_offload = self.config.get("audio_encoder_cpu_offload", self.config.get("cpu_offload", False))
model = SekoAudioEncoderModel(audio_encoder_path, self.config["audio_sr"], audio_encoder_offload) model = SekoAudioEncoderModel(audio_encoder_path, self.config["audio_sr"], audio_encoder_offload)
return model return model
...@@ -561,13 +561,13 @@ class WanAudioRunner(WanRunner): # type:ignore ...@@ -561,13 +561,13 @@ class WanAudioRunner(WanRunner): # type:ignore
audio_adapter.to(device) audio_adapter.to(device)
if self.config.get("adapter_quantized", False): if self.config.get("adapter_quantized", False):
if self.config.get("adapter_quant_scheme", None) in ["fp8", "fp8-q8f"]: if self.config.get("adapter_quant_scheme", None) in ["fp8", "fp8-q8f"]:
model_name = "audio_adapter_fp8.safetensors" model_name = "audio_adapter_model_fp8.safetensors"
elif self.config.get("adapter_quant_scheme", None) == "int8": elif self.config.get("adapter_quant_scheme", None) == "int8":
model_name = "audio_adapter_int8.safetensors" model_name = "audio_adapter_model_int8.safetensors"
else: else:
raise ValueError(f"Unsupported quant_scheme: {self.config.get('adapter_quant_scheme', None)}") raise ValueError(f"Unsupported quant_scheme: {self.config.get('adapter_quant_scheme', None)}")
else: else:
model_name = "audio_adapter.safetensors" model_name = "audio_adapter_model.safetensors"
weights_dict = load_weights(os.path.join(self.config["model_path"], model_name), cpu_offload=audio_adapter_offload) weights_dict = load_weights(os.path.join(self.config["model_path"], model_name), cpu_offload=audio_adapter_offload)
audio_adapter.load_state_dict(weights_dict, strict=False) audio_adapter.load_state_dict(weights_dict, strict=False)
......
#!/bin/bash
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None
python -m lightx2v.infer \
--model_cls wan2.1_audio \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
#!/bin/bash
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0,1,2,3
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None
torchrun --nproc-per-node 4 -m lightx2v.infer \
--model_cls wan2.1_audio \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio_dist.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
#!/bin/bash
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None
python -m lightx2v.infer \
--model_cls wan2.1_audio \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio_quant.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
#!/bin/bash
lightx2v_path=
model_path=
export CUDA_VISIBLE_DEVICES=0,1,2,3
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None
torchrun --nproc-per-node 4 -m lightx2v.infer \
--model_cls wan2.1_audio \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/audio_driven/wan_i2v_audio_quant_dist.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_audio.mp4
...@@ -10,7 +10,6 @@ export CUDA_VISIBLE_DEVICES=0 ...@@ -10,7 +10,6 @@ export CUDA_VISIBLE_DEVICES=0
source ${lightx2v_path}/scripts/base/base.sh source ${lightx2v_path}/scripts/base/base.sh
export ENABLE_GRAPH_MODE=false export ENABLE_GRAPH_MODE=false
export TORCH_CUDA_ARCH_LIST="9.0"
# Start API server with distributed inference service # Start API server with distributed inference service
python -m lightx2v.api_server \ python -m lightx2v.api_server \
......
...@@ -9,7 +9,6 @@ export CUDA_VISIBLE_DEVICES=0 ...@@ -9,7 +9,6 @@ export CUDA_VISIBLE_DEVICES=0
# set environment variables # set environment variables
source ${lightx2v_path}/scripts/base/base.sh source ${lightx2v_path}/scripts/base/base.sh
export TORCH_CUDA_ARCH_LIST="9.0"
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false export ENABLE_GRAPH_MODE=false
......
...@@ -9,7 +9,6 @@ model_path= ...@@ -9,7 +9,6 @@ model_path=
source ${lightx2v_path}/scripts/base/base.sh source ${lightx2v_path}/scripts/base/base.sh
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0,1,2,3
export TORCH_CUDA_ARCH_LIST="9.0"
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false export ENABLE_GRAPH_MODE=false
......
...@@ -10,7 +10,6 @@ export CUDA_VISIBLE_DEVICES=0 ...@@ -10,7 +10,6 @@ export CUDA_VISIBLE_DEVICES=0
# set environment variables # set environment variables
source ${lightx2v_path}/scripts/base/base.sh source ${lightx2v_path}/scripts/base/base.sh
export TORCH_CUDA_ARCH_LIST="9.0"
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false export ENABLE_GRAPH_MODE=false
......
...@@ -9,7 +9,6 @@ export CUDA_VISIBLE_DEVICES=0 ...@@ -9,7 +9,6 @@ export CUDA_VISIBLE_DEVICES=0
# set environment variables # set environment variables
source ${lightx2v_path}/scripts/base/base.sh source ${lightx2v_path}/scripts/base/base.sh
export TORCH_CUDA_ARCH_LIST="9.0"
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false export ENABLE_GRAPH_MODE=false
......
...@@ -4,7 +4,7 @@ from safetensors.torch import save_file ...@@ -4,7 +4,7 @@ from safetensors.torch import save_file
from lightx2v.utils.quant_utils import FloatQuantizer from lightx2v.utils.quant_utils import FloatQuantizer
model_path = "/data/nvme0/models/Wan2.1-R2V721-Audio-14B-720P/audio_adapter.safetensors" model_path = "/data/nvme0/models/Wan2.1-R2V721-Audio-14B-720P/audio_adapter_model.safetensors"
state_dict = {} state_dict = {}
with safetensors.safe_open(model_path, framework="pt", device="cpu") as f: with safetensors.safe_open(model_path, framework="pt", device="cpu") as f:
...@@ -13,7 +13,7 @@ with safetensors.safe_open(model_path, framework="pt", device="cpu") as f: ...@@ -13,7 +13,7 @@ with safetensors.safe_open(model_path, framework="pt", device="cpu") as f:
new_state_dict = {} new_state_dict = {}
new_model_path = "/data/nvme0/models/Wan2.1-R2V721-Audio-14B-720P/audio_adapter_fp8.safetensors" new_model_path = "/data/nvme0/models/Wan2.1-R2V721-Audio-14B-720P/audio_adapter_model_fp8.safetensors"
for key in state_dict.keys(): for key in state_dict.keys():
if key.startswith("ca") and ".to" in key and "weight" in key and "to_kv" not in key: if key.startswith("ca") and ".to" in key and "weight" in key and "to_kv" not in key:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment