Commit 6de0a3b4 authored by Yang Yong(雍洋)'s avatar Yang Yong(雍洋) Committed by GitHub
Browse files

Add audio input files and update pre-commit config for larger files (#283)

parent 8de61521
......@@ -16,6 +16,7 @@ repos:
- id: check-yaml
- id: check-toml
- id: check-added-large-files
args: ['--maxkb=3000'] # Allow files up to 3MB
- id: check-case-conflict
- id: check-merge-conflict
- id: debug-statements
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"video_duration": 15,
"audio_sr": 16000,
"target_video_length": 81,
"target_height": 720,
......
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"video_duration": 15,
"audio_sr": 16000,
"target_video_length": 81,
"resize_mode": "adaptive",
......
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"video_duration": 15,
"audio_sr": 16000,
"target_video_length": 81,
"resize_mode": "adaptive",
......
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"video_duration": 15,
"audio_sr": 16000,
"target_video_length": 81,
"resize_mode": "adaptive",
......@@ -15,7 +15,7 @@
"cpu_offload": false,
"use_31_block": false,
"parallel": {
"seq_p_size": 4,
"seq_p_size": 8,
"seq_p_attn_type": "ulysses"
}
}
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"video_duration": 15,
"audio_sr": 16000,
"target_video_length": 81,
"resize_mode": "adaptive",
......@@ -15,7 +15,7 @@
"cpu_offload": false,
"use_31_block": false,
"parallel": {
"seq_p_size": 4,
"seq_p_size": 8,
"seq_p_attn_type": "ulysses"
},
"mm_config": {
......
{
"infer_steps": 4,
"target_fps": 24,
"video_duration": 12,
"video_duration": 15,
"audio_sr": 16000,
"target_video_length": 121,
"resize_mode": "adaptive",
"text_len": 512,
"num_channels_latents": 48,
"vae_stride": [4, 16, 16],
"vae_stride": [
4,
16,
16
],
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
......@@ -22,8 +26,8 @@
"use_31_block": false,
"lora_configs": [
{
"path": "/mnt/aigc/rtxiang/pretrain/qianhai_weights/lora_model.safetensors",
"strength": 0.125
"path": "/mnt/aigc/rtxiang/pretrain/qianhai_weights/lora_model.safetensors",
"strength": 0.125
}
]
}
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"video_duration": 15,
"audio_sr": 16000,
"target_video_length": 81,
"resize_mode": "fixed_min_area",
......
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"video_duration": 15,
"audio_sr": 16000,
"target_video_length": 81,
"resize_mode": "fixed_min_area",
......
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"video_duration": 15,
"audio_sr": 16000,
"target_video_length": 81,
"resize_mode": "fixed_shape",
"fixed_shape": [240, 320],
"fixed_shape": [
240,
320
],
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
......
{
"infer_steps": 4,
"target_fps": 16,
"video_duration": 12,
"video_duration": 15,
"audio_sr": 16000,
"target_video_length": 17,
"prev_frame_length": 1,
"resize_mode": "fixed_shape",
"fixed_shape": [480, 480],
"fixed_shape": [
480,
480
],
"self_attn_1_type": "flash_attn3",
"cross_attn_1_type": "flash_attn3",
"cross_attn_2_type": "flash_attn3",
......
......@@ -18,8 +18,8 @@ python -m lightx2v.infer \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_01_base.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--prompt "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
......@@ -18,8 +18,8 @@ python -m lightx2v.infer \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_02_fp8.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--prompt "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
......@@ -3,7 +3,7 @@
lightx2v_path=/path/to/Lightx2v
model_path=/path/to/SekoTalk-Distill
export CUDA_VISIBLE_DEVICES=0,1,2,3
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
......@@ -13,13 +13,13 @@ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None
torchrun --nproc-per-node 4 -m lightx2v.infer \
torchrun --nproc-per-node 8 -m lightx2v.infer \
--model_cls seko_talk \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_03_dist.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--prompt "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
......@@ -3,7 +3,7 @@
lightx2v_path=/path/to/Lightx2v
model_path=/path/to/SekoTalk-Distill-fp8
export CUDA_VISIBLE_DEVICES=0,1,2,3
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# set environment variables
source ${lightx2v_path}/scripts/base/base.sh
......@@ -13,13 +13,13 @@ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export ENABLE_GRAPH_MODE=false
export SENSITIVE_LAYER_DTYPE=None
torchrun --nproc-per-node 4 -m lightx2v.infer \
torchrun --nproc-per-node 8 -m lightx2v.infer \
--model_cls seko_talk \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_04_fp8_dist.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--prompt "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
......@@ -18,8 +18,8 @@ python -m lightx2v.infer \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_05_offload_fp8_4090.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--prompt "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
......@@ -18,8 +18,8 @@ python -m lightx2v.infer \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_06_offload_fp8_H100.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--prompt "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
......@@ -18,8 +18,8 @@ torchrun --nproc-per-node 4 -m lightx2v.infer \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_07_dist_offload.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--prompt "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
......@@ -18,8 +18,8 @@ python -m lightx2v.infer \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_08_5B_base.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--prompt "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
......@@ -18,8 +18,8 @@ python -m lightx2v.infer \
--task i2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/seko_talk/seko_talk_09_base_fixed_min_area.json \
--prompt "The video features a old lady is saying something and knitting a sweater." \
--prompt "The video features a male speaking to the camera with arms spread out, a slightly furrowed brow, and a focused gaze." \
--negative_prompt 色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走 \
--image_path ${lightx2v_path}/assets/inputs/audio/15.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/15.wav \
--image_path ${lightx2v_path}/assets/inputs/audio/seko_input.png \
--audio_path ${lightx2v_path}/assets/inputs/audio/seko_input.wav \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_seko_talk.mp4
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment