Commit 92539ed8 authored by gushiqiao's avatar gushiqiao
Browse files

Update gradio and offload

parent 8e941d39
......@@ -109,6 +109,24 @@ def get_cpu_memory():
return available_bytes / 1024**3
def cleanup_memory():
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
try:
if hasattr(psutil, "virtual_memory"):
if os.name == "posix":
try:
os.system("sync")
except: # noqa
pass
except: # noqa
pass
def generate_unique_filename(base_dir="./saved_videos"):
os.makedirs(base_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
......@@ -147,7 +165,6 @@ for op_name, is_installed in available_attn_ops:
def run_inference(
model_type,
prompt,
negative_prompt,
save_video_path,
......@@ -173,6 +190,8 @@ def run_inference(
cpu_offload,
offload_granularity,
offload_ratio,
t5_cpu_offload,
unload_modules,
t5_offload_granularity,
attention_type,
quant_op,
......@@ -181,6 +200,8 @@ def run_inference(
clean_cuda_cache,
image_path=None,
):
cleanup_memory()
quant_op = quant_op.split("(")[0].strip()
attention_type = attention_type.split("(")[0].strip()
......@@ -192,7 +213,7 @@ def run_inference(
model_config = json.load(f)
if task == "t2v":
if model_type == "Wan2.1 1.3B":
if model_size == "1.3b":
# 1.3B
coefficient = [
[
......@@ -287,6 +308,7 @@ def run_inference(
needs_reinit = (
lazy_load
or unload_modules
or global_runner is None
or current_config is None
or cur_dit_quant_scheme is None
......@@ -325,6 +347,8 @@ def run_inference(
if os.path.exists(os.path.join(dit_quantized_ckpt, "config.json")):
with open(os.path.join(dit_quantized_ckpt, "config.json"), "r") as f:
quant_model_config = json.load(f)
else:
quant_model_config = {}
else:
mm_type = "Default"
dit_quantized_ckpt = None
......@@ -355,6 +379,8 @@ def run_inference(
"coefficients": coefficient[0] if use_ret_steps else coefficient[1],
"use_ret_steps": use_ret_steps,
"teacache_thresh": teacache_thresh,
"t5_cpu_offload": t5_cpu_offload,
"unload_modules": unload_modules,
"t5_quantized": is_t5_quant,
"t5_quantized_ckpt": t5_quant_ckpt,
"t5_quant_scheme": t5_quant_scheme,
......@@ -425,15 +451,25 @@ def run_inference(
asyncio.run(runner.run_pipeline())
if lazy_load:
del runner
torch.cuda.empty_cache()
gc.collect()
del config, args, model_config, quant_model_config
if "dit_quantized_ckpt" in locals():
del dit_quantized_ckpt
if "t5_quant_ckpt" in locals():
del t5_quant_ckpt
if "clip_quant_ckpt" in locals():
del clip_quant_ckpt
cleanup_memory()
return save_video_path
def auto_configure(enable_auto_config, model_type, resolution):
def handle_lazy_load_change(lazy_load_enabled):
"""Handle lazy_load checkbox change to automatically enable unload_modules"""
return gr.update(value=lazy_load_enabled)
def auto_configure(enable_auto_config, resolution):
default_config = {
"torch_compile_val": False,
"lazy_load_val": False,
......@@ -443,6 +479,8 @@ def auto_configure(enable_auto_config, model_type, resolution):
"cpu_offload_val": False,
"offload_granularity_val": "block",
"offload_ratio_val": 1,
"t5_cpu_offload_val": False,
"unload_modules_val": False,
"t5_offload_granularity_val": "model",
"attention_type_val": attn_op_choices[0][1],
"quant_op_val": quant_op_choices[0][1],
......@@ -499,7 +537,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
else:
res = "480p"
if model_type in ["Wan2.1 14B"]:
if model_size == "14b":
is_14b = True
else:
is_14b = False
......@@ -507,13 +545,14 @@ def auto_configure(enable_auto_config, model_type, resolution):
if res == "720p" and is_14b:
gpu_rules = [
(80, {}),
(48, {"cpu_offload_val": True, "offload_ratio_val": 0.5}),
(40, {"cpu_offload_val": True, "offload_ratio_val": 0.8}),
(32, {"cpu_offload_val": True, "offload_ratio_val": 1}),
(48, {"cpu_offload_val": True, "offload_ratio_val": 0.5, "t5_cpu_offload_val": True}),
(40, {"cpu_offload_val": True, "offload_ratio_val": 0.8, "t5_cpu_offload_val": True}),
(32, {"cpu_offload_val": True, "offload_ratio_val": 1, "t5_cpu_offload_val": True}),
(
24,
{
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -524,6 +563,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
16,
{
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -537,6 +577,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
12,
{
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -552,6 +593,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
8,
{
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -564,6 +606,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
"clip_quant_scheme_val": quant_type,
"dit_quant_scheme_val": quant_type,
"lazy_load_val": True,
"unload_modules_val": True,
"use_tiny_vae_val": True,
},
),
......@@ -572,13 +615,14 @@ def auto_configure(enable_auto_config, model_type, resolution):
elif is_14b:
gpu_rules = [
(80, {}),
(48, {"cpu_offload_val": True, "offload_ratio_val": 0.2}),
(40, {"cpu_offload_val": True, "offload_ratio_val": 0.5}),
(24, {"cpu_offload_val": True, "offload_ratio_val": 0.8}),
(48, {"cpu_offload_val": True, "offload_ratio_val": 0.2, "t5_cpu_offload_val": True}),
(40, {"cpu_offload_val": True, "offload_ratio_val": 0.5, "t5_cpu_offload_val": True}),
(24, {"cpu_offload_val": True, "offload_ratio_val": 0.8, "t5_cpu_offload_val": True}),
(
16,
{
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -591,6 +635,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
(
{
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -600,6 +645,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
"clip_quant_scheme_val": quant_type,
"dit_quant_scheme_val": quant_type,
"lazy_load_val": True,
"unload_modules_val": True,
"rotary_chunk_val": True,
"rotary_chunk_size_val": 10000,
"use_tiny_vae_val": True,
......@@ -607,6 +653,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
if res == "540p"
else {
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -616,6 +663,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
"clip_quant_scheme_val": quant_type,
"dit_quant_scheme_val": quant_type,
"lazy_load_val": True,
"unload_modules_val": True,
"use_tiny_vae_val": True,
}
),
......@@ -623,7 +671,17 @@ def auto_configure(enable_auto_config, model_type, resolution):
]
else:
gpu_rules = {}
gpu_rules = [
(24, {}),
(
8,
{
"t5_cpu_offload_val": True,
"t5_offload_granularity_val": "block",
"t5_quant_scheme_val": quant_type,
},
),
]
if is_14b:
cpu_rules = [
......@@ -637,11 +695,22 @@ def auto_configure(enable_auto_config, model_type, resolution):
"t5_quant_scheme_val": quant_type,
"clip_quant_scheme_val": quant_type,
"lazy_load_val": True,
"unload_modules_val": True,
},
),
]
else:
cpu_rules = {}
cpu_rules = [
(64, {}),
(
16,
{
"t5_quant_scheme_val": quant_type,
"unload_modules_val": True,
"use_tiny_vae_val": True,
},
),
]
for threshold, updates in gpu_rules:
if gpu_memory >= threshold:
......@@ -680,20 +749,6 @@ def main():
with gr.Group():
gr.Markdown("## 📥 Input Parameters")
with gr.Row():
if task == "i2v":
model_type = gr.Dropdown(
choices=["Wan2.1 14B"],
value="Wan2.1 14B",
label="Model Type",
)
else:
model_type = gr.Dropdown(
choices=["Wan2.1 14B", "Wan2.1 1.3B"],
value="Wan2.1 14B",
label="Model Type",
)
if task == "i2v":
with gr.Row():
image_path = gr.Image(
......@@ -849,6 +904,11 @@ def main():
info="Controls the chunk size for applying rotary embeddings. Larger values may improve performance but increase memory usage. Only effective if 'rotary_chunk' is checked.",
)
unload_modules = gr.Checkbox(
label="Unload Modules",
value=False,
info="Unload modules (T5, CLIP, DIT, etc.) after inference to reduce GPU/CPU memory usage",
)
clean_cuda_cache = gr.Checkbox(
label="Clean CUDA Memory Cache",
value=False,
......@@ -883,6 +943,12 @@ def main():
value=1.0,
info="Controls how much of the Dit model is offloaded to the CPU",
)
t5_cpu_offload = gr.Checkbox(
label="T5 CPU Offloading",
value=False,
info="Offload the T5 Encoder model to CPU to reduce GPU memory usage",
)
t5_offload_granularity = gr.Dropdown(
label="T5 Encoder Offload Granularity",
choices=["model", "block"],
......@@ -971,7 +1037,7 @@ def main():
enable_auto_config.change(
fn=auto_configure,
inputs=[enable_auto_config, model_type, resolution],
inputs=[enable_auto_config, resolution],
outputs=[
torch_compile,
lazy_load,
......@@ -981,6 +1047,8 @@ def main():
cpu_offload,
offload_granularity,
offload_ratio,
t5_cpu_offload,
unload_modules,
t5_offload_granularity,
attention_type,
quant_op,
......@@ -995,11 +1063,16 @@ def main():
use_ret_steps,
],
)
lazy_load.change(
fn=handle_lazy_load_change,
inputs=[lazy_load],
outputs=[unload_modules],
)
if task == "i2v":
infer_btn.click(
fn=run_inference,
inputs=[
model_type,
prompt,
negative_prompt,
save_video_path,
......@@ -1025,6 +1098,8 @@ def main():
cpu_offload,
offload_granularity,
offload_ratio,
t5_cpu_offload,
unload_modules,
t5_offload_granularity,
attention_type,
quant_op,
......@@ -1039,7 +1114,6 @@ def main():
infer_btn.click(
fn=run_inference,
inputs=[
model_type,
prompt,
negative_prompt,
save_video_path,
......@@ -1065,6 +1139,8 @@ def main():
cpu_offload,
offload_granularity,
offload_ratio,
t5_cpu_offload,
unload_modules,
t5_offload_granularity,
attention_type,
quant_op,
......@@ -1088,14 +1164,16 @@ if __name__ == "__main__":
default="wan2.1",
help="Model class to use",
)
parser.add_argument("--model_size", type=str, required=True, choices=["14b", "1.3b"], help="Model type to use")
parser.add_argument("--task", type=str, required=True, choices=["i2v", "t2v"], help="Specify the task type. 'i2v' for image-to-video translation, 't2v' for text-to-video generation.")
parser.add_argument("--server_port", type=int, default=7862, help="Server port")
parser.add_argument("--server_name", type=str, default="0.0.0.0", help="Server ip")
args = parser.parse_args()
global model_path, model_cls
global model_path, model_cls, model_size
model_path = args.model_path
model_cls = args.model_cls
model_size = args.model_size
task = args.task
main()
......@@ -109,6 +109,26 @@ def get_cpu_memory():
return available_bytes / 1024**3
def cleanup_memory():
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
try:
import psutil
if hasattr(psutil, "virtual_memory"):
if os.name == "posix":
try:
os.system("sync")
except: # noqa
pass
except: # noqa
pass
def generate_unique_filename(base_dir="./saved_videos"):
os.makedirs(base_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
......@@ -147,7 +167,6 @@ for op_name, is_installed in available_attn_ops:
def run_inference(
model_type,
prompt,
negative_prompt,
save_video_path,
......@@ -173,6 +192,8 @@ def run_inference(
cpu_offload,
offload_granularity,
offload_ratio,
t5_cpu_offload,
unload_modules,
t5_offload_granularity,
attention_type,
quant_op,
......@@ -181,6 +202,8 @@ def run_inference(
clean_cuda_cache,
image_path=None,
):
cleanup_memory()
quant_op = quant_op.split("(")[0].strip()
attention_type = attention_type.split("(")[0].strip()
......@@ -192,7 +215,7 @@ def run_inference(
model_config = json.load(f)
if task == "t2v":
if model_type == "Wan2.1 1.3B":
if model_size == "1.3b":
# 1.3B
coefficient = [
[
......@@ -287,6 +310,7 @@ def run_inference(
needs_reinit = (
lazy_load
or unload_modules
or global_runner is None
or current_config is None
or cur_dit_quant_scheme is None
......@@ -325,6 +349,8 @@ def run_inference(
if os.path.exists(os.path.join(dit_quantized_ckpt, "config.json")):
with open(os.path.join(dit_quantized_ckpt, "config.json"), "r") as f:
quant_model_config = json.load(f)
else:
quant_model_config = {}
else:
mm_type = "Default"
dit_quantized_ckpt = None
......@@ -355,6 +381,8 @@ def run_inference(
"coefficients": coefficient[0] if use_ret_steps else coefficient[1],
"use_ret_steps": use_ret_steps,
"teacache_thresh": teacache_thresh,
"t5_cpu_offload": t5_cpu_offload,
"unload_modules": unload_modules,
"t5_quantized": is_t5_quant,
"t5_quantized_ckpt": t5_quant_ckpt,
"t5_quant_scheme": t5_quant_scheme,
......@@ -425,15 +453,25 @@ def run_inference(
asyncio.run(runner.run_pipeline())
if lazy_load:
del runner
torch.cuda.empty_cache()
gc.collect()
del config, args, model_config, quant_model_config
if "dit_quantized_ckpt" in locals():
del dit_quantized_ckpt
if "t5_quant_ckpt" in locals():
del t5_quant_ckpt
if "clip_quant_ckpt" in locals():
del clip_quant_ckpt
cleanup_memory()
return save_video_path
def auto_configure(enable_auto_config, model_type, resolution):
def handle_lazy_load_change(lazy_load_enabled):
"""Handle lazy_load checkbox change to automatically enable unload_modules"""
return gr.update(value=lazy_load_enabled)
def auto_configure(enable_auto_config, resolution):
default_config = {
"torch_compile_val": False,
"lazy_load_val": False,
......@@ -443,6 +481,8 @@ def auto_configure(enable_auto_config, model_type, resolution):
"cpu_offload_val": False,
"offload_granularity_val": "block",
"offload_ratio_val": 1,
"t5_cpu_offload_val": False,
"unload_modules_val": False,
"t5_offload_granularity_val": "model",
"attention_type_val": attn_op_choices[0][1],
"quant_op_val": quant_op_choices[0][1],
......@@ -499,7 +539,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
else:
res = "480p"
if model_type in ["Wan2.1 14B"]:
if model_size == "14b":
is_14b = True
else:
is_14b = False
......@@ -507,13 +547,14 @@ def auto_configure(enable_auto_config, model_type, resolution):
if res == "720p" and is_14b:
gpu_rules = [
(80, {}),
(48, {"cpu_offload_val": True, "offload_ratio_val": 0.5}),
(40, {"cpu_offload_val": True, "offload_ratio_val": 0.8}),
(32, {"cpu_offload_val": True, "offload_ratio_val": 1}),
(48, {"cpu_offload_val": True, "offload_ratio_val": 0.5, "t5_cpu_offload_val": True}),
(40, {"cpu_offload_val": True, "offload_ratio_val": 0.8, "t5_cpu_offload_val": True}),
(32, {"cpu_offload_val": True, "offload_ratio_val": 1, "t5_cpu_offload_val": True}),
(
24,
{
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -524,6 +565,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
16,
{
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -537,6 +579,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
12,
{
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -552,6 +595,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
8,
{
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -564,6 +608,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
"clip_quant_scheme_val": quant_type,
"dit_quant_scheme_val": quant_type,
"lazy_load_val": True,
"unload_modules_val": True,
"use_tiny_vae_val": True,
},
),
......@@ -572,13 +617,14 @@ def auto_configure(enable_auto_config, model_type, resolution):
elif is_14b:
gpu_rules = [
(80, {}),
(48, {"cpu_offload_val": True, "offload_ratio_val": 0.2}),
(40, {"cpu_offload_val": True, "offload_ratio_val": 0.5}),
(24, {"cpu_offload_val": True, "offload_ratio_val": 0.8}),
(48, {"cpu_offload_val": True, "offload_ratio_val": 0.2, "t5_cpu_offload_val": True}),
(40, {"cpu_offload_val": True, "offload_ratio_val": 0.5, "t5_cpu_offload_val": True}),
(24, {"cpu_offload_val": True, "offload_ratio_val": 0.8, "t5_cpu_offload_val": True}),
(
16,
{
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -591,6 +637,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
(
{
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -600,6 +647,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
"clip_quant_scheme_val": quant_type,
"dit_quant_scheme_val": quant_type,
"lazy_load_val": True,
"unload_modules_val": True,
"rotary_chunk_val": True,
"rotary_chunk_size_val": 10000,
"use_tiny_vae_val": True,
......@@ -607,6 +655,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
if res == "540p"
else {
"cpu_offload_val": True,
"t5_cpu_offload_val": True,
"offload_ratio_val": 1,
"t5_offload_granularity_val": "block",
"precision_mode_val": "bf16",
......@@ -616,6 +665,7 @@ def auto_configure(enable_auto_config, model_type, resolution):
"clip_quant_scheme_val": quant_type,
"dit_quant_scheme_val": quant_type,
"lazy_load_val": True,
"unload_modules_val": True,
"use_tiny_vae_val": True,
}
),
......@@ -623,7 +673,17 @@ def auto_configure(enable_auto_config, model_type, resolution):
]
else:
gpu_rules = {}
gpu_rules = [
(24, {}),
(
8,
{
"t5_cpu_offload_val": True,
"t5_offload_granularity_val": "block",
"t5_quant_scheme_val": quant_type,
},
),
]
if is_14b:
cpu_rules = [
......@@ -637,11 +697,22 @@ def auto_configure(enable_auto_config, model_type, resolution):
"t5_quant_scheme_val": quant_type,
"clip_quant_scheme_val": quant_type,
"lazy_load_val": True,
"unload_modules_val": True,
},
),
]
else:
cpu_rules = {}
cpu_rules = [
(64, {}),
(
16,
{
"t5_quant_scheme_val": quant_type,
"unload_modules_val": True,
"use_tiny_vae_val": True,
},
),
]
for threshold, updates in gpu_rules:
if gpu_memory >= threshold:
......@@ -680,20 +751,6 @@ def main():
with gr.Group():
gr.Markdown("## 📥 输入参数")
with gr.Row():
if task == "i2v":
model_type = gr.Dropdown(
choices=["Wan2.1 14B"],
value="Wan2.1 14B",
label="模型类型",
)
else:
model_type = gr.Dropdown(
choices=["Wan2.1 14B", "Wan2.1 1.3B"],
value="Wan2.1 14B",
label="模型类型",
)
if task == "i2v":
with gr.Row():
image_path = gr.Image(
......@@ -846,7 +903,11 @@ def main():
step=100,
info="控制应用旋转编码的块大小。较大的值可能提高性能但增加内存使用。仅在'rotary_chunk'勾选时有效。",
)
unload_modules = gr.Checkbox(
label="卸载模块",
value=False,
info="推理后卸载模块(T5、CLIP、DIT等)以减少GPU/CPU内存使用",
)
clean_cuda_cache = gr.Checkbox(
label="清理CUDA内存缓存",
value=False,
......@@ -881,6 +942,11 @@ def main():
value=1.0,
info="控制将多少Dit模型卸载到CPU",
)
t5_cpu_offload = gr.Checkbox(
label="T5 CPU卸载",
value=False,
info="将T5编码器模型卸载到CPU以减少GPU内存使用",
)
t5_offload_granularity = gr.Dropdown(
label="T5编码器卸载粒度",
choices=["model", "block"],
......@@ -969,7 +1035,7 @@ def main():
enable_auto_config.change(
fn=auto_configure,
inputs=[enable_auto_config, model_type, resolution],
inputs=[enable_auto_config, resolution],
outputs=[
torch_compile,
lazy_load,
......@@ -979,6 +1045,8 @@ def main():
cpu_offload,
offload_granularity,
offload_ratio,
t5_cpu_offload,
unload_modules,
t5_offload_granularity,
attention_type,
quant_op,
......@@ -993,11 +1061,16 @@ def main():
use_ret_steps,
],
)
lazy_load.change(
fn=handle_lazy_load_change,
inputs=[lazy_load],
outputs=[unload_modules],
)
if task == "i2v":
infer_btn.click(
fn=run_inference,
inputs=[
model_type,
prompt,
negative_prompt,
save_video_path,
......@@ -1023,6 +1096,8 @@ def main():
cpu_offload,
offload_granularity,
offload_ratio,
t5_cpu_offload,
unload_modules,
t5_offload_granularity,
attention_type,
quant_op,
......@@ -1037,7 +1112,6 @@ def main():
infer_btn.click(
fn=run_inference,
inputs=[
model_type,
prompt,
negative_prompt,
save_video_path,
......@@ -1063,6 +1137,8 @@ def main():
cpu_offload,
offload_granularity,
offload_ratio,
t5_cpu_offload,
unload_modules,
t5_offload_granularity,
attention_type,
quant_op,
......@@ -1086,14 +1162,16 @@ if __name__ == "__main__":
default="wan2.1",
help="要使用的模型类别",
)
parser.add_argument("--model_size", type=str, required=True, choices=["14b", "1.3b"], help="模型大小:14b 或 1.3b")
parser.add_argument("--task", type=str, required=True, choices=["i2v", "t2v"], help="指定任务类型。'i2v'用于图像到视频转换,'t2v'用于文本到视频生成。")
parser.add_argument("--server_port", type=int, default=7862, help="服务器端口")
parser.add_argument("--server_name", type=str, default="0.0.0.0", help="服务器IP")
args = parser.parse_args()
global model_path, model_cls
global model_path, model_cls, model_size
model_path = args.model_path
model_cls = args.model_cls
model_size = args.model_size
task = args.task
main()
......@@ -15,16 +15,19 @@
# Lightx2v project root directory path
# Example: /home/user/lightx2v or /data/video_gen/lightx2v
lightx2v_path=/path/to/lightx2v
# Model path configuration
# Image-to-video model path (for i2v tasks)
# Example: /path/to/Wan2.1-I2V-14B-720P-Lightx2v
i2v_model_path=/path/to/Wan2.1-I2V-14B-720P-Lightx2v
i2v_model_path=/path/to/Wan2.1-I2V-14B-720P-Lightx2v-Step-Distill
# Text-to-video model path (for t2v tasks)
# Example: /path/to/Wan2.1-T2V-1.3B
t2v_model_path=/path/to/Wan2.1-T2V-1.3B
# Model size configuration
# Default model size (14b, 1.3b)
model_size="14b"
# Server configuration
server_name="0.0.0.0"
server_port=8032
......@@ -65,6 +68,10 @@ while [[ $# -gt 0 ]]; do
export CUDA_VISIBLE_DEVICES=$gpu_id
shift 2
;;
--model_size)
model_size="$2"
shift 2
;;
--help)
echo "🎬 Lightx2v Gradio Demo Startup Script"
echo "=========================================="
......@@ -79,6 +86,10 @@ while [[ $# -gt 0 ]]; do
echo " en: English interface"
echo " --port PORT Server port (default: 8032)"
echo " --gpu GPU_ID GPU device ID (default: 0)"
echo " --model_size MODEL_SIZE"
echo " Model size (default: 14b)"
echo " 14b: 14 billion parameters model"
echo " 1.3b: 1.3 billion parameters model"
echo " --help Show this help message"
echo ""
echo "🚀 Usage examples:"
......@@ -86,6 +97,8 @@ while [[ $# -gt 0 ]]; do
echo " $0 --task i2v --lang zh --port 8032 # Start with specified parameters"
echo " $0 --task t2v --lang en --port 7860 # Text-to-video with English interface"
echo " $0 --task i2v --gpu 1 --port 8032 # Use GPU 1"
echo " $0 --task t2v --model_size 1.3b # Use 1.3B model"
echo " $0 --task i2v --model_size 14b # Use 14B model"
echo ""
echo "📝 Notes:"
echo " - Edit script to configure model paths before first use"
......@@ -113,6 +126,12 @@ if [[ "$lang" != "zh" && "$lang" != "en" ]]; then
exit 1
fi
# Validate model size
if [[ "$model_size" != "14b" && "$model_size" != "1.3b" ]]; then
echo "Error: Model size must be '14b' or '1.3b'"
exit 1
fi
# Select model path based on task type
if [[ "$task" == "i2v" ]]; then
model_path=$i2v_model_path
......@@ -161,6 +180,7 @@ echo "=========================================="
echo "📁 Project path: $lightx2v_path"
echo "🤖 Model path: $model_path"
echo "🎯 Task type: $task"
echo "🤖 Model size: $model_size"
echo "🌏 Interface language: $lang"
echo "🖥️ GPU device: $gpu_id"
echo "🌐 Server address: $server_name:$server_port"
......@@ -190,7 +210,8 @@ python $demo_file \
--model_path "$model_path" \
--task "$task" \
--server_name "$server_name" \
--server_port "$server_port"
--server_port "$server_port" \
--model_size "$model_size"
# Display final system resource usage
echo ""
......
......@@ -11,6 +11,7 @@
"sample_shift": 5,
"enable_cfg": true,
"cpu_offload": true,
"t5_cpu_offload": true,
"offload_granularity": "block",
"mm_config": {
"mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm",
......
{
"infer_steps": 4,
"target_video_length": 81,
"target_height": 480,
"target_width": 832,
"self_attn_1_type": "sage_attn2",
"cross_attn_1_type": "sage_attn2",
"cross_attn_2_type": "sage_attn2",
"seed": 42,
"sample_guide_scale": 5,
"sample_shift": 5,
"enable_cfg": true,
"t5_cpu_offload": true,
"t5_offload_granularity": "block",
"t5_quantized": true,
"t5_quantized_ckpt": "/path/to/models_t5_umt5-xxl-enc-fp8.pth",
"t5_quant_scheme": "fp8",
"unload_modules": true,
"use_tiling_vae": true
}
......@@ -13,6 +13,7 @@
"enable_cfg": true,
"cpu_offload": true,
"offload_granularity": "block",
"t5_cpu_offload": true,
"mm_config": {
"mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm",
"weight_auto_quant": true
......
......@@ -18,6 +18,7 @@
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm",
"weight_auto_quant": false
},
"t5_cpu_offload": true,
"t5_quantized": true,
"t5_quantized_ckpt": "/path/to/models_t5_umt5-xxl-enc-fp8.pth",
"t5_quant_scheme": "fp8",
......
......@@ -18,6 +18,7 @@
"mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Vllm",
"weight_auto_quant": false
},
"t5_cpu_offload": true,
"t5_quantized": true,
"t5_quantized_ckpt": "/path/to/models_t5_umt5-xxl-enc-fp8.pth",
"t5_quant_scheme": "fp8",
......
......@@ -12,6 +12,7 @@
"enable_cfg": true,
"cpu_offload": true,
"offload_granularity": "phase",
"t5_cpu_offload": true,
"t5_offload_granularity": "block",
"dit_quantized_ckpt": "/path/to/dit_int8",
"mm_config": {
......
......@@ -12,6 +12,7 @@
"sample_shift": 8,
"enable_cfg": true,
"cpu_offload": true,
"t5_cpu_offload": true,
"offload_granularity": "phase",
"dit_quantized_ckpt": "/path/to/dit_int8",
"mm_config": {
......
......@@ -15,7 +15,7 @@ This project contains two main demo files:
- Python 3.10+ (recommended)
- CUDA 12.4+ (recommended)
- At least 8GB GPU VRAM
- At least 16GB system memory
- At least 16GB system memory (preferably at least 32GB)
- At least 128GB SSD solid-state drive (**💾 Strongly recommend using SSD solid-state drives to store model files! During "lazy loading" startup, significantly improves model loading speed and inference performance**)
### Install Dependencies
......@@ -80,8 +80,9 @@ vim run_gradio.sh
bash run_gradio.sh
# 3. Or start with parameters (recommended)
bash run_gradio.sh --task i2v --lang en --port 8032
# bash run_gradio.sh --task t2v --lang en --port 8032
bash run_gradio.sh --task i2v --lang en --model_size 14b --port 8032
# bash run_gradio.sh --task i2v --lang en --model_size 14b --port 8032
# bash run_gradio.sh --task i2v --lang en --model_size 1.3b --port 8032
```
#### Method 2: Direct Command Line Startup
......@@ -90,6 +91,7 @@ bash run_gradio.sh --task i2v --lang en --port 8032
```bash
python gradio_demo.py \
--model_path /path/to/Wan2.1-I2V-14B-720P-Lightx2v \
--model_size 14b \
--task i2v \
--server_name 0.0.0.0 \
--server_port 7862
......@@ -99,6 +101,7 @@ python gradio_demo.py \
```bash
python gradio_demo.py \
--model_path /path/to/Wan2.1-T2V-1.3B \
--model_size 1.3b \
--task t2v \
--server_name 0.0.0.0 \
--server_port 7862
......@@ -108,6 +111,7 @@ python gradio_demo.py \
```bash
python gradio_demo_zh.py \
--model_path /path/to/model \
--model_size 14b \
--task i2v \
--server_name 0.0.0.0 \
--server_port 7862
......@@ -119,6 +123,7 @@ python gradio_demo_zh.py \
|-----------|------|----------|---------|-------------|
| `--model_path` | str | ✅ | - | Model folder path |
| `--model_cls` | str | ❌ | wan2.1 | Model class (currently only supports wan2.1) |
| `--model_size` | str | ✅ | - | Model size: `14b(t2v or i2v)` or `1.3b(t2v)` |
| `--task` | str | ✅ | - | Task type: `i2v` (image-to-video) or `t2v` (text-to-video) |
| `--server_port` | int | ❌ | 7862 | Server port |
| `--server_name` | str | ❌ | 0.0.0.0 | Server IP address |
......@@ -127,10 +132,6 @@ python gradio_demo_zh.py \
### Basic Settings
#### Model Type Selection
- **Wan2.1 14B**: Large parameter count, high generation quality, suitable for high-quality video generation
- **Wan2.1 1.3B**: Lightweight model, fast speed, suitable for rapid prototyping and testing
#### Input Parameters
- **Prompt**: Describe the expected video content
- **Negative Prompt**: Specify elements you don't want to appear
......@@ -217,7 +218,7 @@ lightx2v/app/
## 🎨 Interface Description
### Basic Settings Tab
- **Input Parameters**: Model type, prompts, resolution, and other basic settings
- **Input Parameters**: Prompts, resolution, and other basic settings
- **Video Parameters**: FPS, frame count, CFG, and other video generation parameters
- **Output Settings**: Video save path configuration
......
This diff is collapsed.
......@@ -15,7 +15,7 @@ Lightx2v 是一个轻量级的视频推理和生成引擎,提供了基于 Grad
- Python 3.10+ (推荐)
- CUDA 12.4+ (推荐)
- 至少 8GB GPU 显存
- 至少 16GB 系统内存
- 至少 16GB 系统内存(最好最少有 32G)
- 至少 128GB SSD固态硬盘 (**💾 强烈建议使用SSD固态硬盘存储模型文件!"延迟加载"启动时,显著提升模型加载速度和推理性能**)
......@@ -83,8 +83,9 @@ vim run_gradio.sh
bash run_gradio.sh
# 3. 或使用参数启动(推荐)
bash run_gradio.sh --task i2v --lang zh --port 8032
# bash run_gradio.sh --task t2v --lang zh --port 8032
bash run_gradio.sh --task i2v --lang zh --model_size 14b --port 8032
# bash run_gradio.sh --task i2v --lang zh --model_size 14b --port 8032
# bash run_gradio.sh --task i2v --lang zh --model_size 1.3b --port 8032
```
#### 方式二:直接命令行启动
......@@ -93,6 +94,7 @@ bash run_gradio.sh --task i2v --lang zh --port 8032
```bash
python gradio_demo_zh.py \
--model_path /path/to/Wan2.1-I2V-14B-720P-Lightx2v \
--model_size 14b \
--task i2v \
--server_name 0.0.0.0 \
--server_port 7862
......@@ -102,6 +104,7 @@ python gradio_demo_zh.py \
```bash
python gradio_demo_zh.py \
--model_path /path/to/Wan2.1-T2V-1.3B \
--model_size 1.3b \
--task t2v \
--server_name 0.0.0.0 \
--server_port 7862
......@@ -111,6 +114,7 @@ python gradio_demo_zh.py \
```bash
python gradio_demo.py \
--model_path /path/to/model \
--model_size 14b \
--task i2v \
--server_name 0.0.0.0 \
--server_port 7862
......@@ -122,6 +126,7 @@ python gradio_demo.py \
|------|------|------|--------|------|
| `--model_path` | str | ✅ | - | 模型文件夹路径 |
| `--model_cls` | str | ❌ | wan2.1 | 模型类别(目前仅支持wan2.1) |
| `--model_size` | str | ✅ | - | 模型大小:`14b(图像到视频或者文本到视频)``1.3b(文本到视频)` |
| `--task` | str | ✅ | - | 任务类型:`i2v`(图像到视频)或 `t2v`(文本到视频) |
| `--server_port` | int | ❌ | 7862 | 服务器端口 |
| `--server_name` | str | ❌ | 0.0.0.0 | 服务器IP地址 |
......@@ -130,10 +135,6 @@ python gradio_demo.py \
### 基本设置
#### 模型类型选择
- **Wan2.1 14B**: 参数量大,生成质量高,适合高质量视频生成
- **Wan2.1 1.3B**: 轻量级模型,速度快,适合快速原型和测试
#### 输入参数
- **提示词 (Prompt)**: 描述期望的视频内容
- **负向提示词 (Negative Prompt)**: 指定不希望出现的元素
......@@ -221,7 +222,7 @@ lightx2v/app/
## 🎨 界面说明
### 基本设置标签页
- **输入参数**: 模型类型、提示词、分辨率等基本设置
- **输入参数**: 提示词、分辨率等基本设置
- **视频参数**: FPS、帧数、CFG等视频生成参数
- **输出设置**: 视频保存路径配置
......
......@@ -15,6 +15,7 @@ class WeightAsyncStreamManager(object):
self.cuda_load_stream = torch.cuda.Stream(priority=0)
self.offload_block_num = int(offload_ratio * blocks_num)
self.phases_num = phases_num
self.block_nums = blocks_num
self.offload_phases_num = blocks_num * phases_num * offload_ratio
def prefetch_weights(self, block_idx, blocks_weights):
......@@ -128,6 +129,9 @@ class LazyWeightAsyncStreamManager(WeightAsyncStreamManager):
if next_block_idx < 0:
next_block_idx = 0
if next_block_idx == self.block_nums:
return
if self.offload_gra == "phase":
for phase_idx in range(self.phases_num):
obj_key = (next_block_idx, phase_idx)
......@@ -170,6 +174,8 @@ class LazyWeightAsyncStreamManager(WeightAsyncStreamManager):
self.pin_memory_buffer.push(block_idx, block)
block_idx += 1
if block_idx == self.block_nums:
break
def prefetch_weights_from_disk(self, blocks):
if self.initial_prefetch_done:
......
......@@ -56,3 +56,10 @@ class Conv2dWeight(Conv2dWeightTemplate):
if self.bias is not None:
destination[self.bias_name] = self.bias.cpu().detach().clone()
return destination
def clear(self):
attrs = ["weight", "bias"]
for attr in attrs:
if hasattr(self, attr):
delattr(self, attr)
setattr(self, attr, None)
......@@ -66,3 +66,10 @@ class Conv3dWeight(Conv3dWeightTemplate):
if self.bias is not None:
destination[self.bias_name] = self.bias.cpu().detach().clone()
return destination
def clear(self):
attrs = ["weight", "bias"]
for attr in attrs:
if hasattr(self, attr):
delattr(self, attr)
setattr(self, attr, None)
......@@ -34,9 +34,11 @@ class LNWeightTemplate(metaclass=ABCMeta):
return self.weight.numel() * self.weight.element_size()
def clear(self):
del self.weight
if self.bias is not None:
del self.bias
attrs = ["weight", "bias"]
for attr in attrs:
if hasattr(self, attr):
delattr(self, attr)
setattr(self, attr, None)
@abstractmethod
def apply(self, input_tensor):
......
......@@ -23,7 +23,11 @@ class RMSWeightTemplate(metaclass=ABCMeta):
self.pinned_weight = torch.empty(self.weight.shape, pin_memory=True, dtype=self.weight.dtype)
def clear(self):
del self.weight
attrs = ["weight"]
for attr in attrs:
if hasattr(self, attr):
delattr(self, attr)
setattr(self, attr, None)
@abstractmethod
def apply(self, input_tensor):
......
......@@ -22,7 +22,11 @@ class DefaultTensor:
self.pinned_tensor = torch.empty(self.tensor.shape, pin_memory=True, dtype=self.tensor.dtype)
def clear(self):
del self.tensor
attrs = ["tensor"]
for attr in attrs:
if hasattr(self, attr):
delattr(self, attr)
setattr(self, attr, None)
def _calculate_size(self):
return self.tensor.numel() * self.tensor.element_size()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment