info="Enable classifier guidance to control prompt strength",
info="Auto-tune optimization settings for your GPU",
)
cfg_scale=gr.Slider(
label="CFG Scale",
minimum=1,
maximum=100,
step=1,
value=5,
info="Controls the influence strength of the prompt. Higher values mean stronger influence",
)
)
gr.Markdown("### Memory Optimization")
gr.Markdown("### GPU Memory Optimization")
withgr.Row():
withgr.Row():
lazy_load=gr.Checkbox(
rotary_chunk=gr.Checkbox(
label="Enable Lazy Loading",
label="Chunked Rotary Position Embedding",
value=False,
info="Lazily load model components during inference, suitable for memory-constrained environments",
)
torch_compile=gr.Checkbox(
label="Enable Torch Compile",
value=False,
info="Use torch.compile to accelerate the inference process",
)
use_expandable_alloc=gr.Checkbox(
label="Enable Expandable Memory Allocation",
value=False,
value=False,
info="Helps reduce memory fragmentation",
info="When enabled, processes rotary position embeddings in chunks to save GPU memory.",
)
)
rotary_chunk=gr.Checkbox(
rotary_chunk_size=gr.Slider(
label="Chunked Rotary Position Encoding",
label="Rotary Embedding Chunk Size",
value=False,
value=100,
info="When enabled, uses chunked processing for rotary position encoding to save memory.",
minimum=100,
maximum=10000,
step=100,
info="Controls the chunk size for applying rotary embeddings. Larger values may improve performance but increase memory usage. Only effective if 'rotary_chunk' is checked.",
)
)
clean_cuda_cache=gr.Checkbox(
clean_cuda_cache=gr.Checkbox(
label="Clean CUDA Memory Cache",
label="Clean CUDA Memory Cache",
value=False,
value=False,
info="When enabled, frees up memory in a timely manner but slows down inference.",
info="When enabled, frees up GPU memory promptly but slows down inference.",
)
)
gr.Markdown("### Asynchronous Offloading")
withgr.Row():
withgr.Row():
cpu_offload=gr.Checkbox(
cpu_offload=gr.Checkbox(
label="CPU Offload",
label="CPU Offloading",
value=False,
info="Offload parts of the model computation from GPU to CPU to reduce GPU memory usage",
)
lazy_load=gr.Checkbox(
label="Enable Lazy Loading",
value=False,
value=False,
info="Offload part of the model computation from GPU to CPU to reduce video memory usage",
info="Lazy load model components during inference. Requires CPU loading and DIT quantization.",
)
)
offload_granularity=gr.Dropdown(
offload_granularity=gr.Dropdown(
label="Dit Offload Granularity",
label="Dit Offload Granularity",
choices=["block","phase"],
choices=["block","phase"],
value="block",
value="phase",
info="Controls the granularity of Dit model offloading to CPU",
info="Sets Dit model offloading granularity: blocks or computational phases",
)
offload_ratio=gr.Slider(
label="Offload ratio for Dit model",
minimum=0.0,
maximum=1.0,
step=0.1,
value=1.0,
info="Controls how much of the Dit model is offloaded to the CPU",
)
)
t5_offload_granularity=gr.Dropdown(
t5_offload_granularity=gr.Dropdown(
label="T5 Encoder Offload Granularity",
label="T5 Encoder Offload Granularity",
choices=["model","block"],
choices=["model","block"],
value="block",
value="model",
info="Controls the granularity of T5 Encoder model offloading to CPU",
info="Controls the granularity when offloading the T5 Encoder model to CPU",
)
)
gr.Markdown("### Low-Precision Quantization")
gr.Markdown("### Low-Precision Quantization")
withgr.Row():
withgr.Row():
torch_compile=gr.Checkbox(
label="Torch Compile",
value=False,
info="Use torch.compile to accelerate the inference process",
info="Using a suitable attention operator can accelerate inference",
info="Use appropriate attention operators to accelerate inference",
)
)
quant_op=gr.Dropdown(
quant_op=gr.Dropdown(
label="Quantization Operator",
label="Quantization Matmul Operator",
choices=["vllm","sgl","q8f"],
choices=[op[1]foropinquant_op_choices],
value="vllm",
value=quant_op_choices[0][1],
info="Using a suitable quantization operator can accelerate inference",
info="Select the quantization matrix multiplication operator to accelerate inference",
interactive=True,
)
)
dit_quant_scheme=gr.Dropdown(
dit_quant_scheme=gr.Dropdown(
label="Dit",
label="Dit",
choices=["fp8","int8","bf16"],
choices=["fp8","int8","bf16"],
value="bf16",
value="bf16",
info="Quantization precision for Dit model",
info="Quantization precision for the Dit model",
)
)
t5_quant_scheme=gr.Dropdown(
t5_quant_scheme=gr.Dropdown(
label="T5 Encoder",
label="T5 Encoder",
choices=["fp8","int8","bf16"],
choices=["fp8","int8","bf16"],
value="bf16",
value="bf16",
info="Quantization precision for T5 Encoder model",
info="Quantization precision for the T5 Encoder model",
)
)
clip_quant_scheme=gr.Dropdown(
clip_quant_scheme=gr.Dropdown(
label="Clip Encoder",
label="Clip Encoder",
choices=["fp8","int8","fp16"],
choices=["fp8","int8","fp16"],
value="fp16",
value="fp16",
info="Quantization precision for Clip Encoder",
info="Quantization precision for the Clip Encoder",
)
)
precision_mode=gr.Dropdown(
precision_mode=gr.Dropdown(
label="Sensitive Layer Precision",
label="Precision Mode",
choices=["fp32","bf16"],
choices=["fp32","bf16"],
value="bf16",
value="fp32",
info="Select the numerical precision for sensitive layer calculations.",
info="Select the numerical precision used for sensitive layers.",
)
)
gr.Markdown("### Variational Autoencoder (VAE)")
gr.Markdown("### Variational Autoencoder (VAE)")
withgr.Row():
withgr.Row():
use_tiny_vae=gr.Checkbox(
use_tiny_vae=gr.Checkbox(
label="Use Lightweight VAE",
label="Use Tiny VAE",
value=False,
value=False,
info="Use a lightweight VAE model to accelerate the decoding process",
info="Use a lightweight VAE model to accelerate the decoding process",
)
)
use_tiling_vae=gr.Checkbox(
use_tiling_vae=gr.Checkbox(
label="Enable VAE Tiling Inference",
label="VAE Tiling Inference",
value=False,
value=False,
info="Use VAE tiling inference to reduce video memory usage",
info="Use VAE tiling inference to reduce GPU memory usage",
)
)
gr.Markdown("### Feature Caching")
gr.Markdown("### Feature Caching")
withgr.Row():
withgr.Row():
enable_teacache=gr.Checkbox(
enable_teacache=gr.Checkbox(
label="Enable Tea Cache",
label="Tea Cache",
value=False,
value=False,
info="Cache features during inference to reduce the number of inference steps",
info="Cache features during inference to reduce the number of inference steps",
)
)
...
@@ -614,9 +925,41 @@ def main():
...
@@ -614,9 +925,41 @@ def main():
value=0.26,
value=0.26,
minimum=0,
minimum=0,
maximum=1,
maximum=1,
info="Higher acceleration may lead to lower quality - setting to 0.1 gives about 2.0x acceleration, setting to 0.2 gives about 3.0x acceleration",
info="Higher acceleration may result in lower quality —— Setting to 0.1 provides ~2.0x acceleration, setting to 0.2 provides ~3.0x acceleration",
)
use_ret_steps=gr.Checkbox(
label="Cache Only Key Steps",
value=False,
info="When checked, cache is written only at key steps where the scheduler returns results; when unchecked, cache is written at all steps to ensure the highest quality",