# Set chunked prefill size, which depends on the gpu memory capacity
...
...
@@ -276,9 +276,9 @@ class ServerArgs:
)
self.page_size=128
# Set CUDA graph max batch size
# Set cuda graph max batch size
ifself.cuda_graph_max_bsisNone:
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable CUDA graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating CUDA graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable CUDA graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating CUDA graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating CUDA graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
ifgpu_memisnotNoneandgpu_mem<25_000:
ifself.tp_size<4:
self.cuda_graph_max_bs=8
...
...
@@ -729,7 +729,7 @@ class ServerArgs:
"--download-dir",
type=str,
default=ServerArgs.download_dir,
help="Model download directory for HuggingFace.",
help="Model download directory for huggingface.",
)
parser.add_argument(
"--base-gpu-id",
...
...
@@ -1024,12 +1024,12 @@ class ServerArgs:
parser.add_argument(
"--disable-cuda-graph",
action="store_true",
help="Disable CUDA graph.",
help="Disable cuda graph.",
)
parser.add_argument(
"--disable-cuda-graph-padding",
action="store_true",
help="Disable CUDA graph when padding is needed. Still uses CUDA graph when padding is not needed.",
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
)
parser.add_argument(
"--enable-nccl-nvls",
...
...
@@ -1075,7 +1075,7 @@ class ServerArgs:
parser.add_argument(
"--enable-ep-moe",
action="store_true",
help="Enabling expert parallelism for MoE. The ep size is equal to the tp size.",
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
)
parser.add_argument(
"--enable-torch-compile",
...
...
@@ -1092,13 +1092,13 @@ class ServerArgs:
"--cuda-graph-max-bs",
type=int,
default=ServerArgs.cuda_graph_max_bs,
help="Set the maximum batch size for CUDA graph. It will extend the CUDA graph capture batch size to this value.",
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
)
parser.add_argument(
"--cuda-graph-bs",
type=int,
nargs="+",
help="Set the list of batch sizes for CUDA graph.",
help="Set the list of batch sizes for cuda graph.",