Commit d126ce21 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_USE_CUDA_GRAPH_SIZES to use 1-18... (not only 1 2 4 8 16)

parent 77fccdf4
......@@ -4761,9 +4761,14 @@ class VllmConfig:
else:
cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
if len(cuda_graph_sizes) == 1:
batch_size_capture_list = [1, 2, 4] + [
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
]
if not envs.VLLM_USE_CUDA_GRAPH_SIZES:
batch_size_capture_list = [1, 2, 4] + [
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
]
else:
batch_size_capture_list = list(range(1, 19)) + [24, 32] + [
i for i in range(40, cuda_graph_sizes[0] + 1, 8)
]
elif len(cuda_graph_sizes) > 1:
batch_size_capture_list = sorted(cuda_graph_sizes)
else:
......
......@@ -182,6 +182,7 @@ if TYPE_CHECKING:
VLLM_USE_LIGHTOP_FILL_MOE_ALIGN: bool = False
USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT: bool = False
VLLM_USE_PP_BALANCE: bool = False
VLLM_USE_CUDA_GRAPH_SIZES: bool = False
def get_default_cache_root():
return os.getenv(
......@@ -1151,32 +1152,44 @@ environment_variables: dict[str, Callable[[], Any]] = {
# vllm pd separation will be used async
"VLLM_P2P_ASYNC":
lambda: bool(int(os.getenv("VLLM_P2P_ASYNC", "0"))),
# pd separation p2p async buf tokens
"VLLM_P2P_BUF_TOKENS":
lambda: int(os.getenv("VLLM_P2P_BUF_TOKENS", "30000")),
# vllm will enable minimal injection for pipeline parallel scheduling
"VLLM_SCHED_ENABLE_MINIMAL_INJECTION":
lambda: (os.getenv("VLLM_SCHED_ENABLE_MINIMAL_INJECTION", "0").lower() in
("true", "1")),
# vLLM will split prefill and decode, not mix up
"VLLM_USE_PD_SPLIT":
lambda: (os.environ.get("VLLM_USE_PD_SPLIT", "False").lower() in
("true", "1")),
# vLLM will sync to avoid pp vmfault
"VLLM_USE_PP_SYNC":
lambda: (os.environ.get("VLLM_USE_PP_SYNC", "False").lower() in
("true", "1")),
# vLLM will use lightop to fuse fill and moe align
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN":
lambda: (os.environ.get("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN", "False").lower() in
("true", "1")),
# vllm will use custom-allreduce rmsquant fused op
"USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT":
lambda: (os.getenv('USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT', '0').lower() in
("true", "1")),
"VLLM_USE_PP_BALANCE":
lambda: (os.getenv('VLLM_USE_PP_BALANCE', '1').lower() in
("true", "1")),
# vllm will use 1-18... (not only 1 2 4 8 16)
"VLLM_USE_CUDA_GRAPH_SIZES":
lambda: (os.getenv('VLLM_USE_CUDA_GRAPH_SIZES', 'False').lower() in
("true", "1")),
}
# --8<-- [end:env-vars-definition]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment