Commit 2b47bce9 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_USE_CUDA_GRAPH_SIZES(1) to use 1-24... (not only 1 2 4 8 16)

set VLLM_USE_LIGHTOP_FILL_MOE_ALIGN=1, VLLM_USE_OPT_ZEROS=1 and VLLM_USE_PP_SYNC=1
parent ce755d66
...@@ -4766,7 +4766,7 @@ class VllmConfig: ...@@ -4766,7 +4766,7 @@ class VllmConfig:
i for i in range(8, cuda_graph_sizes[0] + 1, 8) i for i in range(8, cuda_graph_sizes[0] + 1, 8)
] ]
else: else:
batch_size_capture_list = list(range(1, 19)) + [24, 32] + [ batch_size_capture_list = list(range(1, 25)) + [32] + [
i for i in range(40, cuda_graph_sizes[0] + 1, 8) i for i in range(40, cuda_graph_sizes[0] + 1, 8)
] ]
elif len(cuda_graph_sizes) > 1: elif len(cuda_graph_sizes) > 1:
......
...@@ -1114,7 +1114,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1114,7 +1114,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
("true", "1")), ("true", "1")),
# vLLM will use elenmentwise not triton_ # vLLM will use elenmentwise not triton_
"VLLM_USE_OPT_ZEROS": "VLLM_USE_OPT_ZEROS":
lambda: (os.environ.get("VLLM_USE_OPT_ZEROS", "False").lower() in lambda: (os.environ.get("VLLM_USE_OPT_ZEROS", "True").lower() in
("true", "1")), ("true", "1")),
# vLLM will use opt cat for deepseek-v3 # vLLM will use opt cat for deepseek-v3
"VLLM_USE_OPT_CAT": "VLLM_USE_OPT_CAT":
...@@ -1170,12 +1170,12 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1170,12 +1170,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
# vLLM will sync to avoid pp vmfault # vLLM will sync to avoid pp vmfault
"VLLM_USE_PP_SYNC": "VLLM_USE_PP_SYNC":
lambda: (os.environ.get("VLLM_USE_PP_SYNC", "False").lower() in lambda: (os.environ.get("VLLM_USE_PP_SYNC", "True").lower() in
("true", "1")), ("true", "1")),
# vLLM will use lightop to fuse fill and moe align # vLLM will use lightop to fuse fill and moe align
"VLLM_USE_LIGHTOP_FILL_MOE_ALIGN": "VLLM_USE_LIGHTOP_FILL_MOE_ALIGN":
lambda: (os.environ.get("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN", "False").lower() in lambda: (os.environ.get("VLLM_USE_LIGHTOP_FILL_MOE_ALIGN", "True").lower() in
("true", "1")), ("true", "1")),
# vllm will use custom-allreduce rmsquant fused op # vllm will use custom-allreduce rmsquant fused op
...@@ -1191,9 +1191,9 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1191,9 +1191,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: (os.getenv('VLLM_USE_ZERO_MTP', '1').lower() in lambda: (os.getenv('VLLM_USE_ZERO_MTP', '1').lower() in
("true", "1")), ("true", "1")),
# vllm will use 1-18... (not only 1 2 4 8 16) # vllm will use 1-24... (not only 1 2 4 8 16 24)
"VLLM_USE_CUDA_GRAPH_SIZES": "VLLM_USE_CUDA_GRAPH_SIZES":
lambda: (os.getenv('VLLM_USE_CUDA_GRAPH_SIZES', 'False').lower() in lambda: (os.getenv('VLLM_USE_CUDA_GRAPH_SIZES', 'True').lower() in
("true", "1")), ("true", "1")),
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment