Commit 5a14b60c authored by wangmin6's avatar wangmin6
Browse files

Merge branch 'v0.15.1-dev_t2' into 'v0.15.1-dev'

增加max_cudagraph_capture_size,细化capture的范围

See merge request dcutoolkit/deeplearing/vllm!494
parents 8781f412 1b71f522
......@@ -1157,9 +1157,14 @@ class VllmConfig:
# sort to make sure the sizes are in ascending order
cudagraph_capture_sizes.sort()
else:
cudagraph_capture_sizes = [
i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
]
if not envs.VLLM_USE_CUDA_GRAPH_SIZES:
cudagraph_capture_sizes = [
i for i in [1, 2, 4] if i <= max_cudagraph_capture_size
]
else:
cudagraph_capture_sizes = list(range(1, 25)) + [32] + [
i for i in range(40, max_cudagraph_capture_size + 1, 8)
]
if max_cudagraph_capture_size >= 8:
# Step size 8 for small batch sizes, up to 256(not included)
cudagraph_capture_sizes += list(
......
......@@ -306,6 +306,7 @@ if TYPE_CHECKING:
VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER: bool = False
VLLM_V1_USE_FUSED_QKV_SPLIT_RMS_ROPE_KVSTORE: bool = False
VLLM_USE_FUSED_DTBMM: bool = False # DOUBLE TRANS BMM FP8
VLLM_USE_CUDA_GRAPH_SIZES: bool = False
def get_default_cache_root():
......@@ -1918,6 +1919,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_FUSED_DTBMM":
lambda: (os.environ.get("VLLM_USE_FUSED_DTBMM", "False").lower() in
("true", "1")),
# vllm will use 1-24,32,40,48... (not only 1 2 4 8 16)
"VLLM_USE_CUDA_GRAPH_SIZES":
lambda: (os.getenv("VLLM_USE_CUDA_GRAPH_SIZES", "False").lower() in
("true", "1")),
}
# --8<-- [end:env-vars-definition]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment