Unverified Commit 61c299f8 authored by Chen Xia's avatar Chen Xia Committed by GitHub
Browse files

[Misc]add configurable cuda graph size (#17201)


Signed-off-by: default avatarCXIAAAAA <cxia0209@gmail.com>
Co-authored-by: default avatarCyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 4acfa335
...@@ -1865,6 +1865,13 @@ class SchedulerConfig: ...@@ -1865,6 +1865,13 @@ class SchedulerConfig:
This config has no static default. If left unspecified by the user, it will This config has no static default. If left unspecified by the user, it will
be set in `EngineArgs.create_engine_config` based on the usage context.""" be set in `EngineArgs.create_engine_config` based on the usage context."""
cuda_graph_sizes: list[int] = field(default_factory=lambda: [512])
"""Cuda graph capture sizes, default is 512.
1. if one value is provided, then the capture list would follow the pattern:
[1, 2, 4] + [i for i in range(8, cuda_graph_sizes + 1, 8)]
2. more than one value (e.g. 1 2 128) is provided,
then the capture list will follow the provided list."""
max_num_seqs: int = None # type: ignore max_num_seqs: int = None # type: ignore
"""Maximum number of sequences to be processed in a single iteration. """Maximum number of sequences to be processed in a single iteration.
...@@ -4235,13 +4242,20 @@ class VllmConfig: ...@@ -4235,13 +4242,20 @@ class VllmConfig:
batch_size_capture_list = [] batch_size_capture_list = []
if self.model_config is not None and \ if self.model_config is not None and \
not self.model_config.enforce_eager: not self.model_config.enforce_eager:
batch_size_capture_list = [1, 2, 4 cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
] + [i for i in range(8, 513, 8)] if len(cuda_graph_sizes) == 1:
batch_size_capture_list = [1, 2, 4] + [
i for i in range(8, cuda_graph_sizes[0] + 1, 8)
]
elif len(cuda_graph_sizes) > 1:
batch_size_capture_list = sorted(cuda_graph_sizes)
else:
raise TypeError(
f"Invalid value for {cuda_graph_sizes=}.")
if self.parallel_config.tensor_parallel_size > 1 and \ if self.parallel_config.tensor_parallel_size > 1 and \
self.compilation_config.pass_config.enable_sequence_parallelism: self.compilation_config.pass_config.enable_sequence_parallelism:
batch_size_capture_list = \ batch_size_capture_list = \
self.update_sizes_for_sequence_parallelism(batch_size_capture_list) self.update_sizes_for_sequence_parallelism(batch_size_capture_list)
max_num_tokens = self.scheduler_config.max_num_batched_tokens max_num_tokens = self.scheduler_config.max_num_batched_tokens
batch_size_capture_list = [ batch_size_capture_list = [
size for size in batch_size_capture_list size for size in batch_size_capture_list
......
...@@ -231,6 +231,8 @@ class EngineArgs: ...@@ -231,6 +231,8 @@ class EngineArgs:
kv_cache_dtype: CacheDType = CacheConfig.cache_dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
seed: Optional[int] = ModelConfig.seed seed: Optional[int] = ModelConfig.seed
max_model_len: Optional[int] = ModelConfig.max_model_len max_model_len: Optional[int] = ModelConfig.max_model_len
cuda_graph_sizes: list[int] = get_field(SchedulerConfig,
"cuda_graph_sizes")
# Note: Specifying a custom executor backend by passing a class # Note: Specifying a custom executor backend by passing a class
# is intended for expert use only. The API may change without # is intended for expert use only. The API may change without
# notice. # notice.
...@@ -711,6 +713,8 @@ class EngineArgs: ...@@ -711,6 +713,8 @@ class EngineArgs:
scheduler_group.add_argument( scheduler_group.add_argument(
"--max-long-partial-prefills", "--max-long-partial-prefills",
**scheduler_kwargs["max_long_partial_prefills"]) **scheduler_kwargs["max_long_partial_prefills"])
scheduler_group.add_argument('--cuda-graph-sizes',
**scheduler_kwargs["cuda_graph_sizes"])
scheduler_group.add_argument( scheduler_group.add_argument(
"--long-prefill-token-threshold", "--long-prefill-token-threshold",
**scheduler_kwargs["long_prefill_token_threshold"]) **scheduler_kwargs["long_prefill_token_threshold"])
...@@ -1042,6 +1046,7 @@ class EngineArgs: ...@@ -1042,6 +1046,7 @@ class EngineArgs:
max_num_batched_tokens=self.max_num_batched_tokens, max_num_batched_tokens=self.max_num_batched_tokens,
max_num_seqs=self.max_num_seqs, max_num_seqs=self.max_num_seqs,
max_model_len=model_config.max_model_len, max_model_len=model_config.max_model_len,
cuda_graph_sizes=self.cuda_graph_sizes,
num_lookahead_slots=num_lookahead_slots, num_lookahead_slots=num_lookahead_slots,
delay_factor=self.scheduler_delay_factor, delay_factor=self.scheduler_delay_factor,
enable_chunked_prefill=self.enable_chunked_prefill, enable_chunked_prefill=self.enable_chunked_prefill,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment