Unverified Commit c075702e authored by Matthew Bonanni's avatar Matthew Bonanni Committed by GitHub
Browse files

[Misc][UX] Suppress confusing `num_gpu_blocks` log lines (#40402)


Signed-off-by: default avatarMatthew Bonanni <mbonanni@redhat.com>
Co-authored-by: default avatarmergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
parent 21b086d0
......@@ -820,24 +820,31 @@ def get_max_concurrency_for_kv_cache_config(
return max_concurrency
def may_override_num_blocks(vllm_config: VllmConfig, num_blocks: int) -> int:
def may_override_num_blocks(
vllm_config: VllmConfig, num_blocks: int, suppress_log: bool = False
) -> int:
"""
Override the number of kv cache blocks if `num_gpu_blocks_override` is set.
"""
if vllm_config.cache_config.num_gpu_blocks_override is not None:
num_gpu_blocks_override = vllm_config.cache_config.num_gpu_blocks_override
logger.info(
"Overriding num_gpu_blocks=%d with num_gpu_blocks_override=%d",
num_blocks,
num_gpu_blocks_override,
)
if not suppress_log:
logger.info(
"Overriding num_gpu_blocks=%d with num_gpu_blocks_override=%d",
num_blocks,
num_gpu_blocks_override,
)
num_blocks = num_gpu_blocks_override
return num_blocks
def get_num_blocks(
vllm_config: VllmConfig, num_layers: int, available_memory: int, page_size: int
vllm_config: VllmConfig,
num_layers: int,
available_memory: int,
page_size: int,
suppress_log: bool = False,
) -> int:
"""
Get the number of kv cache blocks.
......@@ -847,10 +854,14 @@ def get_num_blocks(
num_layers: The number of layers
available_memory: Memory available for KV cache in bytes.
page_size: The page size of the KV cache.
suppress_log: Whether to suppress override log messages. Used when creating a
temporary/dummy KV cache config, e.g. during CG memory profiling
"""
num_blocks = int(available_memory // page_size // num_layers)
num_blocks = max(num_blocks, 0)
num_blocks = may_override_num_blocks(vllm_config, num_blocks)
num_blocks = may_override_num_blocks(
vllm_config, num_blocks, suppress_log=suppress_log
)
return num_blocks
......@@ -1082,6 +1093,7 @@ def get_kv_cache_config_from_groups(
vllm_config: VllmConfig,
kv_cache_groups: list[KVCacheGroupSpec],
available_memory: int,
suppress_log: bool = False,
) -> KVCacheConfig:
"""
Generate the KV cache configuration from the KV cache groups and spec
......@@ -1113,7 +1125,9 @@ def get_kv_cache_config_from_groups(
num_blocks = (
available_memory // kv_cache_groups[0].kv_cache_spec.page_size_bytes
)
num_blocks = may_override_num_blocks(vllm_config, num_blocks)
num_blocks = may_override_num_blocks(
vllm_config, num_blocks, suppress_log=suppress_log
)
per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
kv_cache_tensors = [
KVCacheTensor(
......@@ -1138,7 +1152,11 @@ def get_kv_cache_config_from_groups(
)
assert group_size > 0, "group_size must be greater than 0"
num_blocks = get_num_blocks(
vllm_config, group_size, available_memory, page_size
vllm_config,
group_size,
available_memory,
page_size,
suppress_log=suppress_log,
)
kv_cache_tensors = []
for i in range(group_size):
......
......@@ -5853,7 +5853,7 @@ class GPUModelRunner(
saved_override = self.cache_config.num_gpu_blocks_override
self.cache_config.num_gpu_blocks_override = min_blocks
minimal_config = get_kv_cache_config_from_groups(
self.vllm_config, kv_cache_groups, available_memory=0
self.vllm_config, kv_cache_groups, available_memory=0, suppress_log=True
)
self.cache_config.num_gpu_blocks_override = saved_override
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment