Unverified Commit 3d7182b8 authored by Neal Vaidya's avatar Neal Vaidya Committed by GitHub
Browse files

fix: read block size from vllm at runtime (#5713)


Signed-off-by: default avatarNeal Vaidya <nealv@nvidia.com>
parent 2c3066bd
...@@ -556,7 +556,7 @@ async def register_vllm_model( ...@@ -556,7 +556,7 @@ async def register_vllm_model(
generate_endpoint, generate_endpoint,
config.model, config.model,
config.served_model_name, config.served_model_name,
kv_cache_block_size=config.engine_args.block_size, kv_cache_block_size=runtime_values["block_size"],
runtime_config=runtime_config, runtime_config=runtime_config,
custom_template_path=config.custom_jinja_template, custom_template_path=config.custom_jinja_template,
media_decoder=media_decoder, media_decoder=media_decoder,
...@@ -860,6 +860,7 @@ def get_engine_cache_info(engine: AsyncLLM): ...@@ -860,6 +860,7 @@ def get_engine_cache_info(engine: AsyncLLM):
# Get values directly from vllm_config instead of collective_rpc # Get values directly from vllm_config instead of collective_rpc
cache_values = { cache_values = {
"num_gpu_blocks": engine.vllm_config.cache_config.num_gpu_blocks, "num_gpu_blocks": engine.vllm_config.cache_config.num_gpu_blocks,
"block_size": engine.vllm_config.cache_config.block_size,
} }
scheduler_values = { scheduler_values = {
...@@ -871,6 +872,7 @@ def get_engine_cache_info(engine: AsyncLLM): ...@@ -871,6 +872,7 @@ def get_engine_cache_info(engine: AsyncLLM):
logging.info(f"Scheduler config values: {scheduler_values}") logging.info(f"Scheduler config values: {scheduler_values}")
return { return {
"num_gpu_blocks": cache_values["num_gpu_blocks"], "num_gpu_blocks": cache_values["num_gpu_blocks"],
"block_size": cache_values["block_size"],
"max_num_seqs": scheduler_values["max_num_seqs"], "max_num_seqs": scheduler_values["max_num_seqs"],
"max_num_batched_tokens": scheduler_values["max_num_batched_tokens"], "max_num_batched_tokens": scheduler_values["max_num_batched_tokens"],
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment