Unverified Commit f79b3d62 authored by Richard Huo's avatar Richard Huo Committed by GitHub
Browse files

fix: remove forcing the kv block size as 16 in vLLM backend (#7690)

parent be5b8a58
...@@ -227,9 +227,8 @@ def update_engine_config_with_dynamo( ...@@ -227,9 +227,8 @@ def update_engine_config_with_dynamo(
engine_config.enable_prefix_caching = True engine_config.enable_prefix_caching = True
if getattr(engine_config, "block_size", None) is None: if getattr(engine_config, "block_size", None) is None:
engine_config.block_size = 16
logger.debug( logger.debug(
f"Setting reasonable default of {engine_config.block_size} for block_size" "block_size is not set in engine config. vLLM engine block_size will be determined at runtime based on the model and attention backend."
) )
if _uses_nixl_connector(engine_config): if _uses_nixl_connector(engine_config):
......
...@@ -576,6 +576,10 @@ def setup_vllm_engine( ...@@ -576,6 +576,10 @@ def setup_vllm_engine(
logger.info(f"VllmWorker for {config.served_model_name} has been initialized") logger.info(f"VllmWorker for {config.served_model_name} has been initialized")
# update block_size in vllm_config based on final engine cache info for later use
runtime_values = get_engine_cache_info(engine_client)
vllm_config.cache_config.block_size = runtime_values["block_size"]
return ( return (
engine_client, engine_client,
vllm_config, vllm_config,
......
...@@ -46,6 +46,13 @@ VLLM_ARGS: Dict[str, Any] = { ...@@ -46,6 +46,13 @@ VLLM_ARGS: Dict[str, Any] = {
"enforce_eager": True, # Disable CUDA graphs for faster startup & lower memory "enforce_eager": True, # Disable CUDA graphs for faster startup & lower memory
} }
VLLM_ARGS_NO_BLOCK_SIZE: Dict[str, Any] = {
"model": MODEL_NAME,
"gpu_memory_utilization": 0.4, # Limit VRAM allocation per worker
"max_model_len": 1024, # Limit context length to reduce KV cache size
"enforce_eager": True, # Disable CUDA graphs for faster startup & lower memory
}
class VLLMProcess(ManagedEngineProcessMixin): class VLLMProcess(ManagedEngineProcessMixin):
"""Manages vLLM workers using dynamo.vllm (HTTP API + KV events). """Manages vLLM workers using dynamo.vllm (HTTP API + KV events).
...@@ -73,7 +80,6 @@ class VLLMProcess(ManagedEngineProcessMixin): ...@@ -73,7 +80,6 @@ class VLLMProcess(ManagedEngineProcessMixin):
Args: Args:
request: pytest request fixture for log directory request: pytest request fixture for log directory
vllm_args: Configuration dict with keys: vllm_args: Configuration dict with keys:
- block_size: KV cache block size (default: 16)
- model: Model name/path (default: TinyLlama-1.1B) - model: Model name/path (default: TinyLlama-1.1B)
- gpu_memory_utilization: Fraction of GPU memory to allocate (optional) - gpu_memory_utilization: Fraction of GPU memory to allocate (optional)
- num_gpu_blocks_override: Cap on number of KV cache blocks (optional) - num_gpu_blocks_override: Cap on number of KV cache blocks (optional)
...@@ -110,7 +116,6 @@ class VLLMProcess(ManagedEngineProcessMixin): ...@@ -110,7 +116,6 @@ class VLLMProcess(ManagedEngineProcessMixin):
if vllm_args is None: if vllm_args is None:
vllm_args = {} vllm_args = {}
block_size = vllm_args.get("block_size", BLOCK_SIZE)
model = vllm_args.get("model", MODEL_NAME) model = vllm_args.get("model", MODEL_NAME)
gpu_memory_utilization = vllm_args.get("gpu_memory_utilization") gpu_memory_utilization = vllm_args.get("gpu_memory_utilization")
num_gpu_blocks_override = vllm_args.get("num_gpu_blocks_override") num_gpu_blocks_override = vllm_args.get("num_gpu_blocks_override")
...@@ -144,15 +149,10 @@ class VLLMProcess(ManagedEngineProcessMixin): ...@@ -144,15 +149,10 @@ class VLLMProcess(ManagedEngineProcessMixin):
# No DP; worker sees one GPU # No DP; worker sees one GPU
gpu_device = str(worker_idx) gpu_device = str(worker_idx)
command = [ command = ["python3", "-m", "dynamo.vllm", "--model", model]
"python3",
"-m", if "block_size" in vllm_args:
"dynamo.vllm", command.extend(["--block-size", str(vllm_args["block_size"])])
"--model",
model,
"--block-size",
str(block_size),
]
# Disable CUDA graphs for faster startup & lower memory # Disable CUDA graphs for faster startup & lower memory
if enforce_eager: if enforce_eager:
...@@ -277,6 +277,30 @@ def test_vllm_kv_router_basic( ...@@ -277,6 +277,30 @@ def test_vllm_kv_router_basic(
) )
@pytest.mark.pre_merge
@pytest.mark.gpu_1
@pytest.mark.timeout(150) # ~3x average (~43s/test), rounded up
@pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
def test_vllm_kv_router_without_block_size_specified_in_vllm_args(
request,
runtime_services_dynamic_ports,
predownload_models,
set_ucx_tls_no_mm,
request_plane,
):
run_basic_router_test(
engine_process_cls=VLLMProcess,
engine_args_name="vllm_args",
engine_args=VLLM_ARGS_NO_BLOCK_SIZE,
num_workers=2,
single_gpu=True,
request=request,
request_plane=request_plane,
block_size=BLOCK_SIZE,
model_name=MODEL_NAME,
)
@pytest.mark.pre_merge @pytest.mark.pre_merge
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.timeout(150) # ~3x average (~43s/test), rounded up @pytest.mark.timeout(150) # ~3x average (~43s/test), rounded up
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment