"docs/vscode:/vscode.git/clone" did not exist on "aaacf173243d7700a7a245489198a5f22d96f745"
Unverified Commit c9e66585 authored by Chendi.Xue's avatar Chendi.Xue Committed by GitHub
Browse files

[NIXL] heterogeneous block_size support (#26759)


Signed-off-by: default avatarChendi Xue <chendi.xue@intel.com>
Signed-off-by: default avatarChendi.Xue <chendi.xue@intel.com>
Co-authored-by: default avatarNicolò Lucchesi <nicolo.lucchesi@gmail.com>
parent 363aaeef
......@@ -49,6 +49,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1} # Default to 1
PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-16}
DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-16}
# Find the git repository root directory
GIT_ROOT=$(git rev-parse --show-toplevel)
......@@ -136,6 +138,7 @@ run_tests_for_model() {
vllm serve $model_name \
--port $PORT \
--enforce-eager \
--block-size ${PREFILL_BLOCK_SIZE} \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--tensor-parallel-size $PREFILLER_TP_SIZE \
--kv-transfer-config '$KV_CONFIG'"
......@@ -177,6 +180,7 @@ run_tests_for_model() {
vllm serve $model_name \
--port $PORT \
--enforce-eager \
--block-size ${DECODE_BLOCK_SIZE} \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--kv-transfer-config '$KV_CONFIG'"
......
......@@ -407,6 +407,7 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
# `self.kv_cache_layout` is only forced to HND when vllm engine
# is started. We mock HND here.
kv_cache_layout="HND",
block_size=self.block_size,
),
remote_tp_size=remote_tp_size,
)
......@@ -652,6 +653,7 @@ class TestNixlHandshake:
block_lens=worker.block_len_per_layer,
attn_backend_name=worker.backend_name,
kv_cache_layout=mismatched_layout,
block_size=worker.block_size,
)
with pytest.raises(RuntimeError):
......@@ -706,6 +708,7 @@ class TestNixlHandshake:
block_lens=[i * 2 for i in worker.block_len_per_layer],
attn_backend_name=worker.backend_name,
kv_cache_layout="HND",
block_size=worker.block_size,
)
# We don't check layout for homogeneous TP and MLA for now, as the
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment