Unverified Commit c9e66585 authored by Chendi.Xue's avatar Chendi.Xue Committed by GitHub
Browse files

[NIXL] heterogeneous block_size support (#26759)


Signed-off-by: default avatarChendi Xue <chendi.xue@intel.com>
Signed-off-by: default avatarChendi.Xue <chendi.xue@intel.com>
Co-authored-by: default avatarNicolò Lucchesi <nicolo.lucchesi@gmail.com>
parent 363aaeef
...@@ -49,6 +49,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1} # Default to 1 ...@@ -49,6 +49,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1} # Default to 1
PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1} PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
DECODER_TP_SIZE=${DECODER_TP_SIZE:-1} DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2} GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-16}
DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-16}
# Find the git repository root directory # Find the git repository root directory
GIT_ROOT=$(git rev-parse --show-toplevel) GIT_ROOT=$(git rev-parse --show-toplevel)
...@@ -136,6 +138,7 @@ run_tests_for_model() { ...@@ -136,6 +138,7 @@ run_tests_for_model() {
vllm serve $model_name \ vllm serve $model_name \
--port $PORT \ --port $PORT \
--enforce-eager \ --enforce-eager \
--block-size ${PREFILL_BLOCK_SIZE} \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--tensor-parallel-size $PREFILLER_TP_SIZE \ --tensor-parallel-size $PREFILLER_TP_SIZE \
--kv-transfer-config '$KV_CONFIG'" --kv-transfer-config '$KV_CONFIG'"
...@@ -177,6 +180,7 @@ run_tests_for_model() { ...@@ -177,6 +180,7 @@ run_tests_for_model() {
vllm serve $model_name \ vllm serve $model_name \
--port $PORT \ --port $PORT \
--enforce-eager \ --enforce-eager \
--block-size ${DECODE_BLOCK_SIZE} \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--kv-transfer-config '$KV_CONFIG'" --kv-transfer-config '$KV_CONFIG'"
......
...@@ -407,6 +407,7 @@ class FakeNixlConnectorWorker(NixlConnectorWorker): ...@@ -407,6 +407,7 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
# `self.kv_cache_layout` is only forced to HND when vllm engine # `self.kv_cache_layout` is only forced to HND when vllm engine
# is started. We mock HND here. # is started. We mock HND here.
kv_cache_layout="HND", kv_cache_layout="HND",
block_size=self.block_size,
), ),
remote_tp_size=remote_tp_size, remote_tp_size=remote_tp_size,
) )
...@@ -652,6 +653,7 @@ class TestNixlHandshake: ...@@ -652,6 +653,7 @@ class TestNixlHandshake:
block_lens=worker.block_len_per_layer, block_lens=worker.block_len_per_layer,
attn_backend_name=worker.backend_name, attn_backend_name=worker.backend_name,
kv_cache_layout=mismatched_layout, kv_cache_layout=mismatched_layout,
block_size=worker.block_size,
) )
with pytest.raises(RuntimeError): with pytest.raises(RuntimeError):
...@@ -706,6 +708,7 @@ class TestNixlHandshake: ...@@ -706,6 +708,7 @@ class TestNixlHandshake:
block_lens=[i * 2 for i in worker.block_len_per_layer], block_lens=[i * 2 for i in worker.block_len_per_layer],
attn_backend_name=worker.backend_name, attn_backend_name=worker.backend_name,
kv_cache_layout="HND", kv_cache_layout="HND",
block_size=worker.block_size,
) )
# We don't check layout for homogeneous TP and MLA for now, as the # We don't check layout for homogeneous TP and MLA for now, as the
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment