[NIXL] heterogeneous block_size support (#26759)

Signed-off-by: Chendi Xue <chendi.xue@intel.com> Signed-off-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>

[NIXL] heterogeneous block_size support (#26759)
Signed-off-by: Chendi Xue <chendi.xue@intel.com> Signed-off-by: Chendi.Xue <chendi.xue@intel.com> Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
c9e66585 · Chendi.Xue · GitHub · 363aaeef · c9e66585 · c9e66585
Unverified Commit c9e66585 authored Nov 14, 2025 by Chendi.Xue Committed by GitHub Nov 14, 2025
3 changed files
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -49,6 +49,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
 PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
 DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
+PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-16}
+DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-16}

 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
@@ -136,6 +138,7 @@ run_tests_for_model() {
    vllm serve $model_name \
    --port $PORT \
    --enforce-eager \
+    --block-size ${PREFILL_BLOCK_SIZE} \
    --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
    --tensor-parallel-size $PREFILLER_TP_SIZE \
    --kv-transfer-config '$KV_CONFIG'"
@@ -177,6 +180,7 @@ run_tests_for_model() {
    vllm serve $model_name \
    --port $PORT \
    --enforce-eager \
+    --block-size ${DECODE_BLOCK_SIZE} \
    --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
    --kv-transfer-config '$KV_CONFIG'"
  

--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -407,6 +407,7 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
                # `self.kv_cache_layout` is only forced to HND when vllm engine
                # is started. We mock HND here.
                kv_cache_layout="HND",
+                block_size=self.block_size,
            ),
            remote_tp_size=remote_tp_size,
        )
@@ -652,6 +653,7 @@ class TestNixlHandshake:
                block_lens=worker.block_len_per_layer,
                attn_backend_name=worker.backend_name,
                kv_cache_layout=mismatched_layout,
+                block_size=worker.block_size,
            )

            with pytest.raises(RuntimeError):
@@ -706,6 +708,7 @@ class TestNixlHandshake:
                block_lens=[i * 2 for i in worker.block_len_per_layer],
                attn_backend_name=worker.backend_name,
                kv_cache_layout="HND",
+                block_size=worker.block_size,
            )

            # We don't check layout for homogeneous TP and MLA for now, as the

--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py