[PD] Change kv_load_failure_policy Default from "recompute" to "fail" (#34896)

Signed-off-by: NickLucche <nlucches@redhat.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>

[PD] Change kv_load_failure_policy Default from "recompute" to "fail" (#34896)
Signed-off-by: NickLucche <nlucches@redhat.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
ab6f3487 · Nicolò Lucchesi · GitHub · 8dc8a99b · ab6f3487 · ab6f3487
Unverified Commit ab6f3487 authored Feb 21, 2026 by Nicolò Lucchesi Committed by GitHub Feb 21, 2026
5 changed files
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -197,8 +197,8 @@ For multi-host DP deployment, only need to provide the host/port of the head ins
 The `kv_load_failure_policy` setting controls how the system handles failures when the decoder instance loads KV cache blocks from the prefiller instance:
- **fail** (recommended): Immediately fail the request with an error when KV load fails. This prevents performance degradation by avoiding recomputation of prefill work on the decode instance.
+- **fail** (default): Immediately fail the request with an error when KV load fails. This prevents performance degradation by avoiding recomputation of prefill work on the decode instance.
- **recompute** (default): Recompute failed blocks locally on the decode instance. This may cause performance _jitter_ on decode instances as the scheduled prefill will delay and interfere with other decodes. Furthermore, decode instances are typically configured with low-latency optimizations.
+- **recompute**: Recompute failed blocks locally on the decode instance. This may cause performance _jitter_ on decode instances as the scheduled prefill will delay and interfere with other decodes. Furthermore, decode instances are typically configured with low-latency optimizations.
 !!! warning
    Using `kv_load_failure_policy="recompute"` can lead to performance degradation in production deployments. When KV loads fail, the decode instance will execute prefill work with decode-optimized configurations, which is inefficient and defeats the purpose of disaggregated prefilling. This also increases tail latency for other ongoing decode requests.

--- a/examples/offline_inference/kv_load_failure_recovery/decode_example.py
+++ b/examples/offline_inference/kv_load_failure_recovery/decode_example.py
@@ -42,6 +42,7 @@ def main():
                "async_load": args.async_load,
            },
            kv_connector_module_path="load_recovery_example_connector",
+            kv_load_failure_policy="recompute",
        )
        out_file = (
            "async_decode_recovered_output.txt"

--- a/tests/v1/kv_connector/unit/test_kv_load_failure_recovery.py
+++ b/tests/v1/kv_connector/unit/test_kv_load_failure_recovery.py
@@ -30,7 +30,7 @@ def _make_get_num_new_matched_tokens(
 @pytest.fixture
 def scheduler():
-    vllm_config = create_vllm_config()
+    vllm_config = create_vllm_config(kv_load_failure_policy="recompute")
    return create_scheduler(vllm_config)

--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -5,7 +5,7 @@ from collections import defaultdict
 from collections.abc import Callable
 from dataclasses import dataclass
 from itertools import chain, count
-from typing import Any
+from typing import Any, Literal
 import torch
@@ -96,6 +96,7 @@ def create_vllm_config(
    cache_dtype: str = "auto",
    hf_overrides: dict[str, Any] | None = None,
    attention_backend: str | None = None,
+    kv_load_failure_policy: Literal["recompute", "fail"] = "fail",
 ) -> VllmConfig:
    """Initialize VllmConfig For Testing."""
    model_config = ModelConfig(
@@ -125,6 +126,7 @@ def create_vllm_config(
        kv_role="kv_both",
        enable_permute_local_kv=enable_permute_local_kv,
        kv_connector_extra_config=kv_connector_extra_config or {},
+        kv_load_failure_policy=kv_load_failure_policy,
    )
    attention_config = AttentionConfig(backend=attention_backend)
    return VllmConfig(

--- a/vllm/config/kv_transfer.py
+++ b/vllm/config/kv_transfer.py
@@ -61,10 +61,10 @@ class KVTransferConfig:
    enable_permute_local_kv: bool = False
    """Experiment feature flag to enable HND to NHD KV Transfer"""
-    kv_load_failure_policy: Literal["recompute", "fail"] = "recompute"
+    kv_load_failure_policy: Literal["recompute", "fail"] = "fail"
    """Policy for handling KV cache load failures.
-    'recompute': reschedule the request to recompute failed blocks (default)
+    'recompute': reschedule the request to recompute failed blocks
-    'fail': immediately fail the request with an error finish reason"""
+    'fail': immediately fail the request with an error finish reason (default)"""
    def compute_hash(self) -> str:
        """