Revert "[BugFix] Fix engine hanging after KV cache initialization fai… (#36262)

26bd43b5 · Nick Hill · GitHub · 6b625a88 · 26bd43b5 · 26bd43b5
Unverified Commit 26bd43b5 authored Mar 06, 2026 by Nick Hill Committed by GitHub Mar 06, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 25 additions and 59 deletions

vllm/v1/engine/core.py vllm/v1/engine/core.py +25 -54

vllm/v1/engine/utils.py vllm/v1/engine/utils.py +0 -5

No files found.
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import contextlib
 import os
 import queue
 import signal
@@ -120,17 +119,9 @@ class EngineCore:
            self._eep_scale_up_before_kv_init()

        # Setup KV Caches and update CacheConfig after profiling.
-        try:
-            num_gpu_blocks, num_cpu_blocks, kv_cache_config = (
-                self._initialize_kv_caches(vllm_config)
-            )
-        except Exception:
-            logger.exception(
-                "EngineCore failed during KV cache initialization; "
-                "shutting down executor."
-            )
-            self.model_executor.shutdown()
-            raise
+        num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
+            vllm_config
+        )

        vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
        vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
@@ -976,49 +967,29 @@ class EngineCoreProc(EngineCore):
            addresses = self.startup_handshake(
                handshake_socket, local_client, headless, parallel_config_to_update
            )
-            exc_during_init = False
-            try:
-                yield addresses
-            except Exception:
-                exc_during_init = True
-                raise
-            finally:
-                if exc_during_init:
-                    # Send FAILED status so the front-end detects init
-                    # failure immediately via ZMQ instead of waiting for
-                    # process sentinel (which may be delayed by cleanup).
-                    with contextlib.suppress(Exception):
-                        handshake_socket.send(
-                            msgspec.msgpack.encode(
-                                {
-                                    "status": "FAILED",
-                                    "local": local_client,
-                                    "headless": headless,
-                                }
-                            )
-                        )
-                else:
-                    # Send ready message.
-                    num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks
-                    # We pass back the coordinator stats update address
-                    # here for the external LB case for our colocated
-                    # front-end to use (coordinator only runs with rank 0).
-                    dp_stats_address = self.frontend_stats_publish_address
-
-                    # Include config hash for DP configuration validation
-                    ready_msg = {
-                        "status": "READY",
-                        "local": local_client,
-                        "headless": headless,
-                        "num_gpu_blocks": num_gpu_blocks,
-                        "dp_stats_address": dp_stats_address,
-                    }
-                    if vllm_config.parallel_config.data_parallel_size > 1:
-                        ready_msg["parallel_config_hash"] = (
-                            vllm_config.parallel_config.compute_hash()
-                        )
+            yield addresses
+
+            # Send ready message.
+            num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks
+            # We pass back the coordinator stats update address here for the
+            # external LB case for our colocated front-end to use (coordinator
+            # only runs with rank 0).
+            dp_stats_address = self.frontend_stats_publish_address
+
+            # Include config hash for DP configuration validation
+            ready_msg = {
+                "status": "READY",
+                "local": local_client,
+                "headless": headless,
+                "num_gpu_blocks": num_gpu_blocks,
+                "dp_stats_address": dp_stats_address,
+            }
+            if vllm_config.parallel_config.data_parallel_size > 1:
+                ready_msg["parallel_config_hash"] = (
+                    vllm_config.parallel_config.compute_hash()
+                )

-                    handshake_socket.send(msgspec.msgpack.encode(ready_msg))
+            handshake_socket.send(msgspec.msgpack.encode(ready_msg))

    @staticmethod
    def startup_handshake(

--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -1130,11 +1130,6 @@ def wait_for_engine_startup(

            start_pending[0 if local else 1] -= 1
            engine.state = CoreEngineState.READY
-        elif status == "FAILED":
-            raise RuntimeError(
-                f"Engine core {eng_index} reported initialization failure. "
-                "See root cause above."
-            )
        else:
            raise RuntimeError(
                f"Unexpected {status} message for "