fix: make sleep/wake errors fatal in failover mode and correct RO memory accounting (#7681)

Signed-off-by: mohammedabdulwahhab <furkhan324@berkeley.edu> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

fix: make sleep/wake errors fatal in failover mode and correct RO memory accounting (#7681)
Signed-off-by: mohammedabdulwahhab <furkhan324@berkeley.edu> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
a565f105 · mohammedabdulwahhab · GitHub · 64c6eb66 · a565f105 · a565f105
Unverified Commit a565f105 authored Mar 30, 2026 by mohammedabdulwahhab Committed by GitHub Mar 30, 2026
Showing with 17 additions and 6 deletions

components/src/dynamo/vllm/worker_factory.py components/src/dynamo/vllm/worker_factory.py +3 -2

lib/gpu_memory_service/integrations/vllm/worker.py lib/gpu_memory_service/integrations/vllm/worker.py +14 -4

No files found.
--- a/components/src/dynamo/vllm/worker_factory.py
+++ b/components/src/dynamo/vllm/worker_factory.py
@@ -521,7 +521,7 @@ class WorkerFactory:
        if config.gms_shadow_mode:
            # Shadow mode: lock-driven activation.
            # Flow: sleep → startup probe passes → block on lock → wake → register.
-            await handler.sleep({"level": 1})
+            await handler._quiesce_controller.quiesce(1)

            runtime.set_health_status(True)
            logger.info(
@@ -536,7 +536,8 @@ class WorkerFactory:
            await lock.acquire(engine_id=f"engine-{engine_id}")
            logger.info("[Shadow] Lock acquired, waking engine")

-            await handler.wake_up({})
+            await handler._quiesce_controller.resume()
+            handler._quiesce_controller.mark_resumed()
            logger.info("[Shadow] Engine awake, registering with discovery")

        await self.register_vllm_model(

--- a/lib/gpu_memory_service/integrations/vllm/worker.py
+++ b/lib/gpu_memory_service/integrations/vllm/worker.py
@@ -106,21 +106,31 @@ class GMSWorker(Worker):
        if not is_shadow_mode():
            return super().determine_available_memory()

-        # TODO: Need a more robust way for shadow engines to profile memory while they are sharing GPUs with other engines.
-        # For now this gets the job done.
        torch.cuda.reset_peak_memory_stats()
        self.model_runner.profile_run()
        torch.cuda.synchronize()
-        non_kv_cache_memory = torch.cuda.max_memory_allocated()
+        torch_peak = torch.cuda.max_memory_allocated()
+
+        # If weights are strictly loaded (RO), torch's memory accounting will miss them since we didn't go through the mempool
+        # We therefore add in the memory of the weights into our accounting here
+        # This is not an issue on engines that write the weights and then downgrade to RO
+        weights_memory = int(getattr(self.model_runner, "model_memory_usage", 0))
+        if torch_peak < weights_memory:
+            non_kv_cache_memory = torch_peak + weights_memory
+        else:
+            non_kv_cache_memory = torch_peak

        projected_available = self.requested_memory - non_kv_cache_memory

        logger.info(
            "[GMS] Shadow mode: projected available memory "
-            "%.2f GiB (requested=%.2f GiB, non_kv=%.2f GiB)",
+            "%.2f GiB (requested=%.2f GiB, non_kv=%.2f GiB, "
+            "torch_peak=%.2f GiB, weights=%.2f GiB)",
            projected_available / (1 << 30),
            self.requested_memory / (1 << 30),
            non_kv_cache_memory / (1 << 30),
+            torch_peak / (1 << 30),
+            weights_memory / (1 << 30),
        )

        return int(projected_available)