refactor: clean up checkpoint orchestration (#7309)

Signed-off-by: Schwinn Saereesitthipitak <schwinns@nvidia.com>

refactor: clean up checkpoint orchestration (#7309)
Signed-off-by: Schwinn Saereesitthipitak <schwinns@nvidia.com>
38bb9d37 · Schwinn Saereesitthipitak · GitHub · 9ea3acad · 38bb9d37 · 38bb9d37
Unverified Commit 38bb9d37 authored Mar 18, 2026 by Schwinn Saereesitthipitak Committed by GitHub Mar 18, 2026
20 changed files
--- a/components/src/dynamo/common/utils/snapshot.py
+++ b/components/src/dynamo/common/utils/snapshot.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Shared Dynamo snapshot helpers for checkpoint lifecycle."""
+import asyncio
+import logging
+import os
+import signal
+from dataclasses import dataclass
+from typing import Any, Generic, TypeVar
+from dynamo.common.utils.namespace import get_worker_namespace
+logger = logging.getLogger(__name__)
+PODINFO_ROOT = "/etc/podinfo"
+REQUIRED_PODINFO_FILES = {
+    "DYN_NAMESPACE": "dyn_namespace",
+    "DYN_COMPONENT": "dyn_component",
+    "DYN_PARENT_DGD_K8S_NAME": "dyn_parent_dgd_k8s_name",
+    "DYN_PARENT_DGD_K8S_NAMESPACE": "dyn_parent_dgd_k8s_namespace",
+}
+OPTIONAL_PODINFO_FILES = {
+    "DYN_NAMESPACE_WORKER_SUFFIX": "dyn_namespace_worker_suffix",
+}
+EngineT = TypeVar("EngineT")
+class CheckpointConfig:
+    """Parsed checkpoint configuration plus the watcher-driven lifecycle."""
+    def __init__(self, ready_file: str):
+        self.ready_file = ready_file
+        self._checkpoint_done = asyncio.Event()
+        self._restore_done = asyncio.Event()
+    @classmethod
+    def from_env(cls) -> "CheckpointConfig | None":
+        ready_file = os.environ.get("DYN_READY_FOR_CHECKPOINT_FILE")
+        if not ready_file:
+            return None
+        configure_checkpoint_transport_env()
+        return cls(ready_file=ready_file)
+    async def run_lifecycle(
+        self,
+        quiesce_controller: Any,
+        *quiesce_args: object,
+    ) -> bool:
+        logger.info("Quiescing model")
+        await quiesce_controller.quiesce(*quiesce_args)
+        self._install_signal_handlers()
+        try:
+            with open(self.ready_file, "w", encoding="utf-8") as ready_file:
+                ready_file.write("ready")
+        except Exception:
+            self._remove_signal_handlers()
+            raise
+        logger.info(
+            "Ready for checkpoint. Waiting for watcher signal "
+            "(SIGUSR1=checkpoint complete, SIGCONT=restore complete)"
+        )
+        try:
+            event = await self._wait_for_watcher_signal()
+            if event == "restore":
+                logger.info("Restore signal detected (SIGCONT)")
+                logger.info("Resuming model after restore")
+                await quiesce_controller.resume()
+                quiesce_controller.mark_resumed()
+                return True
+            logger.info("Checkpoint completion signal detected (SIGUSR1)")
+            return False
+        finally:
+            self._remove_signal_handlers()
+            try:
+                os.unlink(self.ready_file)
+            except OSError:
+                pass
+    def _install_signal_handlers(self) -> None:
+        loop = asyncio.get_running_loop()
+        loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set)
+        loop.add_signal_handler(signal.SIGCONT, self._restore_done.set)
+    def _remove_signal_handlers(self) -> None:
+        loop = asyncio.get_running_loop()
+        loop.remove_signal_handler(signal.SIGUSR1)
+        loop.remove_signal_handler(signal.SIGCONT)
+    async def _wait_for_watcher_signal(self) -> str:
+        waiters = {
+            asyncio.create_task(self._checkpoint_done.wait()): "checkpoint",
+            asyncio.create_task(self._restore_done.wait()): "restore",
+        }
+        try:
+            done, pending = await asyncio.wait(
+                waiters.keys(), return_when=asyncio.FIRST_COMPLETED
+            )
+            for task in pending:
+                task.cancel()
+            winner = done.pop()
+            await winner
+            return waiters[winner]
+        finally:
+            for task in waiters:
+                if not task.done():
+                    task.cancel()
+def configure_checkpoint_transport_env() -> None:
+    gloo_ifname = os.environ.get("GLOO_SOCKET_IFNAME")
+    if gloo_ifname and gloo_ifname != "lo":
+        logger.warning(
+            "Overriding GLOO_SOCKET_IFNAME=%r with 'lo' for checkpoint mode "
+            "because CRIU cannot restore sockets bound to non-loopback addresses",
+            gloo_ifname,
+        )
+    os.environ["GLOO_SOCKET_IFNAME"] = "lo"
+    nccl_ifname = os.environ.get("NCCL_SOCKET_IFNAME")
+    if nccl_ifname and nccl_ifname != "lo":
+        logger.warning(
+            "Overriding NCCL_SOCKET_IFNAME=%r with 'lo' for checkpoint mode "
+            "because CRIU cannot restore sockets bound to non-loopback addresses",
+            nccl_ifname,
+        )
+    os.environ["NCCL_SOCKET_IFNAME"] = "lo"
+    nccl_cumem_enable = os.environ.get("NCCL_CUMEM_ENABLE")
+    if nccl_cumem_enable and nccl_cumem_enable != "0":
+        logger.warning(
+            "Overriding NCCL_CUMEM_ENABLE=%r with '0' for checkpoint mode "
+            "because cuda-checkpoint does not support cuMem-backed NCCL allocations",
+            nccl_cumem_enable,
+        )
+    os.environ["NCCL_CUMEM_ENABLE"] = "0"
+    nccl_p2p_disable = os.environ.get("NCCL_P2P_DISABLE")
+    if nccl_p2p_disable and nccl_p2p_disable != "0":
+        logger.warning(
+            "Overriding NCCL_P2P_DISABLE=%r with '0' for checkpoint mode "
+            "to keep NCCL on GPU P2P transport when topology allows it",
+            nccl_p2p_disable,
+        )
+    os.environ["NCCL_P2P_DISABLE"] = "0"
+    nccl_nvls_enable = os.environ.get("NCCL_NVLS_ENABLE")
+    if nccl_nvls_enable and nccl_nvls_enable != "0":
+        logger.warning(
+            "Overriding NCCL_NVLS_ENABLE=%r with '0' for checkpoint mode "
+            "to avoid NVLS and keep NCCL on the legacy P2P path",
+            nccl_nvls_enable,
+        )
+    os.environ["NCCL_NVLS_ENABLE"] = "0"
+    nccl_ib_disable = os.environ.get("NCCL_IB_DISABLE")
+    if nccl_ib_disable and nccl_ib_disable != "1":
+        logger.warning(
+            "Overriding NCCL_IB_DISABLE=%r with '1' for checkpoint mode "
+            "because CRIU and cuda-checkpoint cannot restore InfiniBand state",
+            nccl_ib_disable,
+        )
+    os.environ["NCCL_IB_DISABLE"] = "1"
+    torch_nccl_monitoring = os.environ.get("TORCH_NCCL_ENABLE_MONITORING")
+    if torch_nccl_monitoring and torch_nccl_monitoring != "0":
+        logger.warning(
+            "Overriding TORCH_NCCL_ENABLE_MONITORING=%r with '0' for checkpoint mode "
+            "because ProcessGroupNCCL monitoring can terminate restored processes",
+            torch_nccl_monitoring,
+        )
+    os.environ["TORCH_NCCL_ENABLE_MONITORING"] = "0"
+    os.environ.setdefault("TORCH_NCCL_DUMP_ON_TIMEOUT", "0")
+@dataclass
+class EngineSnapshotController(Generic[EngineT]):
+    engine: EngineT
+    quiesce_controller: Any
+    checkpoint_config: CheckpointConfig
+    quiesce_args: tuple[object, ...] = ()
+    async def wait_for_restore(self) -> bool:
+        return await self.checkpoint_config.run_lifecycle(
+            self.quiesce_controller,
+            *self.quiesce_args,
+        )
+    def reload_restore_identity(self) -> tuple[str, str]:
+        return reload_snapshot_restore_identity()
+def reload_snapshot_restore_identity() -> tuple[str, str]:
+    for env_name, podinfo_file in REQUIRED_PODINFO_FILES.items():
+        podinfo_path = os.path.join(PODINFO_ROOT, podinfo_file)
+        if not os.path.isfile(podinfo_path):
+            raise RuntimeError(f"snapshot restore requires {podinfo_path}")
+        with open(podinfo_path, encoding="utf-8") as podinfo:
+            value = podinfo.read().strip()
+        if not value:
+            raise RuntimeError(f"snapshot restore requires a non-empty {podinfo_path}")
+        os.environ[env_name] = value
+    for env_name, podinfo_file in OPTIONAL_PODINFO_FILES.items():
+        podinfo_path = os.path.join(PODINFO_ROOT, podinfo_file)
+        if not os.path.isfile(podinfo_path):
+            os.environ.pop(env_name, None)
+            continue
+        with open(podinfo_path, encoding="utf-8") as podinfo:
+            value = podinfo.read().strip()
+        if not value:
+            os.environ.pop(env_name, None)
+            continue
+        os.environ[env_name] = value
+    # Snapshot restore only runs in Kubernetes-managed pods, so discovery resets here.
+    os.environ["DYN_DISCOVERY_BACKEND"] = "kubernetes"
+    return get_worker_namespace(), "kubernetes"
--- a/components/src/dynamo/sglang/main.py
+++ b/components/src/dynamo/sglang/main.py
@@ -26,9 +26,10 @@ from dynamo.sglang.init_multimodal import (
    init_multimodal_worker,
 )
 from dynamo.sglang.shutdown import install_graceful_shutdown
-from dynamo.sglang.snapshot import handle_checkpoint_mode
+from dynamo.sglang.snapshot import prepare_snapshot_engine
 configure_dynamo_logging()
+logger = logging.getLogger(__name__)
 async def worker():
@@ -41,11 +42,17 @@ async def worker():
        config.server_args.load_format = setup_gms(config.server_args)
    # Checkpoint mode: engine must be created BEFORE runtime (no NATS/etcd during CRIU)
-    should_exit, snapshot_engine = await handle_checkpoint_mode(config.server_args)
+    snapshot_controller = await prepare_snapshot_engine(config.server_args)
-    if should_exit:
-        return
    dynamo_args = config.dynamo_args
+    snapshot_engine = None
+    if snapshot_controller is not None:
+        snapshot_engine = snapshot_controller.engine
+        (
+            dynamo_args.namespace,
+            dynamo_args.discovery_backend,
+        ) = snapshot_controller.reload_restore_identity()
    shutdown_event = asyncio.Event()
    shutdown_endpoints: list = []
    runtime, loop = create_runtime(
@@ -58,7 +65,7 @@ async def worker():
    run_deferred_handlers = install_graceful_shutdown(
        loop, runtime, shutdown_endpoints, shutdown_event
    )
-    logging.info(
+    logger.info(
        "Signal handlers set up for graceful shutdown "
        "(discovery unregister + grace period, with chaining)"
    )

--- a/components/src/dynamo/sglang/request_handlers/handler_base.py
+++ b/components/src/dynamo/sglang/request_handlers/handler_base.py
@@ -19,9 +19,57 @@ from dynamo.runtime import DistributedRuntime
 from dynamo.sglang.args import Config
 from dynamo.sglang.publisher import DynamoSglangPublisher
-# Keep default tags minimal and safe for general use.
-# "cuda_graph" can still be requested explicitly, but it requires LD_PRELOAD setup.
+class SGLangEngineQuiesceController:
-DEFAULT_MEMORY_OCCUPATION_TAGS = ["kv_cache", "weights"]
+    def __init__(self, engine: sgl.Engine):
+        self._engine = engine
+        self._quiesced_tags: Optional[list[str]] = None
+        self._is_quiesced = False
+    @property
+    def is_quiesced(self) -> bool:
+        return self._is_quiesced
+    async def quiesce(self, tags: Optional[list[str]] = None) -> bool:
+        if self._is_quiesced:
+            return False
+        from sglang.srt.managers.io_struct import (
+            PauseGenerationReqInput,
+            ReleaseMemoryOccupationReqInput,
+        )
+        await self._engine.tokenizer_manager.pause_generation(PauseGenerationReqInput())
+        await self._engine.tokenizer_manager.release_memory_occupation(
+            ReleaseMemoryOccupationReqInput(tags=tags),
+            None,
+        )
+        self._quiesced_tags = None if tags is None else list(tags)
+        self._is_quiesced = True
+        return True
+    async def resume(self, tags: Optional[list[str]] = None) -> bool:
+        if not self._is_quiesced:
+            return False
+        from sglang.srt.managers.io_struct import (
+            ContinueGenerationReqInput,
+            ResumeMemoryOccupationReqInput,
+        )
+        request_tags = self._quiesced_tags if tags is None else list(tags)
+        await self._engine.tokenizer_manager.resume_memory_occupation(
+            ResumeMemoryOccupationReqInput(tags=request_tags),
+            None,
+        )
+        await self._engine.tokenizer_manager.continue_generation(
+            ContinueGenerationReqInput()
+        )
+        return True
+    def mark_resumed(self) -> None:
+        self._quiesced_tags = None
+        self._is_quiesced = False
 class BaseGenerativeHandler(ABC):
@@ -148,8 +196,10 @@ class BaseWorkerHandler(BaseGenerativeHandler):
            # have an sgl.Engine.
            self.input_param_manager = InputParamManager(None)
            self._engine_supports_priority = False
-        self._memory_occupation_lock = asyncio.Lock()
+        self._quiesce_controller = (
-        self._memory_released = False
+            SGLangEngineQuiesceController(engine) if engine is not None else None
+        )
+        self._quiesce_lock = asyncio.Lock()
    def _priority_kwargs(self, priority: Any) -> Dict[str, Any]:
        if priority is not None and self._engine_supports_priority:
@@ -160,32 +210,23 @@ class BaseWorkerHandler(BaseGenerativeHandler):
        """Release GPU memory occupation and unregister from discovery.
        Args:
-            body: Unused. Release always targets default tags.
+            body: Optional dict with "tags" to target specific memory regions.
        Order of operations:
        1. Unregister from discovery - stop accepting new requests
        2. Pause generation - drain in-flight requests
        3. Release memory - safe now that no requests are active
        """
-        from sglang.srt.managers.io_struct import (
+        if self._quiesce_controller is None:
-            PauseGenerationReqInput,
-            ReleaseMemoryOccupationReqInput,
-        )
-        tags = list(DEFAULT_MEMORY_OCCUPATION_TAGS)
-        tokenizer_manager = (
-            getattr(self.engine, "tokenizer_manager", None)
-            if self.engine is not None
-            else None
-        )
-        if tokenizer_manager is None:
            return {
                "status": "error",
                "message": "memory control not supported on this worker",
            }
-        async with self._memory_occupation_lock:
+        body = body or {}
-            if self._memory_released:
+        tags = body.get("tags")
+        async with self._quiesce_lock:
+            if self._quiesce_controller.is_quiesced:
                return {
                    "status": "ok",
                    "message": "Memory already released",
@@ -196,16 +237,15 @@ class BaseWorkerHandler(BaseGenerativeHandler):
                if self.generate_endpoint is not None:
                    await self.generate_endpoint.unregister_endpoint_instance()
-                pause_req = PauseGenerationReqInput()
+                await self._quiesce_controller.quiesce(tags)
-                await tokenizer_manager.pause_generation(pause_req)
-                release_req = ReleaseMemoryOccupationReqInput(tags=tags)
-                await tokenizer_manager.release_memory_occupation(release_req, None)
-                self._memory_released = True
                return {
                    "status": "ok",
-                    "message": f"Memory released for tags: {tags}",
+                    "message": (
+                        f"Memory released for tags: {tags}"
+                        if tags is not None
+                        else "Memory released"
+                    ),
                }
            except Exception as e:
                logging.error(f"Failed to release memory occupation: {e}")
@@ -215,51 +255,42 @@ class BaseWorkerHandler(BaseGenerativeHandler):
        """Resume GPU memory occupation and re-register to discovery.
        Args:
-            body: Unused. Resume always targets default tags.
+            body: Optional dict with "tags" to target specific memory regions.
        Order of operations:
        1. Resume memory - restore GPU allocations
        2. Continue generation - ready to serve requests
        3. Re-register to discovery - allow frontend to route here
        """
-        from sglang.srt.managers.io_struct import (
+        if self._quiesce_controller is None:
-            ContinueGenerationReqInput,
-            ResumeMemoryOccupationReqInput,
-        )
-        tags = list(DEFAULT_MEMORY_OCCUPATION_TAGS)
-        tokenizer_manager = (
-            getattr(self.engine, "tokenizer_manager", None)
-            if self.engine is not None
-            else None
-        )
-        if tokenizer_manager is None:
            return {
                "status": "error",
                "message": "memory control not supported on this worker",
            }
-        async with self._memory_occupation_lock:
+        body = body or {}
-            if not self._memory_released:
+        tags = body.get("tags")
+        async with self._quiesce_lock:
+            if not self._quiesce_controller.is_quiesced:
                return {
                    "status": "ok",
                    "message": "Memory already resumed",
                }
            try:
-                resume_req = ResumeMemoryOccupationReqInput(tags=tags)
+                await self._quiesce_controller.resume(tags)
-                await tokenizer_manager.resume_memory_occupation(resume_req, None)
-                continue_req = ContinueGenerationReqInput()
-                await tokenizer_manager.continue_generation(continue_req)
                if self.generate_endpoint is not None:
                    await self.generate_endpoint.register_endpoint_instance()
+                self._quiesce_controller.mark_resumed()
-                self._memory_released = False
                return {
                    "status": "ok",
-                    "message": f"Memory resumed for tags: {tags}",
+                    "message": (
+                        f"Memory resumed for tags: {tags}"
+                        if tags is not None
+                        else "Memory resumed"
+                    ),
                }
            except Exception as e:
                logging.error(f"Failed to resume memory occupation: {e}")

--- a/components/src/dynamo/sglang/snapshot.py
+++ b/components/src/dynamo/sglang/snapshot.py
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-"""
+"""Dynamo Snapshot integration for SGLang workers."""
-Dynamo Snapshot integration for SGLang workers.
-Handles the checkpoint job pod lifecycle:
-1. Early exit if a checkpoint already exists (idempotency)
-2. Sleep model for CRIU-friendly GPU state
-3. Signal readiness for DaemonSet to begin checkpoint
-4. Wait for watcher signals from the DaemonSet
-5. Wake model after restore
-SGLang does not have a native sleep/wake API like vLLM.  Instead we use
-release_memory_occupation / resume_memory_occupation through the
-SGLangCheckpointAdapter, which presents the same sleep()/wake_up()
-interface that CheckpointConfig.run_lifecycle expects.
-Environment variables:
- DYN_READY_FOR_CHECKPOINT_FILE: Path where this worker writes readiness marker
- DYN_CHECKPOINT_STORAGE_TYPE: Storage backend (pvc, s3, oci) (optional, defaults to pvc)
- DYN_CHECKPOINT_LOCATION: Full checkpoint path (optional when PATH+HASH are provided)
- DYN_CHECKPOINT_PATH + DYN_CHECKPOINT_HASH: PVC base path + hash (used to derive location)
-Signals handled in checkpoint mode:
- SIGUSR1: Checkpoint completed, exit process
- SIGCONT: Restore completed, wake model and continue
- SIGKILL (from watcher on failure): Process is terminated immediately (unhandleable)
-"""
-import asyncio
 import logging
-import os
-import signal
 import time
-from typing import Optional
 import sglang as sgl
-logger = logging.getLogger(__name__)
+from dynamo.common.utils.snapshot import CheckpointConfig, EngineSnapshotController
-_SLEEP_MODE_LEVEL = 1
-# Memory tags to release/resume for CRIU checkpoint/restore.
+from .request_handlers.handler_base import SGLangEngineQuiesceController
-# All GPU resources must be released so CRIU can snapshot the process cleanly.
-_MEMORY_TAGS = ["kv_cache", "weights", "cuda_graph"]
+logger = logging.getLogger(__name__)
-class SGLangCheckpointAdapter:
-    """Adapts an sgl.Engine to the sleep/wake_up interface expected by
-    CheckpointConfig.run_lifecycle (matching vLLM's AsyncLLM API).
-    sleep():   pause generation -> release GPU memory
-    wake_up(): resume GPU memory -> continue generation
-    """
-    def __init__(self, engine: sgl.Engine):
+async def prepare_snapshot_engine(
-        self._engine = engine
+    server_args,
+) -> EngineSnapshotController[sgl.Engine] | None:
-    async def sleep(self, level: int = 1) -> None:
-        from sglang.srt.managers.io_struct import (
-            PauseGenerationReqInput,
-            ReleaseMemoryOccupationReqInput,
-        )
-        # Drain in-flight requests before touching GPU memory
-        await self._engine.tokenizer_manager.pause_generation(PauseGenerationReqInput())
-        await self._engine.tokenizer_manager.release_memory_occupation(
-            ReleaseMemoryOccupationReqInput(tags=_MEMORY_TAGS), None
-        )
-    async def wake_up(self) -> None:
-        from sglang.srt.managers.io_struct import (
-            ContinueGenerationReqInput,
-            ResumeMemoryOccupationReqInput,
-        )
-        await self._engine.tokenizer_manager.resume_memory_occupation(
-            ResumeMemoryOccupationReqInput(tags=_MEMORY_TAGS), None
-        )
-        await self._engine.tokenizer_manager.continue_generation(
-            ContinueGenerationReqInput()
-        )
-class CheckpointConfig:
-    """Parsed and validated checkpoint configuration from environment variables."""
-    def __init__(self):
-        self.ready_file = os.environ["DYN_READY_FOR_CHECKPOINT_FILE"]
-        self.storage_type = os.environ.get("DYN_CHECKPOINT_STORAGE_TYPE", "pvc")
-        self.location = os.environ.get("DYN_CHECKPOINT_LOCATION", "")
-        if not self.location:
-            checkpoint_path = os.environ.get("DYN_CHECKPOINT_PATH", "").rstrip("/")
-            checkpoint_hash = os.environ.get("DYN_CHECKPOINT_HASH", "")
-            if checkpoint_path and checkpoint_hash:
-                self.location = f"{checkpoint_path}/{checkpoint_hash}"
-        self.is_checkpoint_job = bool(self.location)
-        self._checkpoint_done = asyncio.Event()
-        self._restore_done = asyncio.Event()
-    def checkpoint_exists(self) -> bool:
-        """Check if a completed checkpoint already exists (idempotency).
-        A checkpoint is complete when its directory exists at the base path root
-        (not under the tmp/ staging area). Directory presence = done.
-        """
-        if self.storage_type != "pvc":
-            return False
-        if os.path.isdir(self.location):
-            logger.info(f"Existing checkpoint found at {self.location}, skipping")
-            return True
-        logger.info(f"No checkpoint at {self.location}, creating new one")
-        return False
-    async def run_lifecycle(self, engine_client, sleep_level: int) -> bool:
-        """Run the full checkpoint lifecycle after the engine is loaded.
-        1. Put model to sleep (CRIU-friendly GPU state)
-        2. Write ready file (triggers DaemonSet checkpoint via readiness probe)
-        3. Wait for watcher signal (checkpoint complete, restore complete, or failure)
-        4. If restored: wake model and return True (caller proceeds with registration)
-        5. If checkpoint done: return False (caller should exit)
-        """
-        # Sleep model for checkpoint
-        logger.info(f"Putting model to sleep (level={sleep_level})")
-        await engine_client.sleep(level=sleep_level)
-        # Install signal handlers before writing the ready file so there is no
-        # window where the DaemonSet can send SIGUSR1/SIGCONT while the default
-        # signal disposition (terminate) is still in effect.
-        self._install_signal_handlers()
-        # Signal readiness
-        with open(self.ready_file, "w") as f:
-            f.write("ready")
-        logger.info(
-            "Ready for checkpoint. Waiting for watcher signal "
-            "(SIGUSR1=checkpoint complete, SIGCONT=restore complete)"
-        )
-        try:
-            event = await self._wait_for_watcher_signal()
-            if event == "restore":
-                logger.info("Restore signal detected (SIGCONT)")
-                logger.info("Waking up model after restore")
-                await engine_client.wake_up()
-                return True
-            # SIGUSR1: checkpoint complete
-            logger.info("Checkpoint completion signal detected (SIGUSR1)")
-            return False
-        finally:
-            self._remove_signal_handlers()
-            # Remove the ready file so that a restarting pod does not leave a
-            # stale marker that could trick the DaemonSet into acting on it.
-            try:
-                os.unlink(self.ready_file)
-            except OSError:
-                pass
-    def _install_signal_handlers(self) -> None:
-        loop = asyncio.get_running_loop()
-        loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set)
-        # SIGCONT is used as the restore-complete signal. The snapshot DaemonSet
-        # watcher is the only sender, so there is no conflict with POSIX
-        # job-control semantics in practice.
-        loop.add_signal_handler(signal.SIGCONT, self._restore_done.set)
-        # No handler for checkpoint failure: the watcher sends SIGKILL, which
-        # terminates the process immediately (cannot be caught).
-    def _remove_signal_handlers(self) -> None:
-        loop = asyncio.get_running_loop()
-        loop.remove_signal_handler(signal.SIGUSR1)
-        loop.remove_signal_handler(signal.SIGCONT)
-    async def _wait_for_watcher_signal(self) -> str:
-        waiters = {
-            asyncio.create_task(self._checkpoint_done.wait()): "checkpoint",
-            asyncio.create_task(self._restore_done.wait()): "restore",
-        }
-        try:
-            done, pending = await asyncio.wait(
-                waiters.keys(), return_when=asyncio.FIRST_COMPLETED
-            )
-            for task in pending:
-                task.cancel()
-            winner = done.pop()
-            await winner
-            return waiters[winner]
-        finally:
-            for task in waiters:
-                if not task.done():
-                    task.cancel()
-async def handle_checkpoint_mode(server_args) -> tuple[bool, Optional[sgl.Engine]]:
    """Single entry point for Dynamo Snapshot integration.
    Must be called BEFORE runtime creation so the engine can be checkpointed
    without active NATS/etcd connections.
    Returns:
-        (should_exit, engine) where:
+        None when not in checkpoint mode.
-        - (True, None): caller should return immediately (checkpoint already
+        A snapshot controller when restore completed and the caller should use
-          exists, or checkpoint completed successfully).
+        the restored engine.
-        - (False, None): not in checkpoint mode — cold-start normally.
-        - (False, engine): restore completed — caller should use this engine.
-    """
-    if "DYN_READY_FOR_CHECKPOINT_FILE" not in os.environ:
-        return False, None
-    # Validate: either a full location or path + hash must be set.
-    if not os.environ.get("DYN_CHECKPOINT_LOCATION"):
-        path = os.environ.get("DYN_CHECKPOINT_PATH", "")
-        hash_ = os.environ.get("DYN_CHECKPOINT_HASH", "")
-        if not path or not hash_:
-            raise EnvironmentError(
-                "Checkpoint mode requires either DYN_CHECKPOINT_LOCATION or both "
-                "DYN_CHECKPOINT_PATH and DYN_CHECKPOINT_HASH"
-            )
-    cfg = CheckpointConfig()
-    checkpoint_exists = cfg.checkpoint_exists()
-    if cfg.is_checkpoint_job and checkpoint_exists:
+        If checkpointing completed successfully, this function exits the
-        return True, None
+        process with status 0.
+    """
-    if not cfg.is_checkpoint_job and not checkpoint_exists:
+    checkpoint_cfg = CheckpointConfig.from_env()
-        return False, None
+    if checkpoint_cfg is None:
+        return None
    logger.info("Checkpoint mode enabled (watcher-driven signals)")
@@ -244,8 +48,12 @@ async def handle_checkpoint_mode(server_args) -> tuple[bool, Optional[sgl.Engine
        f"SGLang engine loaded in {time.time() - start_time:.2f}s (checkpoint mode)"
    )
-    adapter = SGLangCheckpointAdapter(engine)
+    snapshot_controller = EngineSnapshotController(
-    if not await cfg.run_lifecycle(adapter, _SLEEP_MODE_LEVEL):
+        engine=engine,
-        return True, None
+        quiesce_controller=SGLangEngineQuiesceController(engine),
+        checkpoint_config=checkpoint_cfg,
+    )
+    if not await snapshot_controller.wait_for_restore():
+        raise SystemExit(0)
-    return False, engine
+    return snapshot_controller
--- a/components/src/dynamo/sglang/tests/test_sglang_memory_occupation_handlers.py
+++ b/components/src/dynamo/sglang/tests/test_sglang_memory_occupation_handlers.py
@@ -10,8 +10,8 @@ from unittest.mock import AsyncMock
 import pytest
 from dynamo.sglang.request_handlers.handler_base import (
-    DEFAULT_MEMORY_OCCUPATION_TAGS,
    BaseWorkerHandler,
+    SGLangEngineQuiesceController,
 )
 pytestmark = [
@@ -59,8 +59,8 @@ def _make_handler() -> _TestWorkerHandler:
        unregister_endpoint_instance=AsyncMock(),
        register_endpoint_instance=AsyncMock(),
    )
-    handler._memory_occupation_lock = asyncio.Lock()
+    handler._quiesce_controller = SGLangEngineQuiesceController(handler.engine)
-    handler._memory_released = False
+    handler._quiesce_lock = asyncio.Lock()
    return handler
@@ -93,7 +93,6 @@ async def test_release_and_resume_are_idempotent():
    assert second_resume["status"] == "ok"
    assert second_release["message"] == "Memory already released"
    assert second_resume["message"] == "Memory already resumed"
-    assert DEFAULT_MEMORY_OCCUPATION_TAGS == ["kv_cache", "weights"]
    release_req = (
        handler.engine.tokenizer_manager.release_memory_occupation.await_args.args[0]
@@ -101,8 +100,8 @@ async def test_release_and_resume_are_idempotent():
    resume_req = (
        handler.engine.tokenizer_manager.resume_memory_occupation.await_args.args[0]
    )
-    assert release_req.tags == DEFAULT_MEMORY_OCCUPATION_TAGS
+    assert release_req.tags is None
-    assert resume_req.tags == DEFAULT_MEMORY_OCCUPATION_TAGS
+    assert resume_req.tags is None
    handler.engine.tokenizer_manager.pause_generation.assert_awaited_once()
    handler.engine.tokenizer_manager.release_memory_occupation.assert_awaited_once()
@@ -114,17 +113,37 @@ async def test_release_and_resume_are_idempotent():
 @pytest.mark.asyncio
-async def test_resume_uses_default_tags_even_when_request_specifies_subset():
+async def test_release_and_resume_use_explicit_request_tags():
    handler = _make_handler()
    await handler.release_memory_occupation({"tags": ["weights"]})
    resume_result = await handler.resume_memory_occupation({"tags": ["weights"]})
+    assert resume_result["status"] == "ok"
+    release_req = (
+        handler.engine.tokenizer_manager.release_memory_occupation.await_args.args[0]
+    )
+    resume_req = (
+        handler.engine.tokenizer_manager.resume_memory_occupation.await_args.args[0]
+    )
+    assert release_req.tags == ["weights"]
+    assert resume_req.tags == ["weights"]
+    handler.engine.tokenizer_manager.continue_generation.assert_awaited_once()
+    handler.generate_endpoint.register_endpoint_instance.assert_awaited_once()
+@pytest.mark.asyncio
+async def test_resume_reuses_release_tags_when_request_omits_them():
+    handler = _make_handler()
+    await handler.release_memory_occupation({"tags": ["weights"]})
+    resume_result = await handler.resume_memory_occupation({})
    assert resume_result["status"] == "ok"
    resume_req = (
        handler.engine.tokenizer_manager.resume_memory_occupation.await_args.args[0]
    )
-    assert resume_req.tags == DEFAULT_MEMORY_OCCUPATION_TAGS
+    assert resume_req.tags == ["weights"]
    handler.engine.tokenizer_manager.continue_generation.assert_awaited_once()
    handler.generate_endpoint.register_endpoint_instance.assert_awaited_once()
@@ -146,6 +165,7 @@ async def test_resume_with_no_sleeping_state_is_noop():
 async def test_release_returns_error_when_worker_has_no_tokenizer_manager():
    handler = _make_handler()
    handler.engine = None
+    handler._quiesce_controller = None
    result = await handler.release_memory_occupation({})
@@ -160,6 +180,7 @@ async def test_release_returns_error_when_worker_has_no_tokenizer_manager():
 async def test_resume_returns_error_when_worker_has_no_tokenizer_manager():
    handler = _make_handler()
    handler.engine = None
+    handler._quiesce_controller = None
    result = await handler.resume_memory_occupation({})
@@ -168,3 +189,18 @@ async def test_resume_returns_error_when_worker_has_no_tokenizer_manager():
        "message": "memory control not supported on this worker",
    }
    handler.generate_endpoint.register_endpoint_instance.assert_not_awaited()
+@pytest.mark.asyncio
+async def test_resume_keeps_quiesced_state_when_register_fails():
+    handler = _make_handler()
+    await handler.release_memory_occupation({})
+    handler.generate_endpoint.register_endpoint_instance = AsyncMock(
+        side_effect=RuntimeError("discovery write timeout")
+    )
+    result = await handler.resume_memory_occupation({})
+    assert result["status"] == "error"
+    assert handler._quiesce_controller is not None
+    assert handler._quiesce_controller.is_quiesced is True
--- a/components/src/dynamo/vllm/backend_args.py
+++ b/components/src/dynamo/vllm/backend_args.py
@@ -63,16 +63,6 @@ class DynamoVllmArgGroup(ArgGroup):
            help="Use vLLM's tokenizer for pre and post processing. This bypasses Dynamo's preprocessor and only v1/chat/completions will be available through the Dynamo frontend.",
        )
-        add_argument(
-            g,
-            flag_name="--sleep-mode-level",
-            env_var="DYN_VLLM_SLEEP_MODE_LEVEL",
-            default=1,
-            help="Sleep mode level (1=offload to CPU, 2=discard weights, 3=discard all).",
-            choices=[1, 2, 3],
-            arg_type=int,
-        )
        # Multimodal
        add_negatable_bool_argument(
            g,
@@ -178,7 +168,6 @@ class DynamoVllmConfig(ConfigBase):
    is_prefill_worker: bool
    is_decode_worker: bool
    use_vllm_tokenizer: bool
-    sleep_mode_level: int
    # Multimodal
    route_to_encoder: bool

--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -52,6 +52,43 @@ configure_dynamo_logging()
 logger = logging.getLogger(__name__)
+class VllmEngineQuiesceController:
+    def __init__(self, engine_client: Any):
+        self._engine_client = engine_client
+        self._is_quiesced = False
+    @property
+    def is_quiesced(self) -> bool:
+        return self._is_quiesced
+    async def quiesce(self, *args: object) -> bool:
+        if self._is_quiesced:
+            return False
+        level = args[0] if args else None
+        await self._engine_client.pause_generation()
+        if level is None:
+            await self._engine_client.sleep()
+        else:
+            await self._engine_client.sleep(level)
+        self._is_quiesced = True
+        return True
+    async def resume(self, tags: list[str] | None = None) -> bool:
+        if not self._is_quiesced:
+            return False
+        if tags is None:
+            await self._engine_client.wake_up()
+        else:
+            await self._engine_client.wake_up(tags)
+        await self._engine_client.resume_generation()
+        return True
+    def mark_resumed(self) -> None:
+        self._is_quiesced = False
 @dataclass(frozen=True)
 class LoRAInfo:
    """Metadata for a loaded LoRA adapter."""
@@ -332,8 +369,8 @@ class BaseWorkerHandler(ABC):
        self.use_vllm_tokenizer = use_vllm_tokenizer
        self.dp_range = get_dp_range_for_worker(self.engine_client.vllm_config)
-        self._sleep_wake_lock = asyncio.Lock()
+        self._quiesce_controller = VllmEngineQuiesceController(self.engine_client)
-        self._engine_is_sleeping = False
+        self._quiesce_lock = asyncio.Lock()
        # Initialize InputParamManager for text-in-text-out mode
        tokenizer = None
@@ -357,8 +394,8 @@ class BaseWorkerHandler(ABC):
        """
        body = body or {}
        level = body.get("level", 1)
-        async with self._sleep_wake_lock:
+        async with self._quiesce_lock:
-            if self._engine_is_sleeping:
+            if self._quiesce_controller.is_quiesced:
                return {
                    "status": "ok",
                    "message": "Engine already sleeping",
@@ -374,11 +411,11 @@ class BaseWorkerHandler(ABC):
                # Step 2: Abort in-flight requests and wait for them to drain so the
                # GPU is fully quiesced before unmapping memory.
-                await self.engine_client.pause_generation()
+                if not await self._quiesce_controller.quiesce(level):
+                    return {
-                # Step 3: Now safe to sleep - no in-flight GPU work
+                        "status": "ok",
-                await self.engine_client.sleep(level)
+                        "message": "Engine already sleeping",
-                self._engine_is_sleeping = True
+                    }
                return {
                    "status": "ok",
@@ -392,29 +429,27 @@ class BaseWorkerHandler(ABC):
        """Wake the engine to restore GPU memory and re-register to discovery.
        Args:
-            body: Unused. Wake always restores all sleep-managed memory.
+            body: Optional dict with "tags" to request a partial wake.
        Order of operations:
        1. Wake engine - restore GPU memory
        2. Re-register endpoint instance - allow frontend to route requests here again
        """
-        async with self._sleep_wake_lock:
+        body = body or {}
-            if not self._engine_is_sleeping:
+        tags = body.get("tags")
+        async with self._quiesce_lock:
+            if not self._quiesce_controller.is_quiesced:
                return {"status": "ok", "message": "Engine already awake"}
            try:
                # Step 1: Wake engine first - must be ready before accepting requests
-                await self.engine_client.wake_up()
+                await self._quiesce_controller.resume(tags)
-                # Step 2: Resume generation and re-register.
-                await self.engine_client.resume_generation()
                if self.generate_endpoint is not None:
                    await self.generate_endpoint.register_endpoint_instance()
                    logger.info(
                        "[Wake] Re-registered endpoint to discovery - worker added back to routing pool"
                    )
+                self._quiesce_controller.mark_resumed()
-                self._engine_is_sleeping = False
                return {
                    "status": "ok",

--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -44,7 +44,7 @@ from .constants import DisaggregationMode
 from .handlers import DecodeWorkerHandler, PrefillWorkerHandler, get_dp_range_for_worker
 from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
 from .publisher import DYNAMO_COMPONENT_REGISTRY, StatLoggerFactory
-from .snapshot import get_checkpoint_config
+from .snapshot import prepare_snapshot_engine
 # Optional imports for frontend decoding support
 MediaDecoder: type | None = None
@@ -61,7 +61,6 @@ except ImportError:
 configure_dynamo_logging()
 logger = logging.getLogger(__name__)
 shutdown_endpoints: list = []
-CHECKPOINT_SLEEP_MODE_LEVEL = 1
 def build_headless_namespace(config: Config) -> argparse.Namespace:
@@ -102,11 +101,6 @@ async def worker() -> None:
    if not config.served_model_name:
        config.served_model_name = config.engine_args.served_model_name = config.model
-    # Check checkpoint mode and validate env vars EARLY (fail fast if misconfigured)
-    early_exit, checkpoint_cfg = get_checkpoint_config()
-    if early_exit:
-        return
    # Download the model if necessary using modelexpress.
    # We want it on disk before we start vllm to avoid downloading from HuggingFace.
    #
@@ -119,35 +113,27 @@ async def worker() -> None:
    if not os.path.exists(config.model):
        await fetch_model(config.model)
+    # CHECKPOINT MODE: Load engine BEFORE runtime creation
+    # This allows checkpointing GPU state before runtime connections are established
+    snapshot_controller = await prepare_snapshot_engine(
+        config,
+        setup_vllm_engine,
+    )
+    snapshot_engine = None
+    if snapshot_controller is not None:
+        snapshot_engine = snapshot_controller.engine
+        (
+            config.namespace,
+            config.discovery_backend,
+        ) = snapshot_controller.reload_restore_identity()
    # HEADLESS MODE: bypass DistributedRuntime entirely.
    # Workers run vLLM only (no NATS, etcd, or dynamo endpoints).
    if config.headless:
-        if checkpoint_cfg is not None:
-            raise ValueError(
-                "--headless is incompatible with checkpoint mode "
-                "(DYN_CHECKPOINT_SIGNAL_FILE is set). "
-                "Remove --headless or unset DYN_CHECKPOINT_SIGNAL_FILE."
-            )
        run_dynamo_headless(config)
        return
-    # CHECKPOINT MODE: Load engine BEFORE runtime creation
-    # This allows checkpointing GPU state before runtime connections are established
-    snapshot_engine = None
-    if checkpoint_cfg is not None:
-        logger.info("Checkpoint mode enabled (watcher-driven signals)")
-        # Checkpoint mode requires sleep mode — enable before engine init
-        config.engine_args.enable_sleep_mode = True
-        snapshot_engine = setup_vllm_engine(config)
-        engine_client = snapshot_engine[0]
-        if not await checkpoint_cfg.run_lifecycle(
-            engine_client, CHECKPOINT_SLEEP_MODE_LEVEL
-        ):
-            return
    shutdown_event = asyncio.Event()
    runtime, loop = create_runtime(
        discovery_backend=config.discovery_backend,

--- a/components/src/dynamo/vllm/snapshot.py
+++ b/components/src/dynamo/vllm/snapshot.py
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-"""
-Dynamo Snapshot integration for vLLM workers.
-Handles the checkpoint job pod lifecycle:
-1. Early exit if a checkpoint already exists (idempotency)
-2. Sleep model for CRIU-friendly GPU state
-3. Signal readiness for DaemonSet to begin checkpoint
-4. Wait for watcher signals from the DaemonSet
-5. Wake model after restore
-Environment variables:
- DYN_READY_FOR_CHECKPOINT_FILE: Path where this worker writes readiness marker
- DYN_CHECKPOINT_STORAGE_TYPE: Storage backend (pvc, s3, oci) (optional, defaults to pvc)
- DYN_CHECKPOINT_LOCATION: Full checkpoint path (optional when PATH+HASH are provided)
- DYN_CHECKPOINT_PATH + DYN_CHECKPOINT_HASH: PVC base path + hash (used to derive location)
-Signals handled in checkpoint mode:
- SIGUSR1: Checkpoint completed, exit process
- SIGCONT: Restore completed, wake model and continue
- SIGKILL (from watcher on failure): Process is terminated immediately (unhandleable)
-"""
-import asyncio
 import logging
-import os
+from collections.abc import Callable
-import signal
-from typing import Optional
-logger = logging.getLogger(__name__)
+from dynamo.common.utils.snapshot import CheckpointConfig, EngineSnapshotController
-class CheckpointConfig:
+from .args import Config
-    """Parsed and validated checkpoint configuration from environment variables."""
+from .handlers import VllmEngineQuiesceController
+from .worker_factory import EngineSetupResult
-    def __init__(self):
+logger = logging.getLogger(__name__)
-        self.ready_file = os.environ["DYN_READY_FOR_CHECKPOINT_FILE"]
-        self.storage_type = os.environ.get("DYN_CHECKPOINT_STORAGE_TYPE", "pvc")
-        self.location = os.environ.get("DYN_CHECKPOINT_LOCATION", "")
-        if not self.location:
-            checkpoint_path = os.environ.get("DYN_CHECKPOINT_PATH", "").rstrip("/")
-            checkpoint_hash = os.environ.get("DYN_CHECKPOINT_HASH", "")
-            if checkpoint_path and checkpoint_hash:
-                self.location = f"{checkpoint_path}/{checkpoint_hash}"
-        self.is_checkpoint_job = bool(self.location)
-        self._checkpoint_done = asyncio.Event()
-        self._restore_done = asyncio.Event()
-    def checkpoint_exists(self) -> bool:
-        """Check if a completed checkpoint already exists (idempotency).
-        A checkpoint is complete when its directory exists at the base path root
-        (not under the tmp/ staging area). Directory presence = done.
-        """
-        if self.storage_type != "pvc":
-            return False
-        if os.path.isdir(self.location):
-            logger.info(f"Existing checkpoint found at {self.location}, skipping")
-            return True
-        logger.info(f"No checkpoint at {self.location}, creating new one")
-        return False
-    async def run_lifecycle(self, engine_client, sleep_level: int) -> bool:
-        """Run the full checkpoint lifecycle after the engine is loaded.
-        1. Put model to sleep (CRIU-friendly GPU state)
-        2. Write ready file (triggers DaemonSet checkpoint via readiness probe)
-        3. Wait for watcher signal (checkpoint complete, restore complete, or failure)
-        4. If restored: wake model and return True (caller proceeds with registration)
-        5. If checkpoint done: return False (caller should exit)
-        """
-        # Sleep model for checkpoint
-        logger.info(f"Putting model to sleep (level={sleep_level})")
-        await engine_client.sleep(level=sleep_level)
-        # Install signal handlers before writing the ready file so there is no
+async def prepare_snapshot_engine(
-        # window where the DaemonSet can send SIGUSR1/SIGCONT while the default
+    config: Config,
-        # signal disposition (terminate) is still in effect.
+    setup_vllm_engine: Callable[[Config], EngineSetupResult],
-        self._install_signal_handlers()
+) -> EngineSnapshotController[EngineSetupResult] | None:
+    checkpoint_config = CheckpointConfig.from_env()
+    if checkpoint_config is None:
+        return None
-        # Signal readiness
+    if config.headless:
-        with open(self.ready_file, "w") as f:
+        raise ValueError(
-            f.write("ready")
+            "--headless is incompatible with checkpoint mode "
-        logger.info(
+            "(DYN_CHECKPOINT_SIGNAL_FILE is set). "
-            "Ready for checkpoint. Waiting for watcher signal "
+            "Remove --headless or unset DYN_CHECKPOINT_SIGNAL_FILE."
-            "(SIGUSR1=checkpoint complete, SIGCONT=restore complete)"
        )
-        try:
+    logger.info("Checkpoint mode enabled (watcher-driven signals)")
-            event = await self._wait_for_watcher_signal()
+    config.engine_args.enable_sleep_mode = True
-            if event == "restore":
-                logger.info("Restore signal detected (SIGCONT)")
-                logger.info("Waking up model after restore")
-                await engine_client.wake_up()
-                return True
-            # SIGUSR1: checkpoint complete
-            logger.info("Checkpoint completion signal detected (SIGUSR1)")
-            return False
-        finally:
-            self._remove_signal_handlers()
-            # Remove the ready file so that a restarting pod does not leave a
-            # stale marker that could trick the DaemonSet into acting on it.
-            try:
-                os.unlink(self.ready_file)
-            except OSError:
-                pass
-    def _install_signal_handlers(self) -> None:
-        loop = asyncio.get_running_loop()
-        loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set)
-        # SIGCONT is used as the restore-complete signal. The snapshot DaemonSet
-        # watcher is the only sender, so there is no conflict with POSIX
-        # job-control semantics in practice.
-        loop.add_signal_handler(signal.SIGCONT, self._restore_done.set)
-        # No handler for checkpoint failure: the watcher sends SIGKILL, which
-        # terminates the process immediately (cannot be caught).
-    def _remove_signal_handlers(self) -> None:
-        loop = asyncio.get_running_loop()
-        loop.remove_signal_handler(signal.SIGUSR1)
-        loop.remove_signal_handler(signal.SIGCONT)
-    async def _wait_for_watcher_signal(self) -> str:
-        waiters = {
-            asyncio.create_task(self._checkpoint_done.wait()): "checkpoint",
-            asyncio.create_task(self._restore_done.wait()): "restore",
-        }
-        try:
-            done, pending = await asyncio.wait(
-                waiters.keys(), return_when=asyncio.FIRST_COMPLETED
-            )
-            for task in pending:
-                task.cancel()
-            winner = done.pop()
-            await winner
-            return waiters[winner]
-        finally:
-            for task in waiters:
-                if not task.done():
-                    task.cancel()
-def get_checkpoint_config() -> tuple[bool, Optional[CheckpointConfig]]:
-    """Resolve checkpoint configuration, handling early-exit and cold-start cases.
-    Checkpoint mode is detected by DYN_READY_FOR_CHECKPOINT_FILE being set.
-    Returns:
-        (early_exit, config) where:
-        - early_exit=True, config=None: checkpoint job re-run, checkpoint already
-          exists — caller should return immediately.
-        - early_exit=False, config=None: not in checkpoint mode, or regular worker
-          with no checkpoint available yet — cold-start normally.
-        - early_exit=False, config=CheckpointConfig: checkpoint lifecycle should run.
-    """
-    if "DYN_READY_FOR_CHECKPOINT_FILE" not in os.environ:
-        return False, None
-    # Validate checkpoint location: either a full location or path + hash must be set.
-    # Check the value (not just presence) so an empty string is treated as unset.
-    if not os.environ.get("DYN_CHECKPOINT_LOCATION"):
-        path = os.environ.get("DYN_CHECKPOINT_PATH", "")
-        hash_ = os.environ.get("DYN_CHECKPOINT_HASH", "")
-        if not path or not hash_:
-            raise EnvironmentError(
-                "Checkpoint mode requires either DYN_CHECKPOINT_LOCATION or both "
-                "DYN_CHECKPOINT_PATH and DYN_CHECKPOINT_HASH"
-            )
-    cfg = CheckpointConfig()
-    checkpoint_exists = cfg.checkpoint_exists()
-    if cfg.is_checkpoint_job and checkpoint_exists:
-        # Idempotent checkpoint job re-run: checkpoint already exists.
-        return True, None
-    if not cfg.is_checkpoint_job and not checkpoint_exists:
+    engine = setup_vllm_engine(config)
-        # Regular worker with no checkpoint available yet: cold-start normally.
+    snapshot_controller = EngineSnapshotController(
-        return False, None
+        engine=engine,
+        quiesce_controller=VllmEngineQuiesceController(engine[0]),
+        checkpoint_config=checkpoint_config,
+        quiesce_args=(None,),
+    )
+    if not await snapshot_controller.wait_for_restore():
+        raise SystemExit(0)
-    return False, cfg
+    return snapshot_controller
--- a/components/src/dynamo/vllm/tests/test_vllm_sleep_wake_handlers.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_sleep_wake_handlers.py
@@ -7,7 +7,7 @@ from unittest.mock import AsyncMock
 import pytest
-from dynamo.vllm.handlers import BaseWorkerHandler
+from dynamo.vllm.handlers import BaseWorkerHandler, VllmEngineQuiesceController
 pytestmark = [
    pytest.mark.unit,
@@ -34,8 +34,8 @@ def _make_handler() -> _TestWorkerHandler:
        unregister_endpoint_instance=AsyncMock(),
        register_endpoint_instance=AsyncMock(),
    )
-    handler._sleep_wake_lock = asyncio.Lock()
+    handler._quiesce_controller = VllmEngineQuiesceController(handler.engine_client)
-    handler._engine_is_sleeping = False
+    handler._quiesce_lock = asyncio.Lock()
    return handler
@@ -74,6 +74,36 @@ async def test_sleep_and_wake_are_idempotent():
    handler.generate_endpoint.register_endpoint_instance.assert_awaited_once()
+@pytest.mark.asyncio
+async def test_quiesce_without_level_uses_vllm_default_sleep():
+    engine_client = SimpleNamespace(
+        pause_generation=AsyncMock(),
+        sleep=AsyncMock(),
+        wake_up=AsyncMock(),
+        resume_generation=AsyncMock(),
+    )
+    controller = VllmEngineQuiesceController(engine_client)
+    changed = await controller.quiesce(None)
+    assert changed is True
+    engine_client.pause_generation.assert_awaited_once()
+    engine_client.sleep.assert_awaited_once_with()
+@pytest.mark.asyncio
+async def test_wake_up_passes_explicit_tags_from_request():
+    handler = _make_handler()
+    await handler._quiesce_controller.quiesce(1)
+    result = await handler.wake_up({"tags": ["weights"]})
+    assert result["status"] == "ok"
+    handler.engine_client.wake_up.assert_awaited_once_with(["weights"])
+    handler.engine_client.resume_generation.assert_awaited_once()
+    handler.generate_endpoint.register_endpoint_instance.assert_awaited_once()
 @pytest.mark.asyncio
 async def test_sleep_returns_error_for_unregister_failure():
    handler = _make_handler()
@@ -91,7 +121,7 @@ async def test_sleep_returns_error_for_unregister_failure():
 @pytest.mark.asyncio
 async def test_wake_up_returns_error_for_register_failure():
    handler = _make_handler()
-    handler._engine_is_sleeping = True
+    await handler._quiesce_controller.quiesce(1)
    handler.generate_endpoint.register_endpoint_instance = AsyncMock(
        side_effect=RuntimeError("discovery write timeout")
    )
@@ -101,3 +131,4 @@ async def test_wake_up_returns_error_for_register_failure():
    assert result["status"] == "error"
    handler.engine_client.wake_up.assert_awaited_once_with()
    handler.engine_client.resume_generation.assert_awaited_once()
+    assert handler._quiesce_controller.is_quiesced is True
--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamocheckpoints.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamocheckpoints.yaml
@@ -124,11 +124,12 @@ spec:
                      default: 3600
                      description: ActiveDeadlineSeconds specifies the maximum time the Job can run
                      format: int64
+                      minimum: 1
                      type: integer
                    backoffLimit:
-                      default: 3
+                      description: 'Deprecated: BackoffLimit is ignored. Checkpoint Jobs never retry.'
-                      description: BackoffLimit specifies the number of retries before marking the Job failed
                      format: int32
+                      minimum: 0
                      type: integer
                    podTemplateSpec:
                      description: |-
@@ -8154,10 +8155,28 @@ spec:
                            - containers
                          type: object
                      type: object
+                    sharedMemory:
+                      description: |-
+                        SharedMemory controls the tmpfs mounted at /dev/shm for the checkpoint Job pod.
+                        When omitted, checkpoint Jobs use the same default 8Gi tmpfs as Dynamo components.
+                      properties:
+                        disabled:
+                          type: boolean
+                        size:
+                          anyOf:
+                            - type: integer
+                            - type: string
+                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                          x-kubernetes-int-or-string: true
+                      type: object
+                      x-kubernetes-validations:
+                        - message: sharedMemory.size must not be set when sharedMemory.disabled is true
+                          rule: '!(self.disabled && has(self.size))'
                    ttlSecondsAfterFinished:
                      default: 300
                      description: TTLSecondsAfterFinished specifies how long to keep the Job after completion
                      format: int32
+                      minimum: 0
                      type: integer
                  required:
                    - podTemplateSpec
@@ -8170,7 +8189,7 @@ spec:
              description: DynamoCheckpointStatus defines the observed state of DynamoCheckpoint
              properties:
                conditions:
-                  description: Conditions represent the latest available observations of the checkpoint's state
+                  description: 'DEPRECATED: Conditions are deprecated. Use status.phase instead.'
                  items:
                    description: Condition contains details for one aspect of the current state of this API Resource.
                    properties:

--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamocomponentdeployments.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamocomponentdeployments.yaml
@@ -678,8 +678,8 @@ spec:
                  properties:
                    checkpointRef:
                      description: |-
-                        CheckpointRef references an existing Checkpoint CR to use
+                        CheckpointRef references an existing DynamoCheckpoint CR by metadata.name.
-                        If specified, Identity is ignored and this checkpoint is used directly
+                        If specified, this service's Identity is ignored and the referenced checkpoint is used directly.
                      type: string
                    enabled:
                      default: false
@@ -11211,6 +11211,9 @@ spec:
                      pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                      x-kubernetes-int-or-string: true
                  type: object
+                  x-kubernetes-validations:
+                    - message: sharedMemory.size must not be set when sharedMemory.disabled is true
+                      rule: '!(self.disabled && has(self.size))'
                subComponentType:
                  description: SubComponentType indicates the sub-role of this component (for example, "prefill").
                  type: string

--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeployments.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeployments.yaml
@@ -887,8 +887,8 @@ spec:
                        properties:
                          checkpointRef:
                            description: |-
-                              CheckpointRef references an existing Checkpoint CR to use
+                              CheckpointRef references an existing DynamoCheckpoint CR by metadata.name.
-                              If specified, Identity is ignored and this checkpoint is used directly
+                              If specified, this service's Identity is ignored and the referenced checkpoint is used directly.
                            type: string
                          enabled:
                            default: false
@@ -11420,6 +11420,9 @@ spec:
                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                            x-kubernetes-int-or-string: true
                        type: object
+                        x-kubernetes-validations:
+                          - message: sharedMemory.size must not be set when sharedMemory.disabled is true
+                            rule: '!(self.disabled && has(self.size))'
                      subComponentType:
                        description: SubComponentType indicates the sub-role of this component (for example, "prefill").
                        type: string
@@ -11466,7 +11469,7 @@ spec:
                        description: IdentityHash is the computed hash of the checkpoint identity
                        type: string
                      ready:
-                        description: Ready indicates if the checkpoint is ready for use
+                        description: Ready indicates if the checkpoint was visible to the worker at startup
                        type: boolean
                    type: object
                  description: |-

--- a/deploy/helm/charts/snapshot/README.md
+++ b/deploy/helm/charts/snapshot/README.md
@@ -33,6 +33,8 @@ dynamo-operator:
        basePath: /checkpoints
 ```
+The snapshot-agent no longer reads `basePath` from its ConfigMap, but the operator still uses its configured PVC base path when it annotates checkpoint and restore pods. That path must match `storage.pvc.basePath` here so the mounted checkpoint location is valid inside the agent pod.
 Cross-node restore requires a shared `ReadWriteMany` storage class. The chart defaults to `storage.pvc.accessMode=ReadWriteMany`.
 For better restore times, use a fast `ReadWriteMany` StorageClass for the checkpoint PVC.
@@ -81,7 +83,7 @@ kubectl get pods -n ${NAMESPACE} -l app.kubernetes.io/name=snapshot -o wide
 | `storage.pvc.size` | Requested PVC size | `1Ti` |
 | `storage.pvc.storageClass` | Storage class name | `""` |
 | `storage.pvc.accessMode` | Access mode for the checkpoint PVC | `ReadWriteMany` |
-| `storage.pvc.basePath` | Checkpoint root inside the PVC | `/checkpoints` |
+| `storage.pvc.basePath` | PVC mount path inside the snapshot-agent pod | `/checkpoints` |
 | `daemonset.image.repository` | Snapshot agent image repository | `nvcr.io/nvidia/ai-dynamo/snapshot-agent` |
 | `daemonset.image.tag` | Snapshot agent image tag | `1.0.0` |
 | `daemonset.imagePullSecrets` | Image pull secrets for the agent | `[{name: ngc-secret}]` |

--- a/deploy/helm/charts/snapshot/templates/configmap.yaml
+++ b/deploy/helm/charts/snapshot/templates/configmap.yaml
@@ -10,12 +10,8 @@ metadata:
    {{- include "snapshot.labels" . | nindent 4 }}
 data:
  config.yaml: |
-    basePath: {{ .Values.storage.pvc.basePath | quote }}
    overlay:
-      systemDirs: {{ toYaml .Values.config.overlay.systemDirs | nindent 8 }}
+      exclusions: {{ toYaml .Values.config.overlay.exclusions | nindent 8 }}
-      cacheDirs: {{ toYaml .Values.config.overlay.cacheDirs | nindent 8 }}
-      additionalExclusions: {{ toYaml .Values.config.overlay.additionalExclusions | nindent 8 }}
    restore:
      nsRestorePath: {{ .Values.config.restore.nsRestorePath | quote }}
@@ -29,6 +25,7 @@ data:
      leaveRunning: {{ .Values.config.criu.leaveRunning }}
      shellJob: {{ .Values.config.criu.shellJob }}
      tcpClose: {{ .Values.config.criu.tcpClose }}
+      tcpEstablished: {{ .Values.config.criu.tcpEstablished }}
      fileLocks: {{ .Values.config.criu.fileLocks }}
      orphanPtsMaster: {{ .Values.config.criu.orphanPtsMaster }}
      extUnixSk: {{ .Values.config.criu.extUnixSk }}

--- a/deploy/helm/charts/snapshot/templates/role.yaml
+++ b/deploy/helm/charts/snapshot/templates/role.yaml
@@ -16,6 +16,14 @@ rules:
  - apiGroups: [""]
    resources: ["pods"]
    verbs: ["get", "list", "watch", "patch", "update"]
+  # Patch checkpoint Jobs with terminal checkpoint status
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["get", "patch", "update"]
+  # Coordinate checkpoint ownership with per-Job leases
+  - apiGroups: ["coordination.k8s.io"]
+    resources: ["leases"]
+    verbs: ["get", "create", "update", "delete"]
  # Emit operational events on pod/restore lifecycle updates
  - apiGroups: [""]
    resources: ["events"]
@@ -33,6 +41,14 @@ rules:
  - apiGroups: [""]
    resources: ["pods"]
    verbs: ["get", "list", "watch", "patch", "update"]
+  # Patch checkpoint Jobs with terminal checkpoint status
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["get", "patch", "update"]
+  # Coordinate checkpoint ownership with per-Job leases
+  - apiGroups: ["coordination.k8s.io"]
+    resources: ["leases"]
+    verbs: ["get", "create", "update", "delete"]
  # Emit operational events on pod/restore lifecycle updates
  - apiGroups: [""]
    resources: ["events"]

--- a/deploy/helm/charts/snapshot/values.yaml
+++ b/deploy/helm/charts/snapshot/values.yaml
@@ -34,7 +34,8 @@ storage:
    storageClass: ""
    # Access mode - ReadWriteMany required for multi-pod access
    accessMode: ReadWriteMany
-    # Base path for checkpoints (mounted in pods)
+    # PVC mount path inside the snapshot-agent pod.
+    # This must match the operator checkpoint.storage.pvc.basePath setting.
    basePath: /checkpoints
  # S3 configuration (when type=s3)
@@ -123,17 +124,15 @@ rbac:
 # Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) come from environment variables
 config:
  overlay:
-    # Virtual FS dirs are COW artifacts in the overlay upperdir.
+    # Rootfs diff tar exclusions. Absolute-looking paths are normalized
-    systemDirs:
+    # relative to the tar root, and patterns starting with * are passed
+    # through as tar globs unchanged.
+    exclusions:
      - /proc
      - /sys
      - /dev
-    # Cache directories to exclude (reduces checkpoint size)
+      - "*/.cache/huggingface"
-    cacheDirs:
+      - "*/.cache/vllm/torch_compile_cache"
-      - /.cache/huggingface
-    # Python bytecode is already loaded in memory at restore time and
-    # regenerated automatically on cold start.
-    additionalExclusions:
      - "*/__pycache__"
      - "*.pyc"
@@ -157,7 +156,8 @@ config:
    # K8s-specific options (recommended defaults for containers)
    leaveRunning: true      # Keep process running after checkpoint
    shellJob: true          # Containers are often session leaders
-    tcpClose: true          # Pod IPs change on restore/migration
+    tcpClose: true          # Close non-listening TCP sockets on restore
+    tcpEstablished: false   # Preserve established TCP sockets during restore
    fileLocks: true         # Applications use file locks
    orphanPtsMaster: true   # Containers with TTYs
    extUnixSk: true         # External Unix sockets

--- a/deploy/operator/api/v1alpha1/common.go
+++ b/deploy/operator/api/v1alpha1/common.go
@@ -71,6 +71,7 @@ type Autoscaling struct {
 	Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"`
 }
+// +kubebuilder:validation:XValidation:rule="!(self.disabled && has(self.size))",message="sharedMemory.size must not be set when sharedMemory.disabled is true"
 type SharedMemorySpec struct {
 	Disabled bool              `json:"disabled,omitempty"`
 	Size     resource.Quantity `json:"size,omitempty"`
@@ -192,8 +193,8 @@ type ServiceCheckpointConfig struct {
 	// +kubebuilder:default=Auto
 	Mode CheckpointMode `json:"mode,omitempty"`
-	// CheckpointRef references an existing Checkpoint CR to use
+	// CheckpointRef references an existing DynamoCheckpoint CR by metadata.name.
-	// If specified, Identity is ignored and this checkpoint is used directly
+	// If specified, this service's Identity is ignored and the referenced checkpoint is used directly.
 	// +optional
 	CheckpointRef *string `json:"checkpointRef,omitempty"`

--- a/deploy/operator/api/v1alpha1/dynamocheckpoint_types.go
+++ b/deploy/operator/api/v1alpha1/dynamocheckpoint_types.go
@@ -93,18 +93,25 @@ type DynamoCheckpointJobConfig struct {
 	// +kubebuilder:validation:Required
 	PodTemplateSpec corev1.PodTemplateSpec `json:"podTemplateSpec"`
+	// SharedMemory controls the tmpfs mounted at /dev/shm for the checkpoint Job pod.
+	// When omitted, checkpoint Jobs use the same default 8Gi tmpfs as Dynamo components.
+	// +optional
+	SharedMemory *SharedMemorySpec `json:"sharedMemory,omitempty"`
 	// ActiveDeadlineSeconds specifies the maximum time the Job can run
 	// +optional
+	// +kubebuilder:validation:Minimum=1
 	// +kubebuilder:default=3600
 	ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"`
-	// BackoffLimit specifies the number of retries before marking the Job failed
+	// Deprecated: BackoffLimit is ignored. Checkpoint Jobs never retry.
 	// +optional
-	// +kubebuilder:default=3
+	// +kubebuilder:validation:Minimum=0
 	BackoffLimit *int32 `json:"backoffLimit,omitempty"`
 	// TTLSecondsAfterFinished specifies how long to keep the Job after completion
 	// +optional
+	// +kubebuilder:validation:Minimum=0
 	// +kubebuilder:default=300
 	TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"`
 }
@@ -124,9 +131,9 @@ type DynamoCheckpointSpec struct {
 type DynamoCheckpointConditionType string
 const (
-	// DynamoCheckpointConditionJobCreated indicates whether the checkpoint Job has been created
+	// DEPRECATED: DynamoCheckpointConditionJobCreated is deprecated. Use status.phase instead.
 	DynamoCheckpointConditionJobCreated DynamoCheckpointConditionType = "JobCreated"
-	// DynamoCheckpointConditionJobCompleted indicates whether the checkpoint Job has completed
+	// DEPRECATED: DynamoCheckpointConditionJobCompleted is deprecated. Use status.phase instead.
 	DynamoCheckpointConditionJobCompleted DynamoCheckpointConditionType = "JobCompleted"
 )
@@ -164,7 +171,7 @@ type DynamoCheckpointStatus struct {
 	// +optional
 	Message string `json:"message,omitempty"`
-	// Conditions represent the latest available observations of the checkpoint's state
+	// DEPRECATED: Conditions are deprecated. Use status.phase instead.
 	// +optional
 	Conditions []metav1.Condition `json:"conditions,omitempty"`
 }

--- a/deploy/operator/api/v1alpha1/dynamographdeployment_types.go
+++ b/deploy/operator/api/v1alpha1/dynamographdeployment_types.go
@@ -144,7 +144,7 @@ type ServiceCheckpointStatus struct {
 	// IdentityHash is the computed hash of the checkpoint identity
 	// +optional
 	IdentityHash string `json:"identityHash,omitempty"`
-	// Ready indicates if the checkpoint is ready for use
+	// Ready indicates if the checkpoint was visible to the worker at startup
 	// +optional
 	Ready bool `json:"ready,omitempty"`
 }