Unverified Commit 38bb9d37 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor: clean up checkpoint orchestration (#7309)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 9ea3acad
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Shared Dynamo snapshot helpers for checkpoint lifecycle."""
import asyncio
import logging
import os
import signal
from dataclasses import dataclass
from typing import Any, Generic, TypeVar
from dynamo.common.utils.namespace import get_worker_namespace
logger = logging.getLogger(__name__)
PODINFO_ROOT = "/etc/podinfo"
REQUIRED_PODINFO_FILES = {
"DYN_NAMESPACE": "dyn_namespace",
"DYN_COMPONENT": "dyn_component",
"DYN_PARENT_DGD_K8S_NAME": "dyn_parent_dgd_k8s_name",
"DYN_PARENT_DGD_K8S_NAMESPACE": "dyn_parent_dgd_k8s_namespace",
}
OPTIONAL_PODINFO_FILES = {
"DYN_NAMESPACE_WORKER_SUFFIX": "dyn_namespace_worker_suffix",
}
EngineT = TypeVar("EngineT")
class CheckpointConfig:
"""Parsed checkpoint configuration plus the watcher-driven lifecycle."""
def __init__(self, ready_file: str):
self.ready_file = ready_file
self._checkpoint_done = asyncio.Event()
self._restore_done = asyncio.Event()
@classmethod
def from_env(cls) -> "CheckpointConfig | None":
ready_file = os.environ.get("DYN_READY_FOR_CHECKPOINT_FILE")
if not ready_file:
return None
configure_checkpoint_transport_env()
return cls(ready_file=ready_file)
async def run_lifecycle(
self,
quiesce_controller: Any,
*quiesce_args: object,
) -> bool:
logger.info("Quiescing model")
await quiesce_controller.quiesce(*quiesce_args)
self._install_signal_handlers()
try:
with open(self.ready_file, "w", encoding="utf-8") as ready_file:
ready_file.write("ready")
except Exception:
self._remove_signal_handlers()
raise
logger.info(
"Ready for checkpoint. Waiting for watcher signal "
"(SIGUSR1=checkpoint complete, SIGCONT=restore complete)"
)
try:
event = await self._wait_for_watcher_signal()
if event == "restore":
logger.info("Restore signal detected (SIGCONT)")
logger.info("Resuming model after restore")
await quiesce_controller.resume()
quiesce_controller.mark_resumed()
return True
logger.info("Checkpoint completion signal detected (SIGUSR1)")
return False
finally:
self._remove_signal_handlers()
try:
os.unlink(self.ready_file)
except OSError:
pass
def _install_signal_handlers(self) -> None:
loop = asyncio.get_running_loop()
loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set)
loop.add_signal_handler(signal.SIGCONT, self._restore_done.set)
def _remove_signal_handlers(self) -> None:
loop = asyncio.get_running_loop()
loop.remove_signal_handler(signal.SIGUSR1)
loop.remove_signal_handler(signal.SIGCONT)
async def _wait_for_watcher_signal(self) -> str:
waiters = {
asyncio.create_task(self._checkpoint_done.wait()): "checkpoint",
asyncio.create_task(self._restore_done.wait()): "restore",
}
try:
done, pending = await asyncio.wait(
waiters.keys(), return_when=asyncio.FIRST_COMPLETED
)
for task in pending:
task.cancel()
winner = done.pop()
await winner
return waiters[winner]
finally:
for task in waiters:
if not task.done():
task.cancel()
def configure_checkpoint_transport_env() -> None:
gloo_ifname = os.environ.get("GLOO_SOCKET_IFNAME")
if gloo_ifname and gloo_ifname != "lo":
logger.warning(
"Overriding GLOO_SOCKET_IFNAME=%r with 'lo' for checkpoint mode "
"because CRIU cannot restore sockets bound to non-loopback addresses",
gloo_ifname,
)
os.environ["GLOO_SOCKET_IFNAME"] = "lo"
nccl_ifname = os.environ.get("NCCL_SOCKET_IFNAME")
if nccl_ifname and nccl_ifname != "lo":
logger.warning(
"Overriding NCCL_SOCKET_IFNAME=%r with 'lo' for checkpoint mode "
"because CRIU cannot restore sockets bound to non-loopback addresses",
nccl_ifname,
)
os.environ["NCCL_SOCKET_IFNAME"] = "lo"
nccl_cumem_enable = os.environ.get("NCCL_CUMEM_ENABLE")
if nccl_cumem_enable and nccl_cumem_enable != "0":
logger.warning(
"Overriding NCCL_CUMEM_ENABLE=%r with '0' for checkpoint mode "
"because cuda-checkpoint does not support cuMem-backed NCCL allocations",
nccl_cumem_enable,
)
os.environ["NCCL_CUMEM_ENABLE"] = "0"
nccl_p2p_disable = os.environ.get("NCCL_P2P_DISABLE")
if nccl_p2p_disable and nccl_p2p_disable != "0":
logger.warning(
"Overriding NCCL_P2P_DISABLE=%r with '0' for checkpoint mode "
"to keep NCCL on GPU P2P transport when topology allows it",
nccl_p2p_disable,
)
os.environ["NCCL_P2P_DISABLE"] = "0"
nccl_nvls_enable = os.environ.get("NCCL_NVLS_ENABLE")
if nccl_nvls_enable and nccl_nvls_enable != "0":
logger.warning(
"Overriding NCCL_NVLS_ENABLE=%r with '0' for checkpoint mode "
"to avoid NVLS and keep NCCL on the legacy P2P path",
nccl_nvls_enable,
)
os.environ["NCCL_NVLS_ENABLE"] = "0"
nccl_ib_disable = os.environ.get("NCCL_IB_DISABLE")
if nccl_ib_disable and nccl_ib_disable != "1":
logger.warning(
"Overriding NCCL_IB_DISABLE=%r with '1' for checkpoint mode "
"because CRIU and cuda-checkpoint cannot restore InfiniBand state",
nccl_ib_disable,
)
os.environ["NCCL_IB_DISABLE"] = "1"
torch_nccl_monitoring = os.environ.get("TORCH_NCCL_ENABLE_MONITORING")
if torch_nccl_monitoring and torch_nccl_monitoring != "0":
logger.warning(
"Overriding TORCH_NCCL_ENABLE_MONITORING=%r with '0' for checkpoint mode "
"because ProcessGroupNCCL monitoring can terminate restored processes",
torch_nccl_monitoring,
)
os.environ["TORCH_NCCL_ENABLE_MONITORING"] = "0"
os.environ.setdefault("TORCH_NCCL_DUMP_ON_TIMEOUT", "0")
@dataclass
class EngineSnapshotController(Generic[EngineT]):
engine: EngineT
quiesce_controller: Any
checkpoint_config: CheckpointConfig
quiesce_args: tuple[object, ...] = ()
async def wait_for_restore(self) -> bool:
return await self.checkpoint_config.run_lifecycle(
self.quiesce_controller,
*self.quiesce_args,
)
def reload_restore_identity(self) -> tuple[str, str]:
return reload_snapshot_restore_identity()
def reload_snapshot_restore_identity() -> tuple[str, str]:
for env_name, podinfo_file in REQUIRED_PODINFO_FILES.items():
podinfo_path = os.path.join(PODINFO_ROOT, podinfo_file)
if not os.path.isfile(podinfo_path):
raise RuntimeError(f"snapshot restore requires {podinfo_path}")
with open(podinfo_path, encoding="utf-8") as podinfo:
value = podinfo.read().strip()
if not value:
raise RuntimeError(f"snapshot restore requires a non-empty {podinfo_path}")
os.environ[env_name] = value
for env_name, podinfo_file in OPTIONAL_PODINFO_FILES.items():
podinfo_path = os.path.join(PODINFO_ROOT, podinfo_file)
if not os.path.isfile(podinfo_path):
os.environ.pop(env_name, None)
continue
with open(podinfo_path, encoding="utf-8") as podinfo:
value = podinfo.read().strip()
if not value:
os.environ.pop(env_name, None)
continue
os.environ[env_name] = value
# Snapshot restore only runs in Kubernetes-managed pods, so discovery resets here.
os.environ["DYN_DISCOVERY_BACKEND"] = "kubernetes"
return get_worker_namespace(), "kubernetes"
...@@ -26,9 +26,10 @@ from dynamo.sglang.init_multimodal import ( ...@@ -26,9 +26,10 @@ from dynamo.sglang.init_multimodal import (
init_multimodal_worker, init_multimodal_worker,
) )
from dynamo.sglang.shutdown import install_graceful_shutdown from dynamo.sglang.shutdown import install_graceful_shutdown
from dynamo.sglang.snapshot import handle_checkpoint_mode from dynamo.sglang.snapshot import prepare_snapshot_engine
configure_dynamo_logging() configure_dynamo_logging()
logger = logging.getLogger(__name__)
async def worker(): async def worker():
...@@ -41,11 +42,17 @@ async def worker(): ...@@ -41,11 +42,17 @@ async def worker():
config.server_args.load_format = setup_gms(config.server_args) config.server_args.load_format = setup_gms(config.server_args)
# Checkpoint mode: engine must be created BEFORE runtime (no NATS/etcd during CRIU) # Checkpoint mode: engine must be created BEFORE runtime (no NATS/etcd during CRIU)
should_exit, snapshot_engine = await handle_checkpoint_mode(config.server_args) snapshot_controller = await prepare_snapshot_engine(config.server_args)
if should_exit:
return
dynamo_args = config.dynamo_args dynamo_args = config.dynamo_args
snapshot_engine = None
if snapshot_controller is not None:
snapshot_engine = snapshot_controller.engine
(
dynamo_args.namespace,
dynamo_args.discovery_backend,
) = snapshot_controller.reload_restore_identity()
shutdown_event = asyncio.Event() shutdown_event = asyncio.Event()
shutdown_endpoints: list = [] shutdown_endpoints: list = []
runtime, loop = create_runtime( runtime, loop = create_runtime(
...@@ -58,7 +65,7 @@ async def worker(): ...@@ -58,7 +65,7 @@ async def worker():
run_deferred_handlers = install_graceful_shutdown( run_deferred_handlers = install_graceful_shutdown(
loop, runtime, shutdown_endpoints, shutdown_event loop, runtime, shutdown_endpoints, shutdown_event
) )
logging.info( logger.info(
"Signal handlers set up for graceful shutdown " "Signal handlers set up for graceful shutdown "
"(discovery unregister + grace period, with chaining)" "(discovery unregister + grace period, with chaining)"
) )
......
...@@ -19,9 +19,57 @@ from dynamo.runtime import DistributedRuntime ...@@ -19,9 +19,57 @@ from dynamo.runtime import DistributedRuntime
from dynamo.sglang.args import Config from dynamo.sglang.args import Config
from dynamo.sglang.publisher import DynamoSglangPublisher from dynamo.sglang.publisher import DynamoSglangPublisher
# Keep default tags minimal and safe for general use.
# "cuda_graph" can still be requested explicitly, but it requires LD_PRELOAD setup. class SGLangEngineQuiesceController:
DEFAULT_MEMORY_OCCUPATION_TAGS = ["kv_cache", "weights"] def __init__(self, engine: sgl.Engine):
self._engine = engine
self._quiesced_tags: Optional[list[str]] = None
self._is_quiesced = False
@property
def is_quiesced(self) -> bool:
return self._is_quiesced
async def quiesce(self, tags: Optional[list[str]] = None) -> bool:
if self._is_quiesced:
return False
from sglang.srt.managers.io_struct import (
PauseGenerationReqInput,
ReleaseMemoryOccupationReqInput,
)
await self._engine.tokenizer_manager.pause_generation(PauseGenerationReqInput())
await self._engine.tokenizer_manager.release_memory_occupation(
ReleaseMemoryOccupationReqInput(tags=tags),
None,
)
self._quiesced_tags = None if tags is None else list(tags)
self._is_quiesced = True
return True
async def resume(self, tags: Optional[list[str]] = None) -> bool:
if not self._is_quiesced:
return False
from sglang.srt.managers.io_struct import (
ContinueGenerationReqInput,
ResumeMemoryOccupationReqInput,
)
request_tags = self._quiesced_tags if tags is None else list(tags)
await self._engine.tokenizer_manager.resume_memory_occupation(
ResumeMemoryOccupationReqInput(tags=request_tags),
None,
)
await self._engine.tokenizer_manager.continue_generation(
ContinueGenerationReqInput()
)
return True
def mark_resumed(self) -> None:
self._quiesced_tags = None
self._is_quiesced = False
class BaseGenerativeHandler(ABC): class BaseGenerativeHandler(ABC):
...@@ -148,8 +196,10 @@ class BaseWorkerHandler(BaseGenerativeHandler): ...@@ -148,8 +196,10 @@ class BaseWorkerHandler(BaseGenerativeHandler):
# have an sgl.Engine. # have an sgl.Engine.
self.input_param_manager = InputParamManager(None) self.input_param_manager = InputParamManager(None)
self._engine_supports_priority = False self._engine_supports_priority = False
self._memory_occupation_lock = asyncio.Lock() self._quiesce_controller = (
self._memory_released = False SGLangEngineQuiesceController(engine) if engine is not None else None
)
self._quiesce_lock = asyncio.Lock()
def _priority_kwargs(self, priority: Any) -> Dict[str, Any]: def _priority_kwargs(self, priority: Any) -> Dict[str, Any]:
if priority is not None and self._engine_supports_priority: if priority is not None and self._engine_supports_priority:
...@@ -160,32 +210,23 @@ class BaseWorkerHandler(BaseGenerativeHandler): ...@@ -160,32 +210,23 @@ class BaseWorkerHandler(BaseGenerativeHandler):
"""Release GPU memory occupation and unregister from discovery. """Release GPU memory occupation and unregister from discovery.
Args: Args:
body: Unused. Release always targets default tags. body: Optional dict with "tags" to target specific memory regions.
Order of operations: Order of operations:
1. Unregister from discovery - stop accepting new requests 1. Unregister from discovery - stop accepting new requests
2. Pause generation - drain in-flight requests 2. Pause generation - drain in-flight requests
3. Release memory - safe now that no requests are active 3. Release memory - safe now that no requests are active
""" """
from sglang.srt.managers.io_struct import ( if self._quiesce_controller is None:
PauseGenerationReqInput,
ReleaseMemoryOccupationReqInput,
)
tags = list(DEFAULT_MEMORY_OCCUPATION_TAGS)
tokenizer_manager = (
getattr(self.engine, "tokenizer_manager", None)
if self.engine is not None
else None
)
if tokenizer_manager is None:
return { return {
"status": "error", "status": "error",
"message": "memory control not supported on this worker", "message": "memory control not supported on this worker",
} }
async with self._memory_occupation_lock: body = body or {}
if self._memory_released: tags = body.get("tags")
async with self._quiesce_lock:
if self._quiesce_controller.is_quiesced:
return { return {
"status": "ok", "status": "ok",
"message": "Memory already released", "message": "Memory already released",
...@@ -196,16 +237,15 @@ class BaseWorkerHandler(BaseGenerativeHandler): ...@@ -196,16 +237,15 @@ class BaseWorkerHandler(BaseGenerativeHandler):
if self.generate_endpoint is not None: if self.generate_endpoint is not None:
await self.generate_endpoint.unregister_endpoint_instance() await self.generate_endpoint.unregister_endpoint_instance()
pause_req = PauseGenerationReqInput() await self._quiesce_controller.quiesce(tags)
await tokenizer_manager.pause_generation(pause_req)
release_req = ReleaseMemoryOccupationReqInput(tags=tags)
await tokenizer_manager.release_memory_occupation(release_req, None)
self._memory_released = True
return { return {
"status": "ok", "status": "ok",
"message": f"Memory released for tags: {tags}", "message": (
f"Memory released for tags: {tags}"
if tags is not None
else "Memory released"
),
} }
except Exception as e: except Exception as e:
logging.error(f"Failed to release memory occupation: {e}") logging.error(f"Failed to release memory occupation: {e}")
...@@ -215,51 +255,42 @@ class BaseWorkerHandler(BaseGenerativeHandler): ...@@ -215,51 +255,42 @@ class BaseWorkerHandler(BaseGenerativeHandler):
"""Resume GPU memory occupation and re-register to discovery. """Resume GPU memory occupation and re-register to discovery.
Args: Args:
body: Unused. Resume always targets default tags. body: Optional dict with "tags" to target specific memory regions.
Order of operations: Order of operations:
1. Resume memory - restore GPU allocations 1. Resume memory - restore GPU allocations
2. Continue generation - ready to serve requests 2. Continue generation - ready to serve requests
3. Re-register to discovery - allow frontend to route here 3. Re-register to discovery - allow frontend to route here
""" """
from sglang.srt.managers.io_struct import ( if self._quiesce_controller is None:
ContinueGenerationReqInput,
ResumeMemoryOccupationReqInput,
)
tags = list(DEFAULT_MEMORY_OCCUPATION_TAGS)
tokenizer_manager = (
getattr(self.engine, "tokenizer_manager", None)
if self.engine is not None
else None
)
if tokenizer_manager is None:
return { return {
"status": "error", "status": "error",
"message": "memory control not supported on this worker", "message": "memory control not supported on this worker",
} }
async with self._memory_occupation_lock: body = body or {}
if not self._memory_released: tags = body.get("tags")
async with self._quiesce_lock:
if not self._quiesce_controller.is_quiesced:
return { return {
"status": "ok", "status": "ok",
"message": "Memory already resumed", "message": "Memory already resumed",
} }
try: try:
resume_req = ResumeMemoryOccupationReqInput(tags=tags) await self._quiesce_controller.resume(tags)
await tokenizer_manager.resume_memory_occupation(resume_req, None)
continue_req = ContinueGenerationReqInput()
await tokenizer_manager.continue_generation(continue_req)
if self.generate_endpoint is not None: if self.generate_endpoint is not None:
await self.generate_endpoint.register_endpoint_instance() await self.generate_endpoint.register_endpoint_instance()
self._quiesce_controller.mark_resumed()
self._memory_released = False
return { return {
"status": "ok", "status": "ok",
"message": f"Memory resumed for tags: {tags}", "message": (
f"Memory resumed for tags: {tags}"
if tags is not None
else "Memory resumed"
),
} }
except Exception as e: except Exception as e:
logging.error(f"Failed to resume memory occupation: {e}") logging.error(f"Failed to resume memory occupation: {e}")
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """Dynamo Snapshot integration for SGLang workers."""
Dynamo Snapshot integration for SGLang workers.
Handles the checkpoint job pod lifecycle:
1. Early exit if a checkpoint already exists (idempotency)
2. Sleep model for CRIU-friendly GPU state
3. Signal readiness for DaemonSet to begin checkpoint
4. Wait for watcher signals from the DaemonSet
5. Wake model after restore
SGLang does not have a native sleep/wake API like vLLM. Instead we use
release_memory_occupation / resume_memory_occupation through the
SGLangCheckpointAdapter, which presents the same sleep()/wake_up()
interface that CheckpointConfig.run_lifecycle expects.
Environment variables:
- DYN_READY_FOR_CHECKPOINT_FILE: Path where this worker writes readiness marker
- DYN_CHECKPOINT_STORAGE_TYPE: Storage backend (pvc, s3, oci) (optional, defaults to pvc)
- DYN_CHECKPOINT_LOCATION: Full checkpoint path (optional when PATH+HASH are provided)
- DYN_CHECKPOINT_PATH + DYN_CHECKPOINT_HASH: PVC base path + hash (used to derive location)
Signals handled in checkpoint mode:
- SIGUSR1: Checkpoint completed, exit process
- SIGCONT: Restore completed, wake model and continue
- SIGKILL (from watcher on failure): Process is terminated immediately (unhandleable)
"""
import asyncio
import logging import logging
import os
import signal
import time import time
from typing import Optional
import sglang as sgl import sglang as sgl
logger = logging.getLogger(__name__) from dynamo.common.utils.snapshot import CheckpointConfig, EngineSnapshotController
_SLEEP_MODE_LEVEL = 1
# Memory tags to release/resume for CRIU checkpoint/restore. from .request_handlers.handler_base import SGLangEngineQuiesceController
# All GPU resources must be released so CRIU can snapshot the process cleanly.
_MEMORY_TAGS = ["kv_cache", "weights", "cuda_graph"]
logger = logging.getLogger(__name__)
class SGLangCheckpointAdapter:
"""Adapts an sgl.Engine to the sleep/wake_up interface expected by
CheckpointConfig.run_lifecycle (matching vLLM's AsyncLLM API).
sleep(): pause generation -> release GPU memory
wake_up(): resume GPU memory -> continue generation
"""
def __init__(self, engine: sgl.Engine): async def prepare_snapshot_engine(
self._engine = engine server_args,
) -> EngineSnapshotController[sgl.Engine] | None:
async def sleep(self, level: int = 1) -> None:
from sglang.srt.managers.io_struct import (
PauseGenerationReqInput,
ReleaseMemoryOccupationReqInput,
)
# Drain in-flight requests before touching GPU memory
await self._engine.tokenizer_manager.pause_generation(PauseGenerationReqInput())
await self._engine.tokenizer_manager.release_memory_occupation(
ReleaseMemoryOccupationReqInput(tags=_MEMORY_TAGS), None
)
async def wake_up(self) -> None:
from sglang.srt.managers.io_struct import (
ContinueGenerationReqInput,
ResumeMemoryOccupationReqInput,
)
await self._engine.tokenizer_manager.resume_memory_occupation(
ResumeMemoryOccupationReqInput(tags=_MEMORY_TAGS), None
)
await self._engine.tokenizer_manager.continue_generation(
ContinueGenerationReqInput()
)
class CheckpointConfig:
"""Parsed and validated checkpoint configuration from environment variables."""
def __init__(self):
self.ready_file = os.environ["DYN_READY_FOR_CHECKPOINT_FILE"]
self.storage_type = os.environ.get("DYN_CHECKPOINT_STORAGE_TYPE", "pvc")
self.location = os.environ.get("DYN_CHECKPOINT_LOCATION", "")
if not self.location:
checkpoint_path = os.environ.get("DYN_CHECKPOINT_PATH", "").rstrip("/")
checkpoint_hash = os.environ.get("DYN_CHECKPOINT_HASH", "")
if checkpoint_path and checkpoint_hash:
self.location = f"{checkpoint_path}/{checkpoint_hash}"
self.is_checkpoint_job = bool(self.location)
self._checkpoint_done = asyncio.Event()
self._restore_done = asyncio.Event()
def checkpoint_exists(self) -> bool:
"""Check if a completed checkpoint already exists (idempotency).
A checkpoint is complete when its directory exists at the base path root
(not under the tmp/ staging area). Directory presence = done.
"""
if self.storage_type != "pvc":
return False
if os.path.isdir(self.location):
logger.info(f"Existing checkpoint found at {self.location}, skipping")
return True
logger.info(f"No checkpoint at {self.location}, creating new one")
return False
async def run_lifecycle(self, engine_client, sleep_level: int) -> bool:
"""Run the full checkpoint lifecycle after the engine is loaded.
1. Put model to sleep (CRIU-friendly GPU state)
2. Write ready file (triggers DaemonSet checkpoint via readiness probe)
3. Wait for watcher signal (checkpoint complete, restore complete, or failure)
4. If restored: wake model and return True (caller proceeds with registration)
5. If checkpoint done: return False (caller should exit)
"""
# Sleep model for checkpoint
logger.info(f"Putting model to sleep (level={sleep_level})")
await engine_client.sleep(level=sleep_level)
# Install signal handlers before writing the ready file so there is no
# window where the DaemonSet can send SIGUSR1/SIGCONT while the default
# signal disposition (terminate) is still in effect.
self._install_signal_handlers()
# Signal readiness
with open(self.ready_file, "w") as f:
f.write("ready")
logger.info(
"Ready for checkpoint. Waiting for watcher signal "
"(SIGUSR1=checkpoint complete, SIGCONT=restore complete)"
)
try:
event = await self._wait_for_watcher_signal()
if event == "restore":
logger.info("Restore signal detected (SIGCONT)")
logger.info("Waking up model after restore")
await engine_client.wake_up()
return True
# SIGUSR1: checkpoint complete
logger.info("Checkpoint completion signal detected (SIGUSR1)")
return False
finally:
self._remove_signal_handlers()
# Remove the ready file so that a restarting pod does not leave a
# stale marker that could trick the DaemonSet into acting on it.
try:
os.unlink(self.ready_file)
except OSError:
pass
def _install_signal_handlers(self) -> None:
loop = asyncio.get_running_loop()
loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set)
# SIGCONT is used as the restore-complete signal. The snapshot DaemonSet
# watcher is the only sender, so there is no conflict with POSIX
# job-control semantics in practice.
loop.add_signal_handler(signal.SIGCONT, self._restore_done.set)
# No handler for checkpoint failure: the watcher sends SIGKILL, which
# terminates the process immediately (cannot be caught).
def _remove_signal_handlers(self) -> None:
loop = asyncio.get_running_loop()
loop.remove_signal_handler(signal.SIGUSR1)
loop.remove_signal_handler(signal.SIGCONT)
async def _wait_for_watcher_signal(self) -> str:
waiters = {
asyncio.create_task(self._checkpoint_done.wait()): "checkpoint",
asyncio.create_task(self._restore_done.wait()): "restore",
}
try:
done, pending = await asyncio.wait(
waiters.keys(), return_when=asyncio.FIRST_COMPLETED
)
for task in pending:
task.cancel()
winner = done.pop()
await winner
return waiters[winner]
finally:
for task in waiters:
if not task.done():
task.cancel()
async def handle_checkpoint_mode(server_args) -> tuple[bool, Optional[sgl.Engine]]:
"""Single entry point for Dynamo Snapshot integration. """Single entry point for Dynamo Snapshot integration.
Must be called BEFORE runtime creation so the engine can be checkpointed Must be called BEFORE runtime creation so the engine can be checkpointed
without active NATS/etcd connections. without active NATS/etcd connections.
Returns: Returns:
(should_exit, engine) where: None when not in checkpoint mode.
- (True, None): caller should return immediately (checkpoint already A snapshot controller when restore completed and the caller should use
exists, or checkpoint completed successfully). the restored engine.
- (False, None): not in checkpoint mode — cold-start normally.
- (False, engine): restore completed — caller should use this engine.
"""
if "DYN_READY_FOR_CHECKPOINT_FILE" not in os.environ:
return False, None
# Validate: either a full location or path + hash must be set.
if not os.environ.get("DYN_CHECKPOINT_LOCATION"):
path = os.environ.get("DYN_CHECKPOINT_PATH", "")
hash_ = os.environ.get("DYN_CHECKPOINT_HASH", "")
if not path or not hash_:
raise EnvironmentError(
"Checkpoint mode requires either DYN_CHECKPOINT_LOCATION or both "
"DYN_CHECKPOINT_PATH and DYN_CHECKPOINT_HASH"
)
cfg = CheckpointConfig()
checkpoint_exists = cfg.checkpoint_exists()
if cfg.is_checkpoint_job and checkpoint_exists: If checkpointing completed successfully, this function exits the
return True, None process with status 0.
"""
if not cfg.is_checkpoint_job and not checkpoint_exists: checkpoint_cfg = CheckpointConfig.from_env()
return False, None if checkpoint_cfg is None:
return None
logger.info("Checkpoint mode enabled (watcher-driven signals)") logger.info("Checkpoint mode enabled (watcher-driven signals)")
...@@ -244,8 +48,12 @@ async def handle_checkpoint_mode(server_args) -> tuple[bool, Optional[sgl.Engine ...@@ -244,8 +48,12 @@ async def handle_checkpoint_mode(server_args) -> tuple[bool, Optional[sgl.Engine
f"SGLang engine loaded in {time.time() - start_time:.2f}s (checkpoint mode)" f"SGLang engine loaded in {time.time() - start_time:.2f}s (checkpoint mode)"
) )
adapter = SGLangCheckpointAdapter(engine) snapshot_controller = EngineSnapshotController(
if not await cfg.run_lifecycle(adapter, _SLEEP_MODE_LEVEL): engine=engine,
return True, None quiesce_controller=SGLangEngineQuiesceController(engine),
checkpoint_config=checkpoint_cfg,
)
if not await snapshot_controller.wait_for_restore():
raise SystemExit(0)
return False, engine return snapshot_controller
...@@ -10,8 +10,8 @@ from unittest.mock import AsyncMock ...@@ -10,8 +10,8 @@ from unittest.mock import AsyncMock
import pytest import pytest
from dynamo.sglang.request_handlers.handler_base import ( from dynamo.sglang.request_handlers.handler_base import (
DEFAULT_MEMORY_OCCUPATION_TAGS,
BaseWorkerHandler, BaseWorkerHandler,
SGLangEngineQuiesceController,
) )
pytestmark = [ pytestmark = [
...@@ -59,8 +59,8 @@ def _make_handler() -> _TestWorkerHandler: ...@@ -59,8 +59,8 @@ def _make_handler() -> _TestWorkerHandler:
unregister_endpoint_instance=AsyncMock(), unregister_endpoint_instance=AsyncMock(),
register_endpoint_instance=AsyncMock(), register_endpoint_instance=AsyncMock(),
) )
handler._memory_occupation_lock = asyncio.Lock() handler._quiesce_controller = SGLangEngineQuiesceController(handler.engine)
handler._memory_released = False handler._quiesce_lock = asyncio.Lock()
return handler return handler
...@@ -93,7 +93,6 @@ async def test_release_and_resume_are_idempotent(): ...@@ -93,7 +93,6 @@ async def test_release_and_resume_are_idempotent():
assert second_resume["status"] == "ok" assert second_resume["status"] == "ok"
assert second_release["message"] == "Memory already released" assert second_release["message"] == "Memory already released"
assert second_resume["message"] == "Memory already resumed" assert second_resume["message"] == "Memory already resumed"
assert DEFAULT_MEMORY_OCCUPATION_TAGS == ["kv_cache", "weights"]
release_req = ( release_req = (
handler.engine.tokenizer_manager.release_memory_occupation.await_args.args[0] handler.engine.tokenizer_manager.release_memory_occupation.await_args.args[0]
...@@ -101,8 +100,8 @@ async def test_release_and_resume_are_idempotent(): ...@@ -101,8 +100,8 @@ async def test_release_and_resume_are_idempotent():
resume_req = ( resume_req = (
handler.engine.tokenizer_manager.resume_memory_occupation.await_args.args[0] handler.engine.tokenizer_manager.resume_memory_occupation.await_args.args[0]
) )
assert release_req.tags == DEFAULT_MEMORY_OCCUPATION_TAGS assert release_req.tags is None
assert resume_req.tags == DEFAULT_MEMORY_OCCUPATION_TAGS assert resume_req.tags is None
handler.engine.tokenizer_manager.pause_generation.assert_awaited_once() handler.engine.tokenizer_manager.pause_generation.assert_awaited_once()
handler.engine.tokenizer_manager.release_memory_occupation.assert_awaited_once() handler.engine.tokenizer_manager.release_memory_occupation.assert_awaited_once()
...@@ -114,17 +113,37 @@ async def test_release_and_resume_are_idempotent(): ...@@ -114,17 +113,37 @@ async def test_release_and_resume_are_idempotent():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_resume_uses_default_tags_even_when_request_specifies_subset(): async def test_release_and_resume_use_explicit_request_tags():
handler = _make_handler() handler = _make_handler()
await handler.release_memory_occupation({"tags": ["weights"]}) await handler.release_memory_occupation({"tags": ["weights"]})
resume_result = await handler.resume_memory_occupation({"tags": ["weights"]}) resume_result = await handler.resume_memory_occupation({"tags": ["weights"]})
assert resume_result["status"] == "ok"
release_req = (
handler.engine.tokenizer_manager.release_memory_occupation.await_args.args[0]
)
resume_req = (
handler.engine.tokenizer_manager.resume_memory_occupation.await_args.args[0]
)
assert release_req.tags == ["weights"]
assert resume_req.tags == ["weights"]
handler.engine.tokenizer_manager.continue_generation.assert_awaited_once()
handler.generate_endpoint.register_endpoint_instance.assert_awaited_once()
@pytest.mark.asyncio
async def test_resume_reuses_release_tags_when_request_omits_them():
handler = _make_handler()
await handler.release_memory_occupation({"tags": ["weights"]})
resume_result = await handler.resume_memory_occupation({})
assert resume_result["status"] == "ok" assert resume_result["status"] == "ok"
resume_req = ( resume_req = (
handler.engine.tokenizer_manager.resume_memory_occupation.await_args.args[0] handler.engine.tokenizer_manager.resume_memory_occupation.await_args.args[0]
) )
assert resume_req.tags == DEFAULT_MEMORY_OCCUPATION_TAGS assert resume_req.tags == ["weights"]
handler.engine.tokenizer_manager.continue_generation.assert_awaited_once() handler.engine.tokenizer_manager.continue_generation.assert_awaited_once()
handler.generate_endpoint.register_endpoint_instance.assert_awaited_once() handler.generate_endpoint.register_endpoint_instance.assert_awaited_once()
...@@ -146,6 +165,7 @@ async def test_resume_with_no_sleeping_state_is_noop(): ...@@ -146,6 +165,7 @@ async def test_resume_with_no_sleeping_state_is_noop():
async def test_release_returns_error_when_worker_has_no_tokenizer_manager(): async def test_release_returns_error_when_worker_has_no_tokenizer_manager():
handler = _make_handler() handler = _make_handler()
handler.engine = None handler.engine = None
handler._quiesce_controller = None
result = await handler.release_memory_occupation({}) result = await handler.release_memory_occupation({})
...@@ -160,6 +180,7 @@ async def test_release_returns_error_when_worker_has_no_tokenizer_manager(): ...@@ -160,6 +180,7 @@ async def test_release_returns_error_when_worker_has_no_tokenizer_manager():
async def test_resume_returns_error_when_worker_has_no_tokenizer_manager(): async def test_resume_returns_error_when_worker_has_no_tokenizer_manager():
handler = _make_handler() handler = _make_handler()
handler.engine = None handler.engine = None
handler._quiesce_controller = None
result = await handler.resume_memory_occupation({}) result = await handler.resume_memory_occupation({})
...@@ -168,3 +189,18 @@ async def test_resume_returns_error_when_worker_has_no_tokenizer_manager(): ...@@ -168,3 +189,18 @@ async def test_resume_returns_error_when_worker_has_no_tokenizer_manager():
"message": "memory control not supported on this worker", "message": "memory control not supported on this worker",
} }
handler.generate_endpoint.register_endpoint_instance.assert_not_awaited() handler.generate_endpoint.register_endpoint_instance.assert_not_awaited()
@pytest.mark.asyncio
async def test_resume_keeps_quiesced_state_when_register_fails():
handler = _make_handler()
await handler.release_memory_occupation({})
handler.generate_endpoint.register_endpoint_instance = AsyncMock(
side_effect=RuntimeError("discovery write timeout")
)
result = await handler.resume_memory_occupation({})
assert result["status"] == "error"
assert handler._quiesce_controller is not None
assert handler._quiesce_controller.is_quiesced is True
...@@ -63,16 +63,6 @@ class DynamoVllmArgGroup(ArgGroup): ...@@ -63,16 +63,6 @@ class DynamoVllmArgGroup(ArgGroup):
help="Use vLLM's tokenizer for pre and post processing. This bypasses Dynamo's preprocessor and only v1/chat/completions will be available through the Dynamo frontend.", help="Use vLLM's tokenizer for pre and post processing. This bypasses Dynamo's preprocessor and only v1/chat/completions will be available through the Dynamo frontend.",
) )
add_argument(
g,
flag_name="--sleep-mode-level",
env_var="DYN_VLLM_SLEEP_MODE_LEVEL",
default=1,
help="Sleep mode level (1=offload to CPU, 2=discard weights, 3=discard all).",
choices=[1, 2, 3],
arg_type=int,
)
# Multimodal # Multimodal
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
...@@ -178,7 +168,6 @@ class DynamoVllmConfig(ConfigBase): ...@@ -178,7 +168,6 @@ class DynamoVllmConfig(ConfigBase):
is_prefill_worker: bool is_prefill_worker: bool
is_decode_worker: bool is_decode_worker: bool
use_vllm_tokenizer: bool use_vllm_tokenizer: bool
sleep_mode_level: int
# Multimodal # Multimodal
route_to_encoder: bool route_to_encoder: bool
......
...@@ -52,6 +52,43 @@ configure_dynamo_logging() ...@@ -52,6 +52,43 @@ configure_dynamo_logging()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class VllmEngineQuiesceController:
def __init__(self, engine_client: Any):
self._engine_client = engine_client
self._is_quiesced = False
@property
def is_quiesced(self) -> bool:
return self._is_quiesced
async def quiesce(self, *args: object) -> bool:
if self._is_quiesced:
return False
level = args[0] if args else None
await self._engine_client.pause_generation()
if level is None:
await self._engine_client.sleep()
else:
await self._engine_client.sleep(level)
self._is_quiesced = True
return True
async def resume(self, tags: list[str] | None = None) -> bool:
if not self._is_quiesced:
return False
if tags is None:
await self._engine_client.wake_up()
else:
await self._engine_client.wake_up(tags)
await self._engine_client.resume_generation()
return True
def mark_resumed(self) -> None:
self._is_quiesced = False
@dataclass(frozen=True) @dataclass(frozen=True)
class LoRAInfo: class LoRAInfo:
"""Metadata for a loaded LoRA adapter.""" """Metadata for a loaded LoRA adapter."""
...@@ -332,8 +369,8 @@ class BaseWorkerHandler(ABC): ...@@ -332,8 +369,8 @@ class BaseWorkerHandler(ABC):
self.use_vllm_tokenizer = use_vllm_tokenizer self.use_vllm_tokenizer = use_vllm_tokenizer
self.dp_range = get_dp_range_for_worker(self.engine_client.vllm_config) self.dp_range = get_dp_range_for_worker(self.engine_client.vllm_config)
self._sleep_wake_lock = asyncio.Lock() self._quiesce_controller = VllmEngineQuiesceController(self.engine_client)
self._engine_is_sleeping = False self._quiesce_lock = asyncio.Lock()
# Initialize InputParamManager for text-in-text-out mode # Initialize InputParamManager for text-in-text-out mode
tokenizer = None tokenizer = None
...@@ -357,8 +394,8 @@ class BaseWorkerHandler(ABC): ...@@ -357,8 +394,8 @@ class BaseWorkerHandler(ABC):
""" """
body = body or {} body = body or {}
level = body.get("level", 1) level = body.get("level", 1)
async with self._sleep_wake_lock: async with self._quiesce_lock:
if self._engine_is_sleeping: if self._quiesce_controller.is_quiesced:
return { return {
"status": "ok", "status": "ok",
"message": "Engine already sleeping", "message": "Engine already sleeping",
...@@ -374,11 +411,11 @@ class BaseWorkerHandler(ABC): ...@@ -374,11 +411,11 @@ class BaseWorkerHandler(ABC):
# Step 2: Abort in-flight requests and wait for them to drain so the # Step 2: Abort in-flight requests and wait for them to drain so the
# GPU is fully quiesced before unmapping memory. # GPU is fully quiesced before unmapping memory.
await self.engine_client.pause_generation() if not await self._quiesce_controller.quiesce(level):
return {
# Step 3: Now safe to sleep - no in-flight GPU work "status": "ok",
await self.engine_client.sleep(level) "message": "Engine already sleeping",
self._engine_is_sleeping = True }
return { return {
"status": "ok", "status": "ok",
...@@ -392,29 +429,27 @@ class BaseWorkerHandler(ABC): ...@@ -392,29 +429,27 @@ class BaseWorkerHandler(ABC):
"""Wake the engine to restore GPU memory and re-register to discovery. """Wake the engine to restore GPU memory and re-register to discovery.
Args: Args:
body: Unused. Wake always restores all sleep-managed memory. body: Optional dict with "tags" to request a partial wake.
Order of operations: Order of operations:
1. Wake engine - restore GPU memory 1. Wake engine - restore GPU memory
2. Re-register endpoint instance - allow frontend to route requests here again 2. Re-register endpoint instance - allow frontend to route requests here again
""" """
async with self._sleep_wake_lock: body = body or {}
if not self._engine_is_sleeping: tags = body.get("tags")
async with self._quiesce_lock:
if not self._quiesce_controller.is_quiesced:
return {"status": "ok", "message": "Engine already awake"} return {"status": "ok", "message": "Engine already awake"}
try: try:
# Step 1: Wake engine first - must be ready before accepting requests # Step 1: Wake engine first - must be ready before accepting requests
await self.engine_client.wake_up() await self._quiesce_controller.resume(tags)
# Step 2: Resume generation and re-register.
await self.engine_client.resume_generation()
if self.generate_endpoint is not None: if self.generate_endpoint is not None:
await self.generate_endpoint.register_endpoint_instance() await self.generate_endpoint.register_endpoint_instance()
logger.info( logger.info(
"[Wake] Re-registered endpoint to discovery - worker added back to routing pool" "[Wake] Re-registered endpoint to discovery - worker added back to routing pool"
) )
self._quiesce_controller.mark_resumed()
self._engine_is_sleeping = False
return { return {
"status": "ok", "status": "ok",
......
...@@ -44,7 +44,7 @@ from .constants import DisaggregationMode ...@@ -44,7 +44,7 @@ from .constants import DisaggregationMode
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler, get_dp_range_for_worker from .handlers import DecodeWorkerHandler, PrefillWorkerHandler, get_dp_range_for_worker
from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
from .publisher import DYNAMO_COMPONENT_REGISTRY, StatLoggerFactory from .publisher import DYNAMO_COMPONENT_REGISTRY, StatLoggerFactory
from .snapshot import get_checkpoint_config from .snapshot import prepare_snapshot_engine
# Optional imports for frontend decoding support # Optional imports for frontend decoding support
MediaDecoder: type | None = None MediaDecoder: type | None = None
...@@ -61,7 +61,6 @@ except ImportError: ...@@ -61,7 +61,6 @@ except ImportError:
configure_dynamo_logging() configure_dynamo_logging()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
shutdown_endpoints: list = [] shutdown_endpoints: list = []
CHECKPOINT_SLEEP_MODE_LEVEL = 1
def build_headless_namespace(config: Config) -> argparse.Namespace: def build_headless_namespace(config: Config) -> argparse.Namespace:
...@@ -102,11 +101,6 @@ async def worker() -> None: ...@@ -102,11 +101,6 @@ async def worker() -> None:
if not config.served_model_name: if not config.served_model_name:
config.served_model_name = config.engine_args.served_model_name = config.model config.served_model_name = config.engine_args.served_model_name = config.model
# Check checkpoint mode and validate env vars EARLY (fail fast if misconfigured)
early_exit, checkpoint_cfg = get_checkpoint_config()
if early_exit:
return
# Download the model if necessary using modelexpress. # Download the model if necessary using modelexpress.
# We want it on disk before we start vllm to avoid downloading from HuggingFace. # We want it on disk before we start vllm to avoid downloading from HuggingFace.
# #
...@@ -119,35 +113,27 @@ async def worker() -> None: ...@@ -119,35 +113,27 @@ async def worker() -> None:
if not os.path.exists(config.model): if not os.path.exists(config.model):
await fetch_model(config.model) await fetch_model(config.model)
# CHECKPOINT MODE: Load engine BEFORE runtime creation
# This allows checkpointing GPU state before runtime connections are established
snapshot_controller = await prepare_snapshot_engine(
config,
setup_vllm_engine,
)
snapshot_engine = None
if snapshot_controller is not None:
snapshot_engine = snapshot_controller.engine
(
config.namespace,
config.discovery_backend,
) = snapshot_controller.reload_restore_identity()
# HEADLESS MODE: bypass DistributedRuntime entirely. # HEADLESS MODE: bypass DistributedRuntime entirely.
# Workers run vLLM only (no NATS, etcd, or dynamo endpoints). # Workers run vLLM only (no NATS, etcd, or dynamo endpoints).
if config.headless: if config.headless:
if checkpoint_cfg is not None:
raise ValueError(
"--headless is incompatible with checkpoint mode "
"(DYN_CHECKPOINT_SIGNAL_FILE is set). "
"Remove --headless or unset DYN_CHECKPOINT_SIGNAL_FILE."
)
run_dynamo_headless(config) run_dynamo_headless(config)
return return
# CHECKPOINT MODE: Load engine BEFORE runtime creation
# This allows checkpointing GPU state before runtime connections are established
snapshot_engine = None
if checkpoint_cfg is not None:
logger.info("Checkpoint mode enabled (watcher-driven signals)")
# Checkpoint mode requires sleep mode — enable before engine init
config.engine_args.enable_sleep_mode = True
snapshot_engine = setup_vllm_engine(config)
engine_client = snapshot_engine[0]
if not await checkpoint_cfg.run_lifecycle(
engine_client, CHECKPOINT_SLEEP_MODE_LEVEL
):
return
shutdown_event = asyncio.Event() shutdown_event = asyncio.Event()
runtime, loop = create_runtime( runtime, loop = create_runtime(
discovery_backend=config.discovery_backend, discovery_backend=config.discovery_backend,
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""
Dynamo Snapshot integration for vLLM workers.
Handles the checkpoint job pod lifecycle:
1. Early exit if a checkpoint already exists (idempotency)
2. Sleep model for CRIU-friendly GPU state
3. Signal readiness for DaemonSet to begin checkpoint
4. Wait for watcher signals from the DaemonSet
5. Wake model after restore
Environment variables:
- DYN_READY_FOR_CHECKPOINT_FILE: Path where this worker writes readiness marker
- DYN_CHECKPOINT_STORAGE_TYPE: Storage backend (pvc, s3, oci) (optional, defaults to pvc)
- DYN_CHECKPOINT_LOCATION: Full checkpoint path (optional when PATH+HASH are provided)
- DYN_CHECKPOINT_PATH + DYN_CHECKPOINT_HASH: PVC base path + hash (used to derive location)
Signals handled in checkpoint mode:
- SIGUSR1: Checkpoint completed, exit process
- SIGCONT: Restore completed, wake model and continue
- SIGKILL (from watcher on failure): Process is terminated immediately (unhandleable)
"""
import asyncio
import logging import logging
import os from collections.abc import Callable
import signal
from typing import Optional
logger = logging.getLogger(__name__)
from dynamo.common.utils.snapshot import CheckpointConfig, EngineSnapshotController
class CheckpointConfig: from .args import Config
"""Parsed and validated checkpoint configuration from environment variables.""" from .handlers import VllmEngineQuiesceController
from .worker_factory import EngineSetupResult
def __init__(self): logger = logging.getLogger(__name__)
self.ready_file = os.environ["DYN_READY_FOR_CHECKPOINT_FILE"]
self.storage_type = os.environ.get("DYN_CHECKPOINT_STORAGE_TYPE", "pvc")
self.location = os.environ.get("DYN_CHECKPOINT_LOCATION", "")
if not self.location:
checkpoint_path = os.environ.get("DYN_CHECKPOINT_PATH", "").rstrip("/")
checkpoint_hash = os.environ.get("DYN_CHECKPOINT_HASH", "")
if checkpoint_path and checkpoint_hash:
self.location = f"{checkpoint_path}/{checkpoint_hash}"
self.is_checkpoint_job = bool(self.location)
self._checkpoint_done = asyncio.Event()
self._restore_done = asyncio.Event()
def checkpoint_exists(self) -> bool:
"""Check if a completed checkpoint already exists (idempotency).
A checkpoint is complete when its directory exists at the base path root
(not under the tmp/ staging area). Directory presence = done.
"""
if self.storage_type != "pvc":
return False
if os.path.isdir(self.location):
logger.info(f"Existing checkpoint found at {self.location}, skipping")
return True
logger.info(f"No checkpoint at {self.location}, creating new one")
return False
async def run_lifecycle(self, engine_client, sleep_level: int) -> bool:
"""Run the full checkpoint lifecycle after the engine is loaded.
1. Put model to sleep (CRIU-friendly GPU state)
2. Write ready file (triggers DaemonSet checkpoint via readiness probe)
3. Wait for watcher signal (checkpoint complete, restore complete, or failure)
4. If restored: wake model and return True (caller proceeds with registration)
5. If checkpoint done: return False (caller should exit)
"""
# Sleep model for checkpoint
logger.info(f"Putting model to sleep (level={sleep_level})")
await engine_client.sleep(level=sleep_level)
# Install signal handlers before writing the ready file so there is no async def prepare_snapshot_engine(
# window where the DaemonSet can send SIGUSR1/SIGCONT while the default config: Config,
# signal disposition (terminate) is still in effect. setup_vllm_engine: Callable[[Config], EngineSetupResult],
self._install_signal_handlers() ) -> EngineSnapshotController[EngineSetupResult] | None:
checkpoint_config = CheckpointConfig.from_env()
if checkpoint_config is None:
return None
# Signal readiness if config.headless:
with open(self.ready_file, "w") as f: raise ValueError(
f.write("ready") "--headless is incompatible with checkpoint mode "
logger.info( "(DYN_CHECKPOINT_SIGNAL_FILE is set). "
"Ready for checkpoint. Waiting for watcher signal " "Remove --headless or unset DYN_CHECKPOINT_SIGNAL_FILE."
"(SIGUSR1=checkpoint complete, SIGCONT=restore complete)"
) )
try: logger.info("Checkpoint mode enabled (watcher-driven signals)")
event = await self._wait_for_watcher_signal() config.engine_args.enable_sleep_mode = True
if event == "restore":
logger.info("Restore signal detected (SIGCONT)")
logger.info("Waking up model after restore")
await engine_client.wake_up()
return True
# SIGUSR1: checkpoint complete
logger.info("Checkpoint completion signal detected (SIGUSR1)")
return False
finally:
self._remove_signal_handlers()
# Remove the ready file so that a restarting pod does not leave a
# stale marker that could trick the DaemonSet into acting on it.
try:
os.unlink(self.ready_file)
except OSError:
pass
def _install_signal_handlers(self) -> None:
loop = asyncio.get_running_loop()
loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set)
# SIGCONT is used as the restore-complete signal. The snapshot DaemonSet
# watcher is the only sender, so there is no conflict with POSIX
# job-control semantics in practice.
loop.add_signal_handler(signal.SIGCONT, self._restore_done.set)
# No handler for checkpoint failure: the watcher sends SIGKILL, which
# terminates the process immediately (cannot be caught).
def _remove_signal_handlers(self) -> None:
loop = asyncio.get_running_loop()
loop.remove_signal_handler(signal.SIGUSR1)
loop.remove_signal_handler(signal.SIGCONT)
async def _wait_for_watcher_signal(self) -> str:
waiters = {
asyncio.create_task(self._checkpoint_done.wait()): "checkpoint",
asyncio.create_task(self._restore_done.wait()): "restore",
}
try:
done, pending = await asyncio.wait(
waiters.keys(), return_when=asyncio.FIRST_COMPLETED
)
for task in pending:
task.cancel()
winner = done.pop()
await winner
return waiters[winner]
finally:
for task in waiters:
if not task.done():
task.cancel()
def get_checkpoint_config() -> tuple[bool, Optional[CheckpointConfig]]:
"""Resolve checkpoint configuration, handling early-exit and cold-start cases.
Checkpoint mode is detected by DYN_READY_FOR_CHECKPOINT_FILE being set.
Returns:
(early_exit, config) where:
- early_exit=True, config=None: checkpoint job re-run, checkpoint already
exists — caller should return immediately.
- early_exit=False, config=None: not in checkpoint mode, or regular worker
with no checkpoint available yet — cold-start normally.
- early_exit=False, config=CheckpointConfig: checkpoint lifecycle should run.
"""
if "DYN_READY_FOR_CHECKPOINT_FILE" not in os.environ:
return False, None
# Validate checkpoint location: either a full location or path + hash must be set.
# Check the value (not just presence) so an empty string is treated as unset.
if not os.environ.get("DYN_CHECKPOINT_LOCATION"):
path = os.environ.get("DYN_CHECKPOINT_PATH", "")
hash_ = os.environ.get("DYN_CHECKPOINT_HASH", "")
if not path or not hash_:
raise EnvironmentError(
"Checkpoint mode requires either DYN_CHECKPOINT_LOCATION or both "
"DYN_CHECKPOINT_PATH and DYN_CHECKPOINT_HASH"
)
cfg = CheckpointConfig()
checkpoint_exists = cfg.checkpoint_exists()
if cfg.is_checkpoint_job and checkpoint_exists:
# Idempotent checkpoint job re-run: checkpoint already exists.
return True, None
if not cfg.is_checkpoint_job and not checkpoint_exists: engine = setup_vllm_engine(config)
# Regular worker with no checkpoint available yet: cold-start normally. snapshot_controller = EngineSnapshotController(
return False, None engine=engine,
quiesce_controller=VllmEngineQuiesceController(engine[0]),
checkpoint_config=checkpoint_config,
quiesce_args=(None,),
)
if not await snapshot_controller.wait_for_restore():
raise SystemExit(0)
return False, cfg return snapshot_controller
...@@ -7,7 +7,7 @@ from unittest.mock import AsyncMock ...@@ -7,7 +7,7 @@ from unittest.mock import AsyncMock
import pytest import pytest
from dynamo.vllm.handlers import BaseWorkerHandler from dynamo.vllm.handlers import BaseWorkerHandler, VllmEngineQuiesceController
pytestmark = [ pytestmark = [
pytest.mark.unit, pytest.mark.unit,
...@@ -34,8 +34,8 @@ def _make_handler() -> _TestWorkerHandler: ...@@ -34,8 +34,8 @@ def _make_handler() -> _TestWorkerHandler:
unregister_endpoint_instance=AsyncMock(), unregister_endpoint_instance=AsyncMock(),
register_endpoint_instance=AsyncMock(), register_endpoint_instance=AsyncMock(),
) )
handler._sleep_wake_lock = asyncio.Lock() handler._quiesce_controller = VllmEngineQuiesceController(handler.engine_client)
handler._engine_is_sleeping = False handler._quiesce_lock = asyncio.Lock()
return handler return handler
...@@ -74,6 +74,36 @@ async def test_sleep_and_wake_are_idempotent(): ...@@ -74,6 +74,36 @@ async def test_sleep_and_wake_are_idempotent():
handler.generate_endpoint.register_endpoint_instance.assert_awaited_once() handler.generate_endpoint.register_endpoint_instance.assert_awaited_once()
@pytest.mark.asyncio
async def test_quiesce_without_level_uses_vllm_default_sleep():
engine_client = SimpleNamespace(
pause_generation=AsyncMock(),
sleep=AsyncMock(),
wake_up=AsyncMock(),
resume_generation=AsyncMock(),
)
controller = VllmEngineQuiesceController(engine_client)
changed = await controller.quiesce(None)
assert changed is True
engine_client.pause_generation.assert_awaited_once()
engine_client.sleep.assert_awaited_once_with()
@pytest.mark.asyncio
async def test_wake_up_passes_explicit_tags_from_request():
handler = _make_handler()
await handler._quiesce_controller.quiesce(1)
result = await handler.wake_up({"tags": ["weights"]})
assert result["status"] == "ok"
handler.engine_client.wake_up.assert_awaited_once_with(["weights"])
handler.engine_client.resume_generation.assert_awaited_once()
handler.generate_endpoint.register_endpoint_instance.assert_awaited_once()
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_sleep_returns_error_for_unregister_failure(): async def test_sleep_returns_error_for_unregister_failure():
handler = _make_handler() handler = _make_handler()
...@@ -91,7 +121,7 @@ async def test_sleep_returns_error_for_unregister_failure(): ...@@ -91,7 +121,7 @@ async def test_sleep_returns_error_for_unregister_failure():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_wake_up_returns_error_for_register_failure(): async def test_wake_up_returns_error_for_register_failure():
handler = _make_handler() handler = _make_handler()
handler._engine_is_sleeping = True await handler._quiesce_controller.quiesce(1)
handler.generate_endpoint.register_endpoint_instance = AsyncMock( handler.generate_endpoint.register_endpoint_instance = AsyncMock(
side_effect=RuntimeError("discovery write timeout") side_effect=RuntimeError("discovery write timeout")
) )
...@@ -101,3 +131,4 @@ async def test_wake_up_returns_error_for_register_failure(): ...@@ -101,3 +131,4 @@ async def test_wake_up_returns_error_for_register_failure():
assert result["status"] == "error" assert result["status"] == "error"
handler.engine_client.wake_up.assert_awaited_once_with() handler.engine_client.wake_up.assert_awaited_once_with()
handler.engine_client.resume_generation.assert_awaited_once() handler.engine_client.resume_generation.assert_awaited_once()
assert handler._quiesce_controller.is_quiesced is True
...@@ -124,11 +124,12 @@ spec: ...@@ -124,11 +124,12 @@ spec:
default: 3600 default: 3600
description: ActiveDeadlineSeconds specifies the maximum time the Job can run description: ActiveDeadlineSeconds specifies the maximum time the Job can run
format: int64 format: int64
minimum: 1
type: integer type: integer
backoffLimit: backoffLimit:
default: 3 description: 'Deprecated: BackoffLimit is ignored. Checkpoint Jobs never retry.'
description: BackoffLimit specifies the number of retries before marking the Job failed
format: int32 format: int32
minimum: 0
type: integer type: integer
podTemplateSpec: podTemplateSpec:
description: |- description: |-
...@@ -8154,10 +8155,28 @@ spec: ...@@ -8154,10 +8155,28 @@ spec:
- containers - containers
type: object type: object
type: object type: object
sharedMemory:
description: |-
SharedMemory controls the tmpfs mounted at /dev/shm for the checkpoint Job pod.
When omitted, checkpoint Jobs use the same default 8Gi tmpfs as Dynamo components.
properties:
disabled:
type: boolean
size:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
x-kubernetes-validations:
- message: sharedMemory.size must not be set when sharedMemory.disabled is true
rule: '!(self.disabled && has(self.size))'
ttlSecondsAfterFinished: ttlSecondsAfterFinished:
default: 300 default: 300
description: TTLSecondsAfterFinished specifies how long to keep the Job after completion description: TTLSecondsAfterFinished specifies how long to keep the Job after completion
format: int32 format: int32
minimum: 0
type: integer type: integer
required: required:
- podTemplateSpec - podTemplateSpec
...@@ -8170,7 +8189,7 @@ spec: ...@@ -8170,7 +8189,7 @@ spec:
description: DynamoCheckpointStatus defines the observed state of DynamoCheckpoint description: DynamoCheckpointStatus defines the observed state of DynamoCheckpoint
properties: properties:
conditions: conditions:
description: Conditions represent the latest available observations of the checkpoint's state description: 'DEPRECATED: Conditions are deprecated. Use status.phase instead.'
items: items:
description: Condition contains details for one aspect of the current state of this API Resource. description: Condition contains details for one aspect of the current state of this API Resource.
properties: properties:
......
...@@ -678,8 +678,8 @@ spec: ...@@ -678,8 +678,8 @@ spec:
properties: properties:
checkpointRef: checkpointRef:
description: |- description: |-
CheckpointRef references an existing Checkpoint CR to use CheckpointRef references an existing DynamoCheckpoint CR by metadata.name.
If specified, Identity is ignored and this checkpoint is used directly If specified, this service's Identity is ignored and the referenced checkpoint is used directly.
type: string type: string
enabled: enabled:
default: false default: false
...@@ -11211,6 +11211,9 @@ spec: ...@@ -11211,6 +11211,9 @@ spec:
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true x-kubernetes-int-or-string: true
type: object type: object
x-kubernetes-validations:
- message: sharedMemory.size must not be set when sharedMemory.disabled is true
rule: '!(self.disabled && has(self.size))'
subComponentType: subComponentType:
description: SubComponentType indicates the sub-role of this component (for example, "prefill"). description: SubComponentType indicates the sub-role of this component (for example, "prefill").
type: string type: string
......
...@@ -887,8 +887,8 @@ spec: ...@@ -887,8 +887,8 @@ spec:
properties: properties:
checkpointRef: checkpointRef:
description: |- description: |-
CheckpointRef references an existing Checkpoint CR to use CheckpointRef references an existing DynamoCheckpoint CR by metadata.name.
If specified, Identity is ignored and this checkpoint is used directly If specified, this service's Identity is ignored and the referenced checkpoint is used directly.
type: string type: string
enabled: enabled:
default: false default: false
...@@ -11420,6 +11420,9 @@ spec: ...@@ -11420,6 +11420,9 @@ spec:
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true x-kubernetes-int-or-string: true
type: object type: object
x-kubernetes-validations:
- message: sharedMemory.size must not be set when sharedMemory.disabled is true
rule: '!(self.disabled && has(self.size))'
subComponentType: subComponentType:
description: SubComponentType indicates the sub-role of this component (for example, "prefill"). description: SubComponentType indicates the sub-role of this component (for example, "prefill").
type: string type: string
...@@ -11466,7 +11469,7 @@ spec: ...@@ -11466,7 +11469,7 @@ spec:
description: IdentityHash is the computed hash of the checkpoint identity description: IdentityHash is the computed hash of the checkpoint identity
type: string type: string
ready: ready:
description: Ready indicates if the checkpoint is ready for use description: Ready indicates if the checkpoint was visible to the worker at startup
type: boolean type: boolean
type: object type: object
description: |- description: |-
......
...@@ -33,6 +33,8 @@ dynamo-operator: ...@@ -33,6 +33,8 @@ dynamo-operator:
basePath: /checkpoints basePath: /checkpoints
``` ```
The snapshot-agent no longer reads `basePath` from its ConfigMap, but the operator still uses its configured PVC base path when it annotates checkpoint and restore pods. That path must match `storage.pvc.basePath` here so the mounted checkpoint location is valid inside the agent pod.
Cross-node restore requires a shared `ReadWriteMany` storage class. The chart defaults to `storage.pvc.accessMode=ReadWriteMany`. Cross-node restore requires a shared `ReadWriteMany` storage class. The chart defaults to `storage.pvc.accessMode=ReadWriteMany`.
For better restore times, use a fast `ReadWriteMany` StorageClass for the checkpoint PVC. For better restore times, use a fast `ReadWriteMany` StorageClass for the checkpoint PVC.
...@@ -81,7 +83,7 @@ kubectl get pods -n ${NAMESPACE} -l app.kubernetes.io/name=snapshot -o wide ...@@ -81,7 +83,7 @@ kubectl get pods -n ${NAMESPACE} -l app.kubernetes.io/name=snapshot -o wide
| `storage.pvc.size` | Requested PVC size | `1Ti` | | `storage.pvc.size` | Requested PVC size | `1Ti` |
| `storage.pvc.storageClass` | Storage class name | `""` | | `storage.pvc.storageClass` | Storage class name | `""` |
| `storage.pvc.accessMode` | Access mode for the checkpoint PVC | `ReadWriteMany` | | `storage.pvc.accessMode` | Access mode for the checkpoint PVC | `ReadWriteMany` |
| `storage.pvc.basePath` | Checkpoint root inside the PVC | `/checkpoints` | | `storage.pvc.basePath` | PVC mount path inside the snapshot-agent pod | `/checkpoints` |
| `daemonset.image.repository` | Snapshot agent image repository | `nvcr.io/nvidia/ai-dynamo/snapshot-agent` | | `daemonset.image.repository` | Snapshot agent image repository | `nvcr.io/nvidia/ai-dynamo/snapshot-agent` |
| `daemonset.image.tag` | Snapshot agent image tag | `1.0.0` | | `daemonset.image.tag` | Snapshot agent image tag | `1.0.0` |
| `daemonset.imagePullSecrets` | Image pull secrets for the agent | `[{name: ngc-secret}]` | | `daemonset.imagePullSecrets` | Image pull secrets for the agent | `[{name: ngc-secret}]` |
......
...@@ -10,12 +10,8 @@ metadata: ...@@ -10,12 +10,8 @@ metadata:
{{- include "snapshot.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
data: data:
config.yaml: | config.yaml: |
basePath: {{ .Values.storage.pvc.basePath | quote }}
overlay: overlay:
systemDirs: {{ toYaml .Values.config.overlay.systemDirs | nindent 8 }} exclusions: {{ toYaml .Values.config.overlay.exclusions | nindent 8 }}
cacheDirs: {{ toYaml .Values.config.overlay.cacheDirs | nindent 8 }}
additionalExclusions: {{ toYaml .Values.config.overlay.additionalExclusions | nindent 8 }}
restore: restore:
nsRestorePath: {{ .Values.config.restore.nsRestorePath | quote }} nsRestorePath: {{ .Values.config.restore.nsRestorePath | quote }}
...@@ -29,6 +25,7 @@ data: ...@@ -29,6 +25,7 @@ data:
leaveRunning: {{ .Values.config.criu.leaveRunning }} leaveRunning: {{ .Values.config.criu.leaveRunning }}
shellJob: {{ .Values.config.criu.shellJob }} shellJob: {{ .Values.config.criu.shellJob }}
tcpClose: {{ .Values.config.criu.tcpClose }} tcpClose: {{ .Values.config.criu.tcpClose }}
tcpEstablished: {{ .Values.config.criu.tcpEstablished }}
fileLocks: {{ .Values.config.criu.fileLocks }} fileLocks: {{ .Values.config.criu.fileLocks }}
orphanPtsMaster: {{ .Values.config.criu.orphanPtsMaster }} orphanPtsMaster: {{ .Values.config.criu.orphanPtsMaster }}
extUnixSk: {{ .Values.config.criu.extUnixSk }} extUnixSk: {{ .Values.config.criu.extUnixSk }}
......
...@@ -16,6 +16,14 @@ rules: ...@@ -16,6 +16,14 @@ rules:
- apiGroups: [""] - apiGroups: [""]
resources: ["pods"] resources: ["pods"]
verbs: ["get", "list", "watch", "patch", "update"] verbs: ["get", "list", "watch", "patch", "update"]
# Patch checkpoint Jobs with terminal checkpoint status
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["get", "patch", "update"]
# Coordinate checkpoint ownership with per-Job leases
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["get", "create", "update", "delete"]
# Emit operational events on pod/restore lifecycle updates # Emit operational events on pod/restore lifecycle updates
- apiGroups: [""] - apiGroups: [""]
resources: ["events"] resources: ["events"]
...@@ -33,6 +41,14 @@ rules: ...@@ -33,6 +41,14 @@ rules:
- apiGroups: [""] - apiGroups: [""]
resources: ["pods"] resources: ["pods"]
verbs: ["get", "list", "watch", "patch", "update"] verbs: ["get", "list", "watch", "patch", "update"]
# Patch checkpoint Jobs with terminal checkpoint status
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["get", "patch", "update"]
# Coordinate checkpoint ownership with per-Job leases
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["get", "create", "update", "delete"]
# Emit operational events on pod/restore lifecycle updates # Emit operational events on pod/restore lifecycle updates
- apiGroups: [""] - apiGroups: [""]
resources: ["events"] resources: ["events"]
......
...@@ -34,7 +34,8 @@ storage: ...@@ -34,7 +34,8 @@ storage:
storageClass: "" storageClass: ""
# Access mode - ReadWriteMany required for multi-pod access # Access mode - ReadWriteMany required for multi-pod access
accessMode: ReadWriteMany accessMode: ReadWriteMany
# Base path for checkpoints (mounted in pods) # PVC mount path inside the snapshot-agent pod.
# This must match the operator checkpoint.storage.pvc.basePath setting.
basePath: /checkpoints basePath: /checkpoints
# S3 configuration (when type=s3) # S3 configuration (when type=s3)
...@@ -123,17 +124,15 @@ rbac: ...@@ -123,17 +124,15 @@ rbac:
# Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) come from environment variables # Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) come from environment variables
config: config:
overlay: overlay:
# Virtual FS dirs are COW artifacts in the overlay upperdir. # Rootfs diff tar exclusions. Absolute-looking paths are normalized
systemDirs: # relative to the tar root, and patterns starting with * are passed
# through as tar globs unchanged.
exclusions:
- /proc - /proc
- /sys - /sys
- /dev - /dev
# Cache directories to exclude (reduces checkpoint size) - "*/.cache/huggingface"
cacheDirs: - "*/.cache/vllm/torch_compile_cache"
- /.cache/huggingface
# Python bytecode is already loaded in memory at restore time and
# regenerated automatically on cold start.
additionalExclusions:
- "*/__pycache__" - "*/__pycache__"
- "*.pyc" - "*.pyc"
...@@ -157,7 +156,8 @@ config: ...@@ -157,7 +156,8 @@ config:
# K8s-specific options (recommended defaults for containers) # K8s-specific options (recommended defaults for containers)
leaveRunning: true # Keep process running after checkpoint leaveRunning: true # Keep process running after checkpoint
shellJob: true # Containers are often session leaders shellJob: true # Containers are often session leaders
tcpClose: true # Pod IPs change on restore/migration tcpClose: true # Close non-listening TCP sockets on restore
tcpEstablished: false # Preserve established TCP sockets during restore
fileLocks: true # Applications use file locks fileLocks: true # Applications use file locks
orphanPtsMaster: true # Containers with TTYs orphanPtsMaster: true # Containers with TTYs
extUnixSk: true # External Unix sockets extUnixSk: true # External Unix sockets
......
...@@ -71,6 +71,7 @@ type Autoscaling struct { ...@@ -71,6 +71,7 @@ type Autoscaling struct {
Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"` Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"`
} }
// +kubebuilder:validation:XValidation:rule="!(self.disabled && has(self.size))",message="sharedMemory.size must not be set when sharedMemory.disabled is true"
type SharedMemorySpec struct { type SharedMemorySpec struct {
Disabled bool `json:"disabled,omitempty"` Disabled bool `json:"disabled,omitempty"`
Size resource.Quantity `json:"size,omitempty"` Size resource.Quantity `json:"size,omitempty"`
...@@ -192,8 +193,8 @@ type ServiceCheckpointConfig struct { ...@@ -192,8 +193,8 @@ type ServiceCheckpointConfig struct {
// +kubebuilder:default=Auto // +kubebuilder:default=Auto
Mode CheckpointMode `json:"mode,omitempty"` Mode CheckpointMode `json:"mode,omitempty"`
// CheckpointRef references an existing Checkpoint CR to use // CheckpointRef references an existing DynamoCheckpoint CR by metadata.name.
// If specified, Identity is ignored and this checkpoint is used directly // If specified, this service's Identity is ignored and the referenced checkpoint is used directly.
// +optional // +optional
CheckpointRef *string `json:"checkpointRef,omitempty"` CheckpointRef *string `json:"checkpointRef,omitempty"`
......
...@@ -93,18 +93,25 @@ type DynamoCheckpointJobConfig struct { ...@@ -93,18 +93,25 @@ type DynamoCheckpointJobConfig struct {
// +kubebuilder:validation:Required // +kubebuilder:validation:Required
PodTemplateSpec corev1.PodTemplateSpec `json:"podTemplateSpec"` PodTemplateSpec corev1.PodTemplateSpec `json:"podTemplateSpec"`
// SharedMemory controls the tmpfs mounted at /dev/shm for the checkpoint Job pod.
// When omitted, checkpoint Jobs use the same default 8Gi tmpfs as Dynamo components.
// +optional
SharedMemory *SharedMemorySpec `json:"sharedMemory,omitempty"`
// ActiveDeadlineSeconds specifies the maximum time the Job can run // ActiveDeadlineSeconds specifies the maximum time the Job can run
// +optional // +optional
// +kubebuilder:validation:Minimum=1
// +kubebuilder:default=3600 // +kubebuilder:default=3600
ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"` ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty"`
// BackoffLimit specifies the number of retries before marking the Job failed // Deprecated: BackoffLimit is ignored. Checkpoint Jobs never retry.
// +optional // +optional
// +kubebuilder:default=3 // +kubebuilder:validation:Minimum=0
BackoffLimit *int32 `json:"backoffLimit,omitempty"` BackoffLimit *int32 `json:"backoffLimit,omitempty"`
// TTLSecondsAfterFinished specifies how long to keep the Job after completion // TTLSecondsAfterFinished specifies how long to keep the Job after completion
// +optional // +optional
// +kubebuilder:validation:Minimum=0
// +kubebuilder:default=300 // +kubebuilder:default=300
TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"` TTLSecondsAfterFinished *int32 `json:"ttlSecondsAfterFinished,omitempty"`
} }
...@@ -124,9 +131,9 @@ type DynamoCheckpointSpec struct { ...@@ -124,9 +131,9 @@ type DynamoCheckpointSpec struct {
type DynamoCheckpointConditionType string type DynamoCheckpointConditionType string
const ( const (
// DynamoCheckpointConditionJobCreated indicates whether the checkpoint Job has been created // DEPRECATED: DynamoCheckpointConditionJobCreated is deprecated. Use status.phase instead.
DynamoCheckpointConditionJobCreated DynamoCheckpointConditionType = "JobCreated" DynamoCheckpointConditionJobCreated DynamoCheckpointConditionType = "JobCreated"
// DynamoCheckpointConditionJobCompleted indicates whether the checkpoint Job has completed // DEPRECATED: DynamoCheckpointConditionJobCompleted is deprecated. Use status.phase instead.
DynamoCheckpointConditionJobCompleted DynamoCheckpointConditionType = "JobCompleted" DynamoCheckpointConditionJobCompleted DynamoCheckpointConditionType = "JobCompleted"
) )
...@@ -164,7 +171,7 @@ type DynamoCheckpointStatus struct { ...@@ -164,7 +171,7 @@ type DynamoCheckpointStatus struct {
// +optional // +optional
Message string `json:"message,omitempty"` Message string `json:"message,omitempty"`
// Conditions represent the latest available observations of the checkpoint's state // DEPRECATED: Conditions are deprecated. Use status.phase instead.
// +optional // +optional
Conditions []metav1.Condition `json:"conditions,omitempty"` Conditions []metav1.Condition `json:"conditions,omitempty"`
} }
......
...@@ -144,7 +144,7 @@ type ServiceCheckpointStatus struct { ...@@ -144,7 +144,7 @@ type ServiceCheckpointStatus struct {
// IdentityHash is the computed hash of the checkpoint identity // IdentityHash is the computed hash of the checkpoint identity
// +optional // +optional
IdentityHash string `json:"identityHash,omitempty"` IdentityHash string `json:"identityHash,omitempty"`
// Ready indicates if the checkpoint is ready for use // Ready indicates if the checkpoint was visible to the worker at startup
// +optional // +optional
Ready bool `json:"ready,omitempty"` Ready bool `json:"ready,omitempty"`
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment