Unverified Commit 6831020f authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

chore: rename chrek to Dynamo Snapshot (#7028)


Signed-off-by: default avatarSchwinn Saereesitthipitak <17022745+galletas1712@users.noreply.github.com>
parent 7dbebf3c
...@@ -108,7 +108,7 @@ deploy: ...@@ -108,7 +108,7 @@ deploy:
- *ci - *ci
- 'deploy/helm/**' - 'deploy/helm/**'
- 'deploy/utils/**' - 'deploy/utils/**'
- 'deploy/chrek/**' - 'deploy/snapshot/**'
- 'tests/deploy/**' - 'tests/deploy/**'
planner: planner:
......
...@@ -61,16 +61,16 @@ async def init_decode( ...@@ -61,16 +61,16 @@ async def init_decode(
shutdown_event: asyncio.Event, shutdown_event: asyncio.Event,
shutdown_endpoints: list, shutdown_endpoints: list,
run_deferred_handlers: Callable[[], Awaitable[None]] | None = None, run_deferred_handlers: Callable[[], Awaitable[None]] | None = None,
checkpoint_restore_engine: Optional[sgl.Engine] = None, snapshot_engine: Optional[sgl.Engine] = None,
) -> None: ) -> None:
server_args, dynamo_args = config.server_args, config.dynamo_args server_args, dynamo_args = config.server_args, config.dynamo_args
if server_args.node_rank >= 1: if server_args.node_rank >= 1:
os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0" os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0"
# Use pre-created engine if provided (checkpoint/restore mode) # Use pre-created engine if provided (snapshot mode)
if checkpoint_restore_engine is not None: if snapshot_engine is not None:
engine = checkpoint_restore_engine engine = snapshot_engine
load_time = 0.0 load_time = 0.0
else: else:
start_time = time.time() start_time = time.time()
...@@ -151,16 +151,16 @@ async def init_prefill( ...@@ -151,16 +151,16 @@ async def init_prefill(
shutdown_event: asyncio.Event, shutdown_event: asyncio.Event,
shutdown_endpoints: list, shutdown_endpoints: list,
run_deferred_handlers: Callable[[], Awaitable[None]] | None = None, run_deferred_handlers: Callable[[], Awaitable[None]] | None = None,
checkpoint_restore_engine: Optional[sgl.Engine] = None, snapshot_engine: Optional[sgl.Engine] = None,
) -> None: ) -> None:
server_args, dynamo_args = config.server_args, config.dynamo_args server_args, dynamo_args = config.server_args, config.dynamo_args
if server_args.node_rank >= 1: if server_args.node_rank >= 1:
os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0" os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0"
# Use pre-created engine if provided (checkpoint/restore mode) # Use pre-created engine if provided (snapshot mode)
if checkpoint_restore_engine is not None: if snapshot_engine is not None:
engine = checkpoint_restore_engine engine = snapshot_engine
else: else:
engine = sgl.Engine(server_args=server_args) engine = sgl.Engine(server_args=server_args)
......
...@@ -12,7 +12,6 @@ from dynamo.common.constants import DisaggregationMode ...@@ -12,7 +12,6 @@ from dynamo.common.constants import DisaggregationMode
from dynamo.common.utils.runtime import create_runtime from dynamo.common.utils.runtime import create_runtime
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.sglang.args import parse_args from dynamo.sglang.args import parse_args
from dynamo.sglang.checkpoint_restore import handle_checkpoint_mode
from dynamo.sglang.init_diffusion import ( from dynamo.sglang.init_diffusion import (
init_image_diffusion, init_image_diffusion,
init_llm_diffusion, init_llm_diffusion,
...@@ -27,6 +26,7 @@ from dynamo.sglang.init_multimodal import ( ...@@ -27,6 +26,7 @@ from dynamo.sglang.init_multimodal import (
init_multimodal_worker, init_multimodal_worker,
) )
from dynamo.sglang.shutdown import install_graceful_shutdown from dynamo.sglang.shutdown import install_graceful_shutdown
from dynamo.sglang.snapshot import handle_checkpoint_mode
configure_dynamo_logging() configure_dynamo_logging()
...@@ -41,9 +41,7 @@ async def worker(): ...@@ -41,9 +41,7 @@ async def worker():
config.server_args.load_format = setup_gms(config.server_args) config.server_args.load_format = setup_gms(config.server_args)
# Checkpoint mode: engine must be created BEFORE runtime (no NATS/etcd during CRIU) # Checkpoint mode: engine must be created BEFORE runtime (no NATS/etcd during CRIU)
should_exit, checkpoint_restore_engine = await handle_checkpoint_mode( should_exit, snapshot_engine = await handle_checkpoint_mode(config.server_args)
config.server_args
)
if should_exit: if should_exit:
return return
...@@ -129,7 +127,7 @@ async def worker(): ...@@ -129,7 +127,7 @@ async def worker():
shutdown_event, shutdown_event,
shutdown_endpoints, shutdown_endpoints,
run_deferred_handlers, run_deferred_handlers,
checkpoint_restore_engine=checkpoint_restore_engine, snapshot_engine=snapshot_engine,
) )
else: else:
await init_prefill( await init_prefill(
...@@ -138,7 +136,7 @@ async def worker(): ...@@ -138,7 +136,7 @@ async def worker():
shutdown_event, shutdown_event,
shutdown_endpoints, shutdown_endpoints,
run_deferred_handlers, run_deferred_handlers,
checkpoint_restore_engine=checkpoint_restore_engine, snapshot_engine=snapshot_engine,
) )
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """
Checkpoint/restore (chrek) integration for SGLang workers. Dynamo Snapshot integration for SGLang workers.
Handles the checkpoint job pod lifecycle: Handles the checkpoint job pod lifecycle:
1. Early exit if a checkpoint already exists (idempotency) 1. Early exit if a checkpoint already exists (idempotency)
...@@ -164,7 +164,7 @@ class CheckpointConfig: ...@@ -164,7 +164,7 @@ class CheckpointConfig:
def _install_signal_handlers(self) -> None: def _install_signal_handlers(self) -> None:
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set) loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set)
# SIGCONT is used as the restore-complete signal. The chrek DaemonSet # SIGCONT is used as the restore-complete signal. The snapshot DaemonSet
# watcher is the only sender, so there is no conflict with POSIX # watcher is the only sender, so there is no conflict with POSIX
# job-control semantics in practice. # job-control semantics in practice.
loop.add_signal_handler(signal.SIGCONT, self._restore_done.set) loop.add_signal_handler(signal.SIGCONT, self._restore_done.set)
...@@ -197,7 +197,7 @@ class CheckpointConfig: ...@@ -197,7 +197,7 @@ class CheckpointConfig:
async def handle_checkpoint_mode(server_args) -> tuple[bool, Optional[sgl.Engine]]: async def handle_checkpoint_mode(server_args) -> tuple[bool, Optional[sgl.Engine]]:
"""Single entry point for checkpoint/restore integration. """Single entry point for Dynamo Snapshot integration.
Must be called BEFORE runtime creation so the engine can be checkpointed Must be called BEFORE runtime creation so the engine can be checkpointed
without active NATS/etcd connections. without active NATS/etcd connections.
......
...@@ -42,7 +42,6 @@ from dynamo.runtime.logging import configure_dynamo_logging ...@@ -42,7 +42,6 @@ from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.vllm.worker_factory import WorkerFactory from dynamo.vllm.worker_factory import WorkerFactory
from .args import Config, _uses_dynamo_connector, parse_args from .args import Config, _uses_dynamo_connector, parse_args
from .checkpoint_restore import get_checkpoint_config
from .constants import DisaggregationMode from .constants import DisaggregationMode
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler, get_dp_range_for_worker from .handlers import DecodeWorkerHandler, PrefillWorkerHandler, get_dp_range_for_worker
from .health_check import ( from .health_check import (
...@@ -51,6 +50,7 @@ from .health_check import ( ...@@ -51,6 +50,7 @@ from .health_check import (
VllmPrefillHealthCheckPayload, VllmPrefillHealthCheckPayload,
) )
from .publisher import DYNAMO_COMPONENT_REGISTRY, StatLoggerFactory from .publisher import DYNAMO_COMPONENT_REGISTRY, StatLoggerFactory
from .snapshot import get_checkpoint_config
# Optional imports for frontend decoding support # Optional imports for frontend decoding support
MediaDecoder: type | None = None MediaDecoder: type | None = None
...@@ -135,15 +135,15 @@ async def worker() -> None: ...@@ -135,15 +135,15 @@ async def worker() -> None:
# CHECKPOINT MODE: Load engine BEFORE runtime creation # CHECKPOINT MODE: Load engine BEFORE runtime creation
# This allows checkpointing GPU state before runtime connections are established # This allows checkpointing GPU state before runtime connections are established
checkpoint_restore_engine = None snapshot_engine = None
if checkpoint_cfg is not None: if checkpoint_cfg is not None:
logger.info("Checkpoint mode enabled (watcher-driven signals)") logger.info("Checkpoint mode enabled (watcher-driven signals)")
# Checkpoint mode requires sleep mode — enable before engine init # Checkpoint mode requires sleep mode — enable before engine init
config.engine_args.enable_sleep_mode = True config.engine_args.enable_sleep_mode = True
checkpoint_restore_engine = setup_vllm_engine(config) snapshot_engine = setup_vllm_engine(config)
engine_client = checkpoint_restore_engine[0] engine_client = snapshot_engine[0]
if not await checkpoint_cfg.run_lifecycle( if not await checkpoint_cfg.run_lifecycle(
engine_client, CHECKPOINT_SLEEP_MODE_LEVEL engine_client, CHECKPOINT_SLEEP_MODE_LEVEL
...@@ -173,7 +173,7 @@ async def worker() -> None: ...@@ -173,7 +173,7 @@ async def worker() -> None:
config, config,
shutdown_event, shutdown_event,
shutdown_endpoints, shutdown_endpoints,
checkpoint_restore_engine=checkpoint_restore_engine, snapshot_engine=snapshot_engine,
) )
logger.debug("multimodal worker completed") logger.debug("multimodal worker completed")
elif config.omni: elif config.omni:
...@@ -184,7 +184,7 @@ async def worker() -> None: ...@@ -184,7 +184,7 @@ async def worker() -> None:
runtime, runtime,
config, config,
shutdown_event, shutdown_event,
checkpoint_restore_engine=checkpoint_restore_engine, snapshot_engine=snapshot_engine,
) )
logger.debug("init_prefill completed") logger.debug("init_prefill completed")
else: else:
...@@ -192,7 +192,7 @@ async def worker() -> None: ...@@ -192,7 +192,7 @@ async def worker() -> None:
runtime, runtime,
config, config,
shutdown_event, shutdown_event,
checkpoint_restore_engine=checkpoint_restore_engine, snapshot_engine=snapshot_engine,
) )
logger.debug("init completed") logger.debug("init completed")
...@@ -597,7 +597,7 @@ async def init_prefill( ...@@ -597,7 +597,7 @@ async def init_prefill(
runtime: DistributedRuntime, runtime: DistributedRuntime,
config: Config, config: Config,
shutdown_event: asyncio.Event, shutdown_event: asyncio.Event,
checkpoint_restore_engine: Optional[ snapshot_engine: Optional[
tuple[AsyncLLM, VllmConfig, Any, Any, LLMBackendMetrics] tuple[AsyncLLM, VllmConfig, Any, Any, LLMBackendMetrics]
] = None, ] = None,
) -> None: ) -> None:
...@@ -612,14 +612,14 @@ async def init_prefill( ...@@ -612,14 +612,14 @@ async def init_prefill(
) )
# Use pre-created engine if provided (checkpoint mode), otherwise create new # Use pre-created engine if provided (checkpoint mode), otherwise create new
if checkpoint_restore_engine is not None: if snapshot_engine is not None:
( (
engine_client, engine_client,
vllm_config, vllm_config,
default_sampling_params, default_sampling_params,
prometheus_temp_dir, prometheus_temp_dir,
_component_gauges, _component_gauges,
) = checkpoint_restore_engine ) = snapshot_engine
else: else:
( (
engine_client, engine_client,
...@@ -741,7 +741,7 @@ async def init( ...@@ -741,7 +741,7 @@ async def init(
runtime: DistributedRuntime, runtime: DistributedRuntime,
config: Config, config: Config,
shutdown_event: asyncio.Event, shutdown_event: asyncio.Event,
checkpoint_restore_engine: Optional[ snapshot_engine: Optional[
tuple[AsyncLLM, VllmConfig, Any, Any, LLMBackendMetrics] tuple[AsyncLLM, VllmConfig, Any, Any, LLMBackendMetrics]
] = None, ] = None,
) -> None: ) -> None:
...@@ -782,14 +782,14 @@ async def init( ...@@ -782,14 +782,14 @@ async def init(
) )
# Use pre-created engine if provided (checkpoint mode), otherwise create new # Use pre-created engine if provided (checkpoint mode), otherwise create new
if checkpoint_restore_engine is not None: if snapshot_engine is not None:
( (
engine_client, engine_client,
vllm_config, vllm_config,
default_sampling_params, default_sampling_params,
prometheus_temp_dir, prometheus_temp_dir,
component_gauges, component_gauges,
) = checkpoint_restore_engine ) = snapshot_engine
# Factory is created after unpack so component_gauges is available # Factory is created after unpack so component_gauges is available
factory = StatLoggerFactory( factory = StatLoggerFactory(
endpoint=generate_endpoint, endpoint=generate_endpoint,
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """
Checkpoint/restore (chrek) integration for vLLM workers. Dynamo Snapshot integration for vLLM workers.
Handles the checkpoint job pod lifecycle: Handles the checkpoint job pod lifecycle:
1. Early exit if a checkpoint already exists (idempotency) 1. Early exit if a checkpoint already exists (idempotency)
...@@ -113,7 +113,7 @@ class CheckpointConfig: ...@@ -113,7 +113,7 @@ class CheckpointConfig:
def _install_signal_handlers(self) -> None: def _install_signal_handlers(self) -> None:
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set) loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set)
# SIGCONT is used as the restore-complete signal. The chrek DaemonSet # SIGCONT is used as the restore-complete signal. The snapshot DaemonSet
# watcher is the only sender, so there is no conflict with POSIX # watcher is the only sender, so there is no conflict with POSIX
# job-control semantics in practice. # job-control semantics in practice.
loop.add_signal_handler(signal.SIGCONT, self._restore_done.set) loop.add_signal_handler(signal.SIGCONT, self._restore_done.set)
......
...@@ -103,14 +103,12 @@ class TestCreate: ...@@ -103,14 +103,12 @@ class TestCreate:
factory._create_multimodal_worker.assert_called_once() # type: ignore[union-attr] factory._create_multimodal_worker.assert_called_once() # type: ignore[union-attr]
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_passes_checkpoint_restore_engine( async def test_passes_snapshot_engine(self, factory: WorkerFactory) -> None:
self, factory: WorkerFactory
) -> None:
config = _make_config(multimodal_worker=True) config = _make_config(multimodal_worker=True)
runtime = Mock() runtime = Mock()
shutdown_event = asyncio.Event() shutdown_event = asyncio.Event()
shutdown_endpoints: list = [] shutdown_endpoints: list = []
checkpoint_restore_engine: EngineSetupResult = ( snapshot_engine: EngineSetupResult = (
Mock(), Mock(),
Mock(), Mock(),
Mock(), Mock(),
...@@ -123,7 +121,7 @@ class TestCreate: ...@@ -123,7 +121,7 @@ class TestCreate:
config, config,
shutdown_event, shutdown_event,
shutdown_endpoints, shutdown_endpoints,
checkpoint_restore_engine=checkpoint_restore_engine, snapshot_engine=snapshot_engine,
) )
factory._create_multimodal_worker.assert_called_once_with( # type: ignore[union-attr] factory._create_multimodal_worker.assert_called_once_with( # type: ignore[union-attr]
...@@ -131,7 +129,7 @@ class TestCreate: ...@@ -131,7 +129,7 @@ class TestCreate:
config, config,
shutdown_event, shutdown_event,
shutdown_endpoints, shutdown_endpoints,
checkpoint_restore_engine=checkpoint_restore_engine, snapshot_engine=snapshot_engine,
) )
@pytest.mark.asyncio @pytest.mark.asyncio
......
...@@ -58,7 +58,7 @@ class WorkerFactory: ...@@ -58,7 +58,7 @@ class WorkerFactory:
config: Config, config: Config,
shutdown_event: asyncio.Event, shutdown_event: asyncio.Event,
shutdown_endpoints: list, shutdown_endpoints: list,
checkpoint_restore_engine: Optional[EngineSetupResult] = None, snapshot_engine: Optional[EngineSetupResult] = None,
) -> None: ) -> None:
"""Create the appropriate multimodal worker based on config flags.""" """Create the appropriate multimodal worker based on config flags."""
...@@ -72,7 +72,7 @@ class WorkerFactory: ...@@ -72,7 +72,7 @@ class WorkerFactory:
config, config,
shutdown_event, shutdown_event,
shutdown_endpoints, shutdown_endpoints,
checkpoint_restore_engine=checkpoint_restore_engine, snapshot_engine=snapshot_engine,
) )
else: else:
raise ValueError( raise ValueError(
...@@ -85,7 +85,7 @@ class WorkerFactory: ...@@ -85,7 +85,7 @@ class WorkerFactory:
config: Config, config: Config,
shutdown_event: asyncio.Event, shutdown_event: asyncio.Event,
shutdown_endpoints: list, # mutated in place shutdown_endpoints: list, # mutated in place
checkpoint_restore_engine: Optional[EngineSetupResult] = None, snapshot_engine: Optional[EngineSetupResult] = None,
) -> None: ) -> None:
""" """
Initialize multimodal worker component. Initialize multimodal worker component.
...@@ -121,14 +121,14 @@ class WorkerFactory: ...@@ -121,14 +121,14 @@ class WorkerFactory:
[load_lora_endpoint, unload_lora_endpoint, list_loras_endpoint] [load_lora_endpoint, unload_lora_endpoint, list_loras_endpoint]
) )
# Use pre-created engine if provided (checkpoint mode), otherwise create new # Use pre-created engine if provided (checkpoint mode), otherwise create new
if checkpoint_restore_engine is not None: if snapshot_engine is not None:
( (
engine_client, engine_client,
vllm_config, vllm_config,
_default_sampling_params, _default_sampling_params,
prometheus_temp_dir, prometheus_temp_dir,
_component_gauges, _component_gauges,
) = checkpoint_restore_engine ) = snapshot_engine
else: else:
( (
engine_client, engine_client,
......
...@@ -163,7 +163,7 @@ The chart includes built-in validation to prevent all operator conflicts: ...@@ -163,7 +163,7 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.checkpoint.enabled | bool | `false` | Whether to enable checkpoint/restore functionality | | dynamo-operator.checkpoint.enabled | bool | `false` | Whether to enable checkpoint/restore functionality |
| dynamo-operator.checkpoint.readyForCheckpointFilePath | string | `"/tmp/ready-for-checkpoint"` | Path written by worker when model is loaded and ready for checkpointing | | dynamo-operator.checkpoint.readyForCheckpointFilePath | string | `"/tmp/ready-for-checkpoint"` | Path written by worker when model is loaded and ready for checkpointing |
| dynamo-operator.checkpoint.storage.type | string | `"pvc"` | Storage backend type: pvc, s3, or oci | | dynamo-operator.checkpoint.storage.type | string | `"pvc"` | Storage backend type: pvc, s3, or oci |
| dynamo-operator.checkpoint.storage.pvc.pvcName | string | `"chrek-pvc"` | Name of the PVC created by the chrek chart | | dynamo-operator.checkpoint.storage.pvc.pvcName | string | `"snapshot-pvc"` | Name of the PVC created by the snapshot chart |
| dynamo-operator.checkpoint.storage.pvc.basePath | string | `"/checkpoints"` | Base path within the PVC for storing checkpoints | | dynamo-operator.checkpoint.storage.pvc.basePath | string | `"/checkpoints"` | Base path within the PVC for storing checkpoints |
| dynamo-operator.checkpoint.storage.s3.uri | string | `""` | S3 URI in format: s3://[endpoint/]bucket/prefix | | dynamo-operator.checkpoint.storage.s3.uri | string | `""` | S3 URI in format: s3://[endpoint/]bucket/prefix |
| dynamo-operator.checkpoint.storage.s3.credentialsSecretRef | string | `""` | Reference to a secret containing AWS credentials | | dynamo-operator.checkpoint.storage.s3.credentialsSecretRef | string | `""` | Reference to a secret containing AWS credentials |
......
...@@ -141,7 +141,7 @@ data: ...@@ -141,7 +141,7 @@ data:
{{- end }} {{- end }}
{{- if or (eq (.Values.checkpoint.storage.type | toString) "pvc") (not .Values.checkpoint.storage.type) }} {{- if or (eq (.Values.checkpoint.storage.type | toString) "pvc") (not .Values.checkpoint.storage.type) }}
pvc: pvc:
pvcName: {{ (.Values.checkpoint.storage.pvc.pvcName | default "chrek-pvc") | quote }} pvcName: {{ (.Values.checkpoint.storage.pvc.pvcName | default "snapshot-pvc") | quote }}
basePath: {{ (.Values.checkpoint.storage.pvc.basePath | default "/checkpoints") | quote }} basePath: {{ (.Values.checkpoint.storage.pvc.basePath | default "/checkpoints") | quote }}
{{- end }} {{- end }}
{{- if eq .Values.checkpoint.storage.type "s3" }} {{- if eq .Values.checkpoint.storage.type "s3" }}
......
...@@ -137,7 +137,7 @@ modelExpressURL: "" ...@@ -137,7 +137,7 @@ modelExpressURL: ""
# Checkpoint configuration for fast pod restore # Checkpoint configuration for fast pod restore
# NOTE: The checkpoint infrastructure (PVC + DaemonSet) must be installed separately # NOTE: The checkpoint infrastructure (PVC + DaemonSet) must be installed separately
# using the chrek Helm chart in each namespace where checkpointing is needed. # using the snapshot Helm chart in each namespace where checkpointing is needed.
checkpoint: checkpoint:
# Enable checkpoint/restore functionality # Enable checkpoint/restore functionality
enabled: false enabled: false
...@@ -148,16 +148,16 @@ checkpoint: ...@@ -148,16 +148,16 @@ checkpoint:
# Storage configuration # Storage configuration
# These settings tell the operator where to find checkpoint storage # These settings tell the operator where to find checkpoint storage
# Must match the configuration in the chrek chart # Must match the configuration in the snapshot chart
storage: storage:
# Storage backend type: pvc, s3, or oci # Storage backend type: pvc, s3, or oci
type: pvc type: pvc
# PVC configuration (used when type=pvc) # PVC configuration (used when type=pvc)
pvc: pvc:
# Name of the PVC created by the chrek chart # Name of the PVC created by the snapshot chart
# Must match the PVC name in the chrek chart # Must match the PVC name in the snapshot chart
pvcName: "chrek-pvc" pvcName: "snapshot-pvc"
# Base path within the PVC for storing checkpoints # Base path within the PVC for storing checkpoints
basePath: "/checkpoints" basePath: "/checkpoints"
......
...@@ -222,7 +222,7 @@ dynamo-operator: ...@@ -222,7 +222,7 @@ dynamo-operator:
# Checkpoint configuration for fast pod restore using CRIU/cuda-checkpoint # Checkpoint configuration for fast pod restore using CRIU/cuda-checkpoint
# NOTE: The checkpoint infrastructure (PVC + DaemonSet) must be installed separately # NOTE: The checkpoint infrastructure (PVC + DaemonSet) must be installed separately
# using the chrek Helm chart in each namespace where checkpointing is needed. # using the snapshot Helm chart in each namespace where checkpointing is needed.
checkpoint: checkpoint:
# -- Whether to enable checkpoint/restore functionality # -- Whether to enable checkpoint/restore functionality
enabled: false enabled: false
...@@ -232,15 +232,15 @@ dynamo-operator: ...@@ -232,15 +232,15 @@ dynamo-operator:
# Storage configuration # Storage configuration
# These settings tell the operator where to find checkpoint storage # These settings tell the operator where to find checkpoint storage
# Must match the configuration in the chrek chart # Must match the configuration in the snapshot chart
storage: storage:
# -- Storage backend type: pvc, s3, or oci # -- Storage backend type: pvc, s3, or oci
type: pvc type: pvc
# PVC storage configuration (used when type=pvc) # PVC storage configuration (used when type=pvc)
pvc: pvc:
# -- Name of the PVC created by the chrek chart # -- Name of the PVC created by the snapshot chart
pvcName: "chrek-pvc" pvcName: "snapshot-pvc"
# -- Base path within the PVC for storing checkpoints # -- Base path within the PVC for storing checkpoints
basePath: "/checkpoints" basePath: "/checkpoints"
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
apiVersion: v2 apiVersion: v2
name: chrek name: snapshot
description: Checkpoint/Restore infrastructure for Dynamo (PVC + DaemonSet + CRIU Agent) description: Checkpoint/Restore infrastructure for Dynamo (PVC + DaemonSet + CRIU Agent)
type: application type: application
version: 1.0.0 version: 1.0.0
......
# Chrek Helm Chart # Dynamo Snapshot Helm Chart
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The DaemonSet runs in privileged mode to perform CRIU operations. See [Prerequisites](#prerequisites) for security considerations. > ⚠️ **Experimental Feature**: Dynamo Snapshot is currently in **beta/preview**. The DaemonSet runs in privileged mode to perform CRIU operations. See [Prerequisites](#prerequisites) for security considerations.
This Helm chart deploys the checkpoint/restore infrastructure for NVIDIA Dynamo, including: This Helm chart deploys the checkpoint/restore infrastructure for NVIDIA Dynamo, including:
- Persistent Volume Claim (PVC) for checkpoint storage - Persistent Volume Claim (PVC) for checkpoint storage
...@@ -14,18 +14,18 @@ This Helm chart deploys the checkpoint/restore infrastructure for NVIDIA Dynamo, ...@@ -14,18 +14,18 @@ This Helm chart deploys the checkpoint/restore infrastructure for NVIDIA Dynamo,
## Prerequisites ## Prerequisites
⚠️ **Security Warning**: The ChReK DaemonSet runs in **privileged mode** with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU checkpoint/restore operations. Workload pods do not need privileged mode. Only deploy in environments where a privileged DaemonSet is acceptable. ⚠️ **Security Warning**: The Dynamo Snapshot DaemonSet runs in **privileged mode** with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU checkpoint/restore operations. Workload pods do not need privileged mode. Only deploy in environments where a privileged DaemonSet is acceptable.
- Kubernetes 1.21+ - Kubernetes 1.21+
- GPU nodes with NVIDIA runtime (`nvidia` runtime class) - GPU nodes with NVIDIA runtime (`nvidia` runtime class)
- containerd runtime (for container inspection; CRIU is bundled in ChReK images) - containerd runtime (for container inspection; CRIU is bundled in Dynamo Snapshot images)
- NVIDIA Dynamo operator installed (cluster-wide or namespace-scoped) - NVIDIA Dynamo operator installed (cluster-wide or namespace-scoped)
- RWX (ReadWriteMany) storage class for multi-node deployments - RWX (ReadWriteMany) storage class for multi-node deployments
- **Security clearance for privileged DaemonSet** (the ChReK agent runs privileged with hostPID/hostIPC/hostNetwork) - **Security clearance for privileged DaemonSet** (the Dynamo Snapshot agent runs privileged with hostPID/hostIPC/hostNetwork)
## Installation ## Installation
> **Note:** The ChReK Helm chart is not yet published to a public Helm repository. For now, you must build and deploy from source. > **Note:** The Dynamo Snapshot Helm chart is not yet published to a public Helm repository. For now, you must build and deploy from source.
### Building from Source ### Building from Source
...@@ -35,17 +35,17 @@ export NAMESPACE=my-team # Your target namespace ...@@ -35,17 +35,17 @@ export NAMESPACE=my-team # Your target namespace
export DOCKER_SERVER=your-registry.com/ # Your container registry export DOCKER_SERVER=your-registry.com/ # Your container registry
export IMAGE_TAG=latest export IMAGE_TAG=latest
# Build ChReK agent image # Build Dynamo Snapshot agent image
cd deploy/chrek cd deploy/snapshot
docker build --target agent -t $DOCKER_SERVER/chrek-agent:$IMAGE_TAG . docker build --target agent -t $DOCKER_SERVER/snapshot-agent:$IMAGE_TAG .
docker push $DOCKER_SERVER/chrek-agent:$IMAGE_TAG docker push $DOCKER_SERVER/snapshot-agent:$IMAGE_TAG
cd - cd -
# Install ChReK chart with custom image # Install Dynamo Snapshot chart with custom image
helm install chrek ./deploy/helm/charts/chrek/ \ helm install snapshot ./deploy/helm/charts/snapshot/ \
--namespace ${NAMESPACE} \ --namespace ${NAMESPACE} \
--create-namespace \ --create-namespace \
--set daemonset.image.repository=${DOCKER_SERVER}/chrek-agent \ --set daemonset.image.repository=${DOCKER_SERVER}/snapshot-agent \
--set daemonset.image.tag=${IMAGE_TAG} \ --set daemonset.image.tag=${IMAGE_TAG} \
--set daemonset.imagePullSecrets[0].name=your-registry-secret --set daemonset.imagePullSecrets[0].name=your-registry-secret
``` ```
...@@ -60,10 +60,11 @@ See `values.yaml` for all configuration options. ...@@ -60,10 +60,11 @@ See `values.yaml` for all configuration options.
|-----------|-------------|---------| |-----------|-------------|---------|
| `storage.type` | Storage type: `pvc` (only supported), `s3` and `oci` planned | `pvc` | | `storage.type` | Storage type: `pvc` (only supported), `s3` and `oci` planned | `pvc` |
| `storage.pvc.create` | Create a new PVC | `true` | | `storage.pvc.create` | Create a new PVC | `true` |
| `storage.pvc.name` | PVC name (must match operator config) | `chrek-pvc` | | `storage.pvc.name` | PVC name (must match operator config) | `snapshot-pvc` |
| `storage.pvc.size` | PVC size | `100Gi` | | `storage.pvc.size` | PVC size | `100Gi` |
| `storage.pvc.storageClass` | Storage class name | `""` (default) | | `storage.pvc.storageClass` | Storage class name | `""` (default) |
| `daemonset.image.repository` | DaemonSet image repository | `nvcr.io/nvidian/dynamo-dev/chrek-agent` | | `daemonset.image.repository` | DaemonSet image repository | `nvcr.io/nvidian/dynamo-dev/snapshot-agent` |
| `daemonset.snapshotLogLevel` | Snapshot agent and nsrestore log level (`trace`, `debug`, `info`, `warn`, `error`) | `info` |
| `daemonset.nodeSelector` | Node selector for GPU nodes | `nvidia.com/gpu.present: "true"` | | `daemonset.nodeSelector` | Node selector for GPU nodes | `nvidia.com/gpu.present: "true"` |
| `config.checkpoint.criu.ghostLimit` | CRIU ghost file size limit in bytes | `536870912` (512MB) | | `config.checkpoint.criu.ghostLimit` | CRIU ghost file size limit in bytes | `536870912` (512MB) |
| `config.checkpoint.criu.logLevel` | CRIU logging verbosity (0-4) | `4` | | `config.checkpoint.criu.logLevel` | CRIU logging verbosity (0-4) | `4` |
...@@ -96,10 +97,10 @@ To enable checkpointing in multiple namespaces, install this chart in each names ...@@ -96,10 +97,10 @@ To enable checkpointing in multiple namespaces, install this chart in each names
```bash ```bash
# Namespace A # Namespace A
helm install chrek nvidia/chrek -n team-a helm install snapshot nvidia/snapshot -n team-a
# Namespace B # Namespace B
helm install chrek nvidia/chrek -n team-b helm install snapshot nvidia/snapshot -n team-b
``` ```
Each namespace will have its own isolated checkpoint storage. Each namespace will have its own isolated checkpoint storage.
...@@ -108,25 +109,25 @@ Each namespace will have its own isolated checkpoint storage. ...@@ -108,25 +109,25 @@ Each namespace will have its own isolated checkpoint storage.
```bash ```bash
# Check PVC # Check PVC
kubectl get pvc chrek-pvc -n my-team kubectl get pvc snapshot-pvc -n my-team
# Check DaemonSet # Check DaemonSet
kubectl get daemonset -n my-team kubectl get daemonset -n my-team
# Check DaemonSet pods are running # Check DaemonSet pods are running
kubectl get pods -n my-team -l app.kubernetes.io/name=chrek kubectl get pods -n my-team -l app.kubernetes.io/name=snapshot
``` ```
## Uninstallation ## Uninstallation
```bash ```bash
helm uninstall chrek -n my-team helm uninstall snapshot -n my-team
``` ```
**Note:** This will NOT delete the PVC by default. To delete the PVC: **Note:** This will NOT delete the PVC by default. To delete the PVC:
```bash ```bash
kubectl delete pvc chrek-pvc -n my-team kubectl delete pvc snapshot-pvc -n my-team
``` ```
## Troubleshooting ## Troubleshooting
...@@ -151,7 +152,7 @@ kubectl label node <node-name> nvidia.com/gpu.present=true ...@@ -151,7 +152,7 @@ kubectl label node <node-name> nvidia.com/gpu.present=true
Check DaemonSet logs: Check DaemonSet logs:
```bash ```bash
kubectl logs -n my-team -l app.kubernetes.io/name=chrek kubectl logs -n my-team -l app.kubernetes.io/name=snapshot
``` ```
### PVC not mounting ### PVC not mounting
...@@ -159,15 +160,15 @@ kubectl logs -n my-team -l app.kubernetes.io/name=chrek ...@@ -159,15 +160,15 @@ kubectl logs -n my-team -l app.kubernetes.io/name=chrek
Check PVC status and events: Check PVC status and events:
```bash ```bash
kubectl describe pvc chrek-pvc -n my-team kubectl describe pvc snapshot-pvc -n my-team
``` ```
Ensure your storage class supports `ReadWriteMany` access mode for multi-node deployments. Ensure your storage class supports `ReadWriteMany` access mode for multi-node deployments.
## Related Documentation ## Related Documentation
- [ChReK Overview](../../../../docs/kubernetes/chrek/README.md) - ChReK architecture and use cases - [Dynamo Snapshot Overview](../../../../docs/kubernetes/snapshot/README.md) - Dynamo Snapshot architecture and use cases
- [ChReK with Dynamo Platform](../../../../docs/kubernetes/chrek/dynamo.md) - Integration guide - [Dynamo Snapshot with Dynamo Platform](../../../../docs/kubernetes/snapshot/dynamo.md) - Integration guide
## License ## License
......
...@@ -15,14 +15,14 @@ ...@@ -15,14 +15,14 @@
{{/* {{/*
Expand the name of the chart. Expand the name of the chart.
*/}} */}}
{{- define "chrek.name" -}} {{- define "snapshot.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }} {{- end }}
{{/* {{/*
Create a default fully qualified app name. Create a default fully qualified app name.
*/}} */}}
{{- define "chrek.fullname" -}} {{- define "snapshot.fullname" -}}
{{- if .Values.fullnameOverride }} {{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }} {{- else }}
...@@ -38,16 +38,16 @@ Create a default fully qualified app name. ...@@ -38,16 +38,16 @@ Create a default fully qualified app name.
{{/* {{/*
Create chart name and version as used by the chart label. Create chart name and version as used by the chart label.
*/}} */}}
{{- define "chrek.chart" -}} {{- define "snapshot.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }} {{- end }}
{{/* {{/*
Common labels Common labels
*/}} */}}
{{- define "chrek.labels" -}} {{- define "snapshot.labels" -}}
helm.sh/chart: {{ include "chrek.chart" . }} helm.sh/chart: {{ include "snapshot.chart" . }}
{{ include "chrek.selectorLabels" . }} {{ include "snapshot.selectorLabels" . }}
{{- if .Chart.AppVersion }} {{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }} {{- end }}
...@@ -58,17 +58,17 @@ app.kubernetes.io/component: checkpoint-agent ...@@ -58,17 +58,17 @@ app.kubernetes.io/component: checkpoint-agent
{{/* {{/*
Selector labels Selector labels
*/}} */}}
{{- define "chrek.selectorLabels" -}} {{- define "snapshot.selectorLabels" -}}
app.kubernetes.io/name: {{ include "chrek.name" . }} app.kubernetes.io/name: {{ include "snapshot.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }} app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }} {{- end }}
{{/* {{/*
Create the name of the service account to use Create the name of the service account to use
*/}} */}}
{{- define "chrek.serviceAccountName" -}} {{- define "snapshot.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }} {{- if .Values.serviceAccount.create }}
{{- default (include "chrek.fullname" . ) .Values.serviceAccount.name }} {{- default (include "snapshot.fullname" . ) .Values.serviceAccount.name }}
{{- else }} {{- else }}
{{- default "default" .Values.serviceAccount.name }} {{- default "default" .Values.serviceAccount.name }}
{{- end }} {{- end }}
......
...@@ -4,10 +4,10 @@ ...@@ -4,10 +4,10 @@
apiVersion: v1 apiVersion: v1
kind: ConfigMap kind: ConfigMap
metadata: metadata:
name: {{ include "chrek.fullname" . }}-config name: {{ include "snapshot.fullname" . }}-config
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
labels: labels:
{{- include "chrek.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
data: data:
config.yaml: | config.yaml: |
basePath: {{ .Values.storage.pvc.basePath | quote }} basePath: {{ .Values.storage.pvc.basePath | quote }}
......
...@@ -4,18 +4,18 @@ ...@@ -4,18 +4,18 @@
apiVersion: apps/v1 apiVersion: apps/v1
kind: DaemonSet kind: DaemonSet
metadata: metadata:
name: {{ include "chrek.fullname" . }}-agent name: {{ include "snapshot.fullname" . }}-agent
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
labels: labels:
{{- include "chrek.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
spec: spec:
selector: selector:
matchLabels: matchLabels:
{{- include "chrek.selectorLabels" . | nindent 6 }} {{- include "snapshot.selectorLabels" . | nindent 6 }}
template: template:
metadata: metadata:
labels: labels:
{{- include "chrek.selectorLabels" . | nindent 8 }} {{- include "snapshot.selectorLabels" . | nindent 8 }}
{{- with .Values.daemonset.podLabels }} {{- with .Values.daemonset.podLabels }}
{{- toYaml . | nindent 8 }} {{- toYaml . | nindent 8 }}
{{- end }} {{- end }}
...@@ -24,7 +24,7 @@ spec: ...@@ -24,7 +24,7 @@ spec:
{{- toYaml . | nindent 8 }} {{- toYaml . | nindent 8 }}
{{- end }} {{- end }}
spec: spec:
serviceAccountName: {{ include "chrek.serviceAccountName" . }} serviceAccountName: {{ include "snapshot.serviceAccountName" . }}
hostPID: true hostPID: true
hostIPC: true hostIPC: true
hostNetwork: true hostNetwork: true
...@@ -76,6 +76,8 @@ spec: ...@@ -76,6 +76,8 @@ spec:
valueFrom: valueFrom:
fieldRef: fieldRef:
fieldPath: spec.nodeName fieldPath: spec.nodeName
- name: SNAPSHOT_LOG_LEVEL
value: {{ .Values.daemonset.snapshotLogLevel | quote }}
{{- if .Values.rbac.namespaceRestricted }} {{- if .Values.rbac.namespaceRestricted }}
# Restrict pod watching to this namespace (namespace-scoped RBAC) # Restrict pod watching to this namespace (namespace-scoped RBAC)
- name: RESTRICTED_NAMESPACE - name: RESTRICTED_NAMESPACE
...@@ -86,7 +88,7 @@ spec: ...@@ -86,7 +88,7 @@ spec:
volumeMounts: volumeMounts:
# Mount configuration ConfigMap # Mount configuration ConfigMap
- name: config - name: config
mountPath: /etc/chrek mountPath: /etc/snapshot
readOnly: true readOnly: true
{{- if eq .Values.storage.type "pvc" }} {{- if eq .Values.storage.type "pvc" }}
# Mount the checkpoint PVC (only for PVC storage type) # Mount the checkpoint PVC (only for PVC storage type)
...@@ -131,12 +133,12 @@ spec: ...@@ -131,12 +133,12 @@ spec:
# Configuration ConfigMap # Configuration ConfigMap
- name: config - name: config
configMap: configMap:
name: {{ include "chrek.fullname" . }}-config name: {{ include "snapshot.fullname" . }}-config
{{- if .Values.seccomp.deploy }} {{- if .Values.seccomp.deploy }}
# Seccomp profile ConfigMap (used by initContainer) # Seccomp profile ConfigMap (used by initContainer)
- name: seccomp-profiles - name: seccomp-profiles
configMap: configMap:
name: {{ include "chrek.fullname" . }}-seccomp name: {{ include "snapshot.fullname" . }}-seccomp
# Host seccomp directory (for deploying the profile) # Host seccomp directory (for deploying the profile)
- name: host-seccomp - name: host-seccomp
hostPath: hostPath:
......
...@@ -8,7 +8,7 @@ metadata: ...@@ -8,7 +8,7 @@ metadata:
name: {{ .Values.storage.pvc.name }} name: {{ .Values.storage.pvc.name }}
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
labels: labels:
app.kubernetes.io/name: {{ include "chrek.name" . }} app.kubernetes.io/name: {{ include "snapshot.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }} app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/component: storage app.kubernetes.io/component: storage
spec: spec:
......
...@@ -6,10 +6,10 @@ ...@@ -6,10 +6,10 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: Role kind: Role
metadata: metadata:
name: {{ include "chrek.fullname" . }}-agent name: {{ include "snapshot.fullname" . }}-agent
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
labels: labels:
{{- include "chrek.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent app.kubernetes.io/component: checkpoint-agent
rules: rules:
# Watch and annotate pods in this namespace to drive checkpoint/restore lifecycle # Watch and annotate pods in this namespace to drive checkpoint/restore lifecycle
...@@ -24,9 +24,9 @@ rules: ...@@ -24,9 +24,9 @@ rules:
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole kind: ClusterRole
metadata: metadata:
name: {{ include "chrek.fullname" . }}-agent name: {{ include "snapshot.fullname" . }}-agent
labels: labels:
{{- include "chrek.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent app.kubernetes.io/component: checkpoint-agent
rules: rules:
# Watch and annotate pods cluster-wide on assigned nodes # Watch and annotate pods cluster-wide on assigned nodes
......
...@@ -6,34 +6,34 @@ ...@@ -6,34 +6,34 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding kind: RoleBinding
metadata: metadata:
name: {{ include "chrek.fullname" . }}-agent name: {{ include "snapshot.fullname" . }}-agent
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
labels: labels:
{{- include "chrek.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent app.kubernetes.io/component: checkpoint-agent
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
kind: Role kind: Role
name: {{ include "chrek.fullname" . }}-agent name: {{ include "snapshot.fullname" . }}-agent
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: {{ include "chrek.serviceAccountName" . }} name: {{ include "snapshot.serviceAccountName" . }}
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
{{- else }} {{- else }}
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding kind: ClusterRoleBinding
metadata: metadata:
name: {{ include "chrek.fullname" . }}-agent name: {{ include "snapshot.fullname" . }}-agent
labels: labels:
{{- include "chrek.labels" . | nindent 4 }} {{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent app.kubernetes.io/component: checkpoint-agent
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
kind: ClusterRole kind: ClusterRole
name: {{ include "chrek.fullname" . }}-agent name: {{ include "snapshot.fullname" . }}-agent
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: {{ include "chrek.serviceAccountName" . }} name: {{ include "snapshot.serviceAccountName" . }}
namespace: {{ .Release.Namespace }} namespace: {{ .Release.Namespace }}
{{- end }} {{- end }}
{{- end }} {{- end }}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment