Unverified Commit 6831020f authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

chore: rename chrek to Dynamo Snapshot (#7028)


Signed-off-by: default avatarSchwinn Saereesitthipitak <17022745+galletas1712@users.noreply.github.com>
parent 7dbebf3c
......@@ -108,7 +108,7 @@ deploy:
- *ci
- 'deploy/helm/**'
- 'deploy/utils/**'
- 'deploy/chrek/**'
- 'deploy/snapshot/**'
- 'tests/deploy/**'
planner:
......
......@@ -61,16 +61,16 @@ async def init_decode(
shutdown_event: asyncio.Event,
shutdown_endpoints: list,
run_deferred_handlers: Callable[[], Awaitable[None]] | None = None,
checkpoint_restore_engine: Optional[sgl.Engine] = None,
snapshot_engine: Optional[sgl.Engine] = None,
) -> None:
server_args, dynamo_args = config.server_args, config.dynamo_args
if server_args.node_rank >= 1:
os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0"
# Use pre-created engine if provided (checkpoint/restore mode)
if checkpoint_restore_engine is not None:
engine = checkpoint_restore_engine
# Use pre-created engine if provided (snapshot mode)
if snapshot_engine is not None:
engine = snapshot_engine
load_time = 0.0
else:
start_time = time.time()
......@@ -151,16 +151,16 @@ async def init_prefill(
shutdown_event: asyncio.Event,
shutdown_endpoints: list,
run_deferred_handlers: Callable[[], Awaitable[None]] | None = None,
checkpoint_restore_engine: Optional[sgl.Engine] = None,
snapshot_engine: Optional[sgl.Engine] = None,
) -> None:
server_args, dynamo_args = config.server_args, config.dynamo_args
if server_args.node_rank >= 1:
os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0"
# Use pre-created engine if provided (checkpoint/restore mode)
if checkpoint_restore_engine is not None:
engine = checkpoint_restore_engine
# Use pre-created engine if provided (snapshot mode)
if snapshot_engine is not None:
engine = snapshot_engine
else:
engine = sgl.Engine(server_args=server_args)
......
......@@ -12,7 +12,6 @@ from dynamo.common.constants import DisaggregationMode
from dynamo.common.utils.runtime import create_runtime
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.sglang.args import parse_args
from dynamo.sglang.checkpoint_restore import handle_checkpoint_mode
from dynamo.sglang.init_diffusion import (
init_image_diffusion,
init_llm_diffusion,
......@@ -27,6 +26,7 @@ from dynamo.sglang.init_multimodal import (
init_multimodal_worker,
)
from dynamo.sglang.shutdown import install_graceful_shutdown
from dynamo.sglang.snapshot import handle_checkpoint_mode
configure_dynamo_logging()
......@@ -41,9 +41,7 @@ async def worker():
config.server_args.load_format = setup_gms(config.server_args)
# Checkpoint mode: engine must be created BEFORE runtime (no NATS/etcd during CRIU)
should_exit, checkpoint_restore_engine = await handle_checkpoint_mode(
config.server_args
)
should_exit, snapshot_engine = await handle_checkpoint_mode(config.server_args)
if should_exit:
return
......@@ -129,7 +127,7 @@ async def worker():
shutdown_event,
shutdown_endpoints,
run_deferred_handlers,
checkpoint_restore_engine=checkpoint_restore_engine,
snapshot_engine=snapshot_engine,
)
else:
await init_prefill(
......@@ -138,7 +136,7 @@ async def worker():
shutdown_event,
shutdown_endpoints,
run_deferred_handlers,
checkpoint_restore_engine=checkpoint_restore_engine,
snapshot_engine=snapshot_engine,
)
......
......@@ -2,7 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
"""
Checkpoint/restore (chrek) integration for SGLang workers.
Dynamo Snapshot integration for SGLang workers.
Handles the checkpoint job pod lifecycle:
1. Early exit if a checkpoint already exists (idempotency)
......@@ -164,7 +164,7 @@ class CheckpointConfig:
def _install_signal_handlers(self) -> None:
loop = asyncio.get_running_loop()
loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set)
# SIGCONT is used as the restore-complete signal. The chrek DaemonSet
# SIGCONT is used as the restore-complete signal. The snapshot DaemonSet
# watcher is the only sender, so there is no conflict with POSIX
# job-control semantics in practice.
loop.add_signal_handler(signal.SIGCONT, self._restore_done.set)
......@@ -197,7 +197,7 @@ class CheckpointConfig:
async def handle_checkpoint_mode(server_args) -> tuple[bool, Optional[sgl.Engine]]:
"""Single entry point for checkpoint/restore integration.
"""Single entry point for Dynamo Snapshot integration.
Must be called BEFORE runtime creation so the engine can be checkpointed
without active NATS/etcd connections.
......
......@@ -42,7 +42,6 @@ from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.vllm.worker_factory import WorkerFactory
from .args import Config, _uses_dynamo_connector, parse_args
from .checkpoint_restore import get_checkpoint_config
from .constants import DisaggregationMode
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler, get_dp_range_for_worker
from .health_check import (
......@@ -51,6 +50,7 @@ from .health_check import (
VllmPrefillHealthCheckPayload,
)
from .publisher import DYNAMO_COMPONENT_REGISTRY, StatLoggerFactory
from .snapshot import get_checkpoint_config
# Optional imports for frontend decoding support
MediaDecoder: type | None = None
......@@ -135,15 +135,15 @@ async def worker() -> None:
# CHECKPOINT MODE: Load engine BEFORE runtime creation
# This allows checkpointing GPU state before runtime connections are established
checkpoint_restore_engine = None
snapshot_engine = None
if checkpoint_cfg is not None:
logger.info("Checkpoint mode enabled (watcher-driven signals)")
# Checkpoint mode requires sleep mode — enable before engine init
config.engine_args.enable_sleep_mode = True
checkpoint_restore_engine = setup_vllm_engine(config)
engine_client = checkpoint_restore_engine[0]
snapshot_engine = setup_vllm_engine(config)
engine_client = snapshot_engine[0]
if not await checkpoint_cfg.run_lifecycle(
engine_client, CHECKPOINT_SLEEP_MODE_LEVEL
......@@ -173,7 +173,7 @@ async def worker() -> None:
config,
shutdown_event,
shutdown_endpoints,
checkpoint_restore_engine=checkpoint_restore_engine,
snapshot_engine=snapshot_engine,
)
logger.debug("multimodal worker completed")
elif config.omni:
......@@ -184,7 +184,7 @@ async def worker() -> None:
runtime,
config,
shutdown_event,
checkpoint_restore_engine=checkpoint_restore_engine,
snapshot_engine=snapshot_engine,
)
logger.debug("init_prefill completed")
else:
......@@ -192,7 +192,7 @@ async def worker() -> None:
runtime,
config,
shutdown_event,
checkpoint_restore_engine=checkpoint_restore_engine,
snapshot_engine=snapshot_engine,
)
logger.debug("init completed")
......@@ -597,7 +597,7 @@ async def init_prefill(
runtime: DistributedRuntime,
config: Config,
shutdown_event: asyncio.Event,
checkpoint_restore_engine: Optional[
snapshot_engine: Optional[
tuple[AsyncLLM, VllmConfig, Any, Any, LLMBackendMetrics]
] = None,
) -> None:
......@@ -612,14 +612,14 @@ async def init_prefill(
)
# Use pre-created engine if provided (checkpoint mode), otherwise create new
if checkpoint_restore_engine is not None:
if snapshot_engine is not None:
(
engine_client,
vllm_config,
default_sampling_params,
prometheus_temp_dir,
_component_gauges,
) = checkpoint_restore_engine
) = snapshot_engine
else:
(
engine_client,
......@@ -741,7 +741,7 @@ async def init(
runtime: DistributedRuntime,
config: Config,
shutdown_event: asyncio.Event,
checkpoint_restore_engine: Optional[
snapshot_engine: Optional[
tuple[AsyncLLM, VllmConfig, Any, Any, LLMBackendMetrics]
] = None,
) -> None:
......@@ -782,14 +782,14 @@ async def init(
)
# Use pre-created engine if provided (checkpoint mode), otherwise create new
if checkpoint_restore_engine is not None:
if snapshot_engine is not None:
(
engine_client,
vllm_config,
default_sampling_params,
prometheus_temp_dir,
component_gauges,
) = checkpoint_restore_engine
) = snapshot_engine
# Factory is created after unpack so component_gauges is available
factory = StatLoggerFactory(
endpoint=generate_endpoint,
......
......@@ -2,7 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
"""
Checkpoint/restore (chrek) integration for vLLM workers.
Dynamo Snapshot integration for vLLM workers.
Handles the checkpoint job pod lifecycle:
1. Early exit if a checkpoint already exists (idempotency)
......@@ -113,7 +113,7 @@ class CheckpointConfig:
def _install_signal_handlers(self) -> None:
loop = asyncio.get_running_loop()
loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set)
# SIGCONT is used as the restore-complete signal. The chrek DaemonSet
# SIGCONT is used as the restore-complete signal. The snapshot DaemonSet
# watcher is the only sender, so there is no conflict with POSIX
# job-control semantics in practice.
loop.add_signal_handler(signal.SIGCONT, self._restore_done.set)
......
......@@ -103,14 +103,12 @@ class TestCreate:
factory._create_multimodal_worker.assert_called_once() # type: ignore[union-attr]
@pytest.mark.asyncio
async def test_passes_checkpoint_restore_engine(
self, factory: WorkerFactory
) -> None:
async def test_passes_snapshot_engine(self, factory: WorkerFactory) -> None:
config = _make_config(multimodal_worker=True)
runtime = Mock()
shutdown_event = asyncio.Event()
shutdown_endpoints: list = []
checkpoint_restore_engine: EngineSetupResult = (
snapshot_engine: EngineSetupResult = (
Mock(),
Mock(),
Mock(),
......@@ -123,7 +121,7 @@ class TestCreate:
config,
shutdown_event,
shutdown_endpoints,
checkpoint_restore_engine=checkpoint_restore_engine,
snapshot_engine=snapshot_engine,
)
factory._create_multimodal_worker.assert_called_once_with( # type: ignore[union-attr]
......@@ -131,7 +129,7 @@ class TestCreate:
config,
shutdown_event,
shutdown_endpoints,
checkpoint_restore_engine=checkpoint_restore_engine,
snapshot_engine=snapshot_engine,
)
@pytest.mark.asyncio
......
......@@ -58,7 +58,7 @@ class WorkerFactory:
config: Config,
shutdown_event: asyncio.Event,
shutdown_endpoints: list,
checkpoint_restore_engine: Optional[EngineSetupResult] = None,
snapshot_engine: Optional[EngineSetupResult] = None,
) -> None:
"""Create the appropriate multimodal worker based on config flags."""
......@@ -72,7 +72,7 @@ class WorkerFactory:
config,
shutdown_event,
shutdown_endpoints,
checkpoint_restore_engine=checkpoint_restore_engine,
snapshot_engine=snapshot_engine,
)
else:
raise ValueError(
......@@ -85,7 +85,7 @@ class WorkerFactory:
config: Config,
shutdown_event: asyncio.Event,
shutdown_endpoints: list, # mutated in place
checkpoint_restore_engine: Optional[EngineSetupResult] = None,
snapshot_engine: Optional[EngineSetupResult] = None,
) -> None:
"""
Initialize multimodal worker component.
......@@ -121,14 +121,14 @@ class WorkerFactory:
[load_lora_endpoint, unload_lora_endpoint, list_loras_endpoint]
)
# Use pre-created engine if provided (checkpoint mode), otherwise create new
if checkpoint_restore_engine is not None:
if snapshot_engine is not None:
(
engine_client,
vllm_config,
_default_sampling_params,
prometheus_temp_dir,
_component_gauges,
) = checkpoint_restore_engine
) = snapshot_engine
else:
(
engine_client,
......
......@@ -163,7 +163,7 @@ The chart includes built-in validation to prevent all operator conflicts:
| dynamo-operator.checkpoint.enabled | bool | `false` | Whether to enable checkpoint/restore functionality |
| dynamo-operator.checkpoint.readyForCheckpointFilePath | string | `"/tmp/ready-for-checkpoint"` | Path written by worker when model is loaded and ready for checkpointing |
| dynamo-operator.checkpoint.storage.type | string | `"pvc"` | Storage backend type: pvc, s3, or oci |
| dynamo-operator.checkpoint.storage.pvc.pvcName | string | `"chrek-pvc"` | Name of the PVC created by the chrek chart |
| dynamo-operator.checkpoint.storage.pvc.pvcName | string | `"snapshot-pvc"` | Name of the PVC created by the snapshot chart |
| dynamo-operator.checkpoint.storage.pvc.basePath | string | `"/checkpoints"` | Base path within the PVC for storing checkpoints |
| dynamo-operator.checkpoint.storage.s3.uri | string | `""` | S3 URI in format: s3://[endpoint/]bucket/prefix |
| dynamo-operator.checkpoint.storage.s3.credentialsSecretRef | string | `""` | Reference to a secret containing AWS credentials |
......
......@@ -141,7 +141,7 @@ data:
{{- end }}
{{- if or (eq (.Values.checkpoint.storage.type | toString) "pvc") (not .Values.checkpoint.storage.type) }}
pvc:
pvcName: {{ (.Values.checkpoint.storage.pvc.pvcName | default "chrek-pvc") | quote }}
pvcName: {{ (.Values.checkpoint.storage.pvc.pvcName | default "snapshot-pvc") | quote }}
basePath: {{ (.Values.checkpoint.storage.pvc.basePath | default "/checkpoints") | quote }}
{{- end }}
{{- if eq .Values.checkpoint.storage.type "s3" }}
......
......@@ -137,7 +137,7 @@ modelExpressURL: ""
# Checkpoint configuration for fast pod restore
# NOTE: The checkpoint infrastructure (PVC + DaemonSet) must be installed separately
# using the chrek Helm chart in each namespace where checkpointing is needed.
# using the snapshot Helm chart in each namespace where checkpointing is needed.
checkpoint:
# Enable checkpoint/restore functionality
enabled: false
......@@ -148,16 +148,16 @@ checkpoint:
# Storage configuration
# These settings tell the operator where to find checkpoint storage
# Must match the configuration in the chrek chart
# Must match the configuration in the snapshot chart
storage:
# Storage backend type: pvc, s3, or oci
type: pvc
# PVC configuration (used when type=pvc)
pvc:
# Name of the PVC created by the chrek chart
# Must match the PVC name in the chrek chart
pvcName: "chrek-pvc"
# Name of the PVC created by the snapshot chart
# Must match the PVC name in the snapshot chart
pvcName: "snapshot-pvc"
# Base path within the PVC for storing checkpoints
basePath: "/checkpoints"
......
......@@ -222,7 +222,7 @@ dynamo-operator:
# Checkpoint configuration for fast pod restore using CRIU/cuda-checkpoint
# NOTE: The checkpoint infrastructure (PVC + DaemonSet) must be installed separately
# using the chrek Helm chart in each namespace where checkpointing is needed.
# using the snapshot Helm chart in each namespace where checkpointing is needed.
checkpoint:
# -- Whether to enable checkpoint/restore functionality
enabled: false
......@@ -232,15 +232,15 @@ dynamo-operator:
# Storage configuration
# These settings tell the operator where to find checkpoint storage
# Must match the configuration in the chrek chart
# Must match the configuration in the snapshot chart
storage:
# -- Storage backend type: pvc, s3, or oci
type: pvc
# PVC storage configuration (used when type=pvc)
pvc:
# -- Name of the PVC created by the chrek chart
pvcName: "chrek-pvc"
# -- Name of the PVC created by the snapshot chart
pvcName: "snapshot-pvc"
# -- Base path within the PVC for storing checkpoints
basePath: "/checkpoints"
......
......@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v2
name: chrek
name: snapshot
description: Checkpoint/Restore infrastructure for Dynamo (PVC + DaemonSet + CRIU Agent)
type: application
version: 1.0.0
......
# Chrek Helm Chart
# Dynamo Snapshot Helm Chart
> ⚠️ **Experimental Feature**: ChReK is currently in **beta/preview**. The DaemonSet runs in privileged mode to perform CRIU operations. See [Prerequisites](#prerequisites) for security considerations.
> ⚠️ **Experimental Feature**: Dynamo Snapshot is currently in **beta/preview**. The DaemonSet runs in privileged mode to perform CRIU operations. See [Prerequisites](#prerequisites) for security considerations.
This Helm chart deploys the checkpoint/restore infrastructure for NVIDIA Dynamo, including:
- Persistent Volume Claim (PVC) for checkpoint storage
......@@ -14,18 +14,18 @@ This Helm chart deploys the checkpoint/restore infrastructure for NVIDIA Dynamo,
## Prerequisites
⚠️ **Security Warning**: The ChReK DaemonSet runs in **privileged mode** with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU checkpoint/restore operations. Workload pods do not need privileged mode. Only deploy in environments where a privileged DaemonSet is acceptable.
⚠️ **Security Warning**: The Dynamo Snapshot DaemonSet runs in **privileged mode** with `hostPID`, `hostIPC`, and `hostNetwork` to perform CRIU checkpoint/restore operations. Workload pods do not need privileged mode. Only deploy in environments where a privileged DaemonSet is acceptable.
- Kubernetes 1.21+
- GPU nodes with NVIDIA runtime (`nvidia` runtime class)
- containerd runtime (for container inspection; CRIU is bundled in ChReK images)
- containerd runtime (for container inspection; CRIU is bundled in Dynamo Snapshot images)
- NVIDIA Dynamo operator installed (cluster-wide or namespace-scoped)
- RWX (ReadWriteMany) storage class for multi-node deployments
- **Security clearance for privileged DaemonSet** (the ChReK agent runs privileged with hostPID/hostIPC/hostNetwork)
- **Security clearance for privileged DaemonSet** (the Dynamo Snapshot agent runs privileged with hostPID/hostIPC/hostNetwork)
## Installation
> **Note:** The ChReK Helm chart is not yet published to a public Helm repository. For now, you must build and deploy from source.
> **Note:** The Dynamo Snapshot Helm chart is not yet published to a public Helm repository. For now, you must build and deploy from source.
### Building from Source
......@@ -35,17 +35,17 @@ export NAMESPACE=my-team # Your target namespace
export DOCKER_SERVER=your-registry.com/ # Your container registry
export IMAGE_TAG=latest
# Build ChReK agent image
cd deploy/chrek
docker build --target agent -t $DOCKER_SERVER/chrek-agent:$IMAGE_TAG .
docker push $DOCKER_SERVER/chrek-agent:$IMAGE_TAG
# Build Dynamo Snapshot agent image
cd deploy/snapshot
docker build --target agent -t $DOCKER_SERVER/snapshot-agent:$IMAGE_TAG .
docker push $DOCKER_SERVER/snapshot-agent:$IMAGE_TAG
cd -
# Install ChReK chart with custom image
helm install chrek ./deploy/helm/charts/chrek/ \
# Install Dynamo Snapshot chart with custom image
helm install snapshot ./deploy/helm/charts/snapshot/ \
--namespace ${NAMESPACE} \
--create-namespace \
--set daemonset.image.repository=${DOCKER_SERVER}/chrek-agent \
--set daemonset.image.repository=${DOCKER_SERVER}/snapshot-agent \
--set daemonset.image.tag=${IMAGE_TAG} \
--set daemonset.imagePullSecrets[0].name=your-registry-secret
```
......@@ -60,10 +60,11 @@ See `values.yaml` for all configuration options.
|-----------|-------------|---------|
| `storage.type` | Storage type: `pvc` (only supported), `s3` and `oci` planned | `pvc` |
| `storage.pvc.create` | Create a new PVC | `true` |
| `storage.pvc.name` | PVC name (must match operator config) | `chrek-pvc` |
| `storage.pvc.name` | PVC name (must match operator config) | `snapshot-pvc` |
| `storage.pvc.size` | PVC size | `100Gi` |
| `storage.pvc.storageClass` | Storage class name | `""` (default) |
| `daemonset.image.repository` | DaemonSet image repository | `nvcr.io/nvidian/dynamo-dev/chrek-agent` |
| `daemonset.image.repository` | DaemonSet image repository | `nvcr.io/nvidian/dynamo-dev/snapshot-agent` |
| `daemonset.snapshotLogLevel` | Snapshot agent and nsrestore log level (`trace`, `debug`, `info`, `warn`, `error`) | `info` |
| `daemonset.nodeSelector` | Node selector for GPU nodes | `nvidia.com/gpu.present: "true"` |
| `config.checkpoint.criu.ghostLimit` | CRIU ghost file size limit in bytes | `536870912` (512MB) |
| `config.checkpoint.criu.logLevel` | CRIU logging verbosity (0-4) | `4` |
......@@ -96,10 +97,10 @@ To enable checkpointing in multiple namespaces, install this chart in each names
```bash
# Namespace A
helm install chrek nvidia/chrek -n team-a
helm install snapshot nvidia/snapshot -n team-a
# Namespace B
helm install chrek nvidia/chrek -n team-b
helm install snapshot nvidia/snapshot -n team-b
```
Each namespace will have its own isolated checkpoint storage.
......@@ -108,25 +109,25 @@ Each namespace will have its own isolated checkpoint storage.
```bash
# Check PVC
kubectl get pvc chrek-pvc -n my-team
kubectl get pvc snapshot-pvc -n my-team
# Check DaemonSet
kubectl get daemonset -n my-team
# Check DaemonSet pods are running
kubectl get pods -n my-team -l app.kubernetes.io/name=chrek
kubectl get pods -n my-team -l app.kubernetes.io/name=snapshot
```
## Uninstallation
```bash
helm uninstall chrek -n my-team
helm uninstall snapshot -n my-team
```
**Note:** This will NOT delete the PVC by default. To delete the PVC:
```bash
kubectl delete pvc chrek-pvc -n my-team
kubectl delete pvc snapshot-pvc -n my-team
```
## Troubleshooting
......@@ -151,7 +152,7 @@ kubectl label node <node-name> nvidia.com/gpu.present=true
Check DaemonSet logs:
```bash
kubectl logs -n my-team -l app.kubernetes.io/name=chrek
kubectl logs -n my-team -l app.kubernetes.io/name=snapshot
```
### PVC not mounting
......@@ -159,15 +160,15 @@ kubectl logs -n my-team -l app.kubernetes.io/name=chrek
Check PVC status and events:
```bash
kubectl describe pvc chrek-pvc -n my-team
kubectl describe pvc snapshot-pvc -n my-team
```
Ensure your storage class supports `ReadWriteMany` access mode for multi-node deployments.
## Related Documentation
- [ChReK Overview](../../../../docs/kubernetes/chrek/README.md) - ChReK architecture and use cases
- [ChReK with Dynamo Platform](../../../../docs/kubernetes/chrek/dynamo.md) - Integration guide
- [Dynamo Snapshot Overview](../../../../docs/kubernetes/snapshot/README.md) - Dynamo Snapshot architecture and use cases
- [Dynamo Snapshot with Dynamo Platform](../../../../docs/kubernetes/snapshot/dynamo.md) - Integration guide
## License
......
......@@ -15,14 +15,14 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "chrek.name" -}}
{{- define "snapshot.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
*/}}
{{- define "chrek.fullname" -}}
{{- define "snapshot.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
......@@ -38,16 +38,16 @@ Create a default fully qualified app name.
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "chrek.chart" -}}
{{- define "snapshot.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "chrek.labels" -}}
helm.sh/chart: {{ include "chrek.chart" . }}
{{ include "chrek.selectorLabels" . }}
{{- define "snapshot.labels" -}}
helm.sh/chart: {{ include "snapshot.chart" . }}
{{ include "snapshot.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
......@@ -58,17 +58,17 @@ app.kubernetes.io/component: checkpoint-agent
{{/*
Selector labels
*/}}
{{- define "chrek.selectorLabels" -}}
app.kubernetes.io/name: {{ include "chrek.name" . }}
{{- define "snapshot.selectorLabels" -}}
app.kubernetes.io/name: {{ include "snapshot.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Create the name of the service account to use
*/}}
{{- define "chrek.serviceAccountName" -}}
{{- define "snapshot.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "chrek.fullname" . ) .Values.serviceAccount.name }}
{{- default (include "snapshot.fullname" . ) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
......
......@@ -4,10 +4,10 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "chrek.fullname" . }}-config
name: {{ include "snapshot.fullname" . }}-config
namespace: {{ .Release.Namespace }}
labels:
{{- include "chrek.labels" . | nindent 4 }}
{{- include "snapshot.labels" . | nindent 4 }}
data:
config.yaml: |
basePath: {{ .Values.storage.pvc.basePath | quote }}
......
......@@ -4,18 +4,18 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ include "chrek.fullname" . }}-agent
name: {{ include "snapshot.fullname" . }}-agent
namespace: {{ .Release.Namespace }}
labels:
{{- include "chrek.labels" . | nindent 4 }}
{{- include "snapshot.labels" . | nindent 4 }}
spec:
selector:
matchLabels:
{{- include "chrek.selectorLabels" . | nindent 6 }}
{{- include "snapshot.selectorLabels" . | nindent 6 }}
template:
metadata:
labels:
{{- include "chrek.selectorLabels" . | nindent 8 }}
{{- include "snapshot.selectorLabels" . | nindent 8 }}
{{- with .Values.daemonset.podLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
......@@ -24,7 +24,7 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
serviceAccountName: {{ include "chrek.serviceAccountName" . }}
serviceAccountName: {{ include "snapshot.serviceAccountName" . }}
hostPID: true
hostIPC: true
hostNetwork: true
......@@ -76,6 +76,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: SNAPSHOT_LOG_LEVEL
value: {{ .Values.daemonset.snapshotLogLevel | quote }}
{{- if .Values.rbac.namespaceRestricted }}
# Restrict pod watching to this namespace (namespace-scoped RBAC)
- name: RESTRICTED_NAMESPACE
......@@ -86,7 +88,7 @@ spec:
volumeMounts:
# Mount configuration ConfigMap
- name: config
mountPath: /etc/chrek
mountPath: /etc/snapshot
readOnly: true
{{- if eq .Values.storage.type "pvc" }}
# Mount the checkpoint PVC (only for PVC storage type)
......@@ -131,12 +133,12 @@ spec:
# Configuration ConfigMap
- name: config
configMap:
name: {{ include "chrek.fullname" . }}-config
name: {{ include "snapshot.fullname" . }}-config
{{- if .Values.seccomp.deploy }}
# Seccomp profile ConfigMap (used by initContainer)
- name: seccomp-profiles
configMap:
name: {{ include "chrek.fullname" . }}-seccomp
name: {{ include "snapshot.fullname" . }}-seccomp
# Host seccomp directory (for deploying the profile)
- name: host-seccomp
hostPath:
......
......@@ -8,7 +8,7 @@ metadata:
name: {{ .Values.storage.pvc.name }}
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/name: {{ include "chrek.name" . }}
app.kubernetes.io/name: {{ include "snapshot.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/component: storage
spec:
......
......@@ -6,10 +6,10 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ include "chrek.fullname" . }}-agent
name: {{ include "snapshot.fullname" . }}-agent
namespace: {{ .Release.Namespace }}
labels:
{{- include "chrek.labels" . | nindent 4 }}
{{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
rules:
# Watch and annotate pods in this namespace to drive checkpoint/restore lifecycle
......@@ -24,9 +24,9 @@ rules:
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "chrek.fullname" . }}-agent
name: {{ include "snapshot.fullname" . }}-agent
labels:
{{- include "chrek.labels" . | nindent 4 }}
{{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
rules:
# Watch and annotate pods cluster-wide on assigned nodes
......
......@@ -6,34 +6,34 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ include "chrek.fullname" . }}-agent
name: {{ include "snapshot.fullname" . }}-agent
namespace: {{ .Release.Namespace }}
labels:
{{- include "chrek.labels" . | nindent 4 }}
{{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ include "chrek.fullname" . }}-agent
name: {{ include "snapshot.fullname" . }}-agent
subjects:
- kind: ServiceAccount
name: {{ include "chrek.serviceAccountName" . }}
name: {{ include "snapshot.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
{{- else }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "chrek.fullname" . }}-agent
name: {{ include "snapshot.fullname" . }}-agent
labels:
{{- include "chrek.labels" . | nindent 4 }}
{{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "chrek.fullname" . }}-agent
name: {{ include "snapshot.fullname" . }}-agent
subjects:
- kind: ServiceAccount
name: {{ include "chrek.serviceAccountName" . }}
name: {{ include "snapshot.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
{{- end }}
{{- end }}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment