Unverified Commit 6831020f authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

chore: rename chrek to Dynamo Snapshot (#7028)


Signed-off-by: default avatarSchwinn Saereesitthipitak <17022745+galletas1712@users.noreply.github.com>
parent 7dbebf3c
......@@ -5,10 +5,10 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "chrek.fullname" . }}-seccomp
name: {{ include "snapshot.fullname" . }}-seccomp
namespace: {{ .Release.Namespace }}
labels:
{{- include "chrek.labels" . | nindent 4 }}
{{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: seccomp
data:
block-iouring.json: |
......
......@@ -5,10 +5,10 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "chrek.serviceAccountName" . }}
name: {{ include "snapshot.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "chrek.labels" . | nindent 4 }}
{{- include "snapshot.labels" . | nindent 4 }}
app.kubernetes.io/component: checkpoint-agent
{{- with .Values.serviceAccount.annotations }}
annotations:
......
......@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Chrek - Checkpoint/Restore Infrastructure
# Dynamo Snapshot - Checkpoint/Restore Infrastructure
# This chart deploys the checkpoint storage (PVC) and CRIU agent (DaemonSet)
# in a single namespace. Install this chart in each namespace where you want
# to enable checkpoint/restore functionality for DynamoGraphDeployments.
......@@ -27,7 +27,7 @@ storage:
# Create a new PVC (set to false if using existing PVC)
create: true
# PVC name - must match operator configuration
name: chrek-pvc
name: snapshot-pvc
# PVC size
size: 100Gi
# Storage class (leave empty for default)
......@@ -48,14 +48,17 @@ storage:
# OCI URI (e.g., oci://registry.io/repo/checkpoints)
uri: ""
# DaemonSet configuration for chrek (checkpoint/restore) agent
# DaemonSet configuration for snapshot (checkpoint/restore) agent
daemonset:
# Container image
image:
repository: nvcr.io/nvidia/ai-dynamo/chrek-agent
repository: nvcr.io/nvidia/ai-dynamo/snapshot-agent
tag: 1.0.0
pullPolicy: Always
# Snapshot agent and nsrestore log level (trace, debug, info, warn, error)
snapshotLogLevel: info
# Image pull secrets
imagePullSecrets:
- name: ngc-secret
......
......@@ -93,7 +93,7 @@ func SetDefaultsOperatorConfiguration(obj *OperatorConfiguration) {
obj.Checkpoint.Storage.Type = CheckpointStorageTypePVC
}
if obj.Checkpoint.Storage.PVC.PVCName == "" {
obj.Checkpoint.Storage.PVC.PVCName = "chrek-pvc"
obj.Checkpoint.Storage.PVC.PVCName = "snapshot-pvc"
}
if obj.Checkpoint.Storage.PVC.BasePath == "" {
obj.Checkpoint.Storage.PVC.BasePath = "/checkpoints"
......
......@@ -265,7 +265,7 @@ type CheckpointStorageConfiguration struct {
// CheckpointPVCConfig holds PVC storage configuration.
type CheckpointPVCConfig struct {
// PVCName is the name of the PVC
// +kubebuilder:default="chrek-pvc"
// +kubebuilder:default="snapshot-pvc"
PVCName string `json:"pvcName"`
// BasePath is the base directory within the PVC
// +kubebuilder:default="/checkpoints"
......
......@@ -325,7 +325,7 @@ func InjectPodInfoVolumeMount(container *corev1.Container) {
}
// InjectCheckpointIntoPodSpec injects checkpoint configuration into a pod spec for
// external restore via the chrek DaemonSet. The pod image is expected to be a
// external restore via the snapshot DaemonSet. The pod image is expected to be a
// runtime-compatible restore image (runtime + CRIU tooling). For ready checkpoints,
// the operator overrides command to `sleep infinity` so the watcher can trigger
// external restore via nsenter + nsrestore.
......
......@@ -44,7 +44,7 @@ func testPVCConfig() *configv1alpha1.CheckpointConfiguration {
Storage: configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypePVC,
PVC: configv1alpha1.CheckpointPVCConfig{
PVCName: "chrek-pvc",
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
},
},
......@@ -113,7 +113,7 @@ func TestHelpers(t *testing.T) {
func TestInjectionIdempotency(t *testing.T) {
// Volume injection is idempotent
podSpec := &corev1.PodSpec{Volumes: []corev1.Volume{{Name: consts.CheckpointVolumeName}, {Name: consts.PodInfoVolumeName}}}
InjectCheckpointVolume(podSpec, "chrek-pvc")
InjectCheckpointVolume(podSpec, "snapshot-pvc")
InjectPodInfoVolume(podSpec)
assert.Len(t, podSpec.Volumes, 2)
......@@ -260,7 +260,7 @@ func TestInjectCheckpointIntoPodSpec(t *testing.T) {
for _, v := range podSpec.Volumes {
volNames[v.Name] = true
if v.Name == consts.CheckpointVolumeName {
assert.Equal(t, "chrek-pvc", v.PersistentVolumeClaim.ClaimName)
assert.Equal(t, "snapshot-pvc", v.PersistentVolumeClaim.ClaimName)
}
}
assert.True(t, volNames[consts.CheckpointVolumeName])
......@@ -329,7 +329,7 @@ func TestInjectCheckpointIntoPodSpec(t *testing.T) {
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "pvc", PVC: configv1alpha1.CheckpointPVCConfig{BasePath: "/checkpoints"}},
}, "no PVC name"},
{"PVC base path missing", testPodSpec(), testInfo(), &configv1alpha1.CheckpointConfiguration{
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "pvc", PVC: configv1alpha1.CheckpointPVCConfig{PVCName: "chrek-pvc"}},
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "pvc", PVC: configv1alpha1.CheckpointPVCConfig{PVCName: "snapshot-pvc"}},
}, "no PVC base path"},
{"S3 URI missing", testPodSpec(), testInfo(), &configv1alpha1.CheckpointConfiguration{
Storage: configv1alpha1.CheckpointStorageConfiguration{Type: "s3"},
......
......@@ -140,13 +140,13 @@ const (
ResourceStateUnknown = "unknown"
// Checkpoint/restore constants
// CROSS-REFERENCE: Some constants below are duplicated in the chrek package at
// deploy/chrek/pkg/config/constants.go. If you change a value here, update there too.
// CROSS-REFERENCE: Some constants below are duplicated in the snapshot package at
// deploy/snapshot/pkg/config/constants.go. If you change a value here, update there too.
// Kubernetes labels
KubeLabelIsCheckpointSource = "nvidia.com/chrek-is-checkpoint-source" // Pod label that triggers DaemonSet auto-checkpoint
KubeLabelCheckpointHash = "nvidia.com/chrek-checkpoint-hash" // Checkpoint identity hash (= DynamoCheckpoint CR name)
KubeLabelIsRestoreTarget = "nvidia.com/chrek-is-restore-target" // Pod label that triggers DaemonSet auto-restore
KubeLabelIsCheckpointSource = "nvidia.com/snapshot-is-checkpoint-source" // Pod label that triggers DaemonSet auto-checkpoint
KubeLabelCheckpointHash = "nvidia.com/snapshot-checkpoint-hash" // Checkpoint identity hash (= DynamoCheckpoint CR name)
KubeLabelIsRestoreTarget = "nvidia.com/snapshot-is-restore-target" // Pod label that triggers DaemonSet auto-restore
// Environment variables injected into pods
EnvCheckpointStorageType = "DYN_CHECKPOINT_STORAGE_TYPE" // Storage backend (pvc, s3, oci) — checkpoint job pods only
......@@ -159,7 +159,7 @@ const (
CheckpointVolumeName = "checkpoint-storage" // Pod-internal volume name for checkpoint PVC
// SeccompProfilePath is the localhost seccomp profile that blocks io_uring syscalls.
// Deployed to nodes by the chrek DaemonSet init container.
// Deployed to nodes by the snapshot DaemonSet init container.
SeccompProfilePath = "profiles/block-iouring.json"
// Pod identity (Downward API) ---
......
......@@ -58,7 +58,7 @@ func checkpointTestConfig() *configv1alpha1.OperatorConfiguration {
Storage: configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypePVC,
PVC: configv1alpha1.CheckpointPVCConfig{
PVCName: "chrek-pvc",
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
},
},
......@@ -143,7 +143,7 @@ func TestBuildCheckpointJob(t *testing.T) {
volNames[v.Name] = true
if v.Name == consts.CheckpointVolumeName {
require.NotNil(t, v.PersistentVolumeClaim)
assert.Equal(t, "chrek-pvc", v.PersistentVolumeClaim.ClaimName)
assert.Equal(t, "snapshot-pvc", v.PersistentVolumeClaim.ClaimName)
}
if v.Name == consts.PodInfoVolumeName {
require.NotNil(t, v.DownwardAPI)
......
......@@ -1298,7 +1298,7 @@ func TestDynamoComponentDeploymentReconciler_generatePodTemplateSpec_RestoreLabe
Storage: configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypePVC,
PVC: configv1alpha1.CheckpointPVCConfig{
PVCName: "chrek-pvc",
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
},
},
......@@ -1430,7 +1430,7 @@ func TestDynamoComponentDeploymentReconciler_generateDeployment_RestoreStrategy(
Storage: configv1alpha1.CheckpointStorageConfiguration{
Type: configv1alpha1.CheckpointStorageTypePVC,
PVC: configv1alpha1.CheckpointPVCConfig{
PVCName: "chrek-pvc",
PVCName: "snapshot-pvc",
BasePath: "/checkpoints",
},
},
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Unified Dockerfile for chrek-agent and placeholder images.
# Unified Dockerfile for snapshot-agent and placeholder images.
#
# Build targets:
# docker build --target agent -t chrek-agent:latest .
# docker build --target agent -t snapshot-agent:latest .
# docker build --target placeholder --build-arg BASE_IMAGE=<app-image> -t placeholder:latest .
#
# Optional targets for CI:
......@@ -66,7 +66,7 @@ FROM go-base AS builder
ARG TARGETOS=linux
ARG TARGETARCH=amd64
RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /chrek-agent ./cmd/agent
RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /snapshot-agent ./cmd/agent
RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /nsrestore ./cmd/nsrestore
# =============================================================================
......@@ -105,7 +105,7 @@ RUN git clone --branch ${CRIU_VERSION} https://github.com/checkpoint-restore/cri
RUN git clone https://github.com/NVIDIA/cuda-checkpoint.git /tmp/cuda-checkpoint
# =============================================================================
# Stage: Agent - Final chrek-agent image
# Stage: Agent - Final snapshot-agent image
# =============================================================================
FROM ${AGENT_BASE_IMAGE} AS agent
......@@ -137,15 +137,15 @@ COPY --from=criu-builder /tmp/cuda-checkpoint/bin/x86_64_Linux/cuda-checkpoint /
RUN chmod +x /usr/local/sbin/cuda-checkpoint
# Copy the built binaries
COPY --from=builder /chrek-agent /usr/local/bin/chrek-agent
COPY --from=builder /snapshot-agent /usr/local/bin/snapshot-agent
COPY --from=builder /nsrestore /usr/local/bin/nsrestore
# Create directories
RUN mkdir -p /checkpoints /var/run/chrek
RUN mkdir -p /checkpoints /var/run/snapshot
USER root
ENTRYPOINT ["/usr/local/bin/chrek-agent"]
ENTRYPOINT ["/usr/local/bin/snapshot-agent"]
# =============================================================================
# Stage: Placeholder - Runtime-compatible restore image (requires BASE_IMAGE arg)
......
......@@ -2,7 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
# Image URL to use all building/pushing image targets
IMG ?= nvcr.io/nvidian/dynamo-dev/chrek-agent:latest
IMG ?= nvcr.io/nvidian/dynamo-dev/snapshot-agent:latest
PLACEHOLDER_IMG ?= nvcr.io/nvidian/dynamo-dev/dynamo-vllm-placeholder:latest
# PLACEHOLDER_BASE_IMG must be provided when building placeholder (no default)
......@@ -54,12 +54,12 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes.
##@ Build
.PHONY: build
build: fmt vet ## Build chrek-agent binary.
CGO_ENABLED=0 go build -ldflags="-w -s" -o bin/chrek-agent ./cmd/agent
build: fmt vet ## Build snapshot-agent binary.
CGO_ENABLED=0 go build -ldflags="-w -s" -o bin/snapshot-agent ./cmd/agent
.PHONY: run
run: build ## Run chrek-agent from your host.
./bin/chrek-agent
run: build ## Run snapshot-agent from your host.
./bin/snapshot-agent
.PHONY: clean
clean: ## Remove build artifacts.
......@@ -69,15 +69,15 @@ clean: ## Remove build artifacts.
##@ Docker
.PHONY: docker-build-agent
docker-build-agent: ## Build chrek-agent docker image.
docker-build-agent: ## Build snapshot-agent docker image.
$(CONTAINER_TOOL) build --target agent -t ${IMG} .
.PHONY: docker-build-agent-lint
docker-build-agent-lint: ## Build chrek-agent docker image up to lint stage.
docker-build-agent-lint: ## Build snapshot-agent docker image up to lint stage.
$(CONTAINER_TOOL) build --target linter -t ${IMG}-lint .
.PHONY: docker-build-agent-test
docker-build-agent-test: ## Build chrek-agent docker image up to test stage.
docker-build-agent-test: ## Build snapshot-agent docker image up to test stage.
$(CONTAINER_TOOL) build --target tester -t ${IMG}-test .
.PHONY: docker-build-placeholder
......@@ -97,7 +97,7 @@ endif
-t ${PLACEHOLDER_IMG} .
.PHONY: docker-push-agent
docker-push-agent: ## Push chrek-agent docker image.
docker-push-agent: ## Push snapshot-agent docker image.
$(CONTAINER_TOOL) push ${IMG}
.PHONY: docker-push-placeholder
......
......@@ -8,11 +8,11 @@ import (
"gopkg.in/yaml.v3"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
)
// ConfigMapPath is the default path where the ConfigMap is mounted.
const ConfigMapPath = "/etc/chrek/config.yaml"
const ConfigMapPath = "/etc/snapshot/config.yaml"
// LoadConfig loads the agent configuration from a YAML file.
func LoadConfig(path string) (*types.AgentConfig, error) {
......
// Package main provides the chrek DaemonSet agent.
// Package main provides the snapshot DaemonSet agent.
// The agent watches for pods with checkpoint/restore labels on its node
// and triggers operations via the orchestrators.
package main
......@@ -12,9 +12,9 @@ import (
"github.com/containerd/containerd"
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/watcher"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/watcher"
)
func main() {
......@@ -41,7 +41,7 @@ func main() {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
agentLog.Info("Starting chrek agent",
agentLog.Info("Starting snapshot agent",
"node", cfg.NodeName,
"checkpoint_dir", cfg.BasePath,
"watch_namespace", cfg.RestrictedNamespace,
......
......@@ -8,8 +8,8 @@ import (
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/orchestrate"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/orchestrate"
)
func main() {
......
module github.com/ai-dynamo/dynamo/deploy/chrek
module github.com/ai-dynamo/dynamo/deploy/snapshot
go 1.25.0
......
......@@ -10,7 +10,7 @@ import (
"github.com/moby/sys/mountinfo"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/types"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/types"
)
// ReadMountInfo reads and parses mountinfo for a container process via /host/proc.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment