Unverified Commit bb8fc8a4 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

feat(chrek): external restore, signal-based IPC, and package refactor (#6286)


Co-authored-by: default avatarDan Feigin <dfeigin@nvidia.com>
parent c8423b57
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Checkpoint/restore (chrek) integration for vLLM workers.
Handles the checkpoint job pod lifecycle:
1. Early exit if a checkpoint already exists (idempotency)
2. Sleep model for CRIU-friendly GPU state
3. Signal readiness for DaemonSet to begin checkpoint
4. Wait for watcher signals from the DaemonSet
5. Wake model after restore
Environment variables:
- DYN_READY_FOR_CHECKPOINT_FILE: Path where this worker writes readiness marker
- DYN_CHECKPOINT_STORAGE_TYPE: Storage backend (pvc, s3, oci) (optional, defaults to pvc)
- DYN_CHECKPOINT_LOCATION: Full checkpoint path (optional when PATH+HASH are provided)
- DYN_CHECKPOINT_PATH + DYN_CHECKPOINT_HASH: PVC base path + hash (used to derive location)
Signals handled in checkpoint mode:
- SIGUSR1: Checkpoint completed, exit process
- SIGCONT: Restore completed, wake model and continue
- SIGUSR2: Checkpoint/restore failed
"""
import asyncio
import logging
import os
import signal
from typing import Optional
logger = logging.getLogger(__name__)
class CheckpointConfig:
"""Parsed and validated checkpoint configuration from environment variables."""
def __init__(self):
self.ready_file = os.environ["DYN_READY_FOR_CHECKPOINT_FILE"]
self.storage_type = os.environ.get("DYN_CHECKPOINT_STORAGE_TYPE", "pvc")
self.location = os.environ.get("DYN_CHECKPOINT_LOCATION", "")
if not self.location:
checkpoint_path = os.environ.get("DYN_CHECKPOINT_PATH", "").rstrip("/")
checkpoint_hash = os.environ.get("DYN_CHECKPOINT_HASH", "")
if checkpoint_path and checkpoint_hash:
self.location = f"{checkpoint_path}/{checkpoint_hash}"
self.is_checkpoint_job = bool(self.location)
self._checkpoint_done = asyncio.Event()
self._restore_done = asyncio.Event()
self._checkpoint_failed = asyncio.Event()
def checkpoint_exists(self) -> bool:
"""Check if a completed checkpoint already exists (idempotency).
A checkpoint is complete when its directory exists at the base path root
(not under the tmp/ staging area). Directory presence = done.
"""
if self.storage_type != "pvc":
return False
if os.path.isdir(self.location):
logger.info(f"Existing checkpoint found at {self.location}, skipping")
return True
logger.info(f"No checkpoint at {self.location}, creating new one")
return False
async def run_lifecycle(self, engine_client, sleep_level: int) -> bool:
"""Run the full checkpoint lifecycle after the engine is loaded.
1. Put model to sleep (CRIU-friendly GPU state)
2. Write ready file (triggers DaemonSet checkpoint via readiness probe)
3. Wait for watcher signal (checkpoint complete, restore complete, or failure)
4. If restored: wake model and return True (caller proceeds with registration)
5. If checkpoint done: return False (caller should exit)
"""
# Sleep model for checkpoint
logger.info(f"Putting model to sleep (level={sleep_level})")
await engine_client.sleep(level=sleep_level)
# Install signal handlers before writing the ready file so there is no
# window where the DaemonSet can send SIGUSR1/SIGUSR2/SIGCONT while the
# default signal disposition (terminate) is still in effect.
self._install_signal_handlers()
# Signal readiness
with open(self.ready_file, "w") as f:
f.write("ready")
logger.info(
"Ready for checkpoint. Waiting for watcher signal "
"(SIGUSR1=checkpoint complete, SIGCONT=restore complete, SIGUSR2=failure)"
)
try:
event = await self._wait_for_watcher_signal()
if event == "restore":
logger.info("Restore signal detected (SIGCONT)")
logger.info("Waking up model after restore")
await engine_client.wake_up()
return True
if event == "checkpoint":
logger.info("Checkpoint completion signal detected (SIGUSR1)")
return False
raise RuntimeError("Checkpoint failed (received SIGUSR2 from watcher)")
finally:
self._remove_signal_handlers()
# Remove the ready file so that a restarting pod does not leave a
# stale marker that could trick the DaemonSet into acting on it.
try:
os.unlink(self.ready_file)
except OSError:
pass
def _install_signal_handlers(self) -> None:
loop = asyncio.get_running_loop()
loop.add_signal_handler(signal.SIGUSR1, self._checkpoint_done.set)
# SIGCONT is used as the restore-complete signal because SIGUSR1 and
# SIGUSR2 are already taken (checkpoint-complete and checkpoint-failed
# respectively). The chrek DaemonSet watcher is the only sender, so
# there is no conflict with POSIX job-control semantics in practice.
loop.add_signal_handler(signal.SIGCONT, self._restore_done.set)
loop.add_signal_handler(signal.SIGUSR2, self._checkpoint_failed.set)
def _remove_signal_handlers(self) -> None:
loop = asyncio.get_running_loop()
loop.remove_signal_handler(signal.SIGUSR1)
loop.remove_signal_handler(signal.SIGCONT)
loop.remove_signal_handler(signal.SIGUSR2)
async def _wait_for_watcher_signal(self) -> str:
waiters = {
asyncio.create_task(self._checkpoint_done.wait()): "checkpoint",
asyncio.create_task(self._restore_done.wait()): "restore",
asyncio.create_task(self._checkpoint_failed.wait()): "failed",
}
try:
done, pending = await asyncio.wait(
waiters.keys(), return_when=asyncio.FIRST_COMPLETED
)
for task in pending:
task.cancel()
winner = done.pop()
await winner
return waiters[winner]
finally:
for task in waiters:
if not task.done():
task.cancel()
def get_checkpoint_config() -> tuple[bool, Optional[CheckpointConfig]]:
"""Resolve checkpoint configuration, handling early-exit and cold-start cases.
Checkpoint mode is detected by DYN_READY_FOR_CHECKPOINT_FILE being set.
Returns:
(early_exit, config) where:
- early_exit=True, config=None: checkpoint job re-run, checkpoint already
exists — caller should return immediately.
- early_exit=False, config=None: not in checkpoint mode, or regular worker
with no checkpoint available yet — cold-start normally.
- early_exit=False, config=CheckpointConfig: checkpoint lifecycle should run.
"""
if "DYN_READY_FOR_CHECKPOINT_FILE" not in os.environ:
return False, None
# Validate checkpoint location: either a full location or path + hash must be set.
# Check the value (not just presence) so an empty string is treated as unset.
if not os.environ.get("DYN_CHECKPOINT_LOCATION"):
path = os.environ.get("DYN_CHECKPOINT_PATH", "")
hash_ = os.environ.get("DYN_CHECKPOINT_HASH", "")
if not path or not hash_:
raise EnvironmentError(
"Checkpoint mode requires either DYN_CHECKPOINT_LOCATION or both "
"DYN_CHECKPOINT_PATH and DYN_CHECKPOINT_HASH"
)
cfg = CheckpointConfig()
checkpoint_exists = cfg.checkpoint_exists()
if cfg.is_checkpoint_job and checkpoint_exists:
# Idempotent checkpoint job re-run: checkpoint already exists.
return True, None
if not cfg.is_checkpoint_job and not checkpoint_exists:
# Regular worker with no checkpoint available yet: cold-start normally.
return False, None
return False, cfg
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Checkpoint/restore (chrek) integration for vLLM workers.
Handles the checkpoint job pod lifecycle:
1. Early exit if a checkpoint already exists (idempotency)
2. Sleep model for CRIU-friendly GPU state
3. Signal readiness for DaemonSet to begin checkpoint
4. Poll for checkpoint completion or CRIU restore detection
5. Wake model after restore
Environment variables (all required in checkpoint mode, no fallbacks):
- DYN_CHECKPOINT_SIGNAL_FILE: Path where DaemonSet writes completion signal
- DYN_READY_FOR_CHECKPOINT_FILE: Path where this worker writes readiness marker
- DYN_CHECKPOINT_STORAGE_TYPE: Storage backend (pvc, s3, oci)
- DYN_CHECKPOINT_LOCATION: Full checkpoint path (for idempotency check)
- DYN_RESTORE_MARKER_FILE: Path written by restore-entrypoint before CRIU restore
"""
import asyncio
import json
import logging
import os
from typing import Optional
logger = logging.getLogger(__name__)
_REQUIRED_ENV_VARS = [
"DYN_CHECKPOINT_SIGNAL_FILE",
"DYN_READY_FOR_CHECKPOINT_FILE",
"DYN_CHECKPOINT_STORAGE_TYPE",
"DYN_CHECKPOINT_LOCATION",
"DYN_RESTORE_MARKER_FILE",
]
class CheckpointConfig:
"""Parsed and validated checkpoint configuration from environment variables."""
def __init__(self):
self.signal_file = os.environ["DYN_CHECKPOINT_SIGNAL_FILE"]
self.ready_file = os.environ["DYN_READY_FOR_CHECKPOINT_FILE"]
self.storage_type = os.environ["DYN_CHECKPOINT_STORAGE_TYPE"]
self.location = os.environ["DYN_CHECKPOINT_LOCATION"]
self.restore_marker = os.environ["DYN_RESTORE_MARKER_FILE"]
def _read_status_file(self, path: str) -> dict:
with open(path) as f:
status = json.load(f)
success = status.get("success")
if not isinstance(success, bool):
raise ValueError(f"missing or invalid success field in {path}")
return status
def checkpoint_exists(self) -> bool:
"""Check if a completed checkpoint already exists (idempotency).
For PVC storage, checks for checkpoint.done marker at the location.
Returns True if the job should exit without loading the model.
"""
assert (
self.storage_type == "pvc"
), "Checkpoint existence check is only implemented for PVC storage"
if self.storage_type == "pvc" and self.location:
done_marker = f"{self.location}/checkpoint.done"
if os.path.exists(done_marker):
try:
status = self._read_status_file(done_marker)
except (OSError, ValueError, json.JSONDecodeError) as exc:
logger.warning(
f"Invalid checkpoint.done marker at {done_marker}, ignoring stale checkpoint: {exc}"
)
return False
if status["success"]:
logger.info(
f"Existing successful checkpoint found at {self.location}, skipping"
)
return True
logger.warning(
f"Existing checkpoint marker reports failure at {self.location}: "
f"{status.get('error', 'unknown error')}"
)
return False
logger.info(f"No checkpoint at {self.location}, creating new one")
return False
async def run_lifecycle(self, engine_client, sleep_level: int) -> bool:
"""Run the full checkpoint lifecycle after the engine is loaded.
1. Put model to sleep (CRIU-friendly GPU state)
2. Write ready file (triggers DaemonSet checkpoint via readiness probe)
3. Poll for signal file (checkpoint done) or restore marker (CRIU restored us)
4. If restored: wake model and return True (caller proceeds with registration)
5. If checkpoint done: return False (caller should exit)
"""
# Sleep model for checkpoint
logger.info(f"Putting model to sleep (level={sleep_level})")
await engine_client.sleep(level=sleep_level)
# Signal readiness
with open(self.ready_file, "w") as f:
f.write("ready")
logger.info(
f"Ready for checkpoint. Waiting for signal: {self.signal_file} "
f"or restore marker: {self.restore_marker}"
)
# Poll for signal or restore
while True:
if os.path.exists(self.restore_marker):
logger.info(f"Restore detected (marker: {self.restore_marker})")
logger.info("Waking up model after restore")
await engine_client.wake_up()
return True
if os.path.exists(self.signal_file):
try:
signal = self._read_status_file(self.signal_file)
except (OSError, ValueError, json.JSONDecodeError) as exc:
raise RuntimeError(
f"Invalid checkpoint signal file {self.signal_file}: {exc}"
) from exc
if signal["success"]:
logger.info(f"Checkpoint complete (signal: {self.signal_file})")
return False
raise RuntimeError(
f"Checkpoint failed (signal: {self.signal_file}): "
f"{signal.get('error', 'unknown error')}"
)
await asyncio.sleep(1)
def get_checkpoint_config() -> Optional[CheckpointConfig]:
"""Returns CheckpointConfig if in checkpoint mode, None otherwise.
Checkpoint mode is detected by DYN_CHECKPOINT_SIGNAL_FILE being set.
If in checkpoint mode, all required env vars must be present — raises
EnvironmentError if any are missing.
"""
if "DYN_CHECKPOINT_SIGNAL_FILE" not in os.environ:
return None
missing = [v for v in _REQUIRED_ENV_VARS if v not in os.environ]
if missing:
raise EnvironmentError(
f"Checkpoint mode requires these environment variables: {', '.join(missing)}"
)
return CheckpointConfig()
......@@ -49,7 +49,7 @@ from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.vllm.worker_factory import WorkerFactory
from .args import Config, parse_args
from .chrek import get_checkpoint_config
from .checkpoint_restore import get_checkpoint_config
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
from .health_check import (
VllmHealthCheckPayload,
......@@ -100,8 +100,8 @@ async def worker():
config.served_model_name = config.engine_args.served_model_name = config.model
# Check checkpoint mode and validate env vars EARLY (fail fast if misconfigured)
checkpoint_cfg = get_checkpoint_config()
if checkpoint_cfg and checkpoint_cfg.checkpoint_exists():
early_exit, checkpoint_cfg = get_checkpoint_config()
if early_exit:
return
# Download the model if necessary using modelexpress.
......@@ -120,9 +120,7 @@ async def worker():
# This allows checkpointing GPU state before runtime connections are established
pre_created_engine = None
if checkpoint_cfg is not None:
logger.info(
f"Checkpoint mode enabled (signal_file={checkpoint_cfg.signal_file})"
)
logger.info("Checkpoint mode enabled (watcher-driven signals)")
# Checkpoint mode requires sleep mode — enable before engine init
config.engine_args.enable_sleep_mode = True
......
......@@ -2,10 +2,6 @@
bin/
*.exe
# Reference source repos (clone separately if needed)
containerd/
runc/
# Build artifacts
*.o
*.a
......
......@@ -67,7 +67,7 @@ ARG TARGETOS=linux
ARG TARGETARCH=amd64
RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /chrek-agent ./cmd/agent
RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /restore-entrypoint ./cmd/restore-entrypoint
RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /nsrestore ./cmd/nsrestore
# =============================================================================
# Stage: CRIU Builder - Build CRIU with CUDA plugin
......@@ -125,6 +125,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
uuid-runtime \
tar \
ca-certificates \
util-linux \
&& rm -rf /var/lib/apt/lists/*
# Copy CRIU from builder
......@@ -137,17 +138,20 @@ RUN chmod +x /usr/local/sbin/cuda-checkpoint
# Copy the built binaries
COPY --from=builder /chrek-agent /usr/local/bin/chrek-agent
COPY --from=builder /restore-entrypoint /restore-entrypoint
COPY --from=builder /nsrestore /usr/local/bin/nsrestore
# Create checkpoint directory
RUN mkdir -p /checkpoints
# Create directories
RUN mkdir -p /checkpoints /var/run/chrek
USER root
ENTRYPOINT ["/usr/local/bin/chrek-agent"]
# =============================================================================
# Stage: Placeholder - Restore placeholder image (requires BASE_IMAGE arg)
# Stage: Placeholder - Runtime-compatible restore image (requires BASE_IMAGE arg)
# This image is a superset of the runtime image: same default execution contract
# (entrypoint/cmd/user), plus CRIU/cuda-checkpoint tooling for external restore.
# The operator may still override command to "sleep infinity" for restore pods.
# =============================================================================
FROM ${BASE_IMAGE} AS placeholder
......@@ -156,7 +160,7 @@ ENV ORIGINAL_BASE_IMAGE=${BASE_IMAGE}
USER root
# Install CRIU runtime dependencies
# Install minimal runtime dependencies for CRIU restore (nsrestore runs here via nsenter)
RUN apt-get update && apt-get install -y --no-install-recommends \
libbsd0 \
libcap2 \
......@@ -174,20 +178,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# Copy CRIU from builder
# Copy CRIU from builder (needed by nsrestore running inside these namespaces)
COPY --from=criu-builder /criu-install/usr/local /usr/local
RUN criu --version && echo "CRIU installed successfully"
# Copy cuda-checkpoint binary
# Copy cuda-checkpoint binary (used for external CUDA state checkpoint/restore)
COPY --from=criu-builder /tmp/cuda-checkpoint/bin/x86_64_Linux/cuda-checkpoint /usr/local/sbin/cuda-checkpoint
RUN chmod +x /usr/local/sbin/cuda-checkpoint
# Copy nsrestore binary (invoked by DaemonSet via nsenter)
COPY --from=builder /nsrestore /usr/local/bin/nsrestore
RUN chmod +x /usr/local/bin/nsrestore
# Create directories
RUN mkdir -p /checkpoints /var/run/criu /var/criu-work
# Copy restore binaries
COPY --from=builder /restore-entrypoint /restore-entrypoint
RUN chmod +x /restore-entrypoint
ENTRYPOINT ["/restore-entrypoint"]
CMD []
......@@ -54,17 +54,8 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes.
##@ Build
.PHONY: build
build: fmt vet ## Build chrek-agent and restore-entrypoint binaries.
build: fmt vet ## Build chrek-agent binary.
CGO_ENABLED=0 go build -ldflags="-w -s" -o bin/chrek-agent ./cmd/agent
CGO_ENABLED=0 go build -ldflags="-w -s" -o bin/restore-entrypoint ./cmd/restore-entrypoint
.PHONY: build-agent
build-agent: fmt vet ## Build chrek-agent binary only.
CGO_ENABLED=0 go build -ldflags="-w -s" -o bin/chrek-agent ./cmd/agent
.PHONY: build-restore
build-restore: fmt vet ## Build restore-entrypoint binary only.
CGO_ENABLED=0 go build -ldflags="-w -s" -o bin/restore-entrypoint ./cmd/restore-entrypoint
.PHONY: run
run: build ## Run chrek-agent from your host.
......@@ -94,8 +85,15 @@ docker-build-placeholder: ## Build placeholder image for checkpoint restore. Req
ifndef PLACEHOLDER_BASE_IMG
$(error PLACEHOLDER_BASE_IMG is required. Example: make docker-build-placeholder PLACEHOLDER_BASE_IMG=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1-cuda13)
endif
@BASE_IMAGE_USER="$$( $(CONTAINER_TOOL) image inspect --format '{{.Config.User}}' ${PLACEHOLDER_BASE_IMG} 2>/dev/null || true )"; \
if [ -z "$$BASE_IMAGE_USER" ]; then \
$(CONTAINER_TOOL) pull ${PLACEHOLDER_BASE_IMG} >/dev/null; \
BASE_IMAGE_USER="$$( $(CONTAINER_TOOL) image inspect --format '{{.Config.User}}' ${PLACEHOLDER_BASE_IMG} 2>/dev/null || true )"; \
fi; \
if [ -z "$$BASE_IMAGE_USER" ]; then BASE_IMAGE_USER=root; fi; \
$(CONTAINER_TOOL) build --target placeholder \
--build-arg BASE_IMAGE=${PLACEHOLDER_BASE_IMG} \
--build-arg BASE_IMAGE_USER=$$BASE_IMAGE_USER \
-t ${PLACEHOLDER_IMG} .
.PHONY: docker-push-agent
......
......@@ -2,129 +2,44 @@
package main
import (
"errors"
"fmt"
"os"
"gopkg.in/yaml.v3"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/types"
)
// ConfigMapPath is the default path where the ConfigMap is mounted.
const ConfigMapPath = "/etc/chrek/config.yaml"
// CheckpointSignalSource determines how checkpoint operations are triggered.
type CheckpointSignalSource string
const (
// SignalFromHTTP triggers checkpoints via HTTP API requests.
SignalFromHTTP CheckpointSignalSource = "http"
// SignalFromWatcher triggers checkpoints automatically when pods become Ready.
SignalFromWatcher CheckpointSignalSource = "watcher"
)
// FullConfig is the root configuration structure loaded from the ConfigMap.
type FullConfig struct {
Agent AgentConfig `yaml:"agent"`
Checkpoint checkpoint.CheckpointSpec `yaml:"checkpoint"`
}
// AgentConfig holds the runtime configuration for the checkpoint agent daemon.
type AgentConfig struct {
// SignalSource determines how checkpoints are triggered: "http" or "watcher"
SignalSource string `yaml:"signalSource"`
// ListenAddr is the HTTP server address for health checks and API
ListenAddr string `yaml:"listenAddr"`
// NodeName is the Kubernetes node name (from NODE_NAME env, downward API)
NodeName string `yaml:"-"`
// RestrictedNamespace restricts pod watching to this namespace (optional)
RestrictedNamespace string `yaml:"-"`
}
// ConfigError represents a configuration validation error.
type ConfigError struct {
Field string
Message string
}
func (e *ConfigError) Error() string {
return fmt.Sprintf("config error: %s: %s", e.Field, e.Message)
}
// LoadConfig loads the full configuration from a YAML file.
func LoadConfig(path string) (*FullConfig, error) {
// LoadConfig loads the agent configuration from a YAML file.
func LoadConfig(path string) (*types.AgentConfig, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("failed to read config file %s: %w", path, err)
}
cfg := &FullConfig{}
cfg := &types.AgentConfig{}
if err := yaml.Unmarshal(data, cfg); err != nil {
return nil, fmt.Errorf("failed to parse config file %s: %w", path, err)
}
// Apply environment variable overrides
cfg.Agent.loadEnvOverrides()
cfg.LoadEnvOverrides()
return cfg, nil
}
// LoadConfigOrDefault loads configuration from a file, falling back to zero values if the file doesn't exist.
func LoadConfigOrDefault(path string) (*FullConfig, error) {
// LoadConfigOrDefault loads configuration from a file, falling back to defaults if the file doesn't exist.
func LoadConfigOrDefault(path string) (*types.AgentConfig, error) {
cfg, err := LoadConfig(path)
if err != nil {
if os.IsNotExist(err) {
cfg = &FullConfig{}
cfg.Agent.loadEnvOverrides()
if errors.Is(err, os.ErrNotExist) {
cfg = &types.AgentConfig{}
cfg.LoadEnvOverrides()
return cfg, nil
}
return nil, err
}
return cfg, nil
}
// loadEnvOverrides applies environment variable overrides to the AgentConfig.
func (c *AgentConfig) loadEnvOverrides() {
if v := os.Getenv("NODE_NAME"); v != "" {
c.NodeName = v
}
if v := os.Getenv("RESTRICTED_NAMESPACE"); v != "" {
c.RestrictedNamespace = v
}
}
// GetSignalSource returns the signal source as a CheckpointSignalSource type.
func (c *AgentConfig) GetSignalSource() CheckpointSignalSource {
return CheckpointSignalSource(c.SignalSource)
}
// Validate checks that the AgentConfig has valid values.
func (c *AgentConfig) Validate() error {
if c.SignalSource != string(SignalFromHTTP) && c.SignalSource != string(SignalFromWatcher) {
return &ConfigError{
Field: "signalSource",
Message: "must be 'http' or 'watcher'",
}
}
if c.SignalSource == string(SignalFromHTTP) && c.ListenAddr == "" {
return &ConfigError{
Field: "listenAddr",
Message: "cannot be empty when signalSource is 'http'",
}
}
return nil
}
// Validate validates the full configuration.
func (c *FullConfig) Validate() error {
if err := c.Agent.Validate(); err != nil {
return err
}
if err := c.Checkpoint.Validate(); err != nil {
return err
}
return nil
}
// Package main provides the CRIU node agent with HTTP API and/or pod watching.
// The agent supports two modes that can be enabled independently:
// - HTTP API mode: Exposes REST endpoints for checkpoint/restore operations
// - Watcher mode: Automatically checkpoints pods with nvidia.com/checkpoint-source=true label
// Package main provides the chrek DaemonSet agent.
// The agent watches for pods with checkpoint/restore labels on its node
// and triggers operations via the orchestrators.
package main
import (
"context"
"log"
"net/http"
"os"
"os/signal"
"syscall"
"time"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
httpApiServer "github.com/ai-dynamo/dynamo/deploy/chrek/pkg/http_api_server"
"github.com/containerd/containerd"
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/watcher"
)
func main() {
// Load configuration from ConfigMap (or use defaults if not found)
rootLog := logging.ConfigureLogger("stdout")
agentLog := rootLog.WithName("agent")
cfg, err := LoadConfigOrDefault(ConfigMapPath)
if err != nil {
log.Fatalf("Failed to load configuration: %v", err)
fatal(agentLog, err, "Failed to load configuration")
}
// Validate configuration
if err := cfg.Agent.Validate(); err != nil {
log.Fatalf("Invalid configuration: %v", err)
if err := cfg.Validate(); err != nil {
fatal(agentLog, err, "Invalid configuration")
}
// Create discovery client
discoveryClient, err := checkpoint.NewDiscoveryClient()
ctrd, err := containerd.New(common.ContainerdSocket)
if err != nil {
log.Fatalf("Failed to create discovery client: %v", err)
fatal(agentLog, err, "Failed to connect to containerd")
}
defer discoveryClient.Close()
defer ctrd.Close()
// Create checkpointer
checkpointer := checkpoint.NewCheckpointer(discoveryClient)
// Context for graceful shutdown
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Handle graceful shutdown
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
log.Printf("CRIU Node Agent starting (node: %s)", cfg.Agent.NodeName)
log.Printf("Checkpoint directory: %s", cfg.Checkpoint.BasePath)
log.Printf("Signal source: %s", cfg.Agent.SignalSource)
agentLog.Info("Starting chrek agent",
"node", cfg.NodeName,
"checkpoint_dir", cfg.BasePath,
"watch_namespace", cfg.RestrictedNamespace,
)
switch cfg.Agent.GetSignalSource() {
case SignalFromHTTP:
serverCfg := httpApiServer.ServerConfig{
ListenAddr: cfg.Agent.ListenAddr,
NodeName: cfg.Agent.NodeName,
CheckpointSpec: &cfg.Checkpoint,
}
srv := httpApiServer.NewServer(serverCfg, checkpointer)
podWatcher, err := watcher.NewWatcher(cfg, ctrd, rootLog.WithName("watcher"))
if err != nil {
fatal(agentLog, err, "Failed to create pod watcher")
}
// Handle graceful shutdown
go func() {
<-sigChan
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer shutdownCancel()
if err := srv.Shutdown(shutdownCtx); err != nil {
log.Printf("HTTP server shutdown error: %v", err)
// Run watcher in the background
watcherDone := make(chan error, 1)
go func() {
agentLog.Info("Pod watcher started")
watcherDone <- podWatcher.Start(ctx)
}()
// Wait for signal or watcher exit
select {
case <-sigChan:
agentLog.Info("Shutting down")
cancel()
select {
case err := <-watcherDone:
if err != nil {
agentLog.Error(err, "Pod watcher exited with error during shutdown")
}
}()
if err := srv.Start(); err != http.ErrServerClosed {
log.Fatalf("HTTP server error: %v", err)
default:
}
case SignalFromWatcher:
watcherConfig := watcher.WatcherConfig{
NodeName: cfg.Agent.NodeName,
ListenAddr: cfg.Agent.ListenAddr,
RestrictedNamespace: cfg.Agent.RestrictedNamespace,
CheckpointSpec: &cfg.Checkpoint,
}
podWatcher, err := watcher.NewWatcher(watcherConfig, discoveryClient, checkpointer)
case err := <-watcherDone:
if err != nil {
log.Fatalf("Failed to create pod watcher: %v", err)
fatal(agentLog, err, "Pod watcher exited with error")
}
}
// Handle graceful shutdown
go func() {
<-sigChan
log.Println("Shutting down pod watcher...")
cancel()
}()
log.Printf("Pod watcher started (watching for label: %s=true)", checkpoint.KubeLabelCheckpointSource)
log.Printf("Health check endpoint: http://0.0.0.0%s/health", cfg.Agent.ListenAddr)
if err := podWatcher.Start(ctx); err != nil {
log.Printf("Pod watcher error: %v", err)
}
agentLog.Info("Agent stopped")
}
default:
log.Fatalf("Unknown signal source: %s", cfg.Agent.SignalSource)
func fatal(log logr.Logger, err error, msg string, keysAndValues ...interface{}) {
if err != nil {
log.Error(err, msg, keysAndValues...)
} else {
log.Info(msg, keysAndValues...)
}
log.Println("Agent stopped")
os.Exit(1)
}
package main
import (
"context"
"encoding/json"
"flag"
"os"
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/orchestrate"
)
func main() {
// Logs go to stderr so stdout is reserved for the structured result.
log := logging.ConfigureLogger("stderr").WithName("nsrestore")
checkpointPath := flag.String("checkpoint-path", "", "Path to checkpoint directory")
cudaDeviceMap := flag.String("cuda-device-map", "", "CUDA device map for cuda-checkpoint restore")
cgroupRoot := flag.String("cgroup-root", "", "CRIU cgroup root remap path")
flag.Parse()
if *checkpointPath == "" {
fatal(log, nil, "--checkpoint-path is required")
}
opts := orchestrate.RestoreOptions{
CheckpointPath: *checkpointPath,
CUDADeviceMap: *cudaDeviceMap,
CgroupRoot: *cgroupRoot,
}
restoredPID, err := orchestrate.RestoreInNamespace(context.Background(), opts, log)
if err != nil {
fatal(log, err, "restore failed")
}
result := struct {
RestoredPID int `json:"restoredPID"`
}{RestoredPID: restoredPID}
if err := json.NewEncoder(os.Stdout).Encode(result); err != nil {
fatal(log, err, "Failed to write restore result")
}
}
func fatal(log logr.Logger, err error, msg string, keysAndValues ...interface{}) {
if err != nil {
log.Error(err, msg, keysAndValues...)
} else {
log.Info(msg, keysAndValues...)
}
os.Exit(1)
}
// Package main provides the restore-entrypoint binary for self-restoring placeholder containers.
// This binary replaces the shell script restore-entrypoint.sh with a Go implementation
// that uses the go-criu library for CRIU operations.
package main
import (
"context"
"fmt"
"os"
"os/exec"
"os/signal"
"path/filepath"
"strings"
"syscall"
"github.com/sirupsen/logrus"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/restore"
)
// logGPUDiagnostics logs nvidia-smi output and /dev/nvidia* devices for debugging GPU visibility.
func logGPUDiagnostics(label string) {
fmt.Printf("=== GPU DIAGNOSTICS [%s] ===\n", label)
// nvidia-smi
if out, err := exec.Command("nvidia-smi", "-L").CombinedOutput(); err != nil {
fmt.Printf("nvidia-smi -L: error: %v\n", err)
} else {
fmt.Printf("nvidia-smi -L:\n%s", out)
}
// GPU memory usage
if out, err := exec.Command("nvidia-smi", "--query-gpu=index,uuid,memory.used,memory.total,memory.free", "--format=csv,noheader").CombinedOutput(); err != nil {
fmt.Printf("nvidia-smi memory query: error: %v\n", err)
} else {
fmt.Printf("nvidia-smi memory:\n%s", out)
}
// /dev/nvidia* devices
matches, _ := filepath.Glob("/dev/nvidia*")
fmt.Printf("/dev/nvidia* devices: %s\n", strings.Join(matches, ", "))
// NVIDIA_VISIBLE_DEVICES env
fmt.Printf("NVIDIA_VISIBLE_DEVICES=%s\n", os.Getenv("NVIDIA_VISIBLE_DEVICES"))
fmt.Printf("CUDA_VISIBLE_DEVICES=%s\n", os.Getenv("CUDA_VISIBLE_DEVICES"))
// Linux namespaces for PID 1
for _, ns := range []string{"mnt", "pid", "ipc", "net", "uts", "cgroup"} {
link, err := os.Readlink(fmt.Sprintf("/proc/1/ns/%s", ns))
if err != nil {
link = err.Error()
}
fmt.Printf("ns/%s: %s\n", ns, link)
}
fmt.Printf("=== END GPU DIAGNOSTICS [%s] ===\n", label)
}
func main() {
// Log GPU diagnostics BEFORE anything else (gated on DEBUG for production quietness)
if os.Getenv("DEBUG") == "1" {
logGPUDiagnostics("PRE-RESTORE")
}
// Set up logging
log := logrus.New()
log.SetOutput(os.Stdout)
log.SetFormatter(&logrus.TextFormatter{
FullTimestamp: true,
TimestampFormat: "2006-01-02 15:04:05",
})
// Load configuration from hardcoded defaults + operator-injected env vars.
// os.Args[1:] are the cold start command args (passed by the operator via pod spec).
cfg, err := restore.NewRestoreRequest(os.Args[1:])
if err != nil {
log.WithError(err).Fatal("Failed to load restore configuration")
}
// Set log level based on DEBUG flag
if cfg.Debug {
log.SetLevel(logrus.DebugLevel)
} else {
log.SetLevel(logrus.InfoLevel)
}
entry := log.WithField("component", "restore-entrypoint")
// Set up context with signal handling for graceful shutdown
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Handle shutdown signals
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGTERM, syscall.SIGINT)
go func() {
sig := <-sigChan
entry.WithField("signal", sig).Info("Received shutdown signal")
cancel()
}()
// Run the restore entrypoint
if err := restore.Run(ctx, cfg, entry); err != nil {
entry.WithError(err).Fatal("Restore entrypoint failed")
}
}
......@@ -3,15 +3,23 @@ module github.com/ai-dynamo/dynamo/deploy/chrek
go 1.25.0
require (
github.com/checkpoint-restore/go-criu/v7 v7.2.0
github.com/checkpoint-restore/go-criu/v8 v8.2.0
github.com/containerd/containerd v1.7.30
github.com/cyphar/filepath-securejoin v0.5.1
github.com/go-logr/logr v1.4.3
github.com/go-logr/zapr v1.3.0
github.com/moby/sys/mountinfo v0.7.1
github.com/opencontainers/runtime-spec v1.2.0
github.com/sirupsen/logrus v1.9.4
github.com/prometheus/procfs v0.16.1
go.uber.org/zap v1.27.1
golang.org/x/sys v0.40.0
google.golang.org/grpc v1.72.2
google.golang.org/protobuf v1.36.11
gopkg.in/yaml.v3 v3.0.1
k8s.io/api v0.35.0
k8s.io/apimachinery v0.35.0
k8s.io/client-go v0.35.0
k8s.io/kubelet v0.35.0
)
require (
......@@ -28,30 +36,26 @@ require (
github.com/containerd/platforms v0.2.1 // indirect
github.com/containerd/ttrpc v1.2.7 // indirect
github.com/containerd/typeurl/v2 v2.1.1 // indirect
github.com/cyphar/filepath-securejoin v0.5.1 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/distribution/reference v0.6.0 // indirect
github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c // indirect
github.com/emicklei/go-restful/v3 v3.12.2 // indirect
github.com/felixge/httpsnoop v1.0.3 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
github.com/go-logr/logr v1.4.3 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/gnostic-models v0.7.0 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.16.7 // indirect
github.com/klauspost/compress v1.18.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/moby/locker v1.0.1 // indirect
github.com/moby/sys/mountinfo v0.7.1 // indirect
github.com/moby/sys/sequential v0.5.0 // indirect
github.com/moby/sys/signal v0.7.0 // indirect
github.com/moby/sys/user v0.3.0 // indirect
......@@ -64,13 +68,15 @@ require (
github.com/opencontainers/selinux v1.13.1 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/spf13/pflag v1.0.9 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/x448/float16 v0.8.4 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0 // indirect
go.opentelemetry.io/otel v1.21.0 // indirect
go.opentelemetry.io/otel/metric v1.21.0 // indirect
go.opentelemetry.io/otel/trace v1.21.0 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
go.opentelemetry.io/otel v1.36.0 // indirect
go.opentelemetry.io/otel/metric v1.36.0 // indirect
go.opentelemetry.io/otel/trace v1.36.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/net v0.47.0 // indirect
......@@ -80,16 +86,14 @@ require (
golang.org/x/text v0.31.0 // indirect
golang.org/x/time v0.12.0 // indirect
google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240401170217-c3f982113cda // indirect
google.golang.org/grpc v1.59.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250528174236-200df99c418a // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
sigs.k8s.io/randfill v1.0.0 // indirect
sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect
sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 // indirect
sigs.k8s.io/yaml v1.6.0 // indirect
)
......@@ -11,8 +11,8 @@ github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA
github.com/Microsoft/hcsshim v0.11.7 h1:vl/nj3Bar/CvJSYo7gIQPyRWc9f3c6IeSNavBTSZNZQ=
github.com/Microsoft/hcsshim v0.11.7/go.mod h1:MV8xMfmECjl5HdO7U/3/hFVnkmSBjAjmA09d4bExKcU=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/checkpoint-restore/go-criu/v7 v7.2.0 h1:qGiWA4App1gGlEfIJ68WR9jbezV9J7yZdjzglezcqKo=
github.com/checkpoint-restore/go-criu/v7 v7.2.0/go.mod h1:u0LCWLg0w4yqqu14aXhiB4YD3a1qd8EcCEg7vda5dwo=
github.com/checkpoint-restore/go-criu/v8 v8.2.0 h1:dsgMgj/eJtZNKn3qn/+Ri0b4bd0uo6o2zt1yd8Nj2NI=
github.com/checkpoint-restore/go-criu/v8 v8.2.0/go.mod h1:HVKJ1dK+bowJcFI1MtdL2ECIuY+/AtRMHzD9Lqa4uA4=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM=
......@@ -51,8 +51,8 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/felixge/httpsnoop v1.0.3 h1:s/nj+GCswXYzN5v2DpNMuMQYe+0DDwt5WVCU6CWBdXk=
github.com/felixge/httpsnoop v1.0.3/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM=
github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ=
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
......@@ -60,6 +60,8 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
......@@ -112,8 +114,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I=
github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
......@@ -160,12 +162,12 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/procfs v0.10.1 h1:kYK1Va/YMlutzCGazswoHKo//tZVlFpKYh+PymziUAg=
github.com/prometheus/procfs v0.10.1/go.mod h1:nwNm2aOCAYw8uTR/9bWRREkZFxAUcWzPHWJq+XBB/FM=
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w=
github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g=
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY=
github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
......@@ -174,6 +176,7 @@ github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpE
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
......@@ -185,16 +188,26 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0 h1:x8Z78aZx8cOF0+Kkazoc7lwUNMGy0LrzEMxTm4BbTxg=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0/go.mod h1:62CPTSry9QZtOaSsE3tOzhx6LzDhHnXJ6xHeMNNiM6Q=
go.opentelemetry.io/otel v1.21.0 h1:hzLeKBZEL7Okw2mGzZ0cc4k/A7Fta0uoPgaJCr8fsFc=
go.opentelemetry.io/otel v1.21.0/go.mod h1:QZzNPQPm1zLX4gZK4cMi+71eaorMSGT3A4znnUvNNEo=
go.opentelemetry.io/otel/metric v1.21.0 h1:tlYWfeo+Bocx5kLEloTjbcDwBuELRrIFxwdQ36PlJu4=
go.opentelemetry.io/otel/metric v1.21.0/go.mod h1:o1p3CA8nNHW8j5yuQLdc1eeqEaPfzug24uvsyIEJRWM=
go.opentelemetry.io/otel/trace v1.21.0 h1:WD9i5gzvoUPuXIXH24ZNBudiarZDKuekPqi/E8fpfLc=
go.opentelemetry.io/otel/trace v1.21.0/go.mod h1:LGbsEB0f9LGjN+OZaQQ26sohbOmiMR+BaslueVtS/qQ=
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q=
go.opentelemetry.io/otel v1.36.0 h1:UumtzIklRBY6cI/lllNZlALOF5nNIzJVb16APdvgTXg=
go.opentelemetry.io/otel v1.36.0/go.mod h1:/TcFMXYjyRNh8khOAO9ybYkqaDBb/70aVwkNML4pP8E=
go.opentelemetry.io/otel/metric v1.36.0 h1:MoWPKVhQvJ+eeXWHFBOPoBOi20jh6Iq2CcCREuTYufE=
go.opentelemetry.io/otel/metric v1.36.0/go.mod h1:zC7Ks+yeyJt4xig9DEw9kuUFe5C3zLbVjV2PzT6qzbs=
go.opentelemetry.io/otel/sdk v1.36.0 h1:b6SYIuLRs88ztox4EyrvRti80uXIFy+Sqzoh9kFULbs=
go.opentelemetry.io/otel/sdk v1.36.0/go.mod h1:+lC+mTgD+MUWfjJubi2vvXWcVxyr9rmlshZni72pXeY=
go.opentelemetry.io/otel/sdk/metric v1.36.0 h1:r0ntwwGosWGaa0CrSt8cuNuTcccMXERFwHX4dThiPis=
go.opentelemetry.io/otel/sdk/metric v1.36.0/go.mod h1:qTNOhFDfKRwX0yXOqJYegL5WRaW376QbB7P4Pb0qva4=
go.opentelemetry.io/otel/trace v1.36.0 h1:ahxWNuqZjpdiFAyrIoQ4GIiAIhxAunQR6MUoKrsNd4w=
go.opentelemetry.io/otel/trace v1.36.0/go.mod h1:gQ+OnDZzrybY4k4seLzPAWNwVBBVlF2szhehOBB/tGA=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc=
go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
......@@ -237,6 +250,7 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU=
......@@ -268,15 +282,15 @@ google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98
google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3 h1:1hfbdAfFbkmpg41000wDVqr7jUpK/Yo+LPnIxxGzmkg=
google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3/go.mod h1:5RBcpGRxr25RbDzY5w+dmaqpSEvl8Gwl1x2CICf60ic=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240401170217-c3f982113cda h1:LI5DOvAxUPMv/50agcLLoo+AdWc1irS9Rzz4vPuD1V4=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240401170217-c3f982113cda/go.mod h1:WtryC6hu0hhx87FDGxWCDptyssuo68sk10vYjF+T9fY=
google.golang.org/genproto/googleapis/rpc v0.0.0-20250528174236-200df99c418a h1:v2PbRU4K3llS09c7zodFpNePeamkAwG3mPrAery9VeE=
google.golang.org/genproto/googleapis/rpc v0.0.0-20250528174236-200df99c418a/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A=
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc=
google.golang.org/grpc v1.59.0 h1:Z5Iec2pjwb+LEOqzpB2MR12/eKFhDPhuqW91O+4bwUk=
google.golang.org/grpc v1.59.0/go.mod h1:aUPDwccQo6OTjy7Hct4AfBPD1GptF4fyUjIkQ9YtF98=
google.golang.org/grpc v1.72.2 h1:TdbGzwb82ty4OusHWepvFWGLgIbNo1/SUynEN0ssqv8=
google.golang.org/grpc v1.72.2/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM=
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
......@@ -312,13 +326,15 @@ k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE=
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ=
k8s.io/kubelet v0.35.0 h1:8cgJHCBCKLYuuQ7/Pxb/qWbJfX1LXIw7790ce9xHq7c=
k8s.io/kubelet v0.35.0/go.mod h1:ciRzAXn7C4z5iB7FhG1L2CGPPXLTVCABDlbXt/Zz8YA=
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck=
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco=
sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 h1:2WOzJpHUBVrrkDjU4KBT8n5LDcj824eX0I5UKcgeRUs=
sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=
// Package checkpoint provides CRIU checkpoint (dump) operations.
package checkpoint
import (
"context"
"fmt"
"os"
"path/filepath"
"time"
criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)
// ContainerInfoSnapshot holds runtime/container info needed for checkpointing.
type ContainerInfoSnapshot struct {
PID int
RootFS string
UpperDir string
OCISpec *specs.Spec
MountInfo []MountInfo
Namespaces map[NamespaceType]*NamespaceInfo
}
// CheckpointManifest is saved as manifest.yaml at checkpoint time and loaded at restore.
type CheckpointManifest struct {
CheckpointID string `yaml:"checkpointId"`
CreatedAt time.Time `yaml:"createdAt"`
CRIUDump CRIUDumpManifest `yaml:"criuDump"`
K8s SourcePodManifest `yaml:"k8s"`
Filesystem FilesystemManifest `yaml:"filesystem"`
Namespaces []NamespaceManifestEntry `yaml:"namespaces"`
}
// NewCheckpointManifest assembles a CheckpointManifest from per-module builders.
func NewCheckpointManifest(
checkpointID string,
criuDump CRIUDumpManifest,
k8s SourcePodManifest,
filesystem FilesystemManifest,
namespaces []NamespaceManifestEntry,
) *CheckpointManifest {
return &CheckpointManifest{
CheckpointID: checkpointID,
CreatedAt: time.Now().UTC(),
CRIUDump: criuDump,
K8s: k8s,
Filesystem: filesystem,
Namespaces: namespaces,
}
}
// CheckpointRequest holds per-checkpoint identifiers for a checkpoint operation.
type CheckpointRequest struct {
ContainerID string
ContainerName string // K8s container name (for K8s API volume type lookup)
CheckpointID string
CheckpointDir string
NodeName string
PodName string
PodNamespace string
}
// CheckpointOutcome contains the result of a checkpoint operation.
type CheckpointOutcome struct {
CheckpointID string
CheckpointDir string
Data *CheckpointManifest
}
// Checkpointer performs CRIU checkpoint operations
type Checkpointer struct {
discoveryClient *DiscoveryClient
log *logrus.Entry
}
// NewCheckpointer creates a new checkpointer
func NewCheckpointer(discoveryClient *DiscoveryClient) *Checkpointer {
return &Checkpointer{
discoveryClient: discoveryClient,
log: logrus.WithField("component", "checkpointer"),
}
}
// Checkpoint performs a CRIU dump of a container.
// The operation has three phases: introspect, configure, capture.
func (c *Checkpointer) Checkpoint(ctx context.Context, req CheckpointRequest, spec *CheckpointSpec) (*CheckpointOutcome, error) {
if spec == nil {
return nil, fmt.Errorf("checkpoint spec is required")
}
checkpointStart := time.Now()
c.log.Info("=== Starting checkpoint operation ===")
checkpointDir := filepath.Join(req.CheckpointDir, req.CheckpointID)
if err := os.MkdirAll(checkpointDir, 0700); err != nil {
return nil, fmt.Errorf("failed to create checkpoint directory: %w", err)
}
// Open image directory FD for CRIU — must stay open through both configure and capture
// phases since CRIU's swrk child process inherits this FD.
imageDir, imageDirFD, err := common.OpenPathForCRIU(checkpointDir)
if err != nil {
return nil, fmt.Errorf("failed to open image directory: %w", err)
}
defer imageDir.Close()
// Phase 1: Introspect container state
state, err := c.introspect(ctx, req.ContainerID)
if err != nil {
return nil, err
}
// Phase 2: Configure CRIU options and build checkpoint manifest.
criuOpts, data, err := c.configure(state, req, spec, checkpointDir, imageDirFD)
if err != nil {
return nil, err
}
// Phase 3: Capture — CRIU dump, /dev/shm, rootfs diff
criuDumpDuration, err := c.capture(criuOpts, data, state, checkpointDir)
if err != nil {
return nil, err
}
totalDuration := time.Since(checkpointStart)
c.log.WithFields(logrus.Fields{
"total_duration": totalDuration,
"criu_dump_duration": criuDumpDuration,
}).Info("=== Checkpoint operation completed ===")
return &CheckpointOutcome{
CheckpointID: req.CheckpointID,
CheckpointDir: checkpointDir,
Data: data,
}, nil
}
// introspect resolves the container and gathers all runtime state from containerd and /proc.
func (c *Checkpointer) introspect(ctx context.Context, containerID string) (*ContainerInfoSnapshot, error) {
pid, ociSpec, err := c.discoveryClient.ResolveContainer(ctx, containerID)
if err != nil {
return nil, fmt.Errorf("failed to resolve container: %w", err)
}
rootFS, err := GetRootFS(pid)
if err != nil {
return nil, fmt.Errorf("failed to get rootfs: %w", err)
}
upperDir, err := GetOverlayUpperDir(pid)
if err != nil {
return nil, fmt.Errorf("failed to get overlay upperdir: %w", err)
}
mountInfo, err := ReadMountInfoFromHostProcPath(pid)
if err != nil {
return nil, fmt.Errorf("failed to parse mountinfo: %w", err)
}
namespaces, err := GetAllNamespaces(pid)
if err != nil {
return nil, fmt.Errorf("failed to get namespaces: %w", err)
}
return &ContainerInfoSnapshot{
PID: pid,
RootFS: rootFS,
UpperDir: upperDir,
OCISpec: ociSpec,
MountInfo: mountInfo,
Namespaces: namespaces,
}, nil
}
// configure builds CRIU options and checkpoint manifest from runtime snapshot and spec.
func (c *Checkpointer) configure(
state *ContainerInfoSnapshot,
req CheckpointRequest,
spec *CheckpointSpec,
checkpointDir string,
imageDirFD int32,
) (*criurpc.CriuOpts, *CheckpointManifest, error) {
criuOpts, err := BuildCRIUDumpOptions(
&spec.CRIU,
state.PID,
imageDirFD,
state.RootFS,
state.MountInfo,
state.OCISpec,
state.Namespaces,
)
if err != nil {
return nil, nil, err
}
// Write CRIU config file (for options unavailable via RPC)
configPath := filepath.Join(checkpointDir, CheckpointCRIUConfFilename)
if err := os.WriteFile(configPath, []byte(spec.CRIU.GenerateCRIUConfContent()), 0644); err != nil {
return nil, nil, fmt.Errorf("failed to write CRIU config file: %w", err)
}
criuOpts.ConfigFile = proto.String(configPath)
// Build and save the checkpoint manifest.
manifest := NewCheckpointManifest(
req.CheckpointID,
NewCRIUDumpManifest(criuOpts, spec.CRIU),
NewSourcePodManifest(req, state.PID),
NewFilesystemManifest(spec.RootfsExclusions, state.UpperDir, state.OCISpec),
NewNamespaceManifestEntries(state.Namespaces),
)
if err := WriteCheckpointManifest(checkpointDir, manifest); err != nil {
return nil, nil, fmt.Errorf("failed to write checkpoint manifest: %w", err)
}
return criuOpts, manifest, nil
}
// capture executes the CRIU dump and post-dump captures (/dev/shm, rootfs diff).
// Returns the CRIU dump duration for timing reporting.
func (c *Checkpointer) capture(
criuOpts *criurpc.CriuOpts,
data *CheckpointManifest,
state *ContainerInfoSnapshot,
checkpointDir string,
) (time.Duration, error) {
criuDumpDuration, err := ExecuteCRIUDump(criuOpts, checkpointDir, c.log)
if err != nil {
return 0, err
}
// Capture /dev/shm contents (must happen after dump for final process state)
if err := CaptureDevShm(state.PID, checkpointDir, c.log); err != nil {
c.log.WithError(err).Warn("Failed to capture /dev/shm contents")
}
// Capture rootfs diff and deleted files
CaptureRootfsState(state.UpperDir, checkpointDir, data, c.log)
return criuDumpDuration, nil
}
// config.go defines the static checkpoint spec loaded from ConfigMap YAML.
package checkpoint
import "fmt"
// CheckpointSpec is the static checkpoint spec loaded from ConfigMap YAML.
type CheckpointSpec struct {
// BasePath is the base directory for checkpoint storage (PVC mount point).
BasePath string `yaml:"basePath"`
// CRIU options for dump operations
CRIU CRIUSettings `yaml:"criu"`
// RootfsExclusions defines paths to exclude from rootfs diff capture
RootfsExclusions FilesystemConfig `yaml:"rootfsExclusions"`
}
// Validate checks that the CheckpointSpec has valid values.
func (c *CheckpointSpec) Validate() error {
return c.RootfsExclusions.Validate()
}
// ConfigError represents a configuration validation error.
type ConfigError struct {
Field string
Message string
}
func (e *ConfigError) Error() string {
return fmt.Sprintf("config error: %s: %s", e.Field, e.Message)
}
// constants.go defines shared constants used across checkpoint and restore packages.
package checkpoint
const (
// HostProcPath is the mount point for the host's /proc in DaemonSet pods.
HostProcPath = "/host/proc"
// DevShmDirName is the directory name for captured /dev/shm contents.
DevShmDirName = "dev-shm"
// KubeLabelCheckpointSource is the pod label that triggers automatic checkpointing.
// Set by the operator on checkpoint-eligible pods.
KubeLabelCheckpointSource = "nvidia.com/checkpoint-source"
// KubeLabelCheckpointHash is the pod label specifying the checkpoint identity hash.
// Set by the operator on checkpoint-eligible pods.
KubeLabelCheckpointHash = "nvidia.com/checkpoint-hash"
// DumpLogFilename is the CRIU dump (checkpoint) log filename.
DumpLogFilename = "dump.log"
// CheckpointCRIUConfFilename is the CRIU config file written at checkpoint time.
CheckpointCRIUConfFilename = "criu.conf"
// CheckpointDoneFilename is the marker file written to the checkpoint directory
// after all checkpoint artifacts are complete. Used to detect checkpoint readiness.
// Also hard-coded in vLLM for early-exit when checkpoint already exists.
CheckpointDoneFilename = "checkpoint.done"
// CheckpointManifestFilename is the name of the manifest file in checkpoint directories.
CheckpointManifestFilename = "manifest.yaml"
// DescriptorsFilename is the name of the file descriptors file.
DescriptorsFilename = "descriptors.yaml"
// RootfsDiffFilename is the name of the rootfs diff tar in checkpoint directories.
RootfsDiffFilename = "rootfs-diff.tar"
// DeletedFilesFilename is the name of the deleted files JSON in checkpoint directories.
DeletedFilesFilename = "deleted-files.json"
)
// criu provides CRIU-specific configuration and utilities for checkpoint operations.
package checkpoint
import (
"fmt"
"time"
criu "github.com/checkpoint-restore/go-criu/v7"
criurpc "github.com/checkpoint-restore/go-criu/v7/rpc"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"google.golang.org/protobuf/proto"
)
// CRIUSettings holds CRIU-specific configuration options.
// Options are categorized by how they are passed to CRIU:
// - RPC options: Passed via go-criu CriuOpts protobuf
// - CRIU conf file options: Written to criu.conf (NOT available via RPC)
type CRIUSettings struct {
// === RPC Options (passed via go-criu CriuOpts) ===
// GhostLimit is the maximum ghost file size in bytes.
// Ghost files are deleted-but-open files that CRIU needs to checkpoint.
// 512MB is recommended for GPU workloads with large memory allocations.
GhostLimit uint32 `yaml:"ghostLimit"`
// Timeout is the CRIU operation timeout in seconds.
// 6 hours (21600s) is recommended for large GPU model checkpoints.
Timeout uint32 `yaml:"timeout"`
// LogLevel is the CRIU logging verbosity (0-4).
LogLevel int32 `yaml:"logLevel"`
// WorkDir is the CRIU work directory for temporary files.
WorkDir string `yaml:"workDir"`
// AutoDedup enables auto-deduplication of memory pages.
AutoDedup bool `yaml:"autoDedup"`
// LazyPages enables lazy page migration (experimental).
LazyPages bool `yaml:"lazyPages"`
// LeaveRunning keeps the process running after checkpoint (dump only).
LeaveRunning bool `yaml:"leaveRunning"`
// ShellJob allows checkpointing session leaders (containers are often session leaders).
ShellJob bool `yaml:"shellJob"`
// TcpClose closes TCP connections instead of preserving them (pod IPs change on restore).
TcpClose bool `yaml:"tcpClose"`
// FileLocks allows checkpointing processes with file locks.
FileLocks bool `yaml:"fileLocks"`
// OrphanPtsMaster allows checkpointing containers with TTYs.
OrphanPtsMaster bool `yaml:"orphanPtsMaster"`
// ExtUnixSk allows external Unix sockets.
ExtUnixSk bool `yaml:"extUnixSk"`
// LinkRemap handles deleted-but-open files.
LinkRemap bool `yaml:"linkRemap"`
// ExtMasters allows external bind mount masters.
ExtMasters bool `yaml:"extMasters"`
// ManageCgroupsMode controls cgroup handling: "ignore" lets K8s manage cgroups.
ManageCgroupsMode string `yaml:"manageCgroupsMode"`
// === CRIU Conf File Options (NOT available via RPC - written to criu.conf) ===
// LibDir is the path to CRIU plugin directory (e.g., /usr/local/lib/criu).
// Required for CUDA checkpoint/restore.
LibDir string `yaml:"libDir"`
// AllowUprobes allows user-space probes (required for CUDA checkpoints).
AllowUprobes bool `yaml:"allowUprobes"`
// SkipInFlight skips in-flight TCP connections during checkpoint/restore.
SkipInFlight bool `yaml:"skipInFlight"`
}
// GenerateCRIUConfContent generates the criu.conf file content for options
// that cannot be passed via RPC.
func (c *CRIUSettings) GenerateCRIUConfContent() string {
var content string
if c.LibDir != "" {
content += "libdir " + c.LibDir + "\n"
}
if c.AllowUprobes {
content += "allow-uprobes\n"
}
if c.SkipInFlight {
content += "skip-in-flight\n"
}
return content
}
// ExternalMountManifestEntry is a serializable CRIU ext-mount entry in checkpoint manifests.
type ExternalMountManifestEntry struct {
Key string `yaml:"key"`
Val string `yaml:"val"`
}
// CRIUDumpManifest stores the resolved dump-time CRIU mount plan used for restore.
type CRIUDumpManifest struct {
CRIU CRIUSettings `yaml:"criu"`
ExtMnt []ExternalMountManifestEntry `yaml:"extMnt,omitempty"`
External []string `yaml:"external,omitempty"`
SkipMnt []string `yaml:"skipMnt,omitempty"`
}
// NewCRIUDumpManifest serializes resolved dump options for restore.
func NewCRIUDumpManifest(criuOpts *criurpc.CriuOpts, settings CRIUSettings) CRIUDumpManifest {
manifest := CRIUDumpManifest{CRIU: settings}
if criuOpts == nil {
return manifest
}
for _, mount := range criuOpts.ExtMnt {
if mount == nil || mount.GetKey() == "" {
continue
}
manifest.ExtMnt = append(manifest.ExtMnt, ExternalMountManifestEntry{
Key: mount.GetKey(),
Val: mount.GetVal(),
})
}
manifest.External = append([]string(nil), criuOpts.External...)
manifest.SkipMnt = append([]string(nil), criuOpts.SkipMnt...)
return manifest
}
// BuildCRIUDumpOptions creates CRIU options directly from spec settings and runtime state.
func BuildCRIUDumpOptions(
settings *CRIUSettings,
pid int,
imageDirFD int32,
rootFS string,
mountInfo []MountInfo,
ociSpec *specs.Spec,
namespaces map[NamespaceType]*NamespaceInfo,
) (*criurpc.CriuOpts, error) {
mountPolicy := BuildMountPolicy(mountInfo, ociSpec, rootFS)
extMnt := buildExternalMountMaps(mountPolicy.Externalized)
skipMnt := mountPolicy.Skipped
external := buildExternalNamespaces(namespaces)
logrus.WithFields(logrus.Fields{
"externalized_count": len(mountPolicy.Externalized),
"skipped_count": len(mountPolicy.Skipped),
}).Debug("Resolved mount policy for CRIU dump")
criuOpts := &criurpc.CriuOpts{
Pid: proto.Int32(int32(pid)),
ImagesDirFd: proto.Int32(imageDirFD),
Root: proto.String(rootFS),
LogFile: proto.String(DumpLogFilename),
}
criuOpts.ExtMnt = extMnt
criuOpts.External = external
criuOpts.SkipMnt = skipMnt
if settings == nil {
return criuOpts, nil
}
// RPC options from spec.
criuOpts.LogLevel = proto.Int32(settings.LogLevel)
criuOpts.LeaveRunning = proto.Bool(settings.LeaveRunning)
criuOpts.ShellJob = proto.Bool(settings.ShellJob)
criuOpts.TcpClose = proto.Bool(settings.TcpClose)
criuOpts.FileLocks = proto.Bool(settings.FileLocks)
criuOpts.OrphanPtsMaster = proto.Bool(settings.OrphanPtsMaster)
criuOpts.ExtUnixSk = proto.Bool(settings.ExtUnixSk)
criuOpts.LinkRemap = proto.Bool(settings.LinkRemap)
criuOpts.ExtMasters = proto.Bool(settings.ExtMasters)
criuOpts.AutoDedup = proto.Bool(settings.AutoDedup)
criuOpts.LazyPages = proto.Bool(settings.LazyPages)
// Cgroup management mode
criuOpts.ManageCgroups = proto.Bool(true)
cgMode := criurpc.CriuCgMode_IGNORE
switch settings.ManageCgroupsMode {
case "soft":
cgMode = criurpc.CriuCgMode_SOFT
case "full":
cgMode = criurpc.CriuCgMode_FULL
case "strict":
cgMode = criurpc.CriuCgMode_STRICT
}
criuOpts.ManageCgroupsMode = &cgMode
// Optional numeric options
if settings.GhostLimit > 0 {
criuOpts.GhostLimit = proto.Uint32(settings.GhostLimit)
}
if settings.Timeout > 0 {
criuOpts.Timeout = proto.Uint32(settings.Timeout)
}
return criuOpts, nil
}
// buildExternalMountMaps serializes externalized mount paths into CRIU map entries.
func buildExternalMountMaps(paths []string) []*criurpc.ExtMountMap {
extMnt := make([]*criurpc.ExtMountMap, 0, len(paths))
existing := make(map[string]struct{}, len(paths))
for _, path := range paths {
if path == "" {
continue
}
if _, ok := existing[path]; ok {
continue
}
extMnt = append(extMnt, &criurpc.ExtMountMap{
Key: proto.String(path),
Val: proto.String(path),
})
existing[path] = struct{}{}
}
return extMnt
}
// buildExternalNamespaces builds external namespace/mount references.
func buildExternalNamespaces(namespaces map[NamespaceType]*NamespaceInfo) []string {
external := make([]string, 0, 1)
// Mark network namespace as external for socket binding preservation
if netNs, ok := namespaces[NamespaceNet]; ok {
external = append(external, fmt.Sprintf("%s[%d]:%s", NamespaceNet, netNs.Inode, "extNetNs"))
logrus.WithField("inode", netNs.Inode).Debug("Marked network namespace as external")
}
return external
}
// ExecuteCRIUDump runs the CRIU dump and logs timing plus dump-log location on failure.
func ExecuteCRIUDump(criuOpts *criurpc.CriuOpts, checkpointDir string, log *logrus.Entry) (time.Duration, error) {
criuDumpStart := time.Now()
criuClient := criu.MakeCriu()
if err := criuClient.Dump(criuOpts, nil); err != nil {
dumpDuration := time.Since(criuDumpStart)
log.WithFields(logrus.Fields{
"duration": dumpDuration,
"checkpoint_dir": checkpointDir,
"dump_log_path": fmt.Sprintf("%s/%s", checkpointDir, DumpLogFilename),
}).Error("CRIU dump failed")
return 0, fmt.Errorf("CRIU dump failed: %w", err)
}
criuDumpDuration := time.Since(criuDumpStart)
log.WithField("duration", criuDumpDuration).Info("CRIU dump completed")
return criuDumpDuration, nil
}
// k8s contains containerd discovery and Kubernetes path classification helpers.
package checkpoint
import (
"context"
"fmt"
"github.com/containerd/containerd"
"github.com/containerd/containerd/namespaces"
specs "github.com/opencontainers/runtime-spec/specs-go"
)
const (
// K8sNamespace is the containerd namespace used by Kubernetes.
K8sNamespace = "k8s.io"
// ContainerdSocket is the default containerd socket path.
ContainerdSocket = "/run/containerd/containerd.sock"
)
type SourcePodManifest struct {
ContainerID string `yaml:"containerId"`
PID int `yaml:"pid"`
SourceNode string `yaml:"sourceNode"`
PodName string `yaml:"podName"`
PodNamespace string `yaml:"podNamespace"`
}
func NewSourcePodManifest(params CheckpointRequest, pid int) SourcePodManifest {
return SourcePodManifest{
ContainerID: params.ContainerID,
PID: pid,
SourceNode: params.NodeName,
PodName: params.PodName,
PodNamespace: params.PodNamespace,
}
}
type DiscoveryClient struct {
client *containerd.Client
}
func NewDiscoveryClient() (*DiscoveryClient, error) {
client, err := containerd.New(ContainerdSocket)
if err != nil {
return nil, fmt.Errorf("failed to connect to containerd at %s: %w", ContainerdSocket, err)
}
return &DiscoveryClient{client: client}, nil
}
func (c *DiscoveryClient) Close() error {
if c.client != nil {
return c.client.Close()
}
return nil
}
func (c *DiscoveryClient) ResolveContainer(ctx context.Context, containerID string) (int, *specs.Spec, error) {
ctx = namespaces.WithNamespace(ctx, K8sNamespace)
container, err := c.client.LoadContainer(ctx, containerID)
if err != nil {
return 0, nil, fmt.Errorf("failed to load container %s: %w", containerID, err)
}
task, err := container.Task(ctx, nil)
if err != nil {
return 0, nil, fmt.Errorf("failed to get task for container %s: %w", containerID, err)
}
pid := task.Pid()
spec, err := container.Spec(ctx)
if err != nil {
return 0, nil, fmt.Errorf("failed to get spec for container %s: %w", containerID, err)
}
return int(pid), spec, nil
}
// mounts parses runtime mount state from /proc.
package checkpoint
import (
"fmt"
"path"
"path/filepath"
"strings"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)
type MountInfo struct {
MountID string
ParentID string
MountPoint string
Root string
FSType string
Source string
Options string
SuperOptions string
}
// MountPolicy is the classified mount plan for CRIU dump options.
type MountPolicy struct {
Externalized []string
Skipped []string
}
// BuildMountPolicy classifies mounts into CRIU extMnt and skipMnt lists.
//
// Rule order and precedence (top to bottom):
// 1. Skip non-OCI proc/sys submounts and non-OCI runtime /run submounts.
// These mounts are typically node/kernel/runtime specific and are the
// highest-risk source of cross-node restore failures, so skip wins.
// 2. Externalize mounts owned by runtime/OCI:
// - "/" (rootfs is recreated by runtime in OCI restore path)
// - OCI mount destinations
// - OCI masked/readonly paths
// 3. Externalize non-OCI bind-like mounts (mount root is not "/" or ".").
// This captures runtime-injected file mounts (for example driver files)
// so CRIU does not try to recreate them from checkpoint data.
// 4. Anything else is left unflagged and handled by CRIU default behavior.
//
// Precedence: skip > externalize. If a path is classified as skipped, it is
// removed from the externalized set.
func BuildMountPolicy(mountInfo []MountInfo, ociSpec *specs.Spec, rootFS string) *MountPolicy {
ociManagedSet := collectOCIManagedDestinations(ociSpec, rootFS)
externalizedSet := make(map[string]struct{}, len(mountInfo)+len(ociManagedSet))
skippedSet := make(map[string]struct{}, len(mountInfo))
for _, mount := range mountInfo {
mp := normalizeMountPath(mount.MountPoint)
if mp == "" {
continue
}
source := path.Clean(strings.TrimSpace(mount.Source))
root := path.Clean(strings.TrimSpace(mount.Root))
isOCIManaged := false
if _, ok := ociManagedSet[mp]; ok {
isOCIManaged = true
}
if !isOCIManaged && strings.HasPrefix(mp, "/run/") {
if _, ok := ociManagedSet["/var"+mp]; ok {
isOCIManaged = true
}
}
if !isOCIManaged && strings.HasPrefix(mp, "/var/run/") {
if _, ok := ociManagedSet[strings.TrimPrefix(mp, "/var")]; ok {
isOCIManaged = true
}
}
// Runtime-owned /run mounts are usually ephemeral tmpfs/overlay mounts
// or bind-like mounts sourced from host runtime directories.
// We skip these unless OCI explicitly manages that destination.
isRunRuntimeMount := strings.HasPrefix(mp, "/run/") &&
(mount.FSType == "tmpfs" ||
mount.FSType == "overlay" ||
strings.HasPrefix(source, "/run/") ||
strings.HasPrefix(source, "/var/run/") ||
strings.HasPrefix(root, "/run/") ||
strings.HasPrefix(root, "/var/run/"))
if !isOCIManaged && (strings.HasPrefix(mp, "/proc/") || strings.HasPrefix(mp, "/sys/") || isRunRuntimeMount) {
skippedSet[mp] = struct{}{}
delete(externalizedSet, mp)
continue
}
if mp == "/" || isOCIManaged || (root != "." && root != "/") {
externalizedSet[mp] = struct{}{}
continue
}
}
// Ensure OCI-managed destinations are externalized, even when mountinfo does not
// include a direct entry (e.g., runtime-managed masked/readonly paths).
for mp := range ociManagedSet {
if _, skipped := skippedSet[mp]; skipped {
continue
}
externalizedSet[mp] = struct{}{}
}
externalized := make([]string, 0, len(externalizedSet))
for mp := range externalizedSet {
externalized = append(externalized, mp)
}
skipped := make([]string, 0, len(skippedSet))
for mp := range skippedSet {
skipped = append(skipped, mp)
}
return &MountPolicy{
Externalized: externalized,
Skipped: skipped,
}
}
// collectOCIManagedDestinations returns the canonical set of OCI-owned mount
// targets. This includes regular OCI mounts plus Linux masked/readonly paths.
// Those masked/readonly paths may not appear as direct mountinfo entries, but
// still need to be treated as runtime-owned and externalized.
func collectOCIManagedDestinations(ociSpec *specs.Spec, rootFS string) map[string]struct{} {
set := map[string]struct{}{}
if ociSpec == nil {
return set
}
paths := make([]string, 0, len(ociSpec.Mounts))
for _, mount := range ociSpec.Mounts {
paths = append(paths, mount.Destination)
}
if ociSpec.Linux != nil {
paths = append(paths, ociSpec.Linux.MaskedPaths...)
paths = append(paths, ociSpec.Linux.ReadonlyPaths...)
}
for _, raw := range paths {
if p := normalizeOCIDestinationPath(raw, rootFS); p != "" {
set[p] = struct{}{}
}
}
return set
}
// normalizeMountPath applies lexical normalization only.
// Mountinfo paths are already kernel truth for the container namespace.
func normalizeMountPath(raw string) string {
raw = strings.TrimSpace(raw)
if raw == "" {
return ""
}
p := path.Clean(raw)
if !strings.HasPrefix(p, "/") {
p = "/" + p
}
return path.Clean(p)
}
// normalizeOCIDestinationPath canonicalizes OCI destinations against container
// rootfs symlinks (for example /var/run -> /run) with lexical fallback.
func normalizeOCIDestinationPath(raw, rootFS string) string {
p := normalizeMountPath(raw)
if p == "" || rootFS == "" {
return p
}
hostPath := filepath.Join(rootFS, strings.TrimPrefix(p, "/"))
resolved, err := filepath.EvalSymlinks(hostPath)
if err != nil {
return p
}
rel, err := filepath.Rel(rootFS, resolved)
if err != nil {
return p
}
rel = filepath.ToSlash(rel)
if rel == "." {
return "/"
}
if strings.HasPrefix(rel, "../") || rel == ".." {
return p
}
return normalizeMountPath("/" + rel)
}
func ReadMountInfoFromHostProcPath(pid int) ([]MountInfo, error) {
mountinfoPath := fmt.Sprintf("%s/%d/mountinfo", HostProcPath, pid)
parsedMounts, err := common.ParseMountInfoFile(mountinfoPath)
if err != nil {
return nil, fmt.Errorf("failed to parse mountinfo at %s: %w", mountinfoPath, err)
}
mounts := make([]MountInfo, 0, len(parsedMounts))
for _, parsed := range parsedMounts {
mounts = append(mounts, MountInfo{
MountID: parsed.MountID,
ParentID: parsed.ParentID,
MountPoint: parsed.Path,
Root: parsed.Root,
FSType: parsed.FSType,
Source: parsed.Source,
Options: parsed.Options,
SuperOptions: parsed.SuperOpts,
})
}
return mounts, nil
}
// namespaces provides Linux namespace introspection for CRIU checkpoint.
package checkpoint
import (
"fmt"
"golang.org/x/sys/unix"
)
// NamespaceManifestEntry stores namespace information saved in checkpoint manifests.
type NamespaceManifestEntry struct {
Type string `yaml:"type"` // net, pid, mnt, etc.
Inode uint64 `yaml:"inode"` // Namespace inode
IsExternal bool `yaml:"isExternal"` // Whether namespace is external (shared)
}
// NamespaceType represents a Linux namespace type
type NamespaceType string
const (
NamespaceNet NamespaceType = "net"
NamespacePID NamespaceType = "pid"
NamespaceMnt NamespaceType = "mnt"
NamespaceUTS NamespaceType = "uts"
NamespaceIPC NamespaceType = "ipc"
NamespaceUser NamespaceType = "user"
NamespaceCgroup NamespaceType = "cgroup"
)
// NamespaceInfo holds namespace identification information
type NamespaceInfo struct {
Type NamespaceType
Inode uint64
IsExternal bool // Whether NS is external (shared with pause container)
}
// NewNamespaceManifestEntries constructs namespace manifest entries from introspected namespaces.
func NewNamespaceManifestEntries(namespaces map[NamespaceType]*NamespaceInfo) []NamespaceManifestEntry {
if len(namespaces) == 0 {
return nil
}
result := make([]NamespaceManifestEntry, 0, len(namespaces))
for nsType, nsInfo := range namespaces {
result = append(result, NamespaceManifestEntry{
Type: string(nsType),
Inode: nsInfo.Inode,
IsExternal: nsInfo.IsExternal,
})
}
return result
}
// GetNamespaceInode returns the inode number for a namespace
func GetNamespaceInode(pid int, nsType NamespaceType) (uint64, error) {
nsPath := fmt.Sprintf("%s/%d/ns/%s", HostProcPath, pid, nsType)
var stat unix.Stat_t
if err := unix.Stat(nsPath, &stat); err != nil {
return 0, fmt.Errorf("failed to stat namespace %s: %w", nsPath, err)
}
return stat.Ino, nil
}
// GetNamespaceInfo returns detailed namespace information
func GetNamespaceInfo(pid int, nsType NamespaceType) (*NamespaceInfo, error) {
nsPath := fmt.Sprintf("%s/%d/ns/%s", HostProcPath, pid, nsType)
// Get inode
var stat unix.Stat_t
if err := unix.Stat(nsPath, &stat); err != nil {
return nil, fmt.Errorf("failed to stat namespace %s: %w", nsPath, err)
}
// Check if this is different from init's namespace (PID 1)
initNsPath := fmt.Sprintf("%s/1/ns/%s", HostProcPath, nsType)
var initStat unix.Stat_t
isExternal := false
if err := unix.Stat(initNsPath, &initStat); err == nil {
// If the inode is different from init's, it's an external namespace
isExternal = stat.Ino != initStat.Ino
}
return &NamespaceInfo{
Type: nsType,
Inode: stat.Ino,
IsExternal: isExternal,
}, nil
}
// GetAllNamespaces returns information about all namespaces for a process
func GetAllNamespaces(pid int) (map[NamespaceType]*NamespaceInfo, error) {
nsTypes := []NamespaceType{
NamespaceNet,
NamespacePID,
NamespaceMnt,
NamespaceUTS,
NamespaceIPC,
NamespaceUser,
NamespaceCgroup,
}
namespaces := make(map[NamespaceType]*NamespaceInfo)
for _, nsType := range nsTypes {
if info, err := GetNamespaceInfo(pid, nsType); err == nil {
namespaces[nsType] = info
}
}
return namespaces, nil
}
// Package checkpoint provides CRIU checkpoint (dump) operations.
package checkpoint
import (
"fmt"
"io"
"os"
"path/filepath"
"github.com/sirupsen/logrus"
)
// CaptureDevShm captures files from /dev/shm to the checkpoint directory.
// This is needed because /dev/shm is a tmpfs mount that is not part of the
// container's overlay filesystem, so rootfs diff doesn't capture it.
//
// Semaphores (sem.* files) are included so that sem_unlink() calls succeed
// after restore. The semaphore kernel state won't be perfectly restored,
// but the files will exist for cleanup operations.
//
// The files are saved to <checkpointDir>/dev-shm/ and can be restored
// using RestoreDevShm before CRIU restore.
func CaptureDevShm(pid int, checkpointDir string, log *logrus.Entry) error {
// Access container's /dev/shm via /proc/<pid>/root
shmPath := filepath.Join(HostProcPath, fmt.Sprintf("%d/root/dev/shm", pid))
entries, err := os.ReadDir(shmPath)
if err != nil {
if os.IsNotExist(err) {
log.Debug("Container /dev/shm does not exist, skipping capture")
return nil
}
return fmt.Errorf("failed to read container /dev/shm: %w", err)
}
// Filter out directories
var filesToCapture []os.DirEntry
for _, entry := range entries {
// Skip directories (unlikely in /dev/shm but be safe)
if entry.IsDir() {
log.WithField("dir", entry.Name()).Debug("Skipping directory in /dev/shm")
continue
}
filesToCapture = append(filesToCapture, entry)
}
if len(filesToCapture) == 0 {
log.Debug("No files to capture from /dev/shm")
return nil
}
// Create destination directory
destDir := filepath.Join(checkpointDir, DevShmDirName)
if err := os.MkdirAll(destDir, 0755); err != nil {
return fmt.Errorf("failed to create dev-shm directory: %w", err)
}
var captured []string
var totalSize int64
for _, entry := range filesToCapture {
name := entry.Name()
srcPath := filepath.Join(shmPath, name)
destPath := filepath.Join(destDir, name)
info, err := entry.Info()
if err != nil {
log.WithError(err).WithField("file", name).Warn("Failed to get file info, skipping")
continue
}
size := info.Size()
// Copy the file
if err := copyFile(srcPath, destPath, info.Mode()); err != nil {
log.WithError(err).WithField("file", name).Warn("Failed to copy file, skipping")
continue
}
captured = append(captured, name)
totalSize += size
log.WithFields(logrus.Fields{
"file": name,
"size": size,
}).Debug("Captured /dev/shm file")
}
if len(captured) > 0 {
log.WithFields(logrus.Fields{
"count": len(captured),
"total_size": totalSize,
"files": captured,
}).Info("Captured /dev/shm files")
}
return nil
}
// copyFile copies a file from src to dest with the given permissions.
func copyFile(src, dest string, mode os.FileMode) error {
srcFile, err := os.Open(src)
if err != nil {
return fmt.Errorf("failed to open source: %w", err)
}
defer srcFile.Close()
destFile, err := os.OpenFile(dest, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, mode)
if err != nil {
return fmt.Errorf("failed to create destination: %w", err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, srcFile); err != nil {
return fmt.Errorf("failed to copy contents: %w", err)
}
// Sync to ensure durability for checkpoint data
if err := destFile.Sync(); err != nil {
return fmt.Errorf("failed to sync destination: %w", err)
}
return nil
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment