Unverified Commit 93530057 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

build(snapshot): default CRIU build to upstream criu-dev (#7744)

parent 24523a1c
...@@ -16,8 +16,12 @@ ...@@ -16,8 +16,12 @@
# ============================================================================= # =============================================================================
ARG DOCKER_PROXY ARG DOCKER_PROXY
ARG GO_VERSION=1.25 ARG GO_VERSION=1.25
ARG CRIU_REPO=https://github.com/dfeigin-nv/criu.git # Default to upstream CRIU development branch. Custom forks can override both
ARG CRIU_COMMIT=777baaf27f6a76f743c9bf24b64886297dc0129b # args at build time, for example:
# --build-arg CRIU_REPO=<git-remote-url>
# --build-arg CRIU_REF=<fork-branch-or-sha>
ARG CRIU_REPO=https://github.com/checkpoint-restore/criu.git
ARG CRIU_REF=criu-dev
ARG AGENT_BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.11-cuda13.0-devel-ubuntu24.04 ARG AGENT_BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.11-cuda13.0-devel-ubuntu24.04
# For placeholder target only - this default allows agent builds to succeed, # For placeholder target only - this default allows agent builds to succeed,
...@@ -70,13 +74,32 @@ ARG TARGETARCH=amd64 ...@@ -70,13 +74,32 @@ ARG TARGETARCH=amd64
RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /snapshot-agent ./cmd/agent RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /snapshot-agent ./cmd/agent
RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /nsrestore ./cmd/nsrestore RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /nsrestore ./cmd/nsrestore
# =============================================================================
# Stage: CUDA checkpoint helper builder
# =============================================================================
FROM ${AGENT_BASE_IMAGE} AS cuda-helper-builder
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /workspace
COPY cmd/cuda-checkpoint-helper/main.c ./cmd/cuda-checkpoint-helper/main.c
RUN gcc -O2 -Wall -Wextra -o /cuda-checkpoint-helper \
./cmd/cuda-checkpoint-helper/main.c \
-I/usr/local/cuda/include \
-L/usr/local/cuda/lib64/stubs \
-lcuda
# ============================================================================= # =============================================================================
# Stage: CRIU Builder - Build CRIU with CUDA plugin # Stage: CRIU Builder - Build CRIU with CUDA plugin
# ============================================================================= # =============================================================================
FROM ubuntu:24.04 AS criu-builder FROM ubuntu:24.04 AS criu-builder
ARG CRIU_REPO ARG CRIU_REPO
ARG CRIU_COMMIT ARG CRIU_REF
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
git \ git \
...@@ -102,7 +125,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ...@@ -102,7 +125,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
RUN git init /tmp/criu \ RUN git init /tmp/criu \
&& cd /tmp/criu \ && cd /tmp/criu \
&& git remote add origin ${CRIU_REPO} \ && git remote add origin ${CRIU_REPO} \
&& git fetch --depth 1 origin ${CRIU_COMMIT} \ && git fetch --depth 1 origin ${CRIU_REF} \
&& git checkout FETCH_HEAD \ && git checkout FETCH_HEAD \
&& make -j$(nproc) \ && make -j$(nproc) \
&& make DESTDIR=/criu-install install-criu install-lib install-cuda_plugin && make DESTDIR=/criu-install install-criu install-lib install-cuda_plugin
...@@ -143,9 +166,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ...@@ -143,9 +166,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
COPY --from=criu-builder /criu-install/usr/local /usr/local COPY --from=criu-builder /criu-install/usr/local /usr/local
RUN criu --version RUN criu --version
# Copy cuda-checkpoint binary # Copy CUDA checkpoint binaries
COPY --from=criu-builder /tmp/cuda-checkpoint/bin/x86_64_Linux/cuda-checkpoint /usr/local/sbin/cuda-checkpoint COPY --from=criu-builder /tmp/cuda-checkpoint/bin/x86_64_Linux/cuda-checkpoint /usr/local/sbin/cuda-checkpoint
RUN chmod +x /usr/local/sbin/cuda-checkpoint COPY --from=cuda-helper-builder /cuda-checkpoint-helper /usr/local/bin/cuda-checkpoint-helper
RUN chmod +x /usr/local/sbin/cuda-checkpoint /usr/local/bin/cuda-checkpoint-helper
# Copy the built binaries # Copy the built binaries
COPY --from=builder /snapshot-agent /usr/local/bin/snapshot-agent COPY --from=builder /snapshot-agent /usr/local/bin/snapshot-agent
...@@ -198,9 +222,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ...@@ -198,9 +222,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
COPY --from=criu-builder /criu-install/usr/local /usr/local COPY --from=criu-builder /criu-install/usr/local /usr/local
RUN criu --version && echo "CRIU installed successfully" RUN criu --version && echo "CRIU installed successfully"
# Copy cuda-checkpoint binary (used for external CUDA state checkpoint/restore) # Copy CUDA checkpoint binaries
COPY --from=criu-builder /tmp/cuda-checkpoint/bin/x86_64_Linux/cuda-checkpoint /usr/local/sbin/cuda-checkpoint COPY --from=criu-builder /tmp/cuda-checkpoint/bin/x86_64_Linux/cuda-checkpoint /usr/local/sbin/cuda-checkpoint
RUN chmod +x /usr/local/sbin/cuda-checkpoint COPY --from=cuda-helper-builder /cuda-checkpoint-helper /usr/local/bin/cuda-checkpoint-helper
RUN chmod +x /usr/local/sbin/cuda-checkpoint /usr/local/bin/cuda-checkpoint-helper
# Copy nsrestore binary (invoked by DaemonSet via nsenter) # Copy nsrestore binary (invoked by DaemonSet via nsenter)
COPY --from=builder /nsrestore /usr/local/bin/nsrestore COPY --from=builder /nsrestore /usr/local/bin/nsrestore
......
...@@ -5,6 +5,9 @@ ...@@ -5,6 +5,9 @@
IMG ?= nvcr.io/nvidian/dynamo-dev/snapshot-agent:latest IMG ?= nvcr.io/nvidian/dynamo-dev/snapshot-agent:latest
PLACEHOLDER_IMG ?= nvcr.io/nvidian/dynamo-dev/dynamo-vllm-placeholder:latest PLACEHOLDER_IMG ?= nvcr.io/nvidian/dynamo-dev/dynamo-vllm-placeholder:latest
# PLACEHOLDER_BASE_IMG must be provided when building placeholder (no default) # PLACEHOLDER_BASE_IMG must be provided when building placeholder (no default)
# Optional CRIU source override for snapshot image builds. If unset, the
# Dockerfile defaults are used.
CRIU_BUILD_ARGS = $(if $(CRIU_REPO),--build-arg CRIU_REPO=$(CRIU_REPO),) $(if $(CRIU_REF),--build-arg CRIU_REF=$(CRIU_REF),)
# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
ifeq (,$(shell go env GOBIN)) ifeq (,$(shell go env GOBIN))
...@@ -63,7 +66,9 @@ build: fmt vet ## Build snapshot-agent binary. ...@@ -63,7 +66,9 @@ build: fmt vet ## Build snapshot-agent binary.
.PHONY: docker-build-agent .PHONY: docker-build-agent
docker-build-agent: ## Build snapshot-agent docker image (linux/amd64 only). docker-build-agent: ## Build snapshot-agent docker image (linux/amd64 only).
$(CONTAINER_TOOL) build --platform ${RUNTIME_IMAGE_PLATFORM} --target agent -t ${IMG} . $(CONTAINER_TOOL) build --platform ${RUNTIME_IMAGE_PLATFORM} --target agent \
$(CRIU_BUILD_ARGS) \
-t ${IMG} .
.PHONY: docker-build-placeholder .PHONY: docker-build-placeholder
docker-build-placeholder: ## Build placeholder image for checkpoint restore (linux/amd64 only). Requires PLACEHOLDER_BASE_IMG. docker-build-placeholder: ## Build placeholder image for checkpoint restore (linux/amd64 only). Requires PLACEHOLDER_BASE_IMG.
...@@ -77,6 +82,7 @@ endif ...@@ -77,6 +82,7 @@ endif
fi; \ fi; \
if [ -z "$$BASE_IMAGE_USER" ]; then BASE_IMAGE_USER=root; fi; \ if [ -z "$$BASE_IMAGE_USER" ]; then BASE_IMAGE_USER=root; fi; \
$(CONTAINER_TOOL) build --platform ${RUNTIME_IMAGE_PLATFORM} --target placeholder \ $(CONTAINER_TOOL) build --platform ${RUNTIME_IMAGE_PLATFORM} --target placeholder \
$(CRIU_BUILD_ARGS) \
--build-arg BASE_IMAGE=${PLACEHOLDER_BASE_IMG} \ --build-arg BASE_IMAGE=${PLACEHOLDER_BASE_IMG} \
--build-arg BASE_IMAGE_USER=$$BASE_IMAGE_USER \ --build-arg BASE_IMAGE_USER=$$BASE_IMAGE_USER \
-t ${PLACEHOLDER_IMG} . -t ${PLACEHOLDER_IMG} .
......
#include <ctype.h>
#include <cuda.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static int
print_usage(FILE* stream)
{
return fprintf(
stream,
"Usage:\n"
" cuda-checkpoint-helper --get-state --pid <pid>\n"
" cuda-checkpoint-helper --get-restore-tid --pid <pid>\n"
" cuda-checkpoint-helper --action lock|checkpoint|restore|unlock --pid <pid> [--timeout <ms>] "
"[--device-map <uuids>]\n") < 0
? 1
: 0;
}
static void
print_cuda_error(CUresult status)
{
const char* name = NULL;
const char* msg = NULL;
(void)cuGetErrorName(status, &name);
(void)cuGetErrorString(status, &msg);
if (name == NULL) {
name = "CUDA_ERROR_UNKNOWN";
}
if (msg == NULL) {
msg = "unknown CUDA error";
}
fprintf(stderr, "%s: %s\n", name, msg);
}
static int
parse_pid(const char* pid_str, int* pid_out)
{
char* end = NULL;
long pid = strtol(pid_str, &end, 10);
if (pid_str[0] == '\0' || end == NULL || *end != '\0' || pid <= 0 || pid > INT_MAX) {
return -1;
}
*pid_out = (int)pid;
return 0;
}
static int
parse_timeout_ms(const char* timeout_str, unsigned int* timeout_ms_out)
{
char* end = NULL;
unsigned long timeout_ms = strtoul(timeout_str, &end, 10);
if (timeout_str[0] == '\0' || end == NULL || *end != '\0' || timeout_ms > UINT_MAX) {
return -1;
}
*timeout_ms_out = (unsigned int)timeout_ms;
return 0;
}
static int
parse_hex_byte(const char* src, unsigned char* byte_out)
{
char tmp[3];
char* end = NULL;
long value;
tmp[0] = src[0];
tmp[1] = src[1];
tmp[2] = '\0';
value = strtol(tmp, &end, 16);
if (end == NULL || *end != '\0' || value < 0 || value > 255) {
return -1;
}
*byte_out = (unsigned char)value;
return 0;
}
static int
parse_uuid(const char* uuid_str, CUuuid* uuid_out)
{
size_t len;
int i;
if (uuid_str == NULL || uuid_out == NULL) {
return -1;
}
len = strlen(uuid_str);
if (len == 40) {
if (strncmp(uuid_str, "GPU-", 4) != 0) {
return -1;
}
uuid_str += 4;
len -= 4;
}
if (len != 36) {
return -1;
}
for (i = 0; i < 16; ++i) {
if (*uuid_str == '-') {
++uuid_str;
}
if (!isxdigit((unsigned char)uuid_str[0]) || !isxdigit((unsigned char)uuid_str[1])) {
return -1;
}
if (parse_hex_byte(uuid_str, (unsigned char*)&uuid_out->bytes[i]) != 0) {
return -1;
}
uuid_str += 2;
}
return *uuid_str == '\0' ? 0 : -1;
}
static int
parse_device_map(const char* device_map, CUcheckpointGpuPair** pairs_out, unsigned int* count_out)
{
char* copy = NULL;
char* pair = NULL;
char* pair_save = NULL;
unsigned int count = 0;
CUcheckpointGpuPair* pairs = NULL;
*pairs_out = NULL;
*count_out = 0;
if (device_map == NULL || device_map[0] == '\0') {
return 0;
}
copy = strdup(device_map);
if (copy == NULL) {
return -1;
}
for (pair = copy; *pair != '\0'; ++pair) {
if (*pair == ',') {
++count;
}
}
++count;
pairs = calloc(count, sizeof(*pairs));
if (pairs == NULL) {
free(copy);
return -1;
}
count = 0;
pair = strtok_r(copy, ",", &pair_save);
while (pair != NULL) {
char* uuid_save = NULL;
char* old_uuid = strtok_r(pair, "=", &uuid_save);
char* new_uuid = strtok_r(NULL, "=", &uuid_save);
if (old_uuid == NULL || new_uuid == NULL || strtok_r(NULL, "=", &uuid_save) != NULL) {
free(copy);
free(pairs);
return -1;
}
if (parse_uuid(old_uuid, &pairs[count].oldUuid) != 0 || parse_uuid(new_uuid, &pairs[count].newUuid) != 0) {
free(copy);
free(pairs);
return -1;
}
++count;
pair = strtok_r(NULL, ",", &pair_save);
}
free(copy);
*pairs_out = pairs;
*count_out = count;
return 0;
}
static const char*
process_state_string(CUprocessState state)
{
switch (state) {
case CU_PROCESS_STATE_RUNNING:
return "running";
case CU_PROCESS_STATE_LOCKED:
return "locked";
case CU_PROCESS_STATE_CHECKPOINTED:
return "checkpointed";
case CU_PROCESS_STATE_FAILED:
return "failed";
default:
return "unknown";
}
}
static CUresult
do_lock(int pid, unsigned int timeout_ms)
{
CUcheckpointLockArgs args;
memset(&args, 0, sizeof(args));
args.timeoutMs = timeout_ms;
return cuCheckpointProcessLock(pid, &args);
}
static CUresult
do_checkpoint(int pid)
{
CUcheckpointCheckpointArgs args;
memset(&args, 0, sizeof(args));
return cuCheckpointProcessCheckpoint(pid, &args);
}
static CUresult
do_restore(int pid, const char* device_map)
{
CUcheckpointRestoreArgs args;
CUcheckpointGpuPair* pairs = NULL;
unsigned int pair_count = 0;
CUresult status;
memset(&args, 0, sizeof(args));
if (parse_device_map(device_map, &pairs, &pair_count) != 0) {
return CUDA_ERROR_INVALID_VALUE;
}
args.gpuPairs = pairs;
args.gpuPairsCount = pair_count;
status = cuCheckpointProcessRestore(pid, &args);
free(pairs);
return status;
}
static CUresult
do_unlock(int pid)
{
CUcheckpointUnlockArgs args;
memset(&args, 0, sizeof(args));
return cuCheckpointProcessUnlock(pid, &args);
}
static CUresult
do_get_state(int pid, CUprocessState* state_out)
{
return cuCheckpointProcessGetState(pid, state_out);
}
static CUresult
do_get_restore_tid(int pid, int* tid_out)
{
return cuCheckpointProcessGetRestoreThreadId(pid, tid_out);
}
int
main(int argc, char** argv)
{
const char* action = NULL;
const char* device_map = "";
int pid = 0;
int have_pid = 0;
int do_get_state_flag = 0;
int do_get_restore_tid_flag = 0;
unsigned int timeout_ms = 0;
int i;
CUresult status;
if (argc == 1) {
return print_usage(stderr);
}
for (i = 1; i < argc; ++i) {
if (strcmp(argv[i], "--get-state") == 0) {
do_get_state_flag = 1;
continue;
}
if (strcmp(argv[i], "--get-restore-tid") == 0) {
do_get_restore_tid_flag = 1;
continue;
}
if (strcmp(argv[i], "--action") == 0) {
if (++i >= argc) {
return print_usage(stderr);
}
action = argv[i];
continue;
}
if (strcmp(argv[i], "--pid") == 0 || strcmp(argv[i], "-p") == 0) {
if (++i >= argc || parse_pid(argv[i], &pid) != 0) {
return print_usage(stderr);
}
have_pid = 1;
continue;
}
if (strcmp(argv[i], "--timeout") == 0 || strcmp(argv[i], "-t") == 0) {
if (++i >= argc || parse_timeout_ms(argv[i], &timeout_ms) != 0) {
return print_usage(stderr);
}
continue;
}
if (strcmp(argv[i], "--device-map") == 0 || strcmp(argv[i], "-d") == 0) {
if (++i >= argc) {
return print_usage(stderr);
}
device_map = argv[i];
continue;
}
if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
return print_usage(stdout);
}
return print_usage(stderr);
}
if ((do_get_state_flag + do_get_restore_tid_flag + (action != NULL ? 1 : 0)) != 1) {
return print_usage(stderr);
}
if (!have_pid) {
return print_usage(stderr);
}
if (do_get_state_flag) {
CUprocessState state;
if (timeout_ms != 0 || device_map[0] != '\0') {
return print_usage(stderr);
}
status = do_get_state(pid, &state);
if (status != CUDA_SUCCESS) {
print_cuda_error(status);
return 1;
}
return fprintf(stdout, "%s\n", process_state_string(state)) < 0 ? 1 : 0;
}
if (do_get_restore_tid_flag) {
int tid = 0;
if (timeout_ms != 0 || device_map[0] != '\0') {
return print_usage(stderr);
}
status = do_get_restore_tid(pid, &tid);
if (status != CUDA_SUCCESS) {
print_cuda_error(status);
return 1;
}
return fprintf(stdout, "%d\n", tid) < 0 ? 1 : 0;
}
if (strcmp(action, "lock") == 0) {
status = do_lock(pid, timeout_ms);
} else if (strcmp(action, "checkpoint") == 0) {
if (timeout_ms != 0 || device_map[0] != '\0') {
return print_usage(stderr);
}
status = do_checkpoint(pid);
} else if (strcmp(action, "restore") == 0) {
if (timeout_ms != 0) {
return print_usage(stderr);
}
status = do_restore(pid, device_map);
} else if (strcmp(action, "unlock") == 0) {
if (timeout_ms != 0 || device_map[0] != '\0') {
return print_usage(stderr);
}
status = do_unlock(pid);
} else {
return print_usage(stderr);
}
if (status != CUDA_SUCCESS) {
print_cuda_error(status);
return 1;
}
return 0;
}
...@@ -17,7 +17,7 @@ func main() { ...@@ -17,7 +17,7 @@ func main() {
log := logging.ConfigureLogger("stderr").WithName("nsrestore") log := logging.ConfigureLogger("stderr").WithName("nsrestore")
checkpointPath := flag.String("checkpoint-path", "", "Path to checkpoint directory") checkpointPath := flag.String("checkpoint-path", "", "Path to checkpoint directory")
cudaDeviceMap := flag.String("cuda-device-map", "", "CUDA device map for cuda-checkpoint restore") cudaDeviceMap := flag.String("cuda-device-map", "", "CUDA device map for cuda-checkpoint-helper restore")
cgroupRoot := flag.String("cgroup-root", "", "CRIU cgroup root remap path") cgroupRoot := flag.String("cgroup-root", "", "CRIU cgroup root remap path")
flag.Parse() flag.Parse()
......
...@@ -104,6 +104,20 @@ func ReadProcessDetails(procRoot string, pid int) (ProcessDetails, error) { ...@@ -104,6 +104,20 @@ func ReadProcessDetails(procRoot string, pid int) (ProcessDetails, error) {
}, nil }, nil
} }
// ReadProcessDetailsOrDefault preserves pid-scoped logging even when proc parsing fails.
func ReadProcessDetailsOrDefault(procRoot string, pid int) ProcessDetails {
details := ProcessDetails{
ObservedPID: pid,
OutermostPID: pid,
InnermostPID: pid,
NamespacePIDs: []int{pid},
}
if process, err := ReadProcessDetails(procRoot, pid); err == nil {
details = process
}
return details
}
// ReadProcessTable snapshots every numeric proc entry under procRoot. // ReadProcessTable snapshots every numeric proc entry under procRoot.
// Used by restore-side PID remap and diagnostics after CRIU restore. // Used by restore-side PID remap and diagnostics after CRIU restore.
func ReadProcessTable(procRoot string) ([]ProcessDetails, error) { func ReadProcessTable(procRoot string) ([]ProcessDetails, error) {
......
...@@ -107,6 +107,22 @@ func TestReadProcessDetails(t *testing.T) { ...@@ -107,6 +107,22 @@ func TestReadProcessDetails(t *testing.T) {
} }
} }
func TestReadProcessDetailsOrDefault(t *testing.T) {
details := ReadProcessDetailsOrDefault(t.TempDir(), 1234)
if details.ObservedPID != 1234 {
t.Fatalf("ObservedPID = %d, want 1234", details.ObservedPID)
}
if details.OutermostPID != 1234 {
t.Fatalf("OutermostPID = %d, want 1234", details.OutermostPID)
}
if details.InnermostPID != 1234 {
t.Fatalf("InnermostPID = %d, want 1234", details.InnermostPID)
}
if len(details.NamespacePIDs) != 1 || details.NamespacePIDs[0] != 1234 {
t.Fatalf("NamespacePIDs = %v, want [1234]", details.NamespacePIDs)
}
}
func TestReadProcessTable(t *testing.T) { func TestReadProcessTable(t *testing.T) {
procRoot := t.TempDir() procRoot := t.TempDir()
writeEntry := func(pid int, status string, cmdline string) { writeEntry := func(pid int, status string, cmdline string) {
......
...@@ -106,7 +106,7 @@ func FilterProcesses(ctx context.Context, allPIDs []int, log logr.Logger) []int ...@@ -106,7 +106,7 @@ func FilterProcesses(ctx context.Context, allPIDs []int, log logr.Logger) []int
if pid <= 0 { if pid <= 0 {
continue continue
} }
cmd := exec.CommandContext(ctx, cudaCheckpointBinary, "--get-restore-tid", "--pid", strconv.Itoa(pid)) cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, "--get-restore-tid", "--pid", strconv.Itoa(pid))
output, err := cmd.CombinedOutput() output, err := cmd.CombinedOutput()
if err != nil { if err != nil {
if ctx.Err() != nil { if ctx.Err() != nil {
...@@ -122,7 +122,7 @@ func FilterProcesses(ctx context.Context, allPIDs []int, log logr.Logger) []int ...@@ -122,7 +122,7 @@ func FilterProcesses(ctx context.Context, allPIDs []int, log logr.Logger) []int
return cudaPIDs return cudaPIDs
} }
// BuildDeviceMap creates a cuda-checkpoint --device-map value from source and target GPU UUID lists. // BuildDeviceMap creates a cuda-checkpoint-helper --device-map value from source and target GPU UUID lists.
// When a source UUID exists in the target set, it maps to itself (identity mapping) to avoid // When a source UUID exists in the target set, it maps to itself (identity mapping) to avoid
// unnecessary cross-GPU restore on same-node restores where kubelet returns GPUs in different order. // unnecessary cross-GPU restore on same-node restores where kubelet returns GPUs in different order.
// Remaining unmatched source UUIDs are paired with remaining unmatched target UUIDs positionally. // Remaining unmatched source UUIDs are paired with remaining unmatched target UUIDs positionally.
...@@ -201,7 +201,7 @@ func RestoreAndUnlockProcessTree(ctx context.Context, cudaPIDs []int, deviceMap ...@@ -201,7 +201,7 @@ func RestoreAndUnlockProcessTree(ctx context.Context, cudaPIDs []int, deviceMap
if err := unlock(ctx, pid, log); err != nil { if err := unlock(ctx, pid, log); err != nil {
state, stateErr := getState(ctx, pid) state, stateErr := getState(ctx, pid)
if stateErr == nil && state == "running" { if stateErr == nil && state == "running" {
log.Info("cuda-checkpoint unlock returned error but process is already running", "pid", pid) log.Info("cuda-checkpoint-helper unlock returned error but process is already running", "pid", pid)
continue continue
} }
return err return err
......
...@@ -177,7 +177,7 @@ func TestGetPodGPUUUIDs(t *testing.T) { ...@@ -177,7 +177,7 @@ func TestGetPodGPUUUIDs(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel() defer cancel()
got, err := GetPodGPUUUIDs(ctx, nil, "test-pod", "default", "main", logr.Discard()) got, err := GetPodGPUUUIDs(ctx, "test-pod", "default", "main")
if err != nil { if err != nil {
t.Fatalf("GetPodGPUUUIDs: %v", err) t.Fatalf("GetPodGPUUUIDs: %v", err)
} }
......
...@@ -14,7 +14,7 @@ import ( ...@@ -14,7 +14,7 @@ import (
) )
const ( const (
cudaCheckpointBinary = "/usr/local/sbin/cuda-checkpoint" cudaCheckpointHelperBinary = "/usr/local/bin/cuda-checkpoint-helper"
actionLock = "lock" actionLock = "lock"
actionCheckpoint = "checkpoint" actionCheckpoint = "checkpoint"
...@@ -39,14 +39,14 @@ func unlock(ctx context.Context, pid int, log logr.Logger) error { ...@@ -39,14 +39,14 @@ func unlock(ctx context.Context, pid int, log logr.Logger) error {
} }
func getState(ctx context.Context, pid int) (string, error) { func getState(ctx context.Context, pid int) (string, error) {
cmd := exec.CommandContext(ctx, cudaCheckpointBinary, "--get-state", "--pid", strconv.Itoa(pid)) cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, "--get-state", "--pid", strconv.Itoa(pid))
output, err := cmd.CombinedOutput() output, err := cmd.CombinedOutput()
state := strings.TrimSpace(string(output)) state := strings.TrimSpace(string(output))
if err != nil { if err != nil {
return "", fmt.Errorf("cuda-checkpoint --get-state failed for pid %d: %w (output: %s)", pid, err, state) return "", fmt.Errorf("cuda-checkpoint-helper --get-state failed for pid %d: %w (output: %s)", pid, err, state)
} }
if state == "" { if state == "" {
return "", fmt.Errorf("cuda-checkpoint --get-state returned empty state for pid %d", pid) return "", fmt.Errorf("cuda-checkpoint-helper --get-state returned empty state for pid %d", pid)
} }
return state, nil return state, nil
} }
...@@ -56,22 +56,14 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr. ...@@ -56,22 +56,14 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr.
if action == actionRestore && deviceMap != "" { if action == actionRestore && deviceMap != "" {
args = append(args, "--device-map", deviceMap) args = append(args, "--device-map", deviceMap)
} }
cmd := exec.CommandContext(ctx, cudaCheckpointBinary, args...) cmd := exec.CommandContext(ctx, cudaCheckpointHelperBinary, args...)
details := common.ProcessDetails{ details := common.ReadProcessDetailsOrDefault("/proc", pid)
ObservedPID: pid,
OutermostPID: pid,
InnermostPID: pid,
NamespacePIDs: []int{pid},
}
if process, err := common.ReadProcessDetails("/proc", pid); err == nil {
details = process
}
start := time.Now() start := time.Now()
output, err := cmd.CombinedOutput() output, err := cmd.CombinedOutput()
duration := time.Since(start) duration := time.Since(start)
out := strings.TrimSpace(string(output)) out := strings.TrimSpace(string(output))
if err != nil { if err != nil {
log.Error(err, "cuda-checkpoint command failed", log.Error(err, "cuda-checkpoint-helper command failed",
"pid", pid, "pid", pid,
"outermost_pid", details.OutermostPID, "outermost_pid", details.OutermostPID,
"innermost_pid", details.InnermostPID, "innermost_pid", details.InnermostPID,
...@@ -80,9 +72,9 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr. ...@@ -80,9 +72,9 @@ func runAction(ctx context.Context, pid int, action, deviceMap string, log logr.
"duration", duration, "duration", duration,
"output", out, "output", out,
) )
return fmt.Errorf("cuda-checkpoint %v failed for pid %d after %s: %w (output: %s)", args, pid, duration, err, out) return fmt.Errorf("cuda-checkpoint-helper %v failed for pid %d after %s: %w (output: %s)", args, pid, duration, err, out)
} }
log.Info("cuda-checkpoint command succeeded", log.Info("cuda-checkpoint-helper command succeeded",
"pid", pid, "pid", pid,
"outermost_pid", details.OutermostPID, "outermost_pid", details.OutermostPID,
"innermost_pid", details.InnermostPID, "innermost_pid", details.InnermostPID,
......
...@@ -47,6 +47,23 @@ make docker-push-placeholder \ ...@@ -47,6 +47,23 @@ make docker-push-placeholder \
This flow is defined in [deploy/snapshot/Makefile](https://github.com/ai-dynamo/dynamo/blob/main/deploy/snapshot/Makefile) and [deploy/snapshot/Dockerfile](https://github.com/ai-dynamo/dynamo/blob/main/deploy/snapshot/Dockerfile). The placeholder image preserves the base runtime entrypoint and command contract, and adds the CRIU, `cuda-checkpoint`, and `nsrestore` tooling needed for restore. This flow is defined in [deploy/snapshot/Makefile](https://github.com/ai-dynamo/dynamo/blob/main/deploy/snapshot/Makefile) and [deploy/snapshot/Dockerfile](https://github.com/ai-dynamo/dynamo/blob/main/deploy/snapshot/Dockerfile). The placeholder image preserves the base runtime entrypoint and command contract, and adds the CRIU, `cuda-checkpoint`, and `nsrestore` tooling needed for restore.
To build either snapshot image against a custom CRIU fork or ref, pass
`CRIU_REPO` and `CRIU_REF` through `make`. If they are unset, the Dockerfile
defaults are used.
```bash
make docker-build-agent \
IMG=registry.example.com/dynamo/snapshot-agent:1.0.0 \
CRIU_REPO="${YOUR_CRIU_REPO}" \
CRIU_REF="branch-or-sha"
make docker-build-placeholder \
PLACEHOLDER_BASE_IMG="${RUNTIME_IMAGE}" \
PLACEHOLDER_IMG="${PLACEHOLDER_IMAGE}" \
CRIU_REPO="${YOUR_CRIU_REPO}" \
CRIU_REF="branch-or-sha"
```
### 2. Enable checkpointing in the platform and verify it ### 2. Enable checkpointing in the platform and verify it
Whether you are installing or upgrading `dynamo-platform`, the operator must have checkpointing enabled and must point at the same storage that the snapshot chart will use: Whether you are installing or upgrading `dynamo-platform`, the operator must have checkpointing enabled and must point at the same storage that the snapshot chart will use:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment