Unverified Commit 38bb9d37 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

refactor: clean up checkpoint orchestration (#7309)


Signed-off-by: default avatarSchwinn Saereesitthipitak <schwinns@nvidia.com>
parent 9ea3acad
......@@ -16,7 +16,8 @@
# =============================================================================
ARG DOCKER_PROXY
ARG GO_VERSION=1.25
ARG CRIU_VERSION=v4.2
ARG CRIU_REPO=https://github.com/dfeigin-nv/criu.git
ARG CRIU_VERSION=add-aio-and-parallel-memfd
ARG AGENT_BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.11-cuda13.0-devel-ubuntu24.04
# For placeholder target only - this default allows agent builds to succeed,
......@@ -74,6 +75,7 @@ RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s
# =============================================================================
FROM ubuntu:24.04 AS criu-builder
ARG CRIU_REPO
ARG CRIU_VERSION
RUN apt-get update && apt-get install -y --no-install-recommends \
......@@ -97,7 +99,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
uuid-dev \
&& rm -rf /var/lib/apt/lists/*
RUN git clone --branch ${CRIU_VERSION} https://github.com/checkpoint-restore/criu.git /tmp/criu \
RUN git clone --depth 1 --branch ${CRIU_VERSION} ${CRIU_REPO} /tmp/criu \
&& cd /tmp/criu \
&& make -j$(nproc) \
&& make DESTDIR=/criu-install install-criu install-lib install-cuda_plugin
......
This diff is collapsed.
......@@ -8,8 +8,8 @@ import (
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/executor"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/snapshot/pkg/orchestrate"
)
func main() {
......@@ -25,13 +25,13 @@ func main() {
fatal(log, nil, "--checkpoint-path is required")
}
opts := orchestrate.RestoreOptions{
opts := executor.RestoreOptions{
CheckpointPath: *checkpointPath,
CUDADeviceMap: *cudaDeviceMap,
CgroupRoot: *cgroupRoot,
}
restoredPID, err := orchestrate.RestoreInNamespace(context.Background(), opts, log)
restoredPID, err := executor.RestoreInNamespace(context.Background(), opts, log)
if err != nil {
fatal(log, err, "restore failed")
}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment