# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Unified Dockerfile for snapshot-agent and placeholder images. # # Build targets: # docker build --platform linux/amd64 --target agent -t snapshot-agent:latest . # docker build --platform linux/amd64 --target placeholder --build-arg BASE_IMAGE= -t placeholder:latest . # # Optional targets for CI: # docker build --target linter . # Run linting # docker build --target tester . # Run tests # ============================================================================= # Build Arguments # ============================================================================= ARG DOCKER_PROXY ARG GO_VERSION=1.25 ARG CRIU_REPO=https://github.com/dfeigin-nv/criu.git ARG CRIU_COMMIT=777baaf27f6a76f743c9bf24b64886297dc0129b ARG AGENT_BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.11-cuda13.0-devel-ubuntu24.04 # For placeholder target only - this default allows agent builds to succeed, # but placeholder builds MUST override it with --build-arg BASE_IMAGE= ARG BASE_IMAGE=placeholder-requires-base-image-arg # ============================================================================= # Stage: Go base - Common setup for Go builds # ============================================================================= FROM ${DOCKER_PROXY}golang:${GO_VERSION} AS go-base ARG TARGETOS=linux ARG TARGETARCH=amd64 RUN echo "Building for ${TARGETOS}/${TARGETARCH}" RUN apt-get update && apt-get install -y --no-install-recommends git ca-certificates \ && rm -rf /var/lib/apt/lists/* WORKDIR /workspace COPY go.mod go.sum ./ RUN go mod download COPY . . # ============================================================================= # Stage: Linter - Run golangci-lint # ============================================================================= FROM go-base AS linter RUN go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.62.2 RUN golangci-lint run --timeout=5m # ============================================================================= # Stage: Tester - Run tests # ============================================================================= FROM go-base AS tester RUN go test ./... -v # ============================================================================= # Stage: Builder - Build Go binaries # ============================================================================= FROM go-base AS builder ARG TARGETOS=linux ARG TARGETARCH=amd64 RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /snapshot-agent ./cmd/agent RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -ldflags="-w -s" -o /nsrestore ./cmd/nsrestore # ============================================================================= # Stage: CRIU Builder - Build CRIU with CUDA plugin # ============================================================================= FROM ubuntu:24.04 AS criu-builder ARG CRIU_REPO ARG CRIU_COMMIT RUN apt-get update && apt-get install -y --no-install-recommends \ git \ ca-certificates \ build-essential \ pkg-config \ libbsd-dev \ libcap-dev \ libnet1-dev \ libnl-3-dev \ libnl-route-3-dev \ libprotobuf-dev \ libprotobuf-c-dev \ protobuf-c-compiler \ protobuf-compiler \ python3 \ python3-protobuf \ libgnutls28-dev \ libnftables-dev \ uuid-dev \ && rm -rf /var/lib/apt/lists/* RUN git init /tmp/criu \ && cd /tmp/criu \ && git remote add origin ${CRIU_REPO} \ && git fetch --depth 1 origin ${CRIU_COMMIT} \ && git checkout FETCH_HEAD \ && make -j$(nproc) \ && make DESTDIR=/criu-install install-criu install-lib install-cuda_plugin RUN git clone https://github.com/NVIDIA/cuda-checkpoint.git /tmp/cuda-checkpoint # ============================================================================= # Stage: Agent - Final snapshot-agent image # ============================================================================= FROM ${AGENT_BASE_IMAGE} AS agent ARG TARGETARCH=amd64 RUN if [ "${TARGETARCH}" != "amd64" ]; then \ echo "ERROR: Dynamo Snapshot requires x86_64 (cuda-checkpoint has no ${TARGETARCH} binary)" >&2; exit 1; \ fi # Install CRIU runtime dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ libbsd0 \ libcap2 \ libnet1 \ libnl-3-200 \ libnl-route-3-200 \ libprotobuf-c1 \ libgnutls30t64 \ libnftables1 \ iproute2 \ iptables \ procps \ uuid-runtime \ tar \ ca-certificates \ util-linux \ && rm -rf /var/lib/apt/lists/* # Copy CRIU from builder COPY --from=criu-builder /criu-install/usr/local /usr/local RUN criu --version # Copy cuda-checkpoint binary COPY --from=criu-builder /tmp/cuda-checkpoint/bin/x86_64_Linux/cuda-checkpoint /usr/local/sbin/cuda-checkpoint RUN chmod +x /usr/local/sbin/cuda-checkpoint # Copy the built binaries COPY --from=builder /snapshot-agent /usr/local/bin/snapshot-agent COPY --from=builder /nsrestore /usr/local/bin/nsrestore # Create directories RUN mkdir -p /checkpoints /var/run/snapshot USER root ENTRYPOINT ["/usr/local/bin/snapshot-agent"] # ============================================================================= # Stage: Placeholder - Runtime-compatible restore image (requires BASE_IMAGE arg) # This image is a superset of the runtime image: same default execution contract # (entrypoint/cmd/user), plus CRIU/cuda-checkpoint tooling for external restore. # The operator may still override command to "sleep infinity" for restore pods. # ============================================================================= FROM ${BASE_IMAGE} AS placeholder ARG BASE_IMAGE ARG TARGETARCH=amd64 ENV ORIGINAL_BASE_IMAGE=${BASE_IMAGE} USER root RUN if [ "${TARGETARCH}" != "amd64" ]; then \ echo "ERROR: Dynamo Snapshot requires x86_64 (cuda-checkpoint has no ${TARGETARCH} binary)" >&2; exit 1; \ fi # Install minimal runtime dependencies for CRIU restore (nsrestore runs here via nsenter) RUN apt-get update && apt-get install -y --no-install-recommends \ libbsd0 \ libcap2 \ libnet1 \ libnl-3-200 \ libnl-route-3-200 \ libprotobuf-c1 \ libgnutls30t64 \ libnftables1 \ iproute2 \ iptables \ procps \ uuid-runtime \ tar \ ca-certificates \ && rm -rf /var/lib/apt/lists/* # Copy CRIU from builder (needed by nsrestore running inside these namespaces) COPY --from=criu-builder /criu-install/usr/local /usr/local RUN criu --version && echo "CRIU installed successfully" # Copy cuda-checkpoint binary (used for external CUDA state checkpoint/restore) COPY --from=criu-builder /tmp/cuda-checkpoint/bin/x86_64_Linux/cuda-checkpoint /usr/local/sbin/cuda-checkpoint RUN chmod +x /usr/local/sbin/cuda-checkpoint # Copy nsrestore binary (invoked by DaemonSet via nsenter) COPY --from=builder /nsrestore /usr/local/bin/nsrestore RUN chmod +x /usr/local/bin/nsrestore # Create directories RUN mkdir -p /checkpoints /var/run/criu /var/criu-work