Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
bb8fc8a4
Unverified
Commit
bb8fc8a4
authored
Feb 20, 2026
by
Schwinn Saereesitthipitak
Committed by
GitHub
Feb 21, 2026
Browse files
feat(chrek): external restore, signal-based IPC, and package refactor (#6286)
Co-authored-by:
Dan Feigin
<
dfeigin@nvidia.com
>
parent
c8423b57
Changes
86
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
399 additions
and
1614 deletions
+399
-1614
components/src/dynamo/vllm/checkpoint_restore.py
components/src/dynamo/vllm/checkpoint_restore.py
+191
-0
components/src/dynamo/vllm/chrek.py
components/src/dynamo/vllm/chrek.py
+0
-157
components/src/dynamo/vllm/main.py
components/src/dynamo/vllm/main.py
+4
-6
deploy/chrek/.gitignore
deploy/chrek/.gitignore
+0
-4
deploy/chrek/Dockerfile
deploy/chrek/Dockerfile
+16
-14
deploy/chrek/Makefile
deploy/chrek/Makefile
+8
-10
deploy/chrek/cmd/agent/config.go
deploy/chrek/cmd/agent/config.go
+11
-96
deploy/chrek/cmd/agent/main.go
deploy/chrek/cmd/agent/main.go
+54
-73
deploy/chrek/cmd/nsrestore/main.go
deploy/chrek/cmd/nsrestore/main.go
+54
-0
deploy/chrek/cmd/restore-entrypoint/main.go
deploy/chrek/cmd/restore-entrypoint/main.go
+0
-107
deploy/chrek/go.mod
deploy/chrek/go.mod
+21
-17
deploy/chrek/go.sum
deploy/chrek/go.sum
+40
-24
deploy/chrek/pkg/checkpoint/checkpoint.go
deploy/chrek/pkg/checkpoint/checkpoint.go
+0
-243
deploy/chrek/pkg/checkpoint/config.go
deploy/chrek/pkg/checkpoint/config.go
+0
-31
deploy/chrek/pkg/checkpoint/constants.go
deploy/chrek/pkg/checkpoint/constants.go
+0
-41
deploy/chrek/pkg/checkpoint/criu.go
deploy/chrek/pkg/checkpoint/criu.go
+0
-258
deploy/chrek/pkg/checkpoint/k8s.go
deploy/chrek/pkg/checkpoint/k8s.go
+0
-79
deploy/chrek/pkg/checkpoint/mounts.go
deploy/chrek/pkg/checkpoint/mounts.go
+0
-218
deploy/chrek/pkg/checkpoint/namespaces.go
deploy/chrek/pkg/checkpoint/namespaces.go
+0
-111
deploy/chrek/pkg/checkpoint/shm.go
deploy/chrek/pkg/checkpoint/shm.go
+0
-125
No files found.
components/src/dynamo/vllm/checkpoint_restore.py
0 → 100644
View file @
bb8fc8a4
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Checkpoint/restore (chrek) integration for vLLM workers.
Handles the checkpoint job pod lifecycle:
1. Early exit if a checkpoint already exists (idempotency)
2. Sleep model for CRIU-friendly GPU state
3. Signal readiness for DaemonSet to begin checkpoint
4. Wait for watcher signals from the DaemonSet
5. Wake model after restore
Environment variables:
- DYN_READY_FOR_CHECKPOINT_FILE: Path where this worker writes readiness marker
- DYN_CHECKPOINT_STORAGE_TYPE: Storage backend (pvc, s3, oci) (optional, defaults to pvc)
- DYN_CHECKPOINT_LOCATION: Full checkpoint path (optional when PATH+HASH are provided)
- DYN_CHECKPOINT_PATH + DYN_CHECKPOINT_HASH: PVC base path + hash (used to derive location)
Signals handled in checkpoint mode:
- SIGUSR1: Checkpoint completed, exit process
- SIGCONT: Restore completed, wake model and continue
- SIGUSR2: Checkpoint/restore failed
"""
import
asyncio
import
logging
import
os
import
signal
from
typing
import
Optional
logger
=
logging
.
getLogger
(
__name__
)
class
CheckpointConfig
:
"""Parsed and validated checkpoint configuration from environment variables."""
def
__init__
(
self
):
self
.
ready_file
=
os
.
environ
[
"DYN_READY_FOR_CHECKPOINT_FILE"
]
self
.
storage_type
=
os
.
environ
.
get
(
"DYN_CHECKPOINT_STORAGE_TYPE"
,
"pvc"
)
self
.
location
=
os
.
environ
.
get
(
"DYN_CHECKPOINT_LOCATION"
,
""
)
if
not
self
.
location
:
checkpoint_path
=
os
.
environ
.
get
(
"DYN_CHECKPOINT_PATH"
,
""
).
rstrip
(
"/"
)
checkpoint_hash
=
os
.
environ
.
get
(
"DYN_CHECKPOINT_HASH"
,
""
)
if
checkpoint_path
and
checkpoint_hash
:
self
.
location
=
f
"
{
checkpoint_path
}
/
{
checkpoint_hash
}
"
self
.
is_checkpoint_job
=
bool
(
self
.
location
)
self
.
_checkpoint_done
=
asyncio
.
Event
()
self
.
_restore_done
=
asyncio
.
Event
()
self
.
_checkpoint_failed
=
asyncio
.
Event
()
def
checkpoint_exists
(
self
)
->
bool
:
"""Check if a completed checkpoint already exists (idempotency).
A checkpoint is complete when its directory exists at the base path root
(not under the tmp/ staging area). Directory presence = done.
"""
if
self
.
storage_type
!=
"pvc"
:
return
False
if
os
.
path
.
isdir
(
self
.
location
):
logger
.
info
(
f
"Existing checkpoint found at
{
self
.
location
}
, skipping"
)
return
True
logger
.
info
(
f
"No checkpoint at
{
self
.
location
}
, creating new one"
)
return
False
async
def
run_lifecycle
(
self
,
engine_client
,
sleep_level
:
int
)
->
bool
:
"""Run the full checkpoint lifecycle after the engine is loaded.
1. Put model to sleep (CRIU-friendly GPU state)
2. Write ready file (triggers DaemonSet checkpoint via readiness probe)
3. Wait for watcher signal (checkpoint complete, restore complete, or failure)
4. If restored: wake model and return True (caller proceeds with registration)
5. If checkpoint done: return False (caller should exit)
"""
# Sleep model for checkpoint
logger
.
info
(
f
"Putting model to sleep (level=
{
sleep_level
}
)"
)
await
engine_client
.
sleep
(
level
=
sleep_level
)
# Install signal handlers before writing the ready file so there is no
# window where the DaemonSet can send SIGUSR1/SIGUSR2/SIGCONT while the
# default signal disposition (terminate) is still in effect.
self
.
_install_signal_handlers
()
# Signal readiness
with
open
(
self
.
ready_file
,
"w"
)
as
f
:
f
.
write
(
"ready"
)
logger
.
info
(
"Ready for checkpoint. Waiting for watcher signal "
"(SIGUSR1=checkpoint complete, SIGCONT=restore complete, SIGUSR2=failure)"
)
try
:
event
=
await
self
.
_wait_for_watcher_signal
()
if
event
==
"restore"
:
logger
.
info
(
"Restore signal detected (SIGCONT)"
)
logger
.
info
(
"Waking up model after restore"
)
await
engine_client
.
wake_up
()
return
True
if
event
==
"checkpoint"
:
logger
.
info
(
"Checkpoint completion signal detected (SIGUSR1)"
)
return
False
raise
RuntimeError
(
"Checkpoint failed (received SIGUSR2 from watcher)"
)
finally
:
self
.
_remove_signal_handlers
()
# Remove the ready file so that a restarting pod does not leave a
# stale marker that could trick the DaemonSet into acting on it.
try
:
os
.
unlink
(
self
.
ready_file
)
except
OSError
:
pass
def
_install_signal_handlers
(
self
)
->
None
:
loop
=
asyncio
.
get_running_loop
()
loop
.
add_signal_handler
(
signal
.
SIGUSR1
,
self
.
_checkpoint_done
.
set
)
# SIGCONT is used as the restore-complete signal because SIGUSR1 and
# SIGUSR2 are already taken (checkpoint-complete and checkpoint-failed
# respectively). The chrek DaemonSet watcher is the only sender, so
# there is no conflict with POSIX job-control semantics in practice.
loop
.
add_signal_handler
(
signal
.
SIGCONT
,
self
.
_restore_done
.
set
)
loop
.
add_signal_handler
(
signal
.
SIGUSR2
,
self
.
_checkpoint_failed
.
set
)
def
_remove_signal_handlers
(
self
)
->
None
:
loop
=
asyncio
.
get_running_loop
()
loop
.
remove_signal_handler
(
signal
.
SIGUSR1
)
loop
.
remove_signal_handler
(
signal
.
SIGCONT
)
loop
.
remove_signal_handler
(
signal
.
SIGUSR2
)
async
def
_wait_for_watcher_signal
(
self
)
->
str
:
waiters
=
{
asyncio
.
create_task
(
self
.
_checkpoint_done
.
wait
()):
"checkpoint"
,
asyncio
.
create_task
(
self
.
_restore_done
.
wait
()):
"restore"
,
asyncio
.
create_task
(
self
.
_checkpoint_failed
.
wait
()):
"failed"
,
}
try
:
done
,
pending
=
await
asyncio
.
wait
(
waiters
.
keys
(),
return_when
=
asyncio
.
FIRST_COMPLETED
)
for
task
in
pending
:
task
.
cancel
()
winner
=
done
.
pop
()
await
winner
return
waiters
[
winner
]
finally
:
for
task
in
waiters
:
if
not
task
.
done
():
task
.
cancel
()
def
get_checkpoint_config
()
->
tuple
[
bool
,
Optional
[
CheckpointConfig
]]:
"""Resolve checkpoint configuration, handling early-exit and cold-start cases.
Checkpoint mode is detected by DYN_READY_FOR_CHECKPOINT_FILE being set.
Returns:
(early_exit, config) where:
- early_exit=True, config=None: checkpoint job re-run, checkpoint already
exists — caller should return immediately.
- early_exit=False, config=None: not in checkpoint mode, or regular worker
with no checkpoint available yet — cold-start normally.
- early_exit=False, config=CheckpointConfig: checkpoint lifecycle should run.
"""
if
"DYN_READY_FOR_CHECKPOINT_FILE"
not
in
os
.
environ
:
return
False
,
None
# Validate checkpoint location: either a full location or path + hash must be set.
# Check the value (not just presence) so an empty string is treated as unset.
if
not
os
.
environ
.
get
(
"DYN_CHECKPOINT_LOCATION"
):
path
=
os
.
environ
.
get
(
"DYN_CHECKPOINT_PATH"
,
""
)
hash_
=
os
.
environ
.
get
(
"DYN_CHECKPOINT_HASH"
,
""
)
if
not
path
or
not
hash_
:
raise
EnvironmentError
(
"Checkpoint mode requires either DYN_CHECKPOINT_LOCATION or both "
"DYN_CHECKPOINT_PATH and DYN_CHECKPOINT_HASH"
)
cfg
=
CheckpointConfig
()
checkpoint_exists
=
cfg
.
checkpoint_exists
()
if
cfg
.
is_checkpoint_job
and
checkpoint_exists
:
# Idempotent checkpoint job re-run: checkpoint already exists.
return
True
,
None
if
not
cfg
.
is_checkpoint_job
and
not
checkpoint_exists
:
# Regular worker with no checkpoint available yet: cold-start normally.
return
False
,
None
return
False
,
cfg
components/src/dynamo/vllm/chrek.py
deleted
100644 → 0
View file @
c8423b57
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Checkpoint/restore (chrek) integration for vLLM workers.
Handles the checkpoint job pod lifecycle:
1. Early exit if a checkpoint already exists (idempotency)
2. Sleep model for CRIU-friendly GPU state
3. Signal readiness for DaemonSet to begin checkpoint
4. Poll for checkpoint completion or CRIU restore detection
5. Wake model after restore
Environment variables (all required in checkpoint mode, no fallbacks):
- DYN_CHECKPOINT_SIGNAL_FILE: Path where DaemonSet writes completion signal
- DYN_READY_FOR_CHECKPOINT_FILE: Path where this worker writes readiness marker
- DYN_CHECKPOINT_STORAGE_TYPE: Storage backend (pvc, s3, oci)
- DYN_CHECKPOINT_LOCATION: Full checkpoint path (for idempotency check)
- DYN_RESTORE_MARKER_FILE: Path written by restore-entrypoint before CRIU restore
"""
import
asyncio
import
json
import
logging
import
os
from
typing
import
Optional
logger
=
logging
.
getLogger
(
__name__
)
_REQUIRED_ENV_VARS
=
[
"DYN_CHECKPOINT_SIGNAL_FILE"
,
"DYN_READY_FOR_CHECKPOINT_FILE"
,
"DYN_CHECKPOINT_STORAGE_TYPE"
,
"DYN_CHECKPOINT_LOCATION"
,
"DYN_RESTORE_MARKER_FILE"
,
]
class
CheckpointConfig
:
"""Parsed and validated checkpoint configuration from environment variables."""
def
__init__
(
self
):
self
.
signal_file
=
os
.
environ
[
"DYN_CHECKPOINT_SIGNAL_FILE"
]
self
.
ready_file
=
os
.
environ
[
"DYN_READY_FOR_CHECKPOINT_FILE"
]
self
.
storage_type
=
os
.
environ
[
"DYN_CHECKPOINT_STORAGE_TYPE"
]
self
.
location
=
os
.
environ
[
"DYN_CHECKPOINT_LOCATION"
]
self
.
restore_marker
=
os
.
environ
[
"DYN_RESTORE_MARKER_FILE"
]
def
_read_status_file
(
self
,
path
:
str
)
->
dict
:
with
open
(
path
)
as
f
:
status
=
json
.
load
(
f
)
success
=
status
.
get
(
"success"
)
if
not
isinstance
(
success
,
bool
):
raise
ValueError
(
f
"missing or invalid success field in
{
path
}
"
)
return
status
def
checkpoint_exists
(
self
)
->
bool
:
"""Check if a completed checkpoint already exists (idempotency).
For PVC storage, checks for checkpoint.done marker at the location.
Returns True if the job should exit without loading the model.
"""
assert
(
self
.
storage_type
==
"pvc"
),
"Checkpoint existence check is only implemented for PVC storage"
if
self
.
storage_type
==
"pvc"
and
self
.
location
:
done_marker
=
f
"
{
self
.
location
}
/checkpoint.done"
if
os
.
path
.
exists
(
done_marker
):
try
:
status
=
self
.
_read_status_file
(
done_marker
)
except
(
OSError
,
ValueError
,
json
.
JSONDecodeError
)
as
exc
:
logger
.
warning
(
f
"Invalid checkpoint.done marker at
{
done_marker
}
, ignoring stale checkpoint:
{
exc
}
"
)
return
False
if
status
[
"success"
]:
logger
.
info
(
f
"Existing successful checkpoint found at
{
self
.
location
}
, skipping"
)
return
True
logger
.
warning
(
f
"Existing checkpoint marker reports failure at
{
self
.
location
}
: "
f
"
{
status
.
get
(
'error'
,
'unknown error'
)
}
"
)
return
False
logger
.
info
(
f
"No checkpoint at
{
self
.
location
}
, creating new one"
)
return
False
async
def
run_lifecycle
(
self
,
engine_client
,
sleep_level
:
int
)
->
bool
:
"""Run the full checkpoint lifecycle after the engine is loaded.
1. Put model to sleep (CRIU-friendly GPU state)
2. Write ready file (triggers DaemonSet checkpoint via readiness probe)
3. Poll for signal file (checkpoint done) or restore marker (CRIU restored us)
4. If restored: wake model and return True (caller proceeds with registration)
5. If checkpoint done: return False (caller should exit)
"""
# Sleep model for checkpoint
logger
.
info
(
f
"Putting model to sleep (level=
{
sleep_level
}
)"
)
await
engine_client
.
sleep
(
level
=
sleep_level
)
# Signal readiness
with
open
(
self
.
ready_file
,
"w"
)
as
f
:
f
.
write
(
"ready"
)
logger
.
info
(
f
"Ready for checkpoint. Waiting for signal:
{
self
.
signal_file
}
"
f
"or restore marker:
{
self
.
restore_marker
}
"
)
# Poll for signal or restore
while
True
:
if
os
.
path
.
exists
(
self
.
restore_marker
):
logger
.
info
(
f
"Restore detected (marker:
{
self
.
restore_marker
}
)"
)
logger
.
info
(
"Waking up model after restore"
)
await
engine_client
.
wake_up
()
return
True
if
os
.
path
.
exists
(
self
.
signal_file
):
try
:
signal
=
self
.
_read_status_file
(
self
.
signal_file
)
except
(
OSError
,
ValueError
,
json
.
JSONDecodeError
)
as
exc
:
raise
RuntimeError
(
f
"Invalid checkpoint signal file
{
self
.
signal_file
}
:
{
exc
}
"
)
from
exc
if
signal
[
"success"
]:
logger
.
info
(
f
"Checkpoint complete (signal:
{
self
.
signal_file
}
)"
)
return
False
raise
RuntimeError
(
f
"Checkpoint failed (signal:
{
self
.
signal_file
}
): "
f
"
{
signal
.
get
(
'error'
,
'unknown error'
)
}
"
)
await
asyncio
.
sleep
(
1
)
def
get_checkpoint_config
()
->
Optional
[
CheckpointConfig
]:
"""Returns CheckpointConfig if in checkpoint mode, None otherwise.
Checkpoint mode is detected by DYN_CHECKPOINT_SIGNAL_FILE being set.
If in checkpoint mode, all required env vars must be present — raises
EnvironmentError if any are missing.
"""
if
"DYN_CHECKPOINT_SIGNAL_FILE"
not
in
os
.
environ
:
return
None
missing
=
[
v
for
v
in
_REQUIRED_ENV_VARS
if
v
not
in
os
.
environ
]
if
missing
:
raise
EnvironmentError
(
f
"Checkpoint mode requires these environment variables:
{
', '
.
join
(
missing
)
}
"
)
return
CheckpointConfig
()
components/src/dynamo/vllm/main.py
View file @
bb8fc8a4
...
...
@@ -49,7 +49,7 @@ from dynamo.runtime.logging import configure_dynamo_logging
from
dynamo.vllm.worker_factory
import
WorkerFactory
from
.args
import
Config
,
parse_args
from
.chre
k
import
get_checkpoint_config
from
.ch
eckpoint_resto
re
import
get_checkpoint_config
from
.handlers
import
DecodeWorkerHandler
,
PrefillWorkerHandler
from
.health_check
import
(
VllmHealthCheckPayload
,
...
...
@@ -100,8 +100,8 @@ async def worker():
config
.
served_model_name
=
config
.
engine_args
.
served_model_name
=
config
.
model
# Check checkpoint mode and validate env vars EARLY (fail fast if misconfigured)
checkpoint_cfg
=
get_checkpoint_config
()
if
checkpoint_cfg
and
checkpoint_cfg
.
checkpoint_exists
()
:
early_exit
,
checkpoint_cfg
=
get_checkpoint_config
()
if
early_exit
:
return
# Download the model if necessary using modelexpress.
...
...
@@ -120,9 +120,7 @@ async def worker():
# This allows checkpointing GPU state before runtime connections are established
pre_created_engine
=
None
if
checkpoint_cfg
is
not
None
:
logger
.
info
(
f
"Checkpoint mode enabled (signal_file=
{
checkpoint_cfg
.
signal_file
}
)"
)
logger
.
info
(
"Checkpoint mode enabled (watcher-driven signals)"
)
# Checkpoint mode requires sleep mode — enable before engine init
config
.
engine_args
.
enable_sleep_mode
=
True
...
...
deploy/chrek/.gitignore
View file @
bb8fc8a4
...
...
@@ -2,10 +2,6 @@
bin/
*.exe
# Reference source repos (clone separately if needed)
containerd/
runc/
# Build artifacts
*.o
*.a
...
...
deploy/chrek/Dockerfile
View file @
bb8fc8a4
...
...
@@ -67,7 +67,7 @@ ARG TARGETOS=linux
ARG
TARGETARCH=amd64
RUN
CGO_ENABLED
=
0
GOOS
=
${
TARGETOS
}
GOARCH
=
${
TARGETARCH
}
go build
-ldflags
=
"-w -s"
-o
/chrek-agent ./cmd/agent
RUN
CGO_ENABLED
=
0
GOOS
=
${
TARGETOS
}
GOARCH
=
${
TARGETARCH
}
go build
-ldflags
=
"-w -s"
-o
/restore
-entrypoint
./cmd/restore
-entrypoint
RUN
CGO_ENABLED
=
0
GOOS
=
${
TARGETOS
}
GOARCH
=
${
TARGETARCH
}
go build
-ldflags
=
"-w -s"
-o
/
ns
restore ./cmd/
ns
restore
# =============================================================================
# Stage: CRIU Builder - Build CRIU with CUDA plugin
...
...
@@ -125,6 +125,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
uuid-runtime
\
tar
\
ca-certificates
\
util-linux
\
&&
rm
-rf
/var/lib/apt/lists/
*
# Copy CRIU from builder
...
...
@@ -137,17 +138,20 @@ RUN chmod +x /usr/local/sbin/cuda-checkpoint
# Copy the built binaries
COPY
--from=builder /chrek-agent /usr/local/bin/chrek-agent
COPY
--from=builder /restore
-entrypoint /restore-entrypoint
COPY
--from=builder /
ns
restore
/usr/local/bin/nsrestore
# Create
checkpoint
director
y
RUN
mkdir
-p
/checkpoints
# Create director
ies
RUN
mkdir
-p
/checkpoints
/var/run/chrek
USER
root
ENTRYPOINT
["/usr/local/bin/chrek-agent"]
# =============================================================================
# Stage: Placeholder - Restore placeholder image (requires BASE_IMAGE arg)
# Stage: Placeholder - Runtime-compatible restore image (requires BASE_IMAGE arg)
# This image is a superset of the runtime image: same default execution contract
# (entrypoint/cmd/user), plus CRIU/cuda-checkpoint tooling for external restore.
# The operator may still override command to "sleep infinity" for restore pods.
# =============================================================================
FROM
${BASE_IMAGE} AS placeholder
...
...
@@ -156,7 +160,7 @@ ENV ORIGINAL_BASE_IMAGE=${BASE_IMAGE}
USER
root
# Install
CRIU
runtime dependencies
# Install
minimal
runtime dependencies
for CRIU restore (nsrestore runs here via nsenter)
RUN
apt-get update
&&
apt-get
install
-y
--no-install-recommends
\
libbsd0
\
libcap2
\
...
...
@@ -174,20 +178,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates
\
&&
rm
-rf
/var/lib/apt/lists/
*
# Copy CRIU from builder
# Copy CRIU from builder
(needed by nsrestore running inside these namespaces)
COPY
--from=criu-builder /criu-install/usr/local /usr/local
RUN
criu
--version
&&
echo
"CRIU installed successfully"
# Copy cuda-checkpoint binary
# Copy cuda-checkpoint binary
(used for external CUDA state checkpoint/restore)
COPY
--from=criu-builder /tmp/cuda-checkpoint/bin/x86_64_Linux/cuda-checkpoint /usr/local/sbin/cuda-checkpoint
RUN
chmod
+x /usr/local/sbin/cuda-checkpoint
# Copy nsrestore binary (invoked by DaemonSet via nsenter)
COPY
--from=builder /nsrestore /usr/local/bin/nsrestore
RUN
chmod
+x /usr/local/bin/nsrestore
# Create directories
RUN
mkdir
-p
/checkpoints /var/run/criu /var/criu-work
# Copy restore binaries
COPY
--from=builder /restore-entrypoint /restore-entrypoint
RUN
chmod
+x /restore-entrypoint
ENTRYPOINT
["/restore-entrypoint"]
CMD
[]
deploy/chrek/Makefile
View file @
bb8fc8a4
...
...
@@ -54,17 +54,8 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes.
##@ Build
.PHONY
:
build
build
:
fmt vet
##
Build chrek-agent
and restore-entrypoint
binar
ies
.
build
:
fmt vet
##
Build chrek-agent binar
y
.
CGO_ENABLED
=
0 go build
-ldflags
=
"-w -s"
-o
bin/chrek-agent ./cmd/agent
CGO_ENABLED
=
0 go build
-ldflags
=
"-w -s"
-o
bin/restore-entrypoint ./cmd/restore-entrypoint
.PHONY
:
build-agent
build-agent
:
fmt vet
##
Build chrek-agent binary only.
CGO_ENABLED
=
0 go build
-ldflags
=
"-w -s"
-o
bin/chrek-agent ./cmd/agent
.PHONY
:
build-restore
build-restore
:
fmt vet
##
Build restore-entrypoint binary only.
CGO_ENABLED
=
0 go build
-ldflags
=
"-w -s"
-o
bin/restore-entrypoint ./cmd/restore-entrypoint
.PHONY
:
run
run
:
build
##
Run chrek-agent from your host.
...
...
@@ -94,8 +85,15 @@ docker-build-placeholder: ## Build placeholder image for checkpoint restore. Req
ifndef
PLACEHOLDER_BASE_IMG
$(
error
PLACEHOLDER_BASE_IMG is required. Example: make docker-build-placeholder
PLACEHOLDER_BASE_IMG
=
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1-cuda13
)
endif
@
BASE_IMAGE_USER
=
"
$$
(
$(CONTAINER_TOOL)
image inspect --format '{{.Config.User}}'
${PLACEHOLDER_BASE_IMG}
2>/dev/null || true )"
;
\
if
[
-z
"
$$
BASE_IMAGE_USER"
]
;
then
\
$(CONTAINER_TOOL)
pull
${PLACEHOLDER_BASE_IMG}
>
/dev/null
;
\
BASE_IMAGE_USER
=
"
$$
(
$(CONTAINER_TOOL)
image inspect --format '{{.Config.User}}'
${PLACEHOLDER_BASE_IMG}
2>/dev/null || true )"
;
\
fi
;
\
if
[
-z
"
$$
BASE_IMAGE_USER"
]
;
then
BASE_IMAGE_USER
=
root
;
fi
;
\
$(CONTAINER_TOOL)
build
--target
placeholder
\
--build-arg
BASE_IMAGE
=
${PLACEHOLDER_BASE_IMG}
\
--build-arg
BASE_IMAGE_USER
=
$$
BASE_IMAGE_USER
\
-t
${PLACEHOLDER_IMG}
.
.PHONY
:
docker-push-agent
...
...
deploy/chrek/cmd/agent/config.go
View file @
bb8fc8a4
...
...
@@ -2,129 +2,44 @@
package
main
import
(
"errors"
"fmt"
"os"
"gopkg.in/yaml.v3"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/
checkpoint
"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/
types
"
)
// ConfigMapPath is the default path where the ConfigMap is mounted.
const
ConfigMapPath
=
"/etc/chrek/config.yaml"
// CheckpointSignalSource determines how checkpoint operations are triggered.
type
CheckpointSignalSource
string
const
(
// SignalFromHTTP triggers checkpoints via HTTP API requests.
SignalFromHTTP
CheckpointSignalSource
=
"http"
// SignalFromWatcher triggers checkpoints automatically when pods become Ready.
SignalFromWatcher
CheckpointSignalSource
=
"watcher"
)
// FullConfig is the root configuration structure loaded from the ConfigMap.
type
FullConfig
struct
{
Agent
AgentConfig
`yaml:"agent"`
Checkpoint
checkpoint
.
CheckpointSpec
`yaml:"checkpoint"`
}
// AgentConfig holds the runtime configuration for the checkpoint agent daemon.
type
AgentConfig
struct
{
// SignalSource determines how checkpoints are triggered: "http" or "watcher"
SignalSource
string
`yaml:"signalSource"`
// ListenAddr is the HTTP server address for health checks and API
ListenAddr
string
`yaml:"listenAddr"`
// NodeName is the Kubernetes node name (from NODE_NAME env, downward API)
NodeName
string
`yaml:"-"`
// RestrictedNamespace restricts pod watching to this namespace (optional)
RestrictedNamespace
string
`yaml:"-"`
}
// ConfigError represents a configuration validation error.
type
ConfigError
struct
{
Field
string
Message
string
}
func
(
e
*
ConfigError
)
Error
()
string
{
return
fmt
.
Sprintf
(
"config error: %s: %s"
,
e
.
Field
,
e
.
Message
)
}
// LoadConfig loads the full configuration from a YAML file.
func
LoadConfig
(
path
string
)
(
*
FullConfig
,
error
)
{
// LoadConfig loads the agent configuration from a YAML file.
func
LoadConfig
(
path
string
)
(
*
types
.
AgentConfig
,
error
)
{
data
,
err
:=
os
.
ReadFile
(
path
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to read config file %s: %w"
,
path
,
err
)
}
cfg
:=
&
Full
Config
{}
cfg
:=
&
types
.
Agent
Config
{}
if
err
:=
yaml
.
Unmarshal
(
data
,
cfg
);
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to parse config file %s: %w"
,
path
,
err
)
}
// Apply environment variable overrides
cfg
.
Agent
.
loadEnvOverrides
()
cfg
.
LoadEnvOverrides
()
return
cfg
,
nil
}
// LoadConfigOrDefault loads configuration from a file, falling back to
zero value
s if the file doesn't exist.
func
LoadConfigOrDefault
(
path
string
)
(
*
Full
Config
,
error
)
{
// LoadConfigOrDefault loads configuration from a file, falling back to
default
s if the file doesn't exist.
func
LoadConfigOrDefault
(
path
string
)
(
*
types
.
Agent
Config
,
error
)
{
cfg
,
err
:=
LoadConfig
(
path
)
if
err
!=
nil
{
if
os
.
Is
NotExist
(
err
)
{
cfg
=
&
Full
Config
{}
cfg
.
Agent
.
l
oadEnvOverrides
()
if
errors
.
Is
(
err
,
os
.
Err
NotExist
)
{
cfg
=
&
types
.
Agent
Config
{}
cfg
.
L
oadEnvOverrides
()
return
cfg
,
nil
}
return
nil
,
err
}
return
cfg
,
nil
}
// loadEnvOverrides applies environment variable overrides to the AgentConfig.
func
(
c
*
AgentConfig
)
loadEnvOverrides
()
{
if
v
:=
os
.
Getenv
(
"NODE_NAME"
);
v
!=
""
{
c
.
NodeName
=
v
}
if
v
:=
os
.
Getenv
(
"RESTRICTED_NAMESPACE"
);
v
!=
""
{
c
.
RestrictedNamespace
=
v
}
}
// GetSignalSource returns the signal source as a CheckpointSignalSource type.
func
(
c
*
AgentConfig
)
GetSignalSource
()
CheckpointSignalSource
{
return
CheckpointSignalSource
(
c
.
SignalSource
)
}
// Validate checks that the AgentConfig has valid values.
func
(
c
*
AgentConfig
)
Validate
()
error
{
if
c
.
SignalSource
!=
string
(
SignalFromHTTP
)
&&
c
.
SignalSource
!=
string
(
SignalFromWatcher
)
{
return
&
ConfigError
{
Field
:
"signalSource"
,
Message
:
"must be 'http' or 'watcher'"
,
}
}
if
c
.
SignalSource
==
string
(
SignalFromHTTP
)
&&
c
.
ListenAddr
==
""
{
return
&
ConfigError
{
Field
:
"listenAddr"
,
Message
:
"cannot be empty when signalSource is 'http'"
,
}
}
return
nil
}
// Validate validates the full configuration.
func
(
c
*
FullConfig
)
Validate
()
error
{
if
err
:=
c
.
Agent
.
Validate
();
err
!=
nil
{
return
err
}
if
err
:=
c
.
Checkpoint
.
Validate
();
err
!=
nil
{
return
err
}
return
nil
}
deploy/chrek/cmd/agent/main.go
View file @
bb8fc8a4
// Package main provides the CRIU node agent with HTTP API and/or pod watching.
// The agent supports two modes that can be enabled independently:
// - HTTP API mode: Exposes REST endpoints for checkpoint/restore operations
// - Watcher mode: Automatically checkpoints pods with nvidia.com/checkpoint-source=true label
// Package main provides the chrek DaemonSet agent.
// The agent watches for pods with checkpoint/restore labels on its node
// and triggers operations via the orchestrators.
package
main
import
(
"context"
"log"
"net/http"
"os"
"os/signal"
"syscall"
"time"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/checkpoint"
httpApiServer
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/http_api_server"
"github.com/containerd/containerd"
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/watcher"
)
func
main
()
{
// Load configuration from ConfigMap (or use defaults if not found)
rootLog
:=
logging
.
ConfigureLogger
(
"stdout"
)
agentLog
:=
rootLog
.
WithName
(
"agent"
)
cfg
,
err
:=
LoadConfigOrDefault
(
ConfigMapPath
)
if
err
!=
nil
{
log
.
Fatalf
(
"Failed to load configuration
: %v"
,
err
)
fatal
(
agentLog
,
err
,
"Failed to load configuration
"
)
}
// Validate configuration
if
err
:=
cfg
.
Agent
.
Validate
();
err
!=
nil
{
log
.
Fatalf
(
"Invalid configuration: %v"
,
err
)
if
err
:=
cfg
.
Validate
();
err
!=
nil
{
fatal
(
agentLog
,
err
,
"Invalid configuration"
)
}
// Create discovery client
discoveryClient
,
err
:=
checkpoint
.
NewDiscoveryClient
()
ctrd
,
err
:=
containerd
.
New
(
common
.
ContainerdSocket
)
if
err
!=
nil
{
log
.
Fatalf
(
"Failed to create discovery client: %v"
,
err
)
fatal
(
agentLog
,
err
,
"Failed to connect to containerd"
)
}
defer
discoveryClient
.
Close
()
defer
ctrd
.
Close
()
// Create checkpointer
checkpointer
:=
checkpoint
.
NewCheckpointer
(
discoveryClient
)
// Context for graceful shutdown
ctx
,
cancel
:=
context
.
WithCancel
(
context
.
Background
())
defer
cancel
()
// Handle graceful shutdown
sigChan
:=
make
(
chan
os
.
Signal
,
1
)
signal
.
Notify
(
sigChan
,
syscall
.
SIGINT
,
syscall
.
SIGTERM
)
log
.
Printf
(
"CRIU Node Agent starting (node: %s)"
,
cfg
.
Agent
.
NodeName
)
log
.
Printf
(
"Checkpoint directory: %s"
,
cfg
.
Checkpoint
.
BasePath
)
log
.
Printf
(
"Signal source: %s"
,
cfg
.
Agent
.
SignalSource
)
agentLog
.
Info
(
"Starting chrek agent"
,
"node"
,
cfg
.
NodeName
,
"checkpoint_dir"
,
cfg
.
BasePath
,
"watch_namespace"
,
cfg
.
RestrictedNamespace
,
)
switch
cfg
.
Agent
.
GetSignalSource
()
{
case
SignalFromHTTP
:
serverCfg
:=
httpApiServer
.
ServerConfig
{
ListenAddr
:
cfg
.
Agent
.
ListenAddr
,
NodeName
:
cfg
.
Agent
.
NodeName
,
CheckpointSpec
:
&
cfg
.
Checkpoint
,
podWatcher
,
err
:=
watcher
.
NewWatcher
(
cfg
,
ctrd
,
rootLog
.
WithName
(
"watcher"
))
if
err
!=
nil
{
fatal
(
agentLog
,
err
,
"Failed to create pod watcher"
)
}
srv
:=
httpApiServer
.
NewServer
(
serverCfg
,
checkpointer
)
// Handle graceful shutdown
// Run watcher in the background
watcherDone
:=
make
(
chan
error
,
1
)
go
func
()
{
<-
sigChan
shutdownCtx
,
shutdownCancel
:=
context
.
WithTimeout
(
context
.
Background
(),
30
*
time
.
Second
)
defer
shutdownCancel
()
if
err
:=
srv
.
Shutdown
(
shutdownCtx
);
err
!=
nil
{
log
.
Printf
(
"HTTP server shutdown error: %v"
,
err
)
}
agentLog
.
Info
(
"Pod watcher started"
)
watcherDone
<-
podWatcher
.
Start
(
ctx
)
}()
if
err
:=
srv
.
Start
();
err
!=
http
.
ErrServerClosed
{
log
.
Fatalf
(
"HTTP server error: %v"
,
err
)
// Wait for signal or watcher exit
select
{
case
<-
sigChan
:
agentLog
.
Info
(
"Shutting down"
)
cancel
()
select
{
case
err
:=
<-
watcherDone
:
if
err
!=
nil
{
agentLog
.
Error
(
err
,
"Pod watcher exited with error during shutdown"
)
}
case
SignalFromWatcher
:
watcherConfig
:=
watcher
.
WatcherConfig
{
NodeName
:
cfg
.
Agent
.
NodeName
,
ListenAddr
:
cfg
.
Agent
.
ListenAddr
,
RestrictedNamespace
:
cfg
.
Agent
.
RestrictedNamespace
,
CheckpointSpec
:
&
cfg
.
Checkpoint
,
default
:
}
podWatcher
,
err
:=
watcher
.
NewWatcher
(
watcherConfig
,
discoveryClient
,
checkpointer
)
case
err
:=
<-
watcherDone
:
if
err
!=
nil
{
log
.
Fatalf
(
"Failed to create pod watcher: %v"
,
err
)
fatal
(
agentLog
,
err
,
"Pod watcher exited with
err
or"
)
}
// Handle graceful shutdown
go
func
()
{
<-
sigChan
log
.
Println
(
"Shutting down pod watcher..."
)
cancel
()
}()
log
.
Printf
(
"Pod watcher started (watching for label: %s=true)"
,
checkpoint
.
KubeLabelCheckpointSource
)
log
.
Printf
(
"Health check endpoint: http://0.0.0.0%s/health"
,
cfg
.
Agent
.
ListenAddr
)
if
err
:=
podWatcher
.
Start
(
ctx
);
err
!=
nil
{
log
.
Printf
(
"Pod watcher error: %v"
,
err
)
}
default
:
log
.
Fatalf
(
"Unknown signal source: %s"
,
cfg
.
Agent
.
SignalSource
)
}
agentLog
.
Info
(
"Agent stopped"
)
}
log
.
Println
(
"Agent stopped"
)
func
fatal
(
log
logr
.
Logger
,
err
error
,
msg
string
,
keysAndValues
...
interface
{})
{
if
err
!=
nil
{
log
.
Error
(
err
,
msg
,
keysAndValues
...
)
}
else
{
log
.
Info
(
msg
,
keysAndValues
...
)
}
os
.
Exit
(
1
)
}
deploy/chrek/cmd/nsrestore/main.go
0 → 100644
View file @
bb8fc8a4
package
main
import
(
"context"
"encoding/json"
"flag"
"os"
"github.com/go-logr/logr"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/logging"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/orchestrate"
)
func
main
()
{
// Logs go to stderr so stdout is reserved for the structured result.
log
:=
logging
.
ConfigureLogger
(
"stderr"
)
.
WithName
(
"nsrestore"
)
checkpointPath
:=
flag
.
String
(
"checkpoint-path"
,
""
,
"Path to checkpoint directory"
)
cudaDeviceMap
:=
flag
.
String
(
"cuda-device-map"
,
""
,
"CUDA device map for cuda-checkpoint restore"
)
cgroupRoot
:=
flag
.
String
(
"cgroup-root"
,
""
,
"CRIU cgroup root remap path"
)
flag
.
Parse
()
if
*
checkpointPath
==
""
{
fatal
(
log
,
nil
,
"--checkpoint-path is required"
)
}
opts
:=
orchestrate
.
RestoreOptions
{
CheckpointPath
:
*
checkpointPath
,
CUDADeviceMap
:
*
cudaDeviceMap
,
CgroupRoot
:
*
cgroupRoot
,
}
restoredPID
,
err
:=
orchestrate
.
RestoreInNamespace
(
context
.
Background
(),
opts
,
log
)
if
err
!=
nil
{
fatal
(
log
,
err
,
"restore failed"
)
}
result
:=
struct
{
RestoredPID
int
`json:"restoredPID"`
}{
RestoredPID
:
restoredPID
}
if
err
:=
json
.
NewEncoder
(
os
.
Stdout
)
.
Encode
(
result
);
err
!=
nil
{
fatal
(
log
,
err
,
"Failed to write restore result"
)
}
}
func
fatal
(
log
logr
.
Logger
,
err
error
,
msg
string
,
keysAndValues
...
interface
{})
{
if
err
!=
nil
{
log
.
Error
(
err
,
msg
,
keysAndValues
...
)
}
else
{
log
.
Info
(
msg
,
keysAndValues
...
)
}
os
.
Exit
(
1
)
}
deploy/chrek/cmd/restore-entrypoint/main.go
deleted
100644 → 0
View file @
c8423b57
// Package main provides the restore-entrypoint binary for self-restoring placeholder containers.
// This binary replaces the shell script restore-entrypoint.sh with a Go implementation
// that uses the go-criu library for CRIU operations.
package
main
import
(
"context"
"fmt"
"os"
"os/exec"
"os/signal"
"path/filepath"
"strings"
"syscall"
"github.com/sirupsen/logrus"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/restore"
)
// logGPUDiagnostics logs nvidia-smi output and /dev/nvidia* devices for debugging GPU visibility.
func
logGPUDiagnostics
(
label
string
)
{
fmt
.
Printf
(
"=== GPU DIAGNOSTICS [%s] ===
\n
"
,
label
)
// nvidia-smi
if
out
,
err
:=
exec
.
Command
(
"nvidia-smi"
,
"-L"
)
.
CombinedOutput
();
err
!=
nil
{
fmt
.
Printf
(
"nvidia-smi -L: error: %v
\n
"
,
err
)
}
else
{
fmt
.
Printf
(
"nvidia-smi -L:
\n
%s"
,
out
)
}
// GPU memory usage
if
out
,
err
:=
exec
.
Command
(
"nvidia-smi"
,
"--query-gpu=index,uuid,memory.used,memory.total,memory.free"
,
"--format=csv,noheader"
)
.
CombinedOutput
();
err
!=
nil
{
fmt
.
Printf
(
"nvidia-smi memory query: error: %v
\n
"
,
err
)
}
else
{
fmt
.
Printf
(
"nvidia-smi memory:
\n
%s"
,
out
)
}
// /dev/nvidia* devices
matches
,
_
:=
filepath
.
Glob
(
"/dev/nvidia*"
)
fmt
.
Printf
(
"/dev/nvidia* devices: %s
\n
"
,
strings
.
Join
(
matches
,
", "
))
// NVIDIA_VISIBLE_DEVICES env
fmt
.
Printf
(
"NVIDIA_VISIBLE_DEVICES=%s
\n
"
,
os
.
Getenv
(
"NVIDIA_VISIBLE_DEVICES"
))
fmt
.
Printf
(
"CUDA_VISIBLE_DEVICES=%s
\n
"
,
os
.
Getenv
(
"CUDA_VISIBLE_DEVICES"
))
// Linux namespaces for PID 1
for
_
,
ns
:=
range
[]
string
{
"mnt"
,
"pid"
,
"ipc"
,
"net"
,
"uts"
,
"cgroup"
}
{
link
,
err
:=
os
.
Readlink
(
fmt
.
Sprintf
(
"/proc/1/ns/%s"
,
ns
))
if
err
!=
nil
{
link
=
err
.
Error
()
}
fmt
.
Printf
(
"ns/%s: %s
\n
"
,
ns
,
link
)
}
fmt
.
Printf
(
"=== END GPU DIAGNOSTICS [%s] ===
\n
"
,
label
)
}
func
main
()
{
// Log GPU diagnostics BEFORE anything else (gated on DEBUG for production quietness)
if
os
.
Getenv
(
"DEBUG"
)
==
"1"
{
logGPUDiagnostics
(
"PRE-RESTORE"
)
}
// Set up logging
log
:=
logrus
.
New
()
log
.
SetOutput
(
os
.
Stdout
)
log
.
SetFormatter
(
&
logrus
.
TextFormatter
{
FullTimestamp
:
true
,
TimestampFormat
:
"2006-01-02 15:04:05"
,
})
// Load configuration from hardcoded defaults + operator-injected env vars.
// os.Args[1:] are the cold start command args (passed by the operator via pod spec).
cfg
,
err
:=
restore
.
NewRestoreRequest
(
os
.
Args
[
1
:
])
if
err
!=
nil
{
log
.
WithError
(
err
)
.
Fatal
(
"Failed to load restore configuration"
)
}
// Set log level based on DEBUG flag
if
cfg
.
Debug
{
log
.
SetLevel
(
logrus
.
DebugLevel
)
}
else
{
log
.
SetLevel
(
logrus
.
InfoLevel
)
}
entry
:=
log
.
WithField
(
"component"
,
"restore-entrypoint"
)
// Set up context with signal handling for graceful shutdown
ctx
,
cancel
:=
context
.
WithCancel
(
context
.
Background
())
defer
cancel
()
// Handle shutdown signals
sigChan
:=
make
(
chan
os
.
Signal
,
1
)
signal
.
Notify
(
sigChan
,
syscall
.
SIGTERM
,
syscall
.
SIGINT
)
go
func
()
{
sig
:=
<-
sigChan
entry
.
WithField
(
"signal"
,
sig
)
.
Info
(
"Received shutdown signal"
)
cancel
()
}()
// Run the restore entrypoint
if
err
:=
restore
.
Run
(
ctx
,
cfg
,
entry
);
err
!=
nil
{
entry
.
WithError
(
err
)
.
Fatal
(
"Restore entrypoint failed"
)
}
}
deploy/chrek/go.mod
View file @
bb8fc8a4
...
...
@@ -3,15 +3,23 @@ module github.com/ai-dynamo/dynamo/deploy/chrek
go 1.25.0
require (
github.com/checkpoint-restore/go-criu/v
7
v
7
.2.0
github.com/checkpoint-restore/go-criu/v
8
v
8
.2.0
github.com/containerd/containerd v1.7.30
github.com/cyphar/filepath-securejoin v0.5.1
github.com/go-logr/logr v1.4.3
github.com/go-logr/zapr v1.3.0
github.com/moby/sys/mountinfo v0.7.1
github.com/opencontainers/runtime-spec v1.2.0
github.com/sirupsen/logrus v1.9.4
github.com/prometheus/procfs v0.16.1
go.uber.org/zap v1.27.1
golang.org/x/sys v0.40.0
google.golang.org/grpc v1.72.2
google.golang.org/protobuf v1.36.11
gopkg.in/yaml.v3 v3.0.1
k8s.io/api v0.35.0
k8s.io/apimachinery v0.35.0
k8s.io/client-go v0.35.0
k8s.io/kubelet v0.35.0
)
require (
...
...
@@ -28,30 +36,26 @@ require (
github.com/containerd/platforms v0.2.1 // indirect
github.com/containerd/ttrpc v1.2.7 // indirect
github.com/containerd/typeurl/v2 v2.1.1 // indirect
github.com/cyphar/filepath-securejoin v0.5.1 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/distribution/reference v0.6.0 // indirect
github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c // indirect
github.com/emicklei/go-restful/v3 v3.12.2 // indirect
github.com/felixge/httpsnoop v1.0.
3
// indirect
github.com/felixge/httpsnoop v1.0.
4
// indirect
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
github.com/go-logr/logr v1.4.3 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/gnostic-models v0.7.0 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.1
6.7
// indirect
github.com/klauspost/compress v1.1
8.0
// indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/moby/locker v1.0.1 // indirect
github.com/moby/sys/mountinfo v0.7.1 // indirect
github.com/moby/sys/sequential v0.5.0 // indirect
github.com/moby/sys/signal v0.7.0 // indirect
github.com/moby/sys/user v0.3.0 // indirect
...
...
@@ -64,13 +68,15 @@ require (
github.com/opencontainers/selinux v1.13.1 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/s
pf13/pflag
v1.
0.9
// indirect
github.com/s
irupsen/logrus
v1.
9.3
// indirect
github.com/x448/float16 v0.8.4 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0 // indirect
go.opentelemetry.io/otel v1.21.0 // indirect
go.opentelemetry.io/otel/metric v1.21.0 // indirect
go.opentelemetry.io/otel/trace v1.21.0 // indirect
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect
go.opentelemetry.io/otel v1.36.0 // indirect
go.opentelemetry.io/otel/metric v1.36.0 // indirect
go.opentelemetry.io/otel/trace v1.36.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/net v0.47.0 // indirect
...
...
@@ -80,16 +86,14 @@ require (
golang.org/x/text v0.31.0 // indirect
golang.org/x/time v0.12.0 // indirect
google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240401170217-c3f982113cda // indirect
google.golang.org/grpc v1.59.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250528174236-200df99c418a // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
sigs.k8s.io/randfill v1.0.0 // indirect
sigs.k8s.io/structured-merge-diff/v6 v6.3.
0
// indirect
sigs.k8s.io/structured-merge-diff/v6 v6.3.
2-0.20260122202528-d9cc6641c482
// indirect
sigs.k8s.io/yaml v1.6.0 // indirect
)
deploy/chrek/go.sum
View file @
bb8fc8a4
...
...
@@ -11,8 +11,8 @@ github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA
github.com/Microsoft/hcsshim v0.11.7 h1:vl/nj3Bar/CvJSYo7gIQPyRWc9f3c6IeSNavBTSZNZQ=
github.com/Microsoft/hcsshim v0.11.7/go.mod h1:MV8xMfmECjl5HdO7U/3/hFVnkmSBjAjmA09d4bExKcU=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/checkpoint-restore/go-criu/v
7
v
7
.2.0 h1:
qGiWA4App1gGlEfIJ68WR9jbezV9J7yZdjzglezcqKo
=
github.com/checkpoint-restore/go-criu/v
7
v
7
.2.0/go.mod h1:
u0LCWLg0w4yqqu14aXhiB4YD3a1qd8EcCEg7vda5dwo
=
github.com/checkpoint-restore/go-criu/v
8
v
8
.2.0 h1:
dsgMgj/eJtZNKn3qn/+Ri0b4bd0uo6o2zt1yd8Nj2NI
=
github.com/checkpoint-restore/go-criu/v
8
v
8
.2.0/go.mod h1:
HVKJ1dK+bowJcFI1MtdL2ECIuY+/AtRMHzD9Lqa4uA4
=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM=
...
...
@@ -51,8 +51,8 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/felixge/httpsnoop v1.0.
3
h1:
s/nj+GCswXYzN5v2DpNMuMQYe+0DDwt5WVCU6CWBdXk
=
github.com/felixge/httpsnoop v1.0.
3
/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/felixge/httpsnoop v1.0.
4
h1:
NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg
=
github.com/felixge/httpsnoop v1.0.
4
/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM=
github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ=
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
...
...
@@ -60,6 +60,8 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
...
...
@@ -112,8 +114,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.1
6.7
h1:
2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I
=
github.com/klauspost/compress v1.1
6.7
/go.mod h1:
ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE
=
github.com/klauspost/compress v1.1
8.0
h1:
c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo
=
github.com/klauspost/compress v1.1
8.0
/go.mod h1:
2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ
=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
...
...
@@ -160,12 +162,12 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/procfs v0.1
0
.1 h1:
kYK1Va/YMlutzCGazswoHKo//tZVlFpKYh+PymziUA
g=
github.com/prometheus/procfs v0.1
0
.1/go.mod h1:
nwNm2aOCAYw8uTR/9bWRREkZFxAUcWzPHWJq+XBB/FM
=
github.com/prometheus/procfs v0.1
6
.1 h1:
hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyR
g=
github.com/prometheus/procfs v0.1
6
.1/go.mod h1:
teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is
=
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
github.com/sirupsen/logrus v1.9.
4
h1:
TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w
=
github.com/sirupsen/logrus v1.9.
4
/go.mod h1:
ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g
=
github.com/sirupsen/logrus v1.9.
3
h1:
dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ
=
github.com/sirupsen/logrus v1.9.
3
/go.mod h1:
naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ
=
github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY=
github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
...
...
@@ -174,6 +176,7 @@ github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpE
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
...
...
@@ -185,16 +188,26 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0 h1:x8Z78aZx8cOF0+Kkazoc7lwUNMGy0LrzEMxTm4BbTxg=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0/go.mod h1:62CPTSry9QZtOaSsE3tOzhx6LzDhHnXJ6xHeMNNiM6Q=
go.opentelemetry.io/otel v1.21.0 h1:hzLeKBZEL7Okw2mGzZ0cc4k/A7Fta0uoPgaJCr8fsFc=
go.opentelemetry.io/otel v1.21.0/go.mod h1:QZzNPQPm1zLX4gZK4cMi+71eaorMSGT3A4znnUvNNEo=
go.opentelemetry.io/otel/metric v1.21.0 h1:tlYWfeo+Bocx5kLEloTjbcDwBuELRrIFxwdQ36PlJu4=
go.opentelemetry.io/otel/metric v1.21.0/go.mod h1:o1p3CA8nNHW8j5yuQLdc1eeqEaPfzug24uvsyIEJRWM=
go.opentelemetry.io/otel/trace v1.21.0 h1:WD9i5gzvoUPuXIXH24ZNBudiarZDKuekPqi/E8fpfLc=
go.opentelemetry.io/otel/trace v1.21.0/go.mod h1:LGbsEB0f9LGjN+OZaQQ26sohbOmiMR+BaslueVtS/qQ=
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q=
go.opentelemetry.io/otel v1.36.0 h1:UumtzIklRBY6cI/lllNZlALOF5nNIzJVb16APdvgTXg=
go.opentelemetry.io/otel v1.36.0/go.mod h1:/TcFMXYjyRNh8khOAO9ybYkqaDBb/70aVwkNML4pP8E=
go.opentelemetry.io/otel/metric v1.36.0 h1:MoWPKVhQvJ+eeXWHFBOPoBOi20jh6Iq2CcCREuTYufE=
go.opentelemetry.io/otel/metric v1.36.0/go.mod h1:zC7Ks+yeyJt4xig9DEw9kuUFe5C3zLbVjV2PzT6qzbs=
go.opentelemetry.io/otel/sdk v1.36.0 h1:b6SYIuLRs88ztox4EyrvRti80uXIFy+Sqzoh9kFULbs=
go.opentelemetry.io/otel/sdk v1.36.0/go.mod h1:+lC+mTgD+MUWfjJubi2vvXWcVxyr9rmlshZni72pXeY=
go.opentelemetry.io/otel/sdk/metric v1.36.0 h1:r0ntwwGosWGaa0CrSt8cuNuTcccMXERFwHX4dThiPis=
go.opentelemetry.io/otel/sdk/metric v1.36.0/go.mod h1:qTNOhFDfKRwX0yXOqJYegL5WRaW376QbB7P4Pb0qva4=
go.opentelemetry.io/otel/trace v1.36.0 h1:ahxWNuqZjpdiFAyrIoQ4GIiAIhxAunQR6MUoKrsNd4w=
go.opentelemetry.io/otel/trace v1.36.0/go.mod h1:gQ+OnDZzrybY4k4seLzPAWNwVBBVlF2szhehOBB/tGA=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc=
go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
...
...
@@ -237,6 +250,7 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU=
...
...
@@ -268,15 +282,15 @@ google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98
google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3 h1:1hfbdAfFbkmpg41000wDVqr7jUpK/Yo+LPnIxxGzmkg=
google.golang.org/genproto v0.0.0-20231211222908-989df2bf70f3/go.mod h1:5RBcpGRxr25RbDzY5w+dmaqpSEvl8Gwl1x2CICf60ic=
google.golang.org/genproto/googleapis/rpc v0.0.0-202
40401170217-c3f982113cda h1:LI5DOvAxUPMv/50agcLLoo+AdWc1irS9Rzz4vPuD1V4
=
google.golang.org/genproto/googleapis/rpc v0.0.0-202
40401170217-c3f982113cda/go.mod h1:WtryC6hu0hhx87FDGxWCDptyssuo68sk10vYjF+T9fY
=
google.golang.org/genproto/googleapis/rpc v0.0.0-202
50528174236-200df99c418a h1:v2PbRU4K3llS09c7zodFpNePeamkAwG3mPrAery9VeE
=
google.golang.org/genproto/googleapis/rpc v0.0.0-202
50528174236-200df99c418a/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A
=
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc=
google.golang.org/grpc v1.
59.0 h1:Z5Iec2pjwb+LEOqzpB2MR12/eKFhDPhuqW91O+4bwUk
=
google.golang.org/grpc v1.
59.0
/go.mod h1:
aUPDwccQo6OTjy7Hct4AfBPD1GptF4fyUjIkQ9YtF98
=
google.golang.org/grpc v1.
72.2 h1:TdbGzwb82ty4OusHWepvFWGLgIbNo1/SUynEN0ssqv8
=
google.golang.org/grpc v1.
72.2
/go.mod h1:
wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM
=
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
...
...
@@ -312,13 +326,15 @@ k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE=
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ=
k8s.io/kubelet v0.35.0 h1:8cgJHCBCKLYuuQ7/Pxb/qWbJfX1LXIw7790ce9xHq7c=
k8s.io/kubelet v0.35.0/go.mod h1:ciRzAXn7C4z5iB7FhG1L2CGPPXLTVCABDlbXt/Zz8YA=
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck=
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
sigs.k8s.io/structured-merge-diff/v6 v6.3.
0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco
=
sigs.k8s.io/structured-merge-diff/v6 v6.3.
0
/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
sigs.k8s.io/structured-merge-diff/v6 v6.3.
2-0.20260122202528-d9cc6641c482 h1:2WOzJpHUBVrrkDjU4KBT8n5LDcj824eX0I5UKcgeRUs
=
sigs.k8s.io/structured-merge-diff/v6 v6.3.
2-0.20260122202528-d9cc6641c482
/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=
deploy/chrek/pkg/checkpoint/checkpoint.go
deleted
100644 → 0
View file @
c8423b57
// Package checkpoint provides CRIU checkpoint (dump) operations.
package
checkpoint
import
(
"context"
"fmt"
"os"
"path/filepath"
"time"
criurpc
"github.com/checkpoint-restore/go-criu/v7/rpc"
specs
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"google.golang.org/protobuf/proto"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)
// ContainerInfoSnapshot holds runtime/container info needed for checkpointing.
type
ContainerInfoSnapshot
struct
{
PID
int
RootFS
string
UpperDir
string
OCISpec
*
specs
.
Spec
MountInfo
[]
MountInfo
Namespaces
map
[
NamespaceType
]
*
NamespaceInfo
}
// CheckpointManifest is saved as manifest.yaml at checkpoint time and loaded at restore.
type
CheckpointManifest
struct
{
CheckpointID
string
`yaml:"checkpointId"`
CreatedAt
time
.
Time
`yaml:"createdAt"`
CRIUDump
CRIUDumpManifest
`yaml:"criuDump"`
K8s
SourcePodManifest
`yaml:"k8s"`
Filesystem
FilesystemManifest
`yaml:"filesystem"`
Namespaces
[]
NamespaceManifestEntry
`yaml:"namespaces"`
}
// NewCheckpointManifest assembles a CheckpointManifest from per-module builders.
func
NewCheckpointManifest
(
checkpointID
string
,
criuDump
CRIUDumpManifest
,
k8s
SourcePodManifest
,
filesystem
FilesystemManifest
,
namespaces
[]
NamespaceManifestEntry
,
)
*
CheckpointManifest
{
return
&
CheckpointManifest
{
CheckpointID
:
checkpointID
,
CreatedAt
:
time
.
Now
()
.
UTC
(),
CRIUDump
:
criuDump
,
K8s
:
k8s
,
Filesystem
:
filesystem
,
Namespaces
:
namespaces
,
}
}
// CheckpointRequest holds per-checkpoint identifiers for a checkpoint operation.
type
CheckpointRequest
struct
{
ContainerID
string
ContainerName
string
// K8s container name (for K8s API volume type lookup)
CheckpointID
string
CheckpointDir
string
NodeName
string
PodName
string
PodNamespace
string
}
// CheckpointOutcome contains the result of a checkpoint operation.
type
CheckpointOutcome
struct
{
CheckpointID
string
CheckpointDir
string
Data
*
CheckpointManifest
}
// Checkpointer performs CRIU checkpoint operations
type
Checkpointer
struct
{
discoveryClient
*
DiscoveryClient
log
*
logrus
.
Entry
}
// NewCheckpointer creates a new checkpointer
func
NewCheckpointer
(
discoveryClient
*
DiscoveryClient
)
*
Checkpointer
{
return
&
Checkpointer
{
discoveryClient
:
discoveryClient
,
log
:
logrus
.
WithField
(
"component"
,
"checkpointer"
),
}
}
// Checkpoint performs a CRIU dump of a container.
// The operation has three phases: introspect, configure, capture.
func
(
c
*
Checkpointer
)
Checkpoint
(
ctx
context
.
Context
,
req
CheckpointRequest
,
spec
*
CheckpointSpec
)
(
*
CheckpointOutcome
,
error
)
{
if
spec
==
nil
{
return
nil
,
fmt
.
Errorf
(
"checkpoint spec is required"
)
}
checkpointStart
:=
time
.
Now
()
c
.
log
.
Info
(
"=== Starting checkpoint operation ==="
)
checkpointDir
:=
filepath
.
Join
(
req
.
CheckpointDir
,
req
.
CheckpointID
)
if
err
:=
os
.
MkdirAll
(
checkpointDir
,
0700
);
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to create checkpoint directory: %w"
,
err
)
}
// Open image directory FD for CRIU — must stay open through both configure and capture
// phases since CRIU's swrk child process inherits this FD.
imageDir
,
imageDirFD
,
err
:=
common
.
OpenPathForCRIU
(
checkpointDir
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to open image directory: %w"
,
err
)
}
defer
imageDir
.
Close
()
// Phase 1: Introspect container state
state
,
err
:=
c
.
introspect
(
ctx
,
req
.
ContainerID
)
if
err
!=
nil
{
return
nil
,
err
}
// Phase 2: Configure CRIU options and build checkpoint manifest.
criuOpts
,
data
,
err
:=
c
.
configure
(
state
,
req
,
spec
,
checkpointDir
,
imageDirFD
)
if
err
!=
nil
{
return
nil
,
err
}
// Phase 3: Capture — CRIU dump, /dev/shm, rootfs diff
criuDumpDuration
,
err
:=
c
.
capture
(
criuOpts
,
data
,
state
,
checkpointDir
)
if
err
!=
nil
{
return
nil
,
err
}
totalDuration
:=
time
.
Since
(
checkpointStart
)
c
.
log
.
WithFields
(
logrus
.
Fields
{
"total_duration"
:
totalDuration
,
"criu_dump_duration"
:
criuDumpDuration
,
})
.
Info
(
"=== Checkpoint operation completed ==="
)
return
&
CheckpointOutcome
{
CheckpointID
:
req
.
CheckpointID
,
CheckpointDir
:
checkpointDir
,
Data
:
data
,
},
nil
}
// introspect resolves the container and gathers all runtime state from containerd and /proc.
func
(
c
*
Checkpointer
)
introspect
(
ctx
context
.
Context
,
containerID
string
)
(
*
ContainerInfoSnapshot
,
error
)
{
pid
,
ociSpec
,
err
:=
c
.
discoveryClient
.
ResolveContainer
(
ctx
,
containerID
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to resolve container: %w"
,
err
)
}
rootFS
,
err
:=
GetRootFS
(
pid
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to get rootfs: %w"
,
err
)
}
upperDir
,
err
:=
GetOverlayUpperDir
(
pid
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to get overlay upperdir: %w"
,
err
)
}
mountInfo
,
err
:=
ReadMountInfoFromHostProcPath
(
pid
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to parse mountinfo: %w"
,
err
)
}
namespaces
,
err
:=
GetAllNamespaces
(
pid
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to get namespaces: %w"
,
err
)
}
return
&
ContainerInfoSnapshot
{
PID
:
pid
,
RootFS
:
rootFS
,
UpperDir
:
upperDir
,
OCISpec
:
ociSpec
,
MountInfo
:
mountInfo
,
Namespaces
:
namespaces
,
},
nil
}
// configure builds CRIU options and checkpoint manifest from runtime snapshot and spec.
func
(
c
*
Checkpointer
)
configure
(
state
*
ContainerInfoSnapshot
,
req
CheckpointRequest
,
spec
*
CheckpointSpec
,
checkpointDir
string
,
imageDirFD
int32
,
)
(
*
criurpc
.
CriuOpts
,
*
CheckpointManifest
,
error
)
{
criuOpts
,
err
:=
BuildCRIUDumpOptions
(
&
spec
.
CRIU
,
state
.
PID
,
imageDirFD
,
state
.
RootFS
,
state
.
MountInfo
,
state
.
OCISpec
,
state
.
Namespaces
,
)
if
err
!=
nil
{
return
nil
,
nil
,
err
}
// Write CRIU config file (for options unavailable via RPC)
configPath
:=
filepath
.
Join
(
checkpointDir
,
CheckpointCRIUConfFilename
)
if
err
:=
os
.
WriteFile
(
configPath
,
[]
byte
(
spec
.
CRIU
.
GenerateCRIUConfContent
()),
0644
);
err
!=
nil
{
return
nil
,
nil
,
fmt
.
Errorf
(
"failed to write CRIU config file: %w"
,
err
)
}
criuOpts
.
ConfigFile
=
proto
.
String
(
configPath
)
// Build and save the checkpoint manifest.
manifest
:=
NewCheckpointManifest
(
req
.
CheckpointID
,
NewCRIUDumpManifest
(
criuOpts
,
spec
.
CRIU
),
NewSourcePodManifest
(
req
,
state
.
PID
),
NewFilesystemManifest
(
spec
.
RootfsExclusions
,
state
.
UpperDir
,
state
.
OCISpec
),
NewNamespaceManifestEntries
(
state
.
Namespaces
),
)
if
err
:=
WriteCheckpointManifest
(
checkpointDir
,
manifest
);
err
!=
nil
{
return
nil
,
nil
,
fmt
.
Errorf
(
"failed to write checkpoint manifest: %w"
,
err
)
}
return
criuOpts
,
manifest
,
nil
}
// capture executes the CRIU dump and post-dump captures (/dev/shm, rootfs diff).
// Returns the CRIU dump duration for timing reporting.
func
(
c
*
Checkpointer
)
capture
(
criuOpts
*
criurpc
.
CriuOpts
,
data
*
CheckpointManifest
,
state
*
ContainerInfoSnapshot
,
checkpointDir
string
,
)
(
time
.
Duration
,
error
)
{
criuDumpDuration
,
err
:=
ExecuteCRIUDump
(
criuOpts
,
checkpointDir
,
c
.
log
)
if
err
!=
nil
{
return
0
,
err
}
// Capture /dev/shm contents (must happen after dump for final process state)
if
err
:=
CaptureDevShm
(
state
.
PID
,
checkpointDir
,
c
.
log
);
err
!=
nil
{
c
.
log
.
WithError
(
err
)
.
Warn
(
"Failed to capture /dev/shm contents"
)
}
// Capture rootfs diff and deleted files
CaptureRootfsState
(
state
.
UpperDir
,
checkpointDir
,
data
,
c
.
log
)
return
criuDumpDuration
,
nil
}
deploy/chrek/pkg/checkpoint/config.go
deleted
100644 → 0
View file @
c8423b57
// config.go defines the static checkpoint spec loaded from ConfigMap YAML.
package
checkpoint
import
"fmt"
// CheckpointSpec is the static checkpoint spec loaded from ConfigMap YAML.
type
CheckpointSpec
struct
{
// BasePath is the base directory for checkpoint storage (PVC mount point).
BasePath
string
`yaml:"basePath"`
// CRIU options for dump operations
CRIU
CRIUSettings
`yaml:"criu"`
// RootfsExclusions defines paths to exclude from rootfs diff capture
RootfsExclusions
FilesystemConfig
`yaml:"rootfsExclusions"`
}
// Validate checks that the CheckpointSpec has valid values.
func
(
c
*
CheckpointSpec
)
Validate
()
error
{
return
c
.
RootfsExclusions
.
Validate
()
}
// ConfigError represents a configuration validation error.
type
ConfigError
struct
{
Field
string
Message
string
}
func
(
e
*
ConfigError
)
Error
()
string
{
return
fmt
.
Sprintf
(
"config error: %s: %s"
,
e
.
Field
,
e
.
Message
)
}
deploy/chrek/pkg/checkpoint/constants.go
deleted
100644 → 0
View file @
c8423b57
// constants.go defines shared constants used across checkpoint and restore packages.
package
checkpoint
const
(
// HostProcPath is the mount point for the host's /proc in DaemonSet pods.
HostProcPath
=
"/host/proc"
// DevShmDirName is the directory name for captured /dev/shm contents.
DevShmDirName
=
"dev-shm"
// KubeLabelCheckpointSource is the pod label that triggers automatic checkpointing.
// Set by the operator on checkpoint-eligible pods.
KubeLabelCheckpointSource
=
"nvidia.com/checkpoint-source"
// KubeLabelCheckpointHash is the pod label specifying the checkpoint identity hash.
// Set by the operator on checkpoint-eligible pods.
KubeLabelCheckpointHash
=
"nvidia.com/checkpoint-hash"
// DumpLogFilename is the CRIU dump (checkpoint) log filename.
DumpLogFilename
=
"dump.log"
// CheckpointCRIUConfFilename is the CRIU config file written at checkpoint time.
CheckpointCRIUConfFilename
=
"criu.conf"
// CheckpointDoneFilename is the marker file written to the checkpoint directory
// after all checkpoint artifacts are complete. Used to detect checkpoint readiness.
// Also hard-coded in vLLM for early-exit when checkpoint already exists.
CheckpointDoneFilename
=
"checkpoint.done"
// CheckpointManifestFilename is the name of the manifest file in checkpoint directories.
CheckpointManifestFilename
=
"manifest.yaml"
// DescriptorsFilename is the name of the file descriptors file.
DescriptorsFilename
=
"descriptors.yaml"
// RootfsDiffFilename is the name of the rootfs diff tar in checkpoint directories.
RootfsDiffFilename
=
"rootfs-diff.tar"
// DeletedFilesFilename is the name of the deleted files JSON in checkpoint directories.
DeletedFilesFilename
=
"deleted-files.json"
)
deploy/chrek/pkg/checkpoint/criu.go
deleted
100644 → 0
View file @
c8423b57
// criu provides CRIU-specific configuration and utilities for checkpoint operations.
package
checkpoint
import
(
"fmt"
"time"
criu
"github.com/checkpoint-restore/go-criu/v7"
criurpc
"github.com/checkpoint-restore/go-criu/v7/rpc"
specs
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"google.golang.org/protobuf/proto"
)
// CRIUSettings holds CRIU-specific configuration options.
// Options are categorized by how they are passed to CRIU:
// - RPC options: Passed via go-criu CriuOpts protobuf
// - CRIU conf file options: Written to criu.conf (NOT available via RPC)
type
CRIUSettings
struct
{
// === RPC Options (passed via go-criu CriuOpts) ===
// GhostLimit is the maximum ghost file size in bytes.
// Ghost files are deleted-but-open files that CRIU needs to checkpoint.
// 512MB is recommended for GPU workloads with large memory allocations.
GhostLimit
uint32
`yaml:"ghostLimit"`
// Timeout is the CRIU operation timeout in seconds.
// 6 hours (21600s) is recommended for large GPU model checkpoints.
Timeout
uint32
`yaml:"timeout"`
// LogLevel is the CRIU logging verbosity (0-4).
LogLevel
int32
`yaml:"logLevel"`
// WorkDir is the CRIU work directory for temporary files.
WorkDir
string
`yaml:"workDir"`
// AutoDedup enables auto-deduplication of memory pages.
AutoDedup
bool
`yaml:"autoDedup"`
// LazyPages enables lazy page migration (experimental).
LazyPages
bool
`yaml:"lazyPages"`
// LeaveRunning keeps the process running after checkpoint (dump only).
LeaveRunning
bool
`yaml:"leaveRunning"`
// ShellJob allows checkpointing session leaders (containers are often session leaders).
ShellJob
bool
`yaml:"shellJob"`
// TcpClose closes TCP connections instead of preserving them (pod IPs change on restore).
TcpClose
bool
`yaml:"tcpClose"`
// FileLocks allows checkpointing processes with file locks.
FileLocks
bool
`yaml:"fileLocks"`
// OrphanPtsMaster allows checkpointing containers with TTYs.
OrphanPtsMaster
bool
`yaml:"orphanPtsMaster"`
// ExtUnixSk allows external Unix sockets.
ExtUnixSk
bool
`yaml:"extUnixSk"`
// LinkRemap handles deleted-but-open files.
LinkRemap
bool
`yaml:"linkRemap"`
// ExtMasters allows external bind mount masters.
ExtMasters
bool
`yaml:"extMasters"`
// ManageCgroupsMode controls cgroup handling: "ignore" lets K8s manage cgroups.
ManageCgroupsMode
string
`yaml:"manageCgroupsMode"`
// === CRIU Conf File Options (NOT available via RPC - written to criu.conf) ===
// LibDir is the path to CRIU plugin directory (e.g., /usr/local/lib/criu).
// Required for CUDA checkpoint/restore.
LibDir
string
`yaml:"libDir"`
// AllowUprobes allows user-space probes (required for CUDA checkpoints).
AllowUprobes
bool
`yaml:"allowUprobes"`
// SkipInFlight skips in-flight TCP connections during checkpoint/restore.
SkipInFlight
bool
`yaml:"skipInFlight"`
}
// GenerateCRIUConfContent generates the criu.conf file content for options
// that cannot be passed via RPC.
func
(
c
*
CRIUSettings
)
GenerateCRIUConfContent
()
string
{
var
content
string
if
c
.
LibDir
!=
""
{
content
+=
"libdir "
+
c
.
LibDir
+
"
\n
"
}
if
c
.
AllowUprobes
{
content
+=
"allow-uprobes
\n
"
}
if
c
.
SkipInFlight
{
content
+=
"skip-in-flight
\n
"
}
return
content
}
// ExternalMountManifestEntry is a serializable CRIU ext-mount entry in checkpoint manifests.
type
ExternalMountManifestEntry
struct
{
Key
string
`yaml:"key"`
Val
string
`yaml:"val"`
}
// CRIUDumpManifest stores the resolved dump-time CRIU mount plan used for restore.
type
CRIUDumpManifest
struct
{
CRIU
CRIUSettings
`yaml:"criu"`
ExtMnt
[]
ExternalMountManifestEntry
`yaml:"extMnt,omitempty"`
External
[]
string
`yaml:"external,omitempty"`
SkipMnt
[]
string
`yaml:"skipMnt,omitempty"`
}
// NewCRIUDumpManifest serializes resolved dump options for restore.
func
NewCRIUDumpManifest
(
criuOpts
*
criurpc
.
CriuOpts
,
settings
CRIUSettings
)
CRIUDumpManifest
{
manifest
:=
CRIUDumpManifest
{
CRIU
:
settings
}
if
criuOpts
==
nil
{
return
manifest
}
for
_
,
mount
:=
range
criuOpts
.
ExtMnt
{
if
mount
==
nil
||
mount
.
GetKey
()
==
""
{
continue
}
manifest
.
ExtMnt
=
append
(
manifest
.
ExtMnt
,
ExternalMountManifestEntry
{
Key
:
mount
.
GetKey
(),
Val
:
mount
.
GetVal
(),
})
}
manifest
.
External
=
append
([]
string
(
nil
),
criuOpts
.
External
...
)
manifest
.
SkipMnt
=
append
([]
string
(
nil
),
criuOpts
.
SkipMnt
...
)
return
manifest
}
// BuildCRIUDumpOptions creates CRIU options directly from spec settings and runtime state.
func
BuildCRIUDumpOptions
(
settings
*
CRIUSettings
,
pid
int
,
imageDirFD
int32
,
rootFS
string
,
mountInfo
[]
MountInfo
,
ociSpec
*
specs
.
Spec
,
namespaces
map
[
NamespaceType
]
*
NamespaceInfo
,
)
(
*
criurpc
.
CriuOpts
,
error
)
{
mountPolicy
:=
BuildMountPolicy
(
mountInfo
,
ociSpec
,
rootFS
)
extMnt
:=
buildExternalMountMaps
(
mountPolicy
.
Externalized
)
skipMnt
:=
mountPolicy
.
Skipped
external
:=
buildExternalNamespaces
(
namespaces
)
logrus
.
WithFields
(
logrus
.
Fields
{
"externalized_count"
:
len
(
mountPolicy
.
Externalized
),
"skipped_count"
:
len
(
mountPolicy
.
Skipped
),
})
.
Debug
(
"Resolved mount policy for CRIU dump"
)
criuOpts
:=
&
criurpc
.
CriuOpts
{
Pid
:
proto
.
Int32
(
int32
(
pid
)),
ImagesDirFd
:
proto
.
Int32
(
imageDirFD
),
Root
:
proto
.
String
(
rootFS
),
LogFile
:
proto
.
String
(
DumpLogFilename
),
}
criuOpts
.
ExtMnt
=
extMnt
criuOpts
.
External
=
external
criuOpts
.
SkipMnt
=
skipMnt
if
settings
==
nil
{
return
criuOpts
,
nil
}
// RPC options from spec.
criuOpts
.
LogLevel
=
proto
.
Int32
(
settings
.
LogLevel
)
criuOpts
.
LeaveRunning
=
proto
.
Bool
(
settings
.
LeaveRunning
)
criuOpts
.
ShellJob
=
proto
.
Bool
(
settings
.
ShellJob
)
criuOpts
.
TcpClose
=
proto
.
Bool
(
settings
.
TcpClose
)
criuOpts
.
FileLocks
=
proto
.
Bool
(
settings
.
FileLocks
)
criuOpts
.
OrphanPtsMaster
=
proto
.
Bool
(
settings
.
OrphanPtsMaster
)
criuOpts
.
ExtUnixSk
=
proto
.
Bool
(
settings
.
ExtUnixSk
)
criuOpts
.
LinkRemap
=
proto
.
Bool
(
settings
.
LinkRemap
)
criuOpts
.
ExtMasters
=
proto
.
Bool
(
settings
.
ExtMasters
)
criuOpts
.
AutoDedup
=
proto
.
Bool
(
settings
.
AutoDedup
)
criuOpts
.
LazyPages
=
proto
.
Bool
(
settings
.
LazyPages
)
// Cgroup management mode
criuOpts
.
ManageCgroups
=
proto
.
Bool
(
true
)
cgMode
:=
criurpc
.
CriuCgMode_IGNORE
switch
settings
.
ManageCgroupsMode
{
case
"soft"
:
cgMode
=
criurpc
.
CriuCgMode_SOFT
case
"full"
:
cgMode
=
criurpc
.
CriuCgMode_FULL
case
"strict"
:
cgMode
=
criurpc
.
CriuCgMode_STRICT
}
criuOpts
.
ManageCgroupsMode
=
&
cgMode
// Optional numeric options
if
settings
.
GhostLimit
>
0
{
criuOpts
.
GhostLimit
=
proto
.
Uint32
(
settings
.
GhostLimit
)
}
if
settings
.
Timeout
>
0
{
criuOpts
.
Timeout
=
proto
.
Uint32
(
settings
.
Timeout
)
}
return
criuOpts
,
nil
}
// buildExternalMountMaps serializes externalized mount paths into CRIU map entries.
func
buildExternalMountMaps
(
paths
[]
string
)
[]
*
criurpc
.
ExtMountMap
{
extMnt
:=
make
([]
*
criurpc
.
ExtMountMap
,
0
,
len
(
paths
))
existing
:=
make
(
map
[
string
]
struct
{},
len
(
paths
))
for
_
,
path
:=
range
paths
{
if
path
==
""
{
continue
}
if
_
,
ok
:=
existing
[
path
];
ok
{
continue
}
extMnt
=
append
(
extMnt
,
&
criurpc
.
ExtMountMap
{
Key
:
proto
.
String
(
path
),
Val
:
proto
.
String
(
path
),
})
existing
[
path
]
=
struct
{}{}
}
return
extMnt
}
// buildExternalNamespaces builds external namespace/mount references.
func
buildExternalNamespaces
(
namespaces
map
[
NamespaceType
]
*
NamespaceInfo
)
[]
string
{
external
:=
make
([]
string
,
0
,
1
)
// Mark network namespace as external for socket binding preservation
if
netNs
,
ok
:=
namespaces
[
NamespaceNet
];
ok
{
external
=
append
(
external
,
fmt
.
Sprintf
(
"%s[%d]:%s"
,
NamespaceNet
,
netNs
.
Inode
,
"extNetNs"
))
logrus
.
WithField
(
"inode"
,
netNs
.
Inode
)
.
Debug
(
"Marked network namespace as external"
)
}
return
external
}
// ExecuteCRIUDump runs the CRIU dump and logs timing plus dump-log location on failure.
func
ExecuteCRIUDump
(
criuOpts
*
criurpc
.
CriuOpts
,
checkpointDir
string
,
log
*
logrus
.
Entry
)
(
time
.
Duration
,
error
)
{
criuDumpStart
:=
time
.
Now
()
criuClient
:=
criu
.
MakeCriu
()
if
err
:=
criuClient
.
Dump
(
criuOpts
,
nil
);
err
!=
nil
{
dumpDuration
:=
time
.
Since
(
criuDumpStart
)
log
.
WithFields
(
logrus
.
Fields
{
"duration"
:
dumpDuration
,
"checkpoint_dir"
:
checkpointDir
,
"dump_log_path"
:
fmt
.
Sprintf
(
"%s/%s"
,
checkpointDir
,
DumpLogFilename
),
})
.
Error
(
"CRIU dump failed"
)
return
0
,
fmt
.
Errorf
(
"CRIU dump failed: %w"
,
err
)
}
criuDumpDuration
:=
time
.
Since
(
criuDumpStart
)
log
.
WithField
(
"duration"
,
criuDumpDuration
)
.
Info
(
"CRIU dump completed"
)
return
criuDumpDuration
,
nil
}
deploy/chrek/pkg/checkpoint/k8s.go
deleted
100644 → 0
View file @
c8423b57
// k8s contains containerd discovery and Kubernetes path classification helpers.
package
checkpoint
import
(
"context"
"fmt"
"github.com/containerd/containerd"
"github.com/containerd/containerd/namespaces"
specs
"github.com/opencontainers/runtime-spec/specs-go"
)
const
(
// K8sNamespace is the containerd namespace used by Kubernetes.
K8sNamespace
=
"k8s.io"
// ContainerdSocket is the default containerd socket path.
ContainerdSocket
=
"/run/containerd/containerd.sock"
)
type
SourcePodManifest
struct
{
ContainerID
string
`yaml:"containerId"`
PID
int
`yaml:"pid"`
SourceNode
string
`yaml:"sourceNode"`
PodName
string
`yaml:"podName"`
PodNamespace
string
`yaml:"podNamespace"`
}
func
NewSourcePodManifest
(
params
CheckpointRequest
,
pid
int
)
SourcePodManifest
{
return
SourcePodManifest
{
ContainerID
:
params
.
ContainerID
,
PID
:
pid
,
SourceNode
:
params
.
NodeName
,
PodName
:
params
.
PodName
,
PodNamespace
:
params
.
PodNamespace
,
}
}
type
DiscoveryClient
struct
{
client
*
containerd
.
Client
}
func
NewDiscoveryClient
()
(
*
DiscoveryClient
,
error
)
{
client
,
err
:=
containerd
.
New
(
ContainerdSocket
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to connect to containerd at %s: %w"
,
ContainerdSocket
,
err
)
}
return
&
DiscoveryClient
{
client
:
client
},
nil
}
func
(
c
*
DiscoveryClient
)
Close
()
error
{
if
c
.
client
!=
nil
{
return
c
.
client
.
Close
()
}
return
nil
}
func
(
c
*
DiscoveryClient
)
ResolveContainer
(
ctx
context
.
Context
,
containerID
string
)
(
int
,
*
specs
.
Spec
,
error
)
{
ctx
=
namespaces
.
WithNamespace
(
ctx
,
K8sNamespace
)
container
,
err
:=
c
.
client
.
LoadContainer
(
ctx
,
containerID
)
if
err
!=
nil
{
return
0
,
nil
,
fmt
.
Errorf
(
"failed to load container %s: %w"
,
containerID
,
err
)
}
task
,
err
:=
container
.
Task
(
ctx
,
nil
)
if
err
!=
nil
{
return
0
,
nil
,
fmt
.
Errorf
(
"failed to get task for container %s: %w"
,
containerID
,
err
)
}
pid
:=
task
.
Pid
()
spec
,
err
:=
container
.
Spec
(
ctx
)
if
err
!=
nil
{
return
0
,
nil
,
fmt
.
Errorf
(
"failed to get spec for container %s: %w"
,
containerID
,
err
)
}
return
int
(
pid
),
spec
,
nil
}
deploy/chrek/pkg/checkpoint/mounts.go
deleted
100644 → 0
View file @
c8423b57
// mounts parses runtime mount state from /proc.
package
checkpoint
import
(
"fmt"
"path"
"path/filepath"
"strings"
specs
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/ai-dynamo/dynamo/deploy/chrek/pkg/common"
)
type
MountInfo
struct
{
MountID
string
ParentID
string
MountPoint
string
Root
string
FSType
string
Source
string
Options
string
SuperOptions
string
}
// MountPolicy is the classified mount plan for CRIU dump options.
type
MountPolicy
struct
{
Externalized
[]
string
Skipped
[]
string
}
// BuildMountPolicy classifies mounts into CRIU extMnt and skipMnt lists.
//
// Rule order and precedence (top to bottom):
// 1. Skip non-OCI proc/sys submounts and non-OCI runtime /run submounts.
// These mounts are typically node/kernel/runtime specific and are the
// highest-risk source of cross-node restore failures, so skip wins.
// 2. Externalize mounts owned by runtime/OCI:
// - "/" (rootfs is recreated by runtime in OCI restore path)
// - OCI mount destinations
// - OCI masked/readonly paths
// 3. Externalize non-OCI bind-like mounts (mount root is not "/" or ".").
// This captures runtime-injected file mounts (for example driver files)
// so CRIU does not try to recreate them from checkpoint data.
// 4. Anything else is left unflagged and handled by CRIU default behavior.
//
// Precedence: skip > externalize. If a path is classified as skipped, it is
// removed from the externalized set.
func
BuildMountPolicy
(
mountInfo
[]
MountInfo
,
ociSpec
*
specs
.
Spec
,
rootFS
string
)
*
MountPolicy
{
ociManagedSet
:=
collectOCIManagedDestinations
(
ociSpec
,
rootFS
)
externalizedSet
:=
make
(
map
[
string
]
struct
{},
len
(
mountInfo
)
+
len
(
ociManagedSet
))
skippedSet
:=
make
(
map
[
string
]
struct
{},
len
(
mountInfo
))
for
_
,
mount
:=
range
mountInfo
{
mp
:=
normalizeMountPath
(
mount
.
MountPoint
)
if
mp
==
""
{
continue
}
source
:=
path
.
Clean
(
strings
.
TrimSpace
(
mount
.
Source
))
root
:=
path
.
Clean
(
strings
.
TrimSpace
(
mount
.
Root
))
isOCIManaged
:=
false
if
_
,
ok
:=
ociManagedSet
[
mp
];
ok
{
isOCIManaged
=
true
}
if
!
isOCIManaged
&&
strings
.
HasPrefix
(
mp
,
"/run/"
)
{
if
_
,
ok
:=
ociManagedSet
[
"/var"
+
mp
];
ok
{
isOCIManaged
=
true
}
}
if
!
isOCIManaged
&&
strings
.
HasPrefix
(
mp
,
"/var/run/"
)
{
if
_
,
ok
:=
ociManagedSet
[
strings
.
TrimPrefix
(
mp
,
"/var"
)];
ok
{
isOCIManaged
=
true
}
}
// Runtime-owned /run mounts are usually ephemeral tmpfs/overlay mounts
// or bind-like mounts sourced from host runtime directories.
// We skip these unless OCI explicitly manages that destination.
isRunRuntimeMount
:=
strings
.
HasPrefix
(
mp
,
"/run/"
)
&&
(
mount
.
FSType
==
"tmpfs"
||
mount
.
FSType
==
"overlay"
||
strings
.
HasPrefix
(
source
,
"/run/"
)
||
strings
.
HasPrefix
(
source
,
"/var/run/"
)
||
strings
.
HasPrefix
(
root
,
"/run/"
)
||
strings
.
HasPrefix
(
root
,
"/var/run/"
))
if
!
isOCIManaged
&&
(
strings
.
HasPrefix
(
mp
,
"/proc/"
)
||
strings
.
HasPrefix
(
mp
,
"/sys/"
)
||
isRunRuntimeMount
)
{
skippedSet
[
mp
]
=
struct
{}{}
delete
(
externalizedSet
,
mp
)
continue
}
if
mp
==
"/"
||
isOCIManaged
||
(
root
!=
"."
&&
root
!=
"/"
)
{
externalizedSet
[
mp
]
=
struct
{}{}
continue
}
}
// Ensure OCI-managed destinations are externalized, even when mountinfo does not
// include a direct entry (e.g., runtime-managed masked/readonly paths).
for
mp
:=
range
ociManagedSet
{
if
_
,
skipped
:=
skippedSet
[
mp
];
skipped
{
continue
}
externalizedSet
[
mp
]
=
struct
{}{}
}
externalized
:=
make
([]
string
,
0
,
len
(
externalizedSet
))
for
mp
:=
range
externalizedSet
{
externalized
=
append
(
externalized
,
mp
)
}
skipped
:=
make
([]
string
,
0
,
len
(
skippedSet
))
for
mp
:=
range
skippedSet
{
skipped
=
append
(
skipped
,
mp
)
}
return
&
MountPolicy
{
Externalized
:
externalized
,
Skipped
:
skipped
,
}
}
// collectOCIManagedDestinations returns the canonical set of OCI-owned mount
// targets. This includes regular OCI mounts plus Linux masked/readonly paths.
// Those masked/readonly paths may not appear as direct mountinfo entries, but
// still need to be treated as runtime-owned and externalized.
func
collectOCIManagedDestinations
(
ociSpec
*
specs
.
Spec
,
rootFS
string
)
map
[
string
]
struct
{}
{
set
:=
map
[
string
]
struct
{}{}
if
ociSpec
==
nil
{
return
set
}
paths
:=
make
([]
string
,
0
,
len
(
ociSpec
.
Mounts
))
for
_
,
mount
:=
range
ociSpec
.
Mounts
{
paths
=
append
(
paths
,
mount
.
Destination
)
}
if
ociSpec
.
Linux
!=
nil
{
paths
=
append
(
paths
,
ociSpec
.
Linux
.
MaskedPaths
...
)
paths
=
append
(
paths
,
ociSpec
.
Linux
.
ReadonlyPaths
...
)
}
for
_
,
raw
:=
range
paths
{
if
p
:=
normalizeOCIDestinationPath
(
raw
,
rootFS
);
p
!=
""
{
set
[
p
]
=
struct
{}{}
}
}
return
set
}
// normalizeMountPath applies lexical normalization only.
// Mountinfo paths are already kernel truth for the container namespace.
func
normalizeMountPath
(
raw
string
)
string
{
raw
=
strings
.
TrimSpace
(
raw
)
if
raw
==
""
{
return
""
}
p
:=
path
.
Clean
(
raw
)
if
!
strings
.
HasPrefix
(
p
,
"/"
)
{
p
=
"/"
+
p
}
return
path
.
Clean
(
p
)
}
// normalizeOCIDestinationPath canonicalizes OCI destinations against container
// rootfs symlinks (for example /var/run -> /run) with lexical fallback.
func
normalizeOCIDestinationPath
(
raw
,
rootFS
string
)
string
{
p
:=
normalizeMountPath
(
raw
)
if
p
==
""
||
rootFS
==
""
{
return
p
}
hostPath
:=
filepath
.
Join
(
rootFS
,
strings
.
TrimPrefix
(
p
,
"/"
))
resolved
,
err
:=
filepath
.
EvalSymlinks
(
hostPath
)
if
err
!=
nil
{
return
p
}
rel
,
err
:=
filepath
.
Rel
(
rootFS
,
resolved
)
if
err
!=
nil
{
return
p
}
rel
=
filepath
.
ToSlash
(
rel
)
if
rel
==
"."
{
return
"/"
}
if
strings
.
HasPrefix
(
rel
,
"../"
)
||
rel
==
".."
{
return
p
}
return
normalizeMountPath
(
"/"
+
rel
)
}
func
ReadMountInfoFromHostProcPath
(
pid
int
)
([]
MountInfo
,
error
)
{
mountinfoPath
:=
fmt
.
Sprintf
(
"%s/%d/mountinfo"
,
HostProcPath
,
pid
)
parsedMounts
,
err
:=
common
.
ParseMountInfoFile
(
mountinfoPath
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to parse mountinfo at %s: %w"
,
mountinfoPath
,
err
)
}
mounts
:=
make
([]
MountInfo
,
0
,
len
(
parsedMounts
))
for
_
,
parsed
:=
range
parsedMounts
{
mounts
=
append
(
mounts
,
MountInfo
{
MountID
:
parsed
.
MountID
,
ParentID
:
parsed
.
ParentID
,
MountPoint
:
parsed
.
Path
,
Root
:
parsed
.
Root
,
FSType
:
parsed
.
FSType
,
Source
:
parsed
.
Source
,
Options
:
parsed
.
Options
,
SuperOptions
:
parsed
.
SuperOpts
,
})
}
return
mounts
,
nil
}
deploy/chrek/pkg/checkpoint/namespaces.go
deleted
100644 → 0
View file @
c8423b57
// namespaces provides Linux namespace introspection for CRIU checkpoint.
package
checkpoint
import
(
"fmt"
"golang.org/x/sys/unix"
)
// NamespaceManifestEntry stores namespace information saved in checkpoint manifests.
type
NamespaceManifestEntry
struct
{
Type
string
`yaml:"type"`
// net, pid, mnt, etc.
Inode
uint64
`yaml:"inode"`
// Namespace inode
IsExternal
bool
`yaml:"isExternal"`
// Whether namespace is external (shared)
}
// NamespaceType represents a Linux namespace type
type
NamespaceType
string
const
(
NamespaceNet
NamespaceType
=
"net"
NamespacePID
NamespaceType
=
"pid"
NamespaceMnt
NamespaceType
=
"mnt"
NamespaceUTS
NamespaceType
=
"uts"
NamespaceIPC
NamespaceType
=
"ipc"
NamespaceUser
NamespaceType
=
"user"
NamespaceCgroup
NamespaceType
=
"cgroup"
)
// NamespaceInfo holds namespace identification information
type
NamespaceInfo
struct
{
Type
NamespaceType
Inode
uint64
IsExternal
bool
// Whether NS is external (shared with pause container)
}
// NewNamespaceManifestEntries constructs namespace manifest entries from introspected namespaces.
func
NewNamespaceManifestEntries
(
namespaces
map
[
NamespaceType
]
*
NamespaceInfo
)
[]
NamespaceManifestEntry
{
if
len
(
namespaces
)
==
0
{
return
nil
}
result
:=
make
([]
NamespaceManifestEntry
,
0
,
len
(
namespaces
))
for
nsType
,
nsInfo
:=
range
namespaces
{
result
=
append
(
result
,
NamespaceManifestEntry
{
Type
:
string
(
nsType
),
Inode
:
nsInfo
.
Inode
,
IsExternal
:
nsInfo
.
IsExternal
,
})
}
return
result
}
// GetNamespaceInode returns the inode number for a namespace
func
GetNamespaceInode
(
pid
int
,
nsType
NamespaceType
)
(
uint64
,
error
)
{
nsPath
:=
fmt
.
Sprintf
(
"%s/%d/ns/%s"
,
HostProcPath
,
pid
,
nsType
)
var
stat
unix
.
Stat_t
if
err
:=
unix
.
Stat
(
nsPath
,
&
stat
);
err
!=
nil
{
return
0
,
fmt
.
Errorf
(
"failed to stat namespace %s: %w"
,
nsPath
,
err
)
}
return
stat
.
Ino
,
nil
}
// GetNamespaceInfo returns detailed namespace information
func
GetNamespaceInfo
(
pid
int
,
nsType
NamespaceType
)
(
*
NamespaceInfo
,
error
)
{
nsPath
:=
fmt
.
Sprintf
(
"%s/%d/ns/%s"
,
HostProcPath
,
pid
,
nsType
)
// Get inode
var
stat
unix
.
Stat_t
if
err
:=
unix
.
Stat
(
nsPath
,
&
stat
);
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to stat namespace %s: %w"
,
nsPath
,
err
)
}
// Check if this is different from init's namespace (PID 1)
initNsPath
:=
fmt
.
Sprintf
(
"%s/1/ns/%s"
,
HostProcPath
,
nsType
)
var
initStat
unix
.
Stat_t
isExternal
:=
false
if
err
:=
unix
.
Stat
(
initNsPath
,
&
initStat
);
err
==
nil
{
// If the inode is different from init's, it's an external namespace
isExternal
=
stat
.
Ino
!=
initStat
.
Ino
}
return
&
NamespaceInfo
{
Type
:
nsType
,
Inode
:
stat
.
Ino
,
IsExternal
:
isExternal
,
},
nil
}
// GetAllNamespaces returns information about all namespaces for a process
func
GetAllNamespaces
(
pid
int
)
(
map
[
NamespaceType
]
*
NamespaceInfo
,
error
)
{
nsTypes
:=
[]
NamespaceType
{
NamespaceNet
,
NamespacePID
,
NamespaceMnt
,
NamespaceUTS
,
NamespaceIPC
,
NamespaceUser
,
NamespaceCgroup
,
}
namespaces
:=
make
(
map
[
NamespaceType
]
*
NamespaceInfo
)
for
_
,
nsType
:=
range
nsTypes
{
if
info
,
err
:=
GetNamespaceInfo
(
pid
,
nsType
);
err
==
nil
{
namespaces
[
nsType
]
=
info
}
}
return
namespaces
,
nil
}
deploy/chrek/pkg/checkpoint/shm.go
deleted
100644 → 0
View file @
c8423b57
// Package checkpoint provides CRIU checkpoint (dump) operations.
package
checkpoint
import
(
"fmt"
"io"
"os"
"path/filepath"
"github.com/sirupsen/logrus"
)
// CaptureDevShm captures files from /dev/shm to the checkpoint directory.
// This is needed because /dev/shm is a tmpfs mount that is not part of the
// container's overlay filesystem, so rootfs diff doesn't capture it.
//
// Semaphores (sem.* files) are included so that sem_unlink() calls succeed
// after restore. The semaphore kernel state won't be perfectly restored,
// but the files will exist for cleanup operations.
//
// The files are saved to <checkpointDir>/dev-shm/ and can be restored
// using RestoreDevShm before CRIU restore.
func
CaptureDevShm
(
pid
int
,
checkpointDir
string
,
log
*
logrus
.
Entry
)
error
{
// Access container's /dev/shm via /proc/<pid>/root
shmPath
:=
filepath
.
Join
(
HostProcPath
,
fmt
.
Sprintf
(
"%d/root/dev/shm"
,
pid
))
entries
,
err
:=
os
.
ReadDir
(
shmPath
)
if
err
!=
nil
{
if
os
.
IsNotExist
(
err
)
{
log
.
Debug
(
"Container /dev/shm does not exist, skipping capture"
)
return
nil
}
return
fmt
.
Errorf
(
"failed to read container /dev/shm: %w"
,
err
)
}
// Filter out directories
var
filesToCapture
[]
os
.
DirEntry
for
_
,
entry
:=
range
entries
{
// Skip directories (unlikely in /dev/shm but be safe)
if
entry
.
IsDir
()
{
log
.
WithField
(
"dir"
,
entry
.
Name
())
.
Debug
(
"Skipping directory in /dev/shm"
)
continue
}
filesToCapture
=
append
(
filesToCapture
,
entry
)
}
if
len
(
filesToCapture
)
==
0
{
log
.
Debug
(
"No files to capture from /dev/shm"
)
return
nil
}
// Create destination directory
destDir
:=
filepath
.
Join
(
checkpointDir
,
DevShmDirName
)
if
err
:=
os
.
MkdirAll
(
destDir
,
0755
);
err
!=
nil
{
return
fmt
.
Errorf
(
"failed to create dev-shm directory: %w"
,
err
)
}
var
captured
[]
string
var
totalSize
int64
for
_
,
entry
:=
range
filesToCapture
{
name
:=
entry
.
Name
()
srcPath
:=
filepath
.
Join
(
shmPath
,
name
)
destPath
:=
filepath
.
Join
(
destDir
,
name
)
info
,
err
:=
entry
.
Info
()
if
err
!=
nil
{
log
.
WithError
(
err
)
.
WithField
(
"file"
,
name
)
.
Warn
(
"Failed to get file info, skipping"
)
continue
}
size
:=
info
.
Size
()
// Copy the file
if
err
:=
copyFile
(
srcPath
,
destPath
,
info
.
Mode
());
err
!=
nil
{
log
.
WithError
(
err
)
.
WithField
(
"file"
,
name
)
.
Warn
(
"Failed to copy file, skipping"
)
continue
}
captured
=
append
(
captured
,
name
)
totalSize
+=
size
log
.
WithFields
(
logrus
.
Fields
{
"file"
:
name
,
"size"
:
size
,
})
.
Debug
(
"Captured /dev/shm file"
)
}
if
len
(
captured
)
>
0
{
log
.
WithFields
(
logrus
.
Fields
{
"count"
:
len
(
captured
),
"total_size"
:
totalSize
,
"files"
:
captured
,
})
.
Info
(
"Captured /dev/shm files"
)
}
return
nil
}
// copyFile copies a file from src to dest with the given permissions.
func
copyFile
(
src
,
dest
string
,
mode
os
.
FileMode
)
error
{
srcFile
,
err
:=
os
.
Open
(
src
)
if
err
!=
nil
{
return
fmt
.
Errorf
(
"failed to open source: %w"
,
err
)
}
defer
srcFile
.
Close
()
destFile
,
err
:=
os
.
OpenFile
(
dest
,
os
.
O_CREATE
|
os
.
O_WRONLY
|
os
.
O_TRUNC
,
mode
)
if
err
!=
nil
{
return
fmt
.
Errorf
(
"failed to create destination: %w"
,
err
)
}
defer
destFile
.
Close
()
if
_
,
err
:=
io
.
Copy
(
destFile
,
srcFile
);
err
!=
nil
{
return
fmt
.
Errorf
(
"failed to copy contents: %w"
,
err
)
}
// Sync to ensure durability for checkpoint data
if
err
:=
destFile
.
Sync
();
err
!=
nil
{
return
fmt
.
Errorf
(
"failed to sync destination: %w"
,
err
)
}
return
nil
}
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment