# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Chrek - Checkpoint/Restore Infrastructure
# This chart deploys the checkpoint storage (PVC) and CRIU agent (DaemonSet)
# in a single namespace. Install this chart in each namespace where you want
# to enable checkpoint/restore functionality for DynamoGraphDeployments.

# Storage configuration for checkpoints
storage:
  # Storage type: pvc (default), s3, or oci
  type: pvc

  # PVC configuration (when type=pvc)
  pvc:
    # Create a new PVC (set to false if using existing PVC)
    create: true
    # PVC name - must match operator configuration
    name: chrek-pvc
    # PVC size
    size: 100Gi
    # Storage class (leave empty for default)
    storageClass: ""
    # Access mode - ReadWriteMany required for multi-pod access
    accessMode: ReadWriteMany
    # Base path for checkpoints (mounted in pods)
    basePath: /checkpoints

  # S3 configuration (when type=s3)
  s3:
    # S3 URI (e.g., s3://my-bucket/checkpoints)
    uri: ""
    # Credentials are expected via IRSA or mounted secrets

  # OCI configuration (when type=oci)
  oci:
    # OCI URI (e.g., oci://registry.io/repo/checkpoints)
    uri: ""

# DaemonSet configuration for chrek (checkpoint/restore) agent
daemonset:
  # Container image
  image:
    repository: nvcr.io/nvidia/ai-dynamo/chrek-agent
    tag: 1.0.0
    pullPolicy: Always

  # Image pull secrets
  imagePullSecrets:
    - name: ngc-secret

  # Resource limits and requests
  resources:
    limits:
      cpu: 4
      memory: 4Gi
    requests:
      cpu: 2
      memory: 1Gi

  # Node selector - target GPU nodes
  nodeSelector:
    nvidia.com/gpu.present: "true"

  # Tolerations for GPU nodes
  tolerations:
    - key: nvidia.com/gpu
      operator: Exists
      effect: NoSchedule
    - key: dedicated
      operator: Exists
      effect: NoSchedule

  # Pod labels
  podLabels: {}

  # Pod annotations
  podAnnotations: {}

  # Affinity rules
  affinity: {}

# Seccomp profile configuration
seccomp:
  # Deploy seccomp profile for blocking io_uring (required for CRIU)
  deploy: true

# Service account configuration
serviceAccount:
  # Create service account
  create: true
  # Service account name (generated if not set)
  name: ""
  # Annotations for service account (e.g., for IRSA)
  annotations: {}

# RBAC configuration
rbac:
  # Create RBAC resources
  create: true

  # Namespace-scoped RBAC (recommended, required for PVC storage)
  # - true (default): Creates Role/RoleBinding, agent watches pods in chart's namespace only
  # - false: Creates ClusterRole/ClusterRoleBinding, agent watches all pods on assigned nodes
  # Note: PVC storage requires namespace-scoped mode (true) as PVCs are namespace-scoped
  namespaceRestricted: true

# Static configuration (loaded from ConfigMap)
# Dynamic values (NODE_NAME, RESTRICTED_NAMESPACE, etc.) come from environment variables
config:
  overlay:
    # Virtual FS dirs are COW artifacts in the overlay upperdir.
    systemDirs:
      - /proc
      - /sys
      - /dev
    # Cache directories to exclude (reduces checkpoint size)
    cacheDirs:
      - /.cache/huggingface
    # Python bytecode is already loaded in memory at restore time and
    # regenerated automatically on cold start.
    additionalExclusions:
      - "*/__pycache__"
      - "*.pyc"

  restore:
    # Path to the nsrestore binary in the placeholder image
    nsRestorePath: /usr/local/bin/nsrestore
    # Maximum seconds to wait for a restored pod to become Ready (0 = no timeout)
    restoreReadyTimeoutSeconds: 0

  criu:
    # Path to the criu binary
    binaryPath: /usr/local/sbin/criu
    # Ghost file size limit in bytes. Deleted-but-open files smaller than this
    # are saved inline in the checkpoint image as ghost files.
    ghostLimit: 536870912
    # CRIU logging verbosity (0-4)
    logLevel: 4
    # CRIU work directory for temporary files
    workDir: /var/criu-work

    # K8s-specific options (recommended defaults for containers)
    leaveRunning: true      # Keep process running after checkpoint
    shellJob: true          # Containers are often session leaders
    tcpClose: true          # Pod IPs change on restore/migration
    fileLocks: true         # Applications use file locks
    orphanPtsMaster: true   # Containers with TTYs
    extUnixSk: true         # External Unix sockets
    linkRemap: true         # Required for deleted-but-open files (e.g. POSIX semaphores in /dev/shm)
    extMasters: true        # External bind mount masters
    manageCgroupsMode: soft # CRIU cgroup management mode (ignore/soft/full/strict)

    # Restore-specific options (only apply during CRIU restore, not dump)
    rstSibling: true         # Restore as sibling process (required for go-criu swrk mode)
    mntnsCompatMode: false   # Mount namespace compatibility mode
    evasiveDevices: true     # Use any device path if original is inaccessible
    forceIrmap: true         # Force resolving inotify/fsnotify watch names

    # Advanced options
    autoDedup: false        # Auto-deduplication of memory pages
    lazyPages: false        # Lazy page migration (experimental)

    # Config file options (NOT available via RPC - written to criu.conf)
    libDir: ""                     # Keep empty: external CUDA checkpoint/restore only (no CRIU CUDA plugin)
    allowUprobes: true             # Leave enabled for kernel/userspace probe compatibility
    skipInFlight: true             # Skip in-flight TCP connections