"vscode:/vscode.git/clone" did not exist on "94d82a485bcd3b7aa7251ac36f834e5316ec5f2c"
Unverified Commit c0f34b15 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

feat(operator): operator config versioning and injected as configmap (#6464)

parent 59c5f6f1
......@@ -44,6 +44,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/predicate"
sigsyaml "sigs.k8s.io/yaml"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
......@@ -298,8 +299,9 @@ echo "Saved profiling output to ConfigMap {{.ConfigMapName}}"
// DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object
type DynamoGraphDeploymentRequestReconciler struct {
client.Client
Recorder record.EventRecorder
Config commonController.Config
Recorder record.EventRecorder
Config *configv1alpha1.OperatorConfiguration
RuntimeConfig *commonController.RuntimeConfig
// RBACMgr handles RBAC setup for profiling jobs
RBACManager RBACManager
......@@ -1015,7 +1017,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
logger.Info("GPU discovery not available", "reason", err.Error())
isNamespaceScoped := r.Config.RestrictedNamespace != ""
isNamespaceScoped := r.Config.Namespace.Restricted != ""
if isNamespaceScoped {
tmpl := template.Must(template.New("nsGPUErr").Parse(
`GPU hardware info required but cannot be auto-discovered.` +
......@@ -1073,7 +1075,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
}
// Ensure profiling job RBAC exists (only for cluster-wide installation)
if r.Config.RestrictedNamespace == "" {
if r.Config.Namespace.Restricted == "" {
if err := r.RBACManager.EnsureServiceAccountWithRBAC(
ctx,
dgdr.Namespace,
......@@ -1791,7 +1793,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manag
UpdateFunc: func(ue event.UpdateEvent) bool { return true },
GenericFunc: func(ge event.GenericEvent) bool { return true },
}),
). // Watch DGDs created by this controller (via label)
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)). // set the event filter to ignore resources handled by other controllers in namespace-restricted mode
).
// Watch DGDs created by this controller (via label)
// Set the event filter to ignore resources handled by other controllers in namespace-restricted mode
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config, r.RuntimeConfig)).
Complete(observability.NewObservedReconciler(r, consts.ResourceTypeDynamoGraphDeploymentRequest))
}
......@@ -22,6 +22,7 @@ import (
"encoding/json"
"time"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
. "github.com/onsi/ginkgo/v2"
......@@ -86,13 +87,16 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient,
Recorder: recorder,
Config: commonController.Config{
RestrictedNamespace: "",
RBAC: commonController.RBACConfig{
Config: &configv1alpha1.OperatorConfiguration{
Namespace: configv1alpha1.NamespaceConfiguration{
Restricted: "",
},
RBAC: configv1alpha1.RBACConfiguration{
DGDRProfilingClusterRoleName: "test-cluster-role",
},
},
RBACManager: &MockRBACManager{},
RuntimeConfig: &commonController.RuntimeConfig{},
RBACManager: &MockRBACManager{},
}
})
......@@ -956,10 +960,13 @@ var _ = Describe("DGDR Profiler Arguments", func() {
reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient,
Recorder: record.NewFakeRecorder(100),
Config: commonController.Config{
RestrictedNamespace: "",
Config: &configv1alpha1.OperatorConfiguration{
Namespace: configv1alpha1.NamespaceConfiguration{
Restricted: "",
},
},
RBACManager: &MockRBACManager{},
RuntimeConfig: &commonController.RuntimeConfig{},
RBACManager: &MockRBACManager{},
}
})
......@@ -1199,10 +1206,13 @@ var _ = Describe("DGDR Error Handling", func() {
reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient,
Recorder: recorder,
Config: commonController.Config{
RestrictedNamespace: "",
Config: &configv1alpha1.OperatorConfiguration{
Namespace: configv1alpha1.NamespaceConfiguration{
Restricted: "",
},
},
RBACManager: &MockRBACManager{},
RuntimeConfig: &commonController.RuntimeConfig{},
RBACManager: &MockRBACManager{},
}
})
......
......@@ -36,6 +36,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
......@@ -45,9 +46,10 @@ import (
// DynamoGraphDeploymentScalingAdapterReconciler reconciles a DynamoGraphDeploymentScalingAdapter object
type DynamoGraphDeploymentScalingAdapterReconciler struct {
client.Client
Scheme *runtime.Scheme
Recorder record.EventRecorder
Config commonController.Config
Scheme *runtime.Scheme
Recorder record.EventRecorder
Config *configv1alpha1.OperatorConfiguration
RuntimeConfig *commonController.RuntimeConfig
}
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters,verbs=get;list;watch;create;update;patch;delete
......@@ -177,7 +179,7 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) SetupWithManager(mgr ctr
GenericFunc: func(ge event.GenericEvent) bool { return false },
}),
).
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)).
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config, r.RuntimeConfig)).
Complete(observability.NewObservedReconciler(r, consts.ResourceTypeDynamoGraphDeploymentScalingAdapter))
}
......
......@@ -20,8 +20,8 @@ package controller_common
import (
"context"
"strings"
"time"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/client-go/discovery"
......@@ -36,153 +36,22 @@ type ExcludedNamespacesInterface interface {
Contains(namespace string) bool
}
type GroveConfig struct {
// Enabled is automatically determined by checking if Grove CRDs are installed in the cluster
Enabled bool
// TerminationDelay configures the termination delay for Grove PodCliqueSets
TerminationDelay time.Duration
}
type LWSConfig struct {
// Enabled is automatically determined by checking if LWS CRDs are installed in the cluster
Enabled bool
}
type KaiSchedulerConfig struct {
// Enabled is automatically determined by checking if Kai-scheduler CRDs are installed in the cluster
Enabled bool
}
type MpiRunConfig struct {
// SecretName is the name of the secret containing the SSH key for MPI Run
SecretName string
}
type Config struct {
// Enable resources filtering, only the resources belonging to the given namespace will be handled.
RestrictedNamespace string
Grove GroveConfig
LWS LWSConfig
KaiScheduler KaiSchedulerConfig
EtcdAddress string
NatsAddress string
IngressConfig IngressConfig
// ModelExpressURL is the URL of the Model Express server to inject into all pods
ModelExpressURL string
// PrometheusEndpoint is the URL of the Prometheus endpoint to use for metrics
PrometheusEndpoint string
MpiRun MpiRunConfig
// RBAC configuration for cross-namespace resource management
RBAC RBACConfig
// ExcludedNamespaces is a thread-safe set of namespaces to exclude (cluster-wide mode only)
ExcludedNamespaces ExcludedNamespacesInterface
// DiscoveryBackend is the discovery backend to use. Default is "kubernetes" for Kubernetes API service discovery. Set to "etcd" to use ETCD for discovery.
DiscoveryBackend string
// GPUDiscoveryEnabled indicates whether Helm provisioned node read access for the namespace-scoped operator.
// Only relevant for namespace-scoped operators (RestrictedNamespace != "").
GPUDiscoveryEnabled bool
// Checkpoint configuration for checkpoint/restore functionality
Checkpoint CheckpointConfig
}
// RBACConfig holds configuration for RBAC management
type RBACConfig struct {
// PlannerClusterRoleName is the name of the ClusterRole for planner (cluster-wide mode only)
PlannerClusterRoleName string
// DGDRProfilingClusterRoleName is the name of the ClusterRole for DGDR profiling jobs (cluster-wide mode only)
DGDRProfilingClusterRoleName string
// EPPClusterRoleName is the name of the ClusterRole for EPP (cluster-wide mode only)
EPPClusterRoleName string
}
// CheckpointConfig holds configuration for checkpoint/restore functionality
type CheckpointConfig struct {
// Enabled indicates if checkpoint functionality is enabled
Enabled bool
// Storage holds storage backend configuration
Storage CheckpointStorageConfig
// ReadyForCheckpointFilePath is the file path used to signal model readiness for checkpoint jobs
ReadyForCheckpointFilePath string
}
// Checkpoint storage type constants
const (
CheckpointStorageTypePVC = "pvc"
CheckpointStorageTypeS3 = "s3"
CheckpointStorageTypeOCI = "oci"
)
// CheckpointStorageConfig holds storage backend configuration for checkpoints
type CheckpointStorageConfig struct {
// Type is the storage backend type: pvc, s3, or oci
Type string
// PVC configuration (used when Type=pvc)
PVC CheckpointPVCConfig
// S3 configuration (used when Type=s3)
S3 CheckpointS3Config
// OCI configuration (used when Type=oci)
OCI CheckpointOCIConfig
}
// CheckpointPVCConfig holds PVC storage configuration
type CheckpointPVCConfig struct {
// PVCName is the name of the PVC
PVCName string
// BasePath is the base directory within the PVC
BasePath string
}
// CheckpointS3Config holds S3 storage configuration
type CheckpointS3Config struct {
// URI is the S3 URI (s3://[endpoint/]bucket/prefix)
URI string
// CredentialsSecretRef is the name of the credentials secret
// (should contain AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and optionally AWS_REGION)
CredentialsSecretRef string
}
// CheckpointOCIConfig holds OCI registry storage configuration
type CheckpointOCIConfig struct {
// URI is the OCI URI (oci://registry/repository)
URI string
// CredentialsSecretRef is the name of the docker config secret
CredentialsSecretRef string
}
type IngressConfig struct {
VirtualServiceGateway string
IngressControllerClassName string
IngressControllerTLSSecret string
IngressHostSuffix string
}
func (i *IngressConfig) UseVirtualService() bool {
return i.VirtualServiceGateway != ""
}
// DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectGroveAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "grove.io")
}
// DetectLWSAvailability checks if LWS is available by checking if the LWS API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectLWSAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "leaderworkerset.x-k8s.io")
}
// detectVolcanoAvailability checks if Volcano is available by checking if the Volcano API group is registered
// This approach uses the discovery client which is simpler and more reliable
// DetectVolcanoAvailability checks if Volcano is available by checking if the Volcano API group is registered
func DetectVolcanoAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "scheduling.volcano.sh")
}
// DetectKaiSchedulerAvailability checks if Kai-scheduler is available by checking if the scheduling.run.ai API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectKaiSchedulerAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "scheduling.run.ai")
}
......@@ -226,20 +95,23 @@ func detectAPIGroupAvailability(ctx context.Context, mgr ctrl.Manager, groupName
return false
}
// For DGD, pass in the meta annotations
// For DCD, pass in the spec annotations
func (c Config) IsK8sDiscoveryEnabled(annotations map[string]string) bool {
return c.GetDiscoveryBackend(annotations) == "kubernetes"
}
func (c Config) GetDiscoveryBackend(annotations map[string]string) string {
// GetDiscoveryBackend returns the discovery backend for the given annotations,
// falling back to the configured default.
// For DGD, pass in the meta annotations; for DCD, pass in the spec annotations.
func GetDiscoveryBackend(discoveryBackend configv1alpha1.DiscoveryBackend, annotations map[string]string) configv1alpha1.DiscoveryBackend {
if dgdDiscoveryBackend, exists := annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend]; exists {
return dgdDiscoveryBackend
return configv1alpha1.DiscoveryBackend(dgdDiscoveryBackend)
}
return c.DiscoveryBackend
return discoveryBackend
}
// IsK8sDiscoveryEnabled returns whether Kubernetes discovery is enabled for the given annotations.
func IsK8sDiscoveryEnabled(discoveryBackend configv1alpha1.DiscoveryBackend, annotations map[string]string) bool {
return GetDiscoveryBackend(discoveryBackend, annotations) == configv1alpha1.DiscoveryBackendKubernetes
}
func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
// EphemeralDeploymentEventFilter returns a predicate that filters events based on namespace configuration.
func EphemeralDeploymentEventFilter(config *configv1alpha1.OperatorConfiguration, runtimeConfig *RuntimeConfig) predicate.Predicate {
return predicate.NewPredicateFuncs(func(o client.Object) bool {
l := log.FromContext(context.Background())
objMeta, err := meta.Accessor(o)
......@@ -247,13 +119,13 @@ func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
l.Error(err, "Error extracting object metadata")
return false
}
if config.RestrictedNamespace != "" {
if config.Namespace.Restricted != "" {
// in case of a restricted namespace, we only want to process the events that are in the restricted namespace
return objMeta.GetNamespace() == config.RestrictedNamespace
return objMeta.GetNamespace() == config.Namespace.Restricted
}
// Cluster-wide mode: check if namespace is excluded
if config.ExcludedNamespaces != nil && config.ExcludedNamespaces.Contains(objMeta.GetNamespace()) {
if runtimeConfig.ExcludedNamespaces != nil && runtimeConfig.ExcludedNamespaces.Contains(objMeta.GetNamespace()) {
l.V(1).Info("Skipping resource - namespace is excluded",
"namespace", objMeta.GetNamespace(),
"resource", objMeta.GetName(),
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller_common
// RuntimeConfig holds runtime state that is resolved after startup (e.g., auto-detection results).
// This is separate from the static OperatorConfiguration loaded from config files.
type RuntimeConfig struct {
// GroveEnabled is the resolved Grove availability (config override merged with auto-detection)
GroveEnabled bool
// LWSEnabled is the resolved LWS availability (config override merged with auto-detection)
LWSEnabled bool
// KaiSchedulerEnabled is the resolved Kai-scheduler availability (config override merged with auto-detection)
KaiSchedulerEnabled bool
// ExcludedNamespaces for cluster-wide mode namespace filtering
ExcludedNamespaces ExcludedNamespacesInterface
}
......@@ -6,6 +6,7 @@
package dynamo
import (
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
......@@ -48,7 +49,7 @@ type ComponentContext struct {
ComponentType string
ParentGraphDeploymentName string
ParentGraphDeploymentNamespace string
DiscoveryBackend string
DiscoveryBackend configv1alpha1.DiscoveryBackend
EPPConfig *v1alpha1.EPPConfig
WorkerHashSuffix string
}
......
......@@ -33,6 +33,7 @@ import (
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
......@@ -785,22 +786,22 @@ func GenerateComponentVirtualService(ctx context.Context, componentName, compone
return vs
}
func GenerateDefaultIngressSpec(dynamoDeployment *v1alpha1.DynamoGraphDeployment, ingressConfig controller_common.IngressConfig) v1alpha1.IngressSpec {
func GenerateDefaultIngressSpec(dynamoDeployment *v1alpha1.DynamoGraphDeployment, ingressConfig configv1alpha1.IngressConfiguration) v1alpha1.IngressSpec {
res := v1alpha1.IngressSpec{
Enabled: ingressConfig.VirtualServiceGateway != "" || ingressConfig.IngressControllerClassName != "",
Enabled: ingressConfig.VirtualServiceGateway != "" || ingressConfig.ControllerClassName != "",
Host: dynamoDeployment.Name,
UseVirtualService: ingressConfig.VirtualServiceGateway != "",
}
if ingressConfig.IngressControllerClassName != "" {
res.IngressControllerClassName = &ingressConfig.IngressControllerClassName
if ingressConfig.ControllerClassName != "" {
res.IngressControllerClassName = &ingressConfig.ControllerClassName
}
if ingressConfig.IngressControllerTLSSecret != "" {
if ingressConfig.ControllerTLSSecretName != "" {
res.TLS = &v1alpha1.IngressTLSSpec{
SecretName: ingressConfig.IngressControllerTLSSecret,
SecretName: ingressConfig.ControllerTLSSecretName,
}
}
if ingressConfig.IngressHostSuffix != "" {
res.HostSuffix = &ingressConfig.IngressHostSuffix
if ingressConfig.HostSuffix != "" {
res.HostSuffix = &ingressConfig.HostSuffix
}
if ingressConfig.VirtualServiceGateway != "" {
res.VirtualServiceGateway = &ingressConfig.VirtualServiceGateway
......@@ -889,7 +890,7 @@ type MultinodeDeployer interface {
}
// BackendFactory creates backend instances based on the framework type
func BackendFactory(backendFramework BackendFramework, controllerConfig controller_common.Config) Backend {
func BackendFactory(backendFramework BackendFramework, operatorConfig *configv1alpha1.OperatorConfiguration) Backend {
switch backendFramework {
case BackendFrameworkSGLang:
return &SGLangBackend{}
......@@ -897,7 +898,7 @@ func BackendFactory(backendFramework BackendFramework, controllerConfig controll
return &VLLMBackend{}
case BackendFrameworkTRTLLM:
return &TRTLLMBackend{
MpiRunSecretName: controllerConfig.MpiRun.SecretName,
MpiRunSecretName: operatorConfig.MPI.SSHSecretName,
}
case BackendFrameworkNoop:
return &NoopBackend{}
......@@ -925,32 +926,32 @@ func IsWorkerComponent(componentType string) bool {
}
// addStandardEnvVars adds the standard environment variables that are common to both Grove and Controller
func addStandardEnvVars(container *corev1.Container, controllerConfig controller_common.Config) {
func addStandardEnvVars(container *corev1.Container, operatorConfig *configv1alpha1.OperatorConfiguration) {
standardEnvVars := []corev1.EnvVar{}
if controllerConfig.NatsAddress != "" {
if operatorConfig.Infrastructure.NATSAddress != "" {
standardEnvVars = append(standardEnvVars, corev1.EnvVar{
Name: "NATS_SERVER",
Value: controllerConfig.NatsAddress,
Value: operatorConfig.Infrastructure.NATSAddress,
})
}
if controllerConfig.EtcdAddress != "" {
if operatorConfig.Infrastructure.ETCDAddress != "" {
standardEnvVars = append(standardEnvVars, corev1.EnvVar{
Name: "ETCD_ENDPOINTS",
Value: controllerConfig.EtcdAddress,
Value: operatorConfig.Infrastructure.ETCDAddress,
})
}
if controllerConfig.ModelExpressURL != "" {
if operatorConfig.Infrastructure.ModelExpressURL != "" {
standardEnvVars = append(standardEnvVars, corev1.EnvVar{
Name: "MODEL_EXPRESS_URL",
Value: controllerConfig.ModelExpressURL,
Value: operatorConfig.Infrastructure.ModelExpressURL,
})
}
if controllerConfig.PrometheusEndpoint != "" {
if operatorConfig.Infrastructure.PrometheusEndpoint != "" {
standardEnvVars = append(standardEnvVars, corev1.EnvVar{
Name: "PROMETHEUS_ENDPOINT",
Value: controllerConfig.PrometheusEndpoint,
Value: operatorConfig.Infrastructure.PrometheusEndpoint,
})
}
// merge the env vars to allow users to override the standard env vars
......@@ -989,13 +990,13 @@ func GenerateBasePodSpec(
namespace string,
role Role,
numberOfNodes int32,
controllerConfig controller_common.Config,
operatorConfig *configv1alpha1.OperatorConfiguration,
multinodeDeploymentType commonconsts.MultinodeDeploymentType,
serviceName string,
checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info (resolved by ResolveCheckpointForService)
) (*corev1.PodSpec, error) {
// Start with base container generated per component type
componentContext := generateComponentContext(component, parentGraphDeploymentName, namespace, numberOfNodes, controllerConfig.GetDiscoveryBackend(component.Annotations))
componentContext := generateComponentContext(component, parentGraphDeploymentName, namespace, numberOfNodes, controller_common.GetDiscoveryBackend(operatorConfig.Discovery.Backend, component.Annotations))
componentDefaults := ComponentDefaultsFactory(component.ComponentType)
container, err := componentDefaults.GetBaseContainer(componentContext)
if err != nil {
......@@ -1078,7 +1079,7 @@ func GenerateBasePodSpec(
})
}
addStandardEnvVars(&container, controllerConfig)
addStandardEnvVars(&container, operatorConfig)
volumes := make([]corev1.Volume, 0, len(component.VolumeMounts)+1) // +1 for shared memory volume
......@@ -1124,7 +1125,7 @@ func GenerateBasePodSpec(
if multinodeDeployer == nil {
return nil, fmt.Errorf("unsupported multinode deployment type: %s", multinodeDeploymentType)
}
backend := BackendFactory(backendFramework, controllerConfig)
backend := BackendFactory(backendFramework, operatorConfig)
if backend == nil {
return nil, fmt.Errorf("unsupported backend framework: %s", backendFramework)
}
......@@ -1156,7 +1157,7 @@ func GenerateBasePodSpec(
applyDefaultSecurityContext(&podSpec)
}
if controllerConfig.IsK8sDiscoveryEnabled(component.Annotations) {
if controller_common.IsK8sDiscoveryEnabled(operatorConfig.Discovery.Backend, component.Annotations) {
if podSpec.ServiceAccountName == "" {
podSpec.ServiceAccountName = discovery.GetK8sDiscoveryServiceAccountName(parentGraphDeploymentName)
}
......@@ -1176,9 +1177,9 @@ func GenerateBasePodSpec(
// - Storage configuration (volumes, mounts)
// CheckpointInfo should have been resolved by ResolveCheckpointForService before calling this function
// Checkpoint config comes from the operator's controller config (Helm values)
var checkpointConfig *controller_common.CheckpointConfig
if controllerConfig.Checkpoint.Enabled {
checkpointConfig = &controllerConfig.Checkpoint
var checkpointConfig *configv1alpha1.CheckpointConfiguration
if operatorConfig.Checkpoint.Enabled {
checkpointConfig = &operatorConfig.Checkpoint
}
if err := checkpoint.InjectCheckpointIntoPodSpec(&podSpec, checkpointInfo, checkpointConfig); err != nil {
return nil, fmt.Errorf("failed to inject checkpoint config: %w", err)
......@@ -1198,7 +1199,7 @@ func setMetricsLabels(labels map[string]string, dynamoGraphDeployment *v1alpha1.
labels[commonconsts.KubeLabelMetricsEnabled] = commonconsts.KubeLabelValueTrue
}
func generateComponentContext(component *v1alpha1.DynamoComponentDeploymentSharedSpec, parentGraphDeploymentName string, namespace string, numberOfNodes int32, discoveryBackend string) ComponentContext {
func generateComponentContext(component *v1alpha1.DynamoComponentDeploymentSharedSpec, parentGraphDeploymentName string, namespace string, numberOfNodes int32, discoveryBackend configv1alpha1.DiscoveryBackend) ComponentContext {
dynamoNamespace := v1alpha1.ComputeDynamoNamespace(component.GlobalDynamoNamespace, namespace, parentGraphDeploymentName)
var workerHashSuffix string
......@@ -1227,7 +1228,7 @@ func GeneratePodSpecForComponent(
dynamoDeployment *v1alpha1.DynamoGraphDeployment,
role Role,
numberOfNodes int32,
controllerConfig controller_common.Config,
operatorConfig *configv1alpha1.OperatorConfiguration,
multinodeDeploymentType commonconsts.MultinodeDeploymentType,
serviceName string,
checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info
......@@ -1235,7 +1236,7 @@ func GeneratePodSpecForComponent(
if len(dynamoDeployment.Spec.Envs) > 0 {
component.Envs = MergeEnvs(dynamoDeployment.Spec.Envs, component.Envs)
}
podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, controllerConfig, multinodeDeploymentType, serviceName, checkpointInfo)
podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, operatorConfig, multinodeDeploymentType, serviceName, checkpointInfo)
if err != nil {
return nil, err
}
......@@ -1246,7 +1247,8 @@ func GeneratePodSpecForComponent(
func GenerateGrovePodCliqueSet(
ctx context.Context,
dynamoDeployment *v1alpha1.DynamoGraphDeployment,
controllerConfig controller_common.Config,
operatorConfig *configv1alpha1.OperatorConfiguration,
runtimeConfig *controller_common.RuntimeConfig,
secretsRetriever SecretsRetriever,
restartState *RestartState,
existingRestartAnnotations map[string]string,
......@@ -1260,13 +1262,13 @@ func GenerateGrovePodCliqueSet(
PublishNotReadyAddresses: true,
}
gangSet.Spec.Template.StartupType = ptr.To(grovev1alpha1.CliqueStartupTypeAnyOrder)
if controllerConfig.Grove.TerminationDelay > 0 {
gangSet.Spec.Template.TerminationDelay = &metav1.Duration{Duration: controllerConfig.Grove.TerminationDelay}
if operatorConfig.Orchestrators.Grove.TerminationDelay.Duration > 0 {
gangSet.Spec.Template.TerminationDelay = &operatorConfig.Orchestrators.Grove.TerminationDelay
}
// Validate kai-scheduler queue once if kai-scheduler is enabled
var validatedQueueName string
if controllerConfig.Grove.Enabled && controllerConfig.KaiScheduler.Enabled {
if runtimeConfig.GroveEnabled && runtimeConfig.KaiSchedulerEnabled {
var err error
validatedQueueName, err = DetermineKaiSchedulerQueue(ctx, dynamoDeployment.Annotations)
if err != nil {
......@@ -1274,7 +1276,7 @@ func GenerateGrovePodCliqueSet(
}
}
discoveryBackend := controllerConfig.GetDiscoveryBackend(dynamoDeployment.Annotations)
discoveryBackend := controller_common.GetDiscoveryBackend(operatorConfig.Discovery.Backend, dynamoDeployment.Annotations)
var scalingGroups []grovev1alpha1.PodCliqueScalingGroupConfig
for serviceName, component := range dynamoDeployment.Spec.Services {
......@@ -1290,7 +1292,7 @@ func GenerateGrovePodCliqueSet(
if component.Annotations == nil {
component.Annotations = make(map[string]string)
}
component.Annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend] = discoveryBackend
component.Annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend] = string(discoveryBackend)
}
// Propagate operator origin version for version-gated behavior in backends
......@@ -1320,7 +1322,7 @@ func GenerateGrovePodCliqueSet(
dynamoDeployment,
r.Role,
numberOfNodes,
controllerConfig,
operatorConfig,
commonconsts.MultinodeDeploymentTypeGrove,
serviceName,
checkpointInfo,
......@@ -1367,7 +1369,7 @@ func GenerateGrovePodCliqueSet(
clique.Annotations = annotations
// Inject kai-scheduler settings if enabled
injectKaiSchedulerIfEnabled(clique, controllerConfig, validatedQueueName)
injectKaiSchedulerIfEnabled(clique, runtimeConfig, validatedQueueName)
gangSet.Spec.Template.Cliques = append(gangSet.Spec.Template.Cliques, clique)
cliqueNames = append(cliqueNames, strings.ToLower(r.Name))
......@@ -1604,7 +1606,7 @@ func GetBackendFrameworkFromDynamoComponent(dynComponent *v1alpha1.DynamoCompone
func GenerateBasePodSpecForController(
dynComponent *v1alpha1.DynamoComponentDeployment,
secretsRetriever SecretsRetriever,
controllerConfig controller_common.Config,
operatorConfig *configv1alpha1.OperatorConfiguration,
role Role,
multinodeDeploymentType commonconsts.MultinodeDeploymentType,
checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info (resolved by caller)
......@@ -1631,7 +1633,7 @@ func GenerateBasePodSpecForController(
dynComponent.Namespace,
role,
numberOfNodes,
controllerConfig,
operatorConfig,
multinodeDeploymentType,
serviceName,
checkpointInfo,
......
......@@ -26,6 +26,7 @@ import (
"testing"
"time"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
......@@ -790,7 +791,7 @@ func TestGenerateComponentContext(t *testing.T) {
parentGraphDeploymentName string
namespace string
numberOfNodes int32
discoveryBackend string
discoveryBackend configv1alpha1.DiscoveryBackend
expectedDynamoNamespace string
expectedComponentType string
expectedParentDGDName string
......@@ -806,7 +807,7 @@ func TestGenerateComponentContext(t *testing.T) {
parentGraphDeploymentName: "my-deployment",
namespace: "my-namespace",
numberOfNodes: 1,
discoveryBackend: "kubernetes",
discoveryBackend: configv1alpha1.DiscoveryBackendKubernetes,
expectedDynamoNamespace: "my-namespace-my-deployment",
expectedComponentType: commonconsts.ComponentTypePlanner,
expectedParentDGDName: "my-deployment",
......@@ -822,7 +823,7 @@ func TestGenerateComponentContext(t *testing.T) {
parentGraphDeploymentName: "vllm-disagg",
namespace: "djangoz",
numberOfNodes: 1,
discoveryBackend: "kubernetes",
discoveryBackend: configv1alpha1.DiscoveryBackendKubernetes,
expectedDynamoNamespace: "djangoz-vllm-disagg",
expectedComponentType: commonconsts.ComponentTypeFrontend,
expectedParentDGDName: "vllm-disagg",
......@@ -839,7 +840,7 @@ func TestGenerateComponentContext(t *testing.T) {
parentGraphDeploymentName: "shared-frontend",
namespace: "production",
numberOfNodes: 2,
discoveryBackend: "etcd",
discoveryBackend: configv1alpha1.DiscoveryBackendEtcd,
expectedDynamoNamespace: commonconsts.GlobalDynamoNamespace,
expectedComponentType: commonconsts.ComponentTypeWorker,
expectedParentDGDName: "shared-frontend",
......@@ -854,7 +855,7 @@ func TestGenerateComponentContext(t *testing.T) {
parentGraphDeploymentName: "test-dgd",
namespace: "default",
numberOfNodes: 1,
discoveryBackend: "kubernetes",
discoveryBackend: configv1alpha1.DiscoveryBackendKubernetes,
expectedDynamoNamespace: "default-test-dgd",
expectedComponentType: commonconsts.ComponentTypePlanner,
expectedParentDGDName: "test-dgd",
......@@ -868,7 +869,7 @@ func TestGenerateComponentContext(t *testing.T) {
parentGraphDeploymentName: "llama-70b-prod",
namespace: "ml-inference",
numberOfNodes: 4,
discoveryBackend: "nats",
discoveryBackend: configv1alpha1.DiscoveryBackendEtcd,
expectedDynamoNamespace: "ml-inference-llama-70b-prod",
expectedComponentType: commonconsts.ComponentTypeFrontend,
expectedParentDGDName: "llama-70b-prod",
......@@ -1220,7 +1221,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
type args struct {
ctx context.Context
dynamoDeployment *v1alpha1.DynamoGraphDeployment
controllerConfig controller_common.Config
controllerConfig *configv1alpha1.OperatorConfiguration
}
tests := []struct {
name string
......@@ -1232,14 +1233,18 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
name: "test_generate_grove_pod_clique_set_single_node",
args: args{
ctx: context.Background(),
controllerConfig: controller_common.Config{
EtcdAddress: "etcd-address",
NatsAddress: "nats-address",
ModelExpressURL: "model-express-url",
Grove: controller_common.GroveConfig{
TerminationDelay: 15 * time.Minute,
controllerConfig: &configv1alpha1.OperatorConfiguration{
Infrastructure: configv1alpha1.InfrastructureConfiguration{
ETCDAddress: "etcd-address",
NATSAddress: "nats-address",
ModelExpressURL: "model-express-url",
PrometheusEndpoint: "http://localhost:9090",
},
Orchestrators: configv1alpha1.OrchestratorConfiguration{
Grove: configv1alpha1.GroveConfiguration{
TerminationDelay: metav1.Duration{Duration: 15 * time.Minute},
},
},
PrometheusEndpoint: "http://localhost:9090",
},
dynamoDeployment: &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
......@@ -1790,11 +1795,15 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
name: "test_generate_grove_pod_gang_set_multinode sglang",
args: args{
ctx: context.Background(),
controllerConfig: controller_common.Config{
EtcdAddress: "etcd-address",
NatsAddress: "nats-address",
Grove: controller_common.GroveConfig{
TerminationDelay: 15 * time.Minute,
controllerConfig: &configv1alpha1.OperatorConfiguration{
Infrastructure: configv1alpha1.InfrastructureConfiguration{
ETCDAddress: "etcd-address",
NATSAddress: "nats-address",
},
Orchestrators: configv1alpha1.OrchestratorConfiguration{
Grove: configv1alpha1.GroveConfiguration{
TerminationDelay: metav1.Duration{Duration: 15 * time.Minute},
},
},
},
dynamoDeployment: &v1alpha1.DynamoGraphDeployment{
......@@ -2755,11 +2764,15 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
name: "test_generate_grove_pod_gang_set_multinode vllm",
args: args{
ctx: context.Background(),
controllerConfig: controller_common.Config{
EtcdAddress: "etcd-address",
NatsAddress: "nats-address",
Grove: controller_common.GroveConfig{
TerminationDelay: 15 * time.Minute,
controllerConfig: &configv1alpha1.OperatorConfiguration{
Infrastructure: configv1alpha1.InfrastructureConfiguration{
ETCDAddress: "etcd-address",
NATSAddress: "nats-address",
},
Orchestrators: configv1alpha1.OrchestratorConfiguration{
Grove: configv1alpha1.GroveConfiguration{
TerminationDelay: metav1.Duration{Duration: 15 * time.Minute},
},
},
},
dynamoDeployment: &v1alpha1.DynamoGraphDeployment{
......@@ -3736,7 +3749,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := GenerateGrovePodCliqueSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, nil, nil, nil, nil)
got, err := GenerateGrovePodCliqueSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, &controller_common.RuntimeConfig{}, nil, nil, nil, nil)
if (err != nil) != tt.wantErr {
t.Errorf("GenerateGrovePodCliqueSet() error = %v, wantErr %v", err, tt.wantErr)
return
......@@ -3797,7 +3810,7 @@ func Test_GeneratePodCliqueSetGlobalDynamoNamespace(t *testing.T) {
},
}
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controller_common.Config{}, nil, nil, nil, nil)
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, &configv1alpha1.OperatorConfiguration{}, &controller_common.RuntimeConfig{}, nil, nil, nil, nil)
if !assert.NoError(t, err) {
return
}
......@@ -3868,7 +3881,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
Namespace: "default",
},
}
controllerConfig := controller_common.Config{}
controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct {
name string
......@@ -4015,7 +4028,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
Namespace: "default",
},
}
controllerConfig := controller_common.Config{}
controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct {
name string
......@@ -4168,7 +4181,7 @@ func TestGeneratePodSpecForComponent_UnsupportedBackend(t *testing.T) {
Namespace: "default",
},
}
controllerConfig := controller_common.Config{}
controllerConfig := &configv1alpha1.OperatorConfiguration{}
component := &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
......@@ -4878,12 +4891,14 @@ func TestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
},
}
controllerConfig := controller_common.Config{
EtcdAddress: "etcd-av1alpha1",
NatsAddress: "nats-address",
controllerConfig := &configv1alpha1.OperatorConfiguration{
Infrastructure: configv1alpha1.InfrastructureConfiguration{
ETCDAddress: "etcd-av1alpha1",
NATSAddress: "nats-address",
},
}
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controllerConfig, secretsRetriever, nil, nil, nil)
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controllerConfig, &controller_common.RuntimeConfig{}, secretsRetriever, nil, nil, nil)
if err != nil {
t.Errorf("GenerateGrovePodCliqueSet() error = %v", err)
return
......@@ -4936,7 +4951,7 @@ func TestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
func TestGenerateBasePodSpec_Frontend(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{}
controllerConfig := &configv1alpha1.OperatorConfiguration{}
dynamoDeployment := &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
......@@ -5032,7 +5047,7 @@ func TestGenerateBasePodSpec_Frontend(t *testing.T) {
func TestGenerateBasePodSpec_PlannerServiceAccount(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{}
controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct {
name string
......@@ -5178,7 +5193,7 @@ func TestGenerateBasePodSpec_DisableImagePullSecretDiscovery(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
controllerConfig := controller_common.Config{}
controllerConfig := &configv1alpha1.OperatorConfiguration{}
podSpec, err := GenerateBasePodSpec(
tt.component,
......@@ -5211,7 +5226,7 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
tests := []struct {
name string
component *v1alpha1.DynamoComponentDeploymentSharedSpec
controllerConfig controller_common.Config
controllerConfig *configv1alpha1.OperatorConfiguration
wantEnvVar string
}{
{
......@@ -5221,15 +5236,18 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
commonconsts.KubeAnnotationDynamoDiscoveryBackend: "kubernetes",
},
},
wantEnvVar: "kubernetes",
controllerConfig: &configv1alpha1.OperatorConfiguration{},
wantEnvVar: "kubernetes",
},
{
name: "Kubernetes discovery from controller config should set env var to kubernetes",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{},
},
controllerConfig: controller_common.Config{
DiscoveryBackend: "kubernetes",
controllerConfig: &configv1alpha1.OperatorConfiguration{
Discovery: configv1alpha1.DiscoveryConfiguration{
Backend: "kubernetes",
},
},
wantEnvVar: "kubernetes",
},
......@@ -5240,8 +5258,10 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
commonconsts.KubeAnnotationDynamoDiscoveryBackend: "etcd",
},
},
controllerConfig: controller_common.Config{
DiscoveryBackend: "kubernetes",
controllerConfig: &configv1alpha1.OperatorConfiguration{
Discovery: configv1alpha1.DiscoveryConfiguration{
Backend: "kubernetes",
},
},
wantEnvVar: "", // etcd is the runtime default, no env var needed
},
......@@ -5250,8 +5270,10 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{},
},
controllerConfig: controller_common.Config{
DiscoveryBackend: "etcd",
controllerConfig: &configv1alpha1.OperatorConfiguration{
Discovery: configv1alpha1.DiscoveryConfiguration{
Backend: "etcd",
},
},
wantEnvVar: "", // etcd is the runtime default, no env var needed
},
......@@ -5262,15 +5284,18 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
commonconsts.KubeAnnotationDynamoDiscoveryBackend: "",
},
},
controllerConfig: controller_common.Config{
DiscoveryBackend: "",
controllerConfig: &configv1alpha1.OperatorConfiguration{
Discovery: configv1alpha1.DiscoveryConfiguration{
Backend: "",
},
},
wantEnvVar: "kubernetes", // empty defaults to kubernetes
},
{
name: "Discovery backend not set defaults to kubernetes",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{},
wantEnvVar: "kubernetes", // not set defaults to kubernetes
name: "Discovery backend not set defaults to kubernetes",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{},
controllerConfig: &configv1alpha1.OperatorConfiguration{},
wantEnvVar: "kubernetes", // not set defaults to kubernetes
},
}
secretsRetriever := &mockSecretsRetriever{}
......@@ -5307,7 +5332,7 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
func TestGenerateBasePodSpec_Worker(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{}
controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct {
name string
......@@ -5474,7 +5499,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
func TestGenerateBasePodSpec_VolumeMounts(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{}
controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct {
name string
......@@ -5609,7 +5634,7 @@ func TestGenerateBasePodSpec_VolumeMounts(t *testing.T) {
func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{}
controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct {
name string
......@@ -5869,7 +5894,7 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
func TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{}
controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct {
name string
......@@ -6055,7 +6080,7 @@ func TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport(t *testing.T)
func TestGenerateBasePodSpec_SecurityContext(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{}
controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct {
name string
......@@ -6696,12 +6721,14 @@ func TestGenerateGrovePodCliqueSet_RestartAnnotations(t *testing.T) {
},
}
controllerConfig := controller_common.Config{
EtcdAddress: "etcd-address",
NatsAddress: "nats-address",
controllerConfig := &configv1alpha1.OperatorConfiguration{
Infrastructure: configv1alpha1.InfrastructureConfiguration{
ETCDAddress: "etcd-address",
NATSAddress: "nats-address",
},
}
got, err := GenerateGrovePodCliqueSet(context.Background(), dgd, controllerConfig, nil, tt.restartState, nil, nil)
got, err := GenerateGrovePodCliqueSet(context.Background(), dgd, controllerConfig, &controller_common.RuntimeConfig{}, nil, tt.restartState, nil, nil)
if err != nil {
t.Fatalf("GenerateGrovePodCliqueSet() error = %v", err)
}
......
......@@ -305,11 +305,11 @@ func ResolveKaiSchedulerQueue(annotations map[string]string) string {
// injectKaiSchedulerIfEnabled injects kai-scheduler settings into a clique if kai-scheduler is enabled and grove is enabled
func injectKaiSchedulerIfEnabled(
clique *grovev1alpha1.PodCliqueTemplateSpec,
controllerConfig controller_common.Config,
runtimeConfig *controller_common.RuntimeConfig,
validatedQueueName string,
) {
// Only proceed if grove is enabled, kai-scheduler is enabled, and no manual schedulerName is set
if !controllerConfig.Grove.Enabled || !controllerConfig.KaiScheduler.Enabled {
if !runtimeConfig.GroveEnabled || !runtimeConfig.KaiSchedulerEnabled {
return
}
......
......@@ -112,7 +112,7 @@ func TestResolveKaiSchedulerQueue(t *testing.T) {
func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
tests := []struct {
name string
controllerConfig controller_common.Config
runtimeConfig *controller_common.RuntimeConfig
validatedQueueName string
initialClique *grovev1alpha1.PodCliqueTemplateSpec
expectedScheduler string
......@@ -121,9 +121,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
}{
{
name: "grove disabled - no injection",
controllerConfig: controller_common.Config{
Grove: controller_common.GroveConfig{Enabled: false},
KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: true},
runtimeConfig: &controller_common.RuntimeConfig{
GroveEnabled: false,
KaiSchedulerEnabled: true,
},
validatedQueueName: "test-queue",
initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
......@@ -135,9 +135,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
},
{
name: "kai-scheduler disabled - no injection",
controllerConfig: controller_common.Config{
Grove: controller_common.GroveConfig{Enabled: true},
KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: false},
runtimeConfig: &controller_common.RuntimeConfig{
GroveEnabled: true,
KaiSchedulerEnabled: false,
},
validatedQueueName: "test-queue",
initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
......@@ -149,9 +149,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
},
{
name: "manual scheduler set - no injection",
controllerConfig: controller_common.Config{
Grove: controller_common.GroveConfig{Enabled: true},
KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: true},
runtimeConfig: &controller_common.RuntimeConfig{
GroveEnabled: true,
KaiSchedulerEnabled: true,
},
validatedQueueName: "test-queue",
initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
......@@ -165,9 +165,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
},
{
name: "both enabled, no manual scheduler - inject",
controllerConfig: controller_common.Config{
Grove: controller_common.GroveConfig{Enabled: true},
KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: true},
runtimeConfig: &controller_common.RuntimeConfig{
GroveEnabled: true,
KaiSchedulerEnabled: true,
},
validatedQueueName: "test-queue",
initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
......@@ -181,9 +181,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
},
{
name: "inject with existing labels",
controllerConfig: controller_common.Config{
Grove: controller_common.GroveConfig{Enabled: true},
KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: true},
runtimeConfig: &controller_common.RuntimeConfig{
GroveEnabled: true,
KaiSchedulerEnabled: true,
},
validatedQueueName: "custom-queue",
initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
......@@ -206,7 +206,7 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
clique := tt.initialClique.DeepCopy()
// Call the function
injectKaiSchedulerIfEnabled(clique, tt.controllerConfig, tt.validatedQueueName)
injectKaiSchedulerIfEnabled(clique, tt.runtimeConfig, tt.validatedQueueName)
if tt.shouldInject {
// Verify scheduler name is injected
......
......@@ -11,6 +11,7 @@
## Packages
- [nvidia.com/v1alpha1](#nvidiacomv1alpha1)
- [nvidia.com/v1beta1](#nvidiacomv1beta1)
- [operator.config.dynamo.nvidia.com/v1alpha1](#operatorconfigdynamonvidiacomv1alpha1)
## nvidia.com/v1alpha1
......@@ -1584,6 +1585,480 @@ _Appears in:_
| `requestRate` _float_ | RequestRate is the target request rate (req/s).<br />Required (or Concurrency) when the planner is disabled. | | Optional: \{\} <br /> |
## operator.config.dynamo.nvidia.com/v1alpha1
### Resource Types
- [OperatorConfiguration](#operatorconfiguration)
#### CheckpointConfiguration
CheckpointConfiguration holds checkpoint/restore settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled indicates if checkpoint functionality is enabled | | |
| `readyForCheckpointFilePath` _string_ | ReadyForCheckpointFilePath signals model readiness for checkpoint jobs | /tmp/ready-for-checkpoint | |
| `storage` _[CheckpointStorageConfiguration](#checkpointstorageconfiguration)_ | Storage holds storage backend configuration | | |
#### CheckpointOCIConfig
CheckpointOCIConfig holds OCI registry storage configuration.
_Appears in:_
- [CheckpointStorageConfiguration](#checkpointstorageconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `uri` _string_ | URI is the OCI URI (oci://registry/repository) | | |
| `credentialsSecretRef` _string_ | CredentialsSecretRef is the name of the docker config secret | | |
#### CheckpointPVCConfig
CheckpointPVCConfig holds PVC storage configuration.
_Appears in:_
- [CheckpointStorageConfiguration](#checkpointstorageconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `pvcName` _string_ | PVCName is the name of the PVC | chrek-pvc | |
| `basePath` _string_ | BasePath is the base directory within the PVC | /checkpoints | |
#### CheckpointS3Config
CheckpointS3Config holds S3 storage configuration.
_Appears in:_
- [CheckpointStorageConfiguration](#checkpointstorageconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `uri` _string_ | URI is the S3 URI (s3://[endpoint/]bucket/prefix) | | |
| `credentialsSecretRef` _string_ | CredentialsSecretRef is the name of the credentials secret | | |
#### CheckpointStorageConfiguration
CheckpointStorageConfiguration holds storage backend configuration for checkpoints.
_Appears in:_
- [CheckpointConfiguration](#checkpointconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `type` _string_ | Type is the storage backend type: pvc, s3, or oci | pvc | |
| `pvc` _[CheckpointPVCConfig](#checkpointpvcconfig)_ | PVC configuration (used when Type=pvc) | | |
| `s3` _[CheckpointS3Config](#checkpoints3config)_ | S3 configuration (used when Type=s3) | | |
| `oci` _[CheckpointOCIConfig](#checkpointociconfig)_ | OCI configuration (used when Type=oci) | | |
#### DiscoveryBackend
_Underlying type:_ _string_
DiscoveryBackend is the type for the discovery backend.
_Appears in:_
- [DiscoveryConfiguration](#discoveryconfiguration)
| Field | Description |
| --- | --- |
| `kubernetes` | DiscoveryBackendKubernetes is the Kubernetes discovery backend<br /> |
| `etcd` | DiscoveryBackendEtcd is the etcd discovery backend<br /> |
#### DiscoveryConfiguration
DiscoveryConfiguration holds discovery backend settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `backend` _[DiscoveryBackend](#discoverybackend)_ | Backend is the discovery backend: "kubernetes" or "etcd" | kubernetes | |
#### GPUConfiguration
GPUConfiguration holds GPU discovery settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `discoveryEnabled` _boolean_ | DiscoveryEnabled indicates whether GPU discovery is enabled | true | |
#### GroveConfiguration
GroveConfiguration holds Grove orchestrator settings.
_Appears in:_
- [OrchestratorConfiguration](#orchestratorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled overrides auto-detection. nil = auto-detect. | | |
| `terminationDelay` _[Duration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#duration-v1-meta)_ | TerminationDelay configures the termination delay for Grove PodCliqueSets | 15m | |
#### InfrastructureConfiguration
InfrastructureConfiguration holds service mesh and backend addresses.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `natsAddress` _string_ | NATSAddress is the address of the NATS server | | |
| `etcdAddress` _string_ | ETCDAddress is the address of the etcd server | | |
| `modelExpressURL` _string_ | ModelExpressURL is the URL of the Model Express server to inject into all pods | | |
| `prometheusEndpoint` _string_ | PrometheusEndpoint is the URL of the Prometheus endpoint to use for metrics | | |
#### IngressConfiguration
IngressConfiguration holds ingress settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `virtualServiceGateway` _string_ | VirtualServiceGateway is the name of the Istio virtual service gateway | | |
| `controllerClassName` _string_ | ControllerClassName is the ingress controller class name | | |
| `controllerTLSSecretName` _string_ | ControllerTLSSecretName is the TLS secret for the ingress controller | | |
| `hostSuffix` _string_ | HostSuffix is the suffix for ingress hostnames | | |
#### KaiSchedulerConfiguration
KaiSchedulerConfiguration holds Kai-scheduler settings.
_Appears in:_
- [OrchestratorConfiguration](#orchestratorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled overrides auto-detection. nil = auto-detect. | | |
#### LWSConfiguration
LWSConfiguration holds LWS orchestrator settings.
_Appears in:_
- [OrchestratorConfiguration](#orchestratorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled overrides auto-detection. nil = auto-detect. | | |
#### LeaderElectionConfiguration
LeaderElectionConfiguration holds leader election settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled enables leader election for controller manager | false | |
| `id` _string_ | ID is the leader election resource identity | | |
| `namespace` _string_ | Namespace is the namespace for the leader election resource | | |
#### LoggingConfiguration
LoggingConfiguration holds logging settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `level` _string_ | Level is the log level (e.g., "info", "debug") | info | |
| `format` _string_ | Format is the log format (e.g., "json", "text") | json | |
#### MPIConfiguration
MPIConfiguration holds MPI SSH secret settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `sshSecretName` _string_ | SSHSecretName is the name of the secret containing the SSH key for MPI | | |
| `sshSecretNamespace` _string_ | SSHSecretNamespace is the namespace where the MPI SSH secret is located | | |
#### MetricsServer
MetricsServer extends Server with secure serving option.
_Appears in:_
- [ServerConfiguration](#serverconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `bindAddress` _string_ | BindAddress is the address the server binds to | | |
| `port` _integer_ | Port is the port the server listens on | | |
| `secure` _boolean_ | Secure enables secure serving for the metrics endpoint | | |
#### NamespaceConfiguration
NamespaceConfiguration determines operator namespace mode.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `restricted` _string_ | Restricted is the namespace to restrict to. Empty = cluster-wide mode. | | |
| `scope` _[NamespaceScopeConfiguration](#namespacescopeconfiguration)_ | Scope holds namespace scope lease settings (namespace-restricted mode only) | | |
#### NamespaceScopeConfiguration
NamespaceScopeConfiguration holds lease settings for namespace-restricted mode.
_Appears in:_
- [NamespaceConfiguration](#namespaceconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `leaseDuration` _[Duration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#duration-v1-meta)_ | LeaseDuration is the duration of namespace scope marker lease before expiration | 30s | |
| `leaseRenewInterval` _[Duration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#duration-v1-meta)_ | LeaseRenewInterval is the interval for renewing namespace scope marker lease | 10s | |
#### OperatorConfiguration
OperatorConfiguration is the Schema for the operator configuration.
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `apiVersion` _string_ | `operator.config.dynamo.nvidia.com/v1alpha1` | | |
| `kind` _string_ | `OperatorConfiguration` | | |
| `server` _[ServerConfiguration](#serverconfiguration)_ | Server configuration (metrics, health probes, webhooks) | | |
| `leaderElection` _[LeaderElectionConfiguration](#leaderelectionconfiguration)_ | Leader election configuration | | |
| `namespace` _[NamespaceConfiguration](#namespaceconfiguration)_ | Namespace configuration (restricted vs cluster-wide) | | |
| `orchestrators` _[OrchestratorConfiguration](#orchestratorconfiguration)_ | Orchestrator configuration with optional overrides | | |
| `infrastructure` _[InfrastructureConfiguration](#infrastructureconfiguration)_ | Service mesh and infrastructure addresses | | |
| `ingress` _[IngressConfiguration](#ingressconfiguration)_ | Ingress configuration | | |
| `rbac` _[RBACConfiguration](#rbacconfiguration)_ | RBAC configuration for cross-namespace resource management (cluster-wide mode) | | |
| `mpi` _[MPIConfiguration](#mpiconfiguration)_ | MPI SSH secret configuration | | |
| `checkpoint` _[CheckpointConfiguration](#checkpointconfiguration)_ | Checkpoint/restore configuration | | |
| `discovery` _[DiscoveryConfiguration](#discoveryconfiguration)_ | Discovery backend configuration | | |
| `gpu` _[GPUConfiguration](#gpuconfiguration)_ | GPU discovery configuration | | |
| `logging` _[LoggingConfiguration](#loggingconfiguration)_ | Logging configuration | | |
| `security` _[SecurityConfiguration](#securityconfiguration)_ | HTTP/2 and TLS settings | | |
#### OrchestratorConfiguration
OrchestratorConfiguration holds orchestrator override settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `grove` _[GroveConfiguration](#groveconfiguration)_ | Grove orchestrator configuration | | |
| `lws` _[LWSConfiguration](#lwsconfiguration)_ | LWS orchestrator configuration | | |
| `kaiScheduler` _[KaiSchedulerConfiguration](#kaischedulerconfiguration)_ | KaiScheduler configuration | | |
#### RBACConfiguration
RBACConfiguration holds RBAC settings for cluster-wide mode.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `plannerClusterRoleName` _string_ | PlannerClusterRoleName is the ClusterRole for planner | | |
| `dgdrProfilingClusterRoleName` _string_ | DGDRProfilingClusterRoleName is the ClusterRole for DGDR profiling jobs | | |
| `eppClusterRoleName` _string_ | EPPClusterRoleName is the ClusterRole for EPP | | |
#### SecurityConfiguration
SecurityConfiguration holds HTTP/2 and TLS settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enableHTTP2` _boolean_ | EnableHTTP2 enables HTTP/2 for metrics and webhook servers | false | |
#### Server
Server holds a bind address and port.
_Appears in:_
- [MetricsServer](#metricsserver)
- [ServerConfiguration](#serverconfiguration)
- [WebhookServer](#webhookserver)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `bindAddress` _string_ | BindAddress is the address the server binds to | | |
| `port` _integer_ | Port is the port the server listens on | | |
#### ServerConfiguration
ServerConfiguration holds server bind addresses and ports.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `metrics` _[MetricsServer](#metricsserver)_ | Metrics server configuration | \{ bindAddress:127.0.0.1 port:8080 \} | |
| `healthProbe` _[Server](#server)_ | Health probe server configuration | \{ bindAddress:0.0.0.0 port:8081 \} | |
| `webhook` _[WebhookServer](#webhookserver)_ | Webhook server configuration | \{ certDir:/tmp/k8s-webhook-server/serving-certs host:0.0.0.0 port:9443 \} | |
#### WebhookServer
WebhookServer extends Server with host and certificate directory.
_Appears in:_
- [ServerConfiguration](#serverconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `bindAddress` _string_ | BindAddress is the address the server binds to | | |
| `port` _integer_ | Port is the port the server listens on | | |
| `host` _string_ | Host is the address the webhook server binds to | | |
| `certDir` _string_ | CertDir is the directory containing TLS certificates | | |
# Operator Default Values Injection
The Dynamo operator automatically applies default values to various fields when they are not explicitly specified in your deployments. These defaults include:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment