Unverified Commit c0f34b15 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

feat(operator): operator config versioning and injected as configmap (#6464)

parent 59c5f6f1
...@@ -44,6 +44,7 @@ import ( ...@@ -44,6 +44,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/predicate"
sigsyaml "sigs.k8s.io/yaml" sigsyaml "sigs.k8s.io/yaml"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common" commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
...@@ -298,8 +299,9 @@ echo "Saved profiling output to ConfigMap {{.ConfigMapName}}" ...@@ -298,8 +299,9 @@ echo "Saved profiling output to ConfigMap {{.ConfigMapName}}"
// DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object // DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object
type DynamoGraphDeploymentRequestReconciler struct { type DynamoGraphDeploymentRequestReconciler struct {
client.Client client.Client
Recorder record.EventRecorder Recorder record.EventRecorder
Config commonController.Config Config *configv1alpha1.OperatorConfiguration
RuntimeConfig *commonController.RuntimeConfig
// RBACMgr handles RBAC setup for profiling jobs // RBACMgr handles RBAC setup for profiling jobs
RBACManager RBACManager RBACManager RBACManager
...@@ -1015,7 +1017,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con ...@@ -1015,7 +1017,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
logger.Info("GPU discovery not available", "reason", err.Error()) logger.Info("GPU discovery not available", "reason", err.Error())
isNamespaceScoped := r.Config.RestrictedNamespace != "" isNamespaceScoped := r.Config.Namespace.Restricted != ""
if isNamespaceScoped { if isNamespaceScoped {
tmpl := template.Must(template.New("nsGPUErr").Parse( tmpl := template.Must(template.New("nsGPUErr").Parse(
`GPU hardware info required but cannot be auto-discovered.` + `GPU hardware info required but cannot be auto-discovered.` +
...@@ -1073,7 +1075,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1073,7 +1075,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
} }
// Ensure profiling job RBAC exists (only for cluster-wide installation) // Ensure profiling job RBAC exists (only for cluster-wide installation)
if r.Config.RestrictedNamespace == "" { if r.Config.Namespace.Restricted == "" {
if err := r.RBACManager.EnsureServiceAccountWithRBAC( if err := r.RBACManager.EnsureServiceAccountWithRBAC(
ctx, ctx,
dgdr.Namespace, dgdr.Namespace,
...@@ -1791,7 +1793,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manag ...@@ -1791,7 +1793,9 @@ func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manag
UpdateFunc: func(ue event.UpdateEvent) bool { return true }, UpdateFunc: func(ue event.UpdateEvent) bool { return true },
GenericFunc: func(ge event.GenericEvent) bool { return true }, GenericFunc: func(ge event.GenericEvent) bool { return true },
}), }),
). // Watch DGDs created by this controller (via label) ).
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)). // set the event filter to ignore resources handled by other controllers in namespace-restricted mode // Watch DGDs created by this controller (via label)
// Set the event filter to ignore resources handled by other controllers in namespace-restricted mode
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config, r.RuntimeConfig)).
Complete(observability.NewObservedReconciler(r, consts.ResourceTypeDynamoGraphDeploymentRequest)) Complete(observability.NewObservedReconciler(r, consts.ResourceTypeDynamoGraphDeploymentRequest))
} }
...@@ -22,6 +22,7 @@ import ( ...@@ -22,6 +22,7 @@ import (
"encoding/json" "encoding/json"
"time" "time"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common" commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
. "github.com/onsi/ginkgo/v2" . "github.com/onsi/ginkgo/v2"
...@@ -86,13 +87,16 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -86,13 +87,16 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
reconciler = &DynamoGraphDeploymentRequestReconciler{ reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient, Client: k8sClient,
Recorder: recorder, Recorder: recorder,
Config: commonController.Config{ Config: &configv1alpha1.OperatorConfiguration{
RestrictedNamespace: "", Namespace: configv1alpha1.NamespaceConfiguration{
RBAC: commonController.RBACConfig{ Restricted: "",
},
RBAC: configv1alpha1.RBACConfiguration{
DGDRProfilingClusterRoleName: "test-cluster-role", DGDRProfilingClusterRoleName: "test-cluster-role",
}, },
}, },
RBACManager: &MockRBACManager{}, RuntimeConfig: &commonController.RuntimeConfig{},
RBACManager: &MockRBACManager{},
} }
}) })
...@@ -956,10 +960,13 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -956,10 +960,13 @@ var _ = Describe("DGDR Profiler Arguments", func() {
reconciler = &DynamoGraphDeploymentRequestReconciler{ reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient, Client: k8sClient,
Recorder: record.NewFakeRecorder(100), Recorder: record.NewFakeRecorder(100),
Config: commonController.Config{ Config: &configv1alpha1.OperatorConfiguration{
RestrictedNamespace: "", Namespace: configv1alpha1.NamespaceConfiguration{
Restricted: "",
},
}, },
RBACManager: &MockRBACManager{}, RuntimeConfig: &commonController.RuntimeConfig{},
RBACManager: &MockRBACManager{},
} }
}) })
...@@ -1199,10 +1206,13 @@ var _ = Describe("DGDR Error Handling", func() { ...@@ -1199,10 +1206,13 @@ var _ = Describe("DGDR Error Handling", func() {
reconciler = &DynamoGraphDeploymentRequestReconciler{ reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient, Client: k8sClient,
Recorder: recorder, Recorder: recorder,
Config: commonController.Config{ Config: &configv1alpha1.OperatorConfiguration{
RestrictedNamespace: "", Namespace: configv1alpha1.NamespaceConfiguration{
Restricted: "",
},
}, },
RBACManager: &MockRBACManager{}, RuntimeConfig: &commonController.RuntimeConfig{},
RBACManager: &MockRBACManager{},
} }
}) })
......
...@@ -36,6 +36,7 @@ import ( ...@@ -36,6 +36,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/reconcile"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common" commonController "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
...@@ -45,9 +46,10 @@ import ( ...@@ -45,9 +46,10 @@ import (
// DynamoGraphDeploymentScalingAdapterReconciler reconciles a DynamoGraphDeploymentScalingAdapter object // DynamoGraphDeploymentScalingAdapterReconciler reconciles a DynamoGraphDeploymentScalingAdapter object
type DynamoGraphDeploymentScalingAdapterReconciler struct { type DynamoGraphDeploymentScalingAdapterReconciler struct {
client.Client client.Client
Scheme *runtime.Scheme Scheme *runtime.Scheme
Recorder record.EventRecorder Recorder record.EventRecorder
Config commonController.Config Config *configv1alpha1.OperatorConfiguration
RuntimeConfig *commonController.RuntimeConfig
} }
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentscalingadapters,verbs=get;list;watch;create;update;patch;delete
...@@ -177,7 +179,7 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) SetupWithManager(mgr ctr ...@@ -177,7 +179,7 @@ func (r *DynamoGraphDeploymentScalingAdapterReconciler) SetupWithManager(mgr ctr
GenericFunc: func(ge event.GenericEvent) bool { return false }, GenericFunc: func(ge event.GenericEvent) bool { return false },
}), }),
). ).
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)). WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config, r.RuntimeConfig)).
Complete(observability.NewObservedReconciler(r, consts.ResourceTypeDynamoGraphDeploymentScalingAdapter)) Complete(observability.NewObservedReconciler(r, consts.ResourceTypeDynamoGraphDeploymentScalingAdapter))
} }
......
...@@ -20,8 +20,8 @@ package controller_common ...@@ -20,8 +20,8 @@ package controller_common
import ( import (
"context" "context"
"strings" "strings"
"time"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/meta"
"k8s.io/client-go/discovery" "k8s.io/client-go/discovery"
...@@ -36,153 +36,22 @@ type ExcludedNamespacesInterface interface { ...@@ -36,153 +36,22 @@ type ExcludedNamespacesInterface interface {
Contains(namespace string) bool Contains(namespace string) bool
} }
type GroveConfig struct {
// Enabled is automatically determined by checking if Grove CRDs are installed in the cluster
Enabled bool
// TerminationDelay configures the termination delay for Grove PodCliqueSets
TerminationDelay time.Duration
}
type LWSConfig struct {
// Enabled is automatically determined by checking if LWS CRDs are installed in the cluster
Enabled bool
}
type KaiSchedulerConfig struct {
// Enabled is automatically determined by checking if Kai-scheduler CRDs are installed in the cluster
Enabled bool
}
type MpiRunConfig struct {
// SecretName is the name of the secret containing the SSH key for MPI Run
SecretName string
}
type Config struct {
// Enable resources filtering, only the resources belonging to the given namespace will be handled.
RestrictedNamespace string
Grove GroveConfig
LWS LWSConfig
KaiScheduler KaiSchedulerConfig
EtcdAddress string
NatsAddress string
IngressConfig IngressConfig
// ModelExpressURL is the URL of the Model Express server to inject into all pods
ModelExpressURL string
// PrometheusEndpoint is the URL of the Prometheus endpoint to use for metrics
PrometheusEndpoint string
MpiRun MpiRunConfig
// RBAC configuration for cross-namespace resource management
RBAC RBACConfig
// ExcludedNamespaces is a thread-safe set of namespaces to exclude (cluster-wide mode only)
ExcludedNamespaces ExcludedNamespacesInterface
// DiscoveryBackend is the discovery backend to use. Default is "kubernetes" for Kubernetes API service discovery. Set to "etcd" to use ETCD for discovery.
DiscoveryBackend string
// GPUDiscoveryEnabled indicates whether Helm provisioned node read access for the namespace-scoped operator.
// Only relevant for namespace-scoped operators (RestrictedNamespace != "").
GPUDiscoveryEnabled bool
// Checkpoint configuration for checkpoint/restore functionality
Checkpoint CheckpointConfig
}
// RBACConfig holds configuration for RBAC management
type RBACConfig struct {
// PlannerClusterRoleName is the name of the ClusterRole for planner (cluster-wide mode only)
PlannerClusterRoleName string
// DGDRProfilingClusterRoleName is the name of the ClusterRole for DGDR profiling jobs (cluster-wide mode only)
DGDRProfilingClusterRoleName string
// EPPClusterRoleName is the name of the ClusterRole for EPP (cluster-wide mode only)
EPPClusterRoleName string
}
// CheckpointConfig holds configuration for checkpoint/restore functionality
type CheckpointConfig struct {
// Enabled indicates if checkpoint functionality is enabled
Enabled bool
// Storage holds storage backend configuration
Storage CheckpointStorageConfig
// ReadyForCheckpointFilePath is the file path used to signal model readiness for checkpoint jobs
ReadyForCheckpointFilePath string
}
// Checkpoint storage type constants
const (
CheckpointStorageTypePVC = "pvc"
CheckpointStorageTypeS3 = "s3"
CheckpointStorageTypeOCI = "oci"
)
// CheckpointStorageConfig holds storage backend configuration for checkpoints
type CheckpointStorageConfig struct {
// Type is the storage backend type: pvc, s3, or oci
Type string
// PVC configuration (used when Type=pvc)
PVC CheckpointPVCConfig
// S3 configuration (used when Type=s3)
S3 CheckpointS3Config
// OCI configuration (used when Type=oci)
OCI CheckpointOCIConfig
}
// CheckpointPVCConfig holds PVC storage configuration
type CheckpointPVCConfig struct {
// PVCName is the name of the PVC
PVCName string
// BasePath is the base directory within the PVC
BasePath string
}
// CheckpointS3Config holds S3 storage configuration
type CheckpointS3Config struct {
// URI is the S3 URI (s3://[endpoint/]bucket/prefix)
URI string
// CredentialsSecretRef is the name of the credentials secret
// (should contain AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and optionally AWS_REGION)
CredentialsSecretRef string
}
// CheckpointOCIConfig holds OCI registry storage configuration
type CheckpointOCIConfig struct {
// URI is the OCI URI (oci://registry/repository)
URI string
// CredentialsSecretRef is the name of the docker config secret
CredentialsSecretRef string
}
type IngressConfig struct {
VirtualServiceGateway string
IngressControllerClassName string
IngressControllerTLSSecret string
IngressHostSuffix string
}
func (i *IngressConfig) UseVirtualService() bool {
return i.VirtualServiceGateway != ""
}
// DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered // DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectGroveAvailability(ctx context.Context, mgr ctrl.Manager) bool { func DetectGroveAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "grove.io") return detectAPIGroupAvailability(ctx, mgr, "grove.io")
} }
// DetectLWSAvailability checks if LWS is available by checking if the LWS API group is registered // DetectLWSAvailability checks if LWS is available by checking if the LWS API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectLWSAvailability(ctx context.Context, mgr ctrl.Manager) bool { func DetectLWSAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "leaderworkerset.x-k8s.io") return detectAPIGroupAvailability(ctx, mgr, "leaderworkerset.x-k8s.io")
} }
// detectVolcanoAvailability checks if Volcano is available by checking if the Volcano API group is registered // DetectVolcanoAvailability checks if Volcano is available by checking if the Volcano API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectVolcanoAvailability(ctx context.Context, mgr ctrl.Manager) bool { func DetectVolcanoAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "scheduling.volcano.sh") return detectAPIGroupAvailability(ctx, mgr, "scheduling.volcano.sh")
} }
// DetectKaiSchedulerAvailability checks if Kai-scheduler is available by checking if the scheduling.run.ai API group is registered // DetectKaiSchedulerAvailability checks if Kai-scheduler is available by checking if the scheduling.run.ai API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectKaiSchedulerAvailability(ctx context.Context, mgr ctrl.Manager) bool { func DetectKaiSchedulerAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "scheduling.run.ai") return detectAPIGroupAvailability(ctx, mgr, "scheduling.run.ai")
} }
...@@ -226,20 +95,23 @@ func detectAPIGroupAvailability(ctx context.Context, mgr ctrl.Manager, groupName ...@@ -226,20 +95,23 @@ func detectAPIGroupAvailability(ctx context.Context, mgr ctrl.Manager, groupName
return false return false
} }
// For DGD, pass in the meta annotations // GetDiscoveryBackend returns the discovery backend for the given annotations,
// For DCD, pass in the spec annotations // falling back to the configured default.
func (c Config) IsK8sDiscoveryEnabled(annotations map[string]string) bool { // For DGD, pass in the meta annotations; for DCD, pass in the spec annotations.
return c.GetDiscoveryBackend(annotations) == "kubernetes" func GetDiscoveryBackend(discoveryBackend configv1alpha1.DiscoveryBackend, annotations map[string]string) configv1alpha1.DiscoveryBackend {
}
func (c Config) GetDiscoveryBackend(annotations map[string]string) string {
if dgdDiscoveryBackend, exists := annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend]; exists { if dgdDiscoveryBackend, exists := annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend]; exists {
return dgdDiscoveryBackend return configv1alpha1.DiscoveryBackend(dgdDiscoveryBackend)
} }
return c.DiscoveryBackend return discoveryBackend
}
// IsK8sDiscoveryEnabled returns whether Kubernetes discovery is enabled for the given annotations.
func IsK8sDiscoveryEnabled(discoveryBackend configv1alpha1.DiscoveryBackend, annotations map[string]string) bool {
return GetDiscoveryBackend(discoveryBackend, annotations) == configv1alpha1.DiscoveryBackendKubernetes
} }
func EphemeralDeploymentEventFilter(config Config) predicate.Predicate { // EphemeralDeploymentEventFilter returns a predicate that filters events based on namespace configuration.
func EphemeralDeploymentEventFilter(config *configv1alpha1.OperatorConfiguration, runtimeConfig *RuntimeConfig) predicate.Predicate {
return predicate.NewPredicateFuncs(func(o client.Object) bool { return predicate.NewPredicateFuncs(func(o client.Object) bool {
l := log.FromContext(context.Background()) l := log.FromContext(context.Background())
objMeta, err := meta.Accessor(o) objMeta, err := meta.Accessor(o)
...@@ -247,13 +119,13 @@ func EphemeralDeploymentEventFilter(config Config) predicate.Predicate { ...@@ -247,13 +119,13 @@ func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
l.Error(err, "Error extracting object metadata") l.Error(err, "Error extracting object metadata")
return false return false
} }
if config.RestrictedNamespace != "" { if config.Namespace.Restricted != "" {
// in case of a restricted namespace, we only want to process the events that are in the restricted namespace // in case of a restricted namespace, we only want to process the events that are in the restricted namespace
return objMeta.GetNamespace() == config.RestrictedNamespace return objMeta.GetNamespace() == config.Namespace.Restricted
} }
// Cluster-wide mode: check if namespace is excluded // Cluster-wide mode: check if namespace is excluded
if config.ExcludedNamespaces != nil && config.ExcludedNamespaces.Contains(objMeta.GetNamespace()) { if runtimeConfig.ExcludedNamespaces != nil && runtimeConfig.ExcludedNamespaces.Contains(objMeta.GetNamespace()) {
l.V(1).Info("Skipping resource - namespace is excluded", l.V(1).Info("Skipping resource - namespace is excluded",
"namespace", objMeta.GetNamespace(), "namespace", objMeta.GetNamespace(),
"resource", objMeta.GetName(), "resource", objMeta.GetName(),
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller_common
// RuntimeConfig holds runtime state that is resolved after startup (e.g., auto-detection results).
// This is separate from the static OperatorConfiguration loaded from config files.
type RuntimeConfig struct {
// GroveEnabled is the resolved Grove availability (config override merged with auto-detection)
GroveEnabled bool
// LWSEnabled is the resolved LWS availability (config override merged with auto-detection)
LWSEnabled bool
// KaiSchedulerEnabled is the resolved Kai-scheduler availability (config override merged with auto-detection)
KaiSchedulerEnabled bool
// ExcludedNamespaces for cluster-wide mode namespace filtering
ExcludedNamespaces ExcludedNamespacesInterface
}
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
package dynamo package dynamo
import ( import (
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
...@@ -48,7 +49,7 @@ type ComponentContext struct { ...@@ -48,7 +49,7 @@ type ComponentContext struct {
ComponentType string ComponentType string
ParentGraphDeploymentName string ParentGraphDeploymentName string
ParentGraphDeploymentNamespace string ParentGraphDeploymentNamespace string
DiscoveryBackend string DiscoveryBackend configv1alpha1.DiscoveryBackend
EPPConfig *v1alpha1.EPPConfig EPPConfig *v1alpha1.EPPConfig
WorkerHashSuffix string WorkerHashSuffix string
} }
......
...@@ -33,6 +33,7 @@ import ( ...@@ -33,6 +33,7 @@ import (
"k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr" "k8s.io/utils/ptr"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint" "github.com/ai-dynamo/dynamo/deploy/operator/internal/checkpoint"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
...@@ -785,22 +786,22 @@ func GenerateComponentVirtualService(ctx context.Context, componentName, compone ...@@ -785,22 +786,22 @@ func GenerateComponentVirtualService(ctx context.Context, componentName, compone
return vs return vs
} }
func GenerateDefaultIngressSpec(dynamoDeployment *v1alpha1.DynamoGraphDeployment, ingressConfig controller_common.IngressConfig) v1alpha1.IngressSpec { func GenerateDefaultIngressSpec(dynamoDeployment *v1alpha1.DynamoGraphDeployment, ingressConfig configv1alpha1.IngressConfiguration) v1alpha1.IngressSpec {
res := v1alpha1.IngressSpec{ res := v1alpha1.IngressSpec{
Enabled: ingressConfig.VirtualServiceGateway != "" || ingressConfig.IngressControllerClassName != "", Enabled: ingressConfig.VirtualServiceGateway != "" || ingressConfig.ControllerClassName != "",
Host: dynamoDeployment.Name, Host: dynamoDeployment.Name,
UseVirtualService: ingressConfig.VirtualServiceGateway != "", UseVirtualService: ingressConfig.VirtualServiceGateway != "",
} }
if ingressConfig.IngressControllerClassName != "" { if ingressConfig.ControllerClassName != "" {
res.IngressControllerClassName = &ingressConfig.IngressControllerClassName res.IngressControllerClassName = &ingressConfig.ControllerClassName
} }
if ingressConfig.IngressControllerTLSSecret != "" { if ingressConfig.ControllerTLSSecretName != "" {
res.TLS = &v1alpha1.IngressTLSSpec{ res.TLS = &v1alpha1.IngressTLSSpec{
SecretName: ingressConfig.IngressControllerTLSSecret, SecretName: ingressConfig.ControllerTLSSecretName,
} }
} }
if ingressConfig.IngressHostSuffix != "" { if ingressConfig.HostSuffix != "" {
res.HostSuffix = &ingressConfig.IngressHostSuffix res.HostSuffix = &ingressConfig.HostSuffix
} }
if ingressConfig.VirtualServiceGateway != "" { if ingressConfig.VirtualServiceGateway != "" {
res.VirtualServiceGateway = &ingressConfig.VirtualServiceGateway res.VirtualServiceGateway = &ingressConfig.VirtualServiceGateway
...@@ -889,7 +890,7 @@ type MultinodeDeployer interface { ...@@ -889,7 +890,7 @@ type MultinodeDeployer interface {
} }
// BackendFactory creates backend instances based on the framework type // BackendFactory creates backend instances based on the framework type
func BackendFactory(backendFramework BackendFramework, controllerConfig controller_common.Config) Backend { func BackendFactory(backendFramework BackendFramework, operatorConfig *configv1alpha1.OperatorConfiguration) Backend {
switch backendFramework { switch backendFramework {
case BackendFrameworkSGLang: case BackendFrameworkSGLang:
return &SGLangBackend{} return &SGLangBackend{}
...@@ -897,7 +898,7 @@ func BackendFactory(backendFramework BackendFramework, controllerConfig controll ...@@ -897,7 +898,7 @@ func BackendFactory(backendFramework BackendFramework, controllerConfig controll
return &VLLMBackend{} return &VLLMBackend{}
case BackendFrameworkTRTLLM: case BackendFrameworkTRTLLM:
return &TRTLLMBackend{ return &TRTLLMBackend{
MpiRunSecretName: controllerConfig.MpiRun.SecretName, MpiRunSecretName: operatorConfig.MPI.SSHSecretName,
} }
case BackendFrameworkNoop: case BackendFrameworkNoop:
return &NoopBackend{} return &NoopBackend{}
...@@ -925,32 +926,32 @@ func IsWorkerComponent(componentType string) bool { ...@@ -925,32 +926,32 @@ func IsWorkerComponent(componentType string) bool {
} }
// addStandardEnvVars adds the standard environment variables that are common to both Grove and Controller // addStandardEnvVars adds the standard environment variables that are common to both Grove and Controller
func addStandardEnvVars(container *corev1.Container, controllerConfig controller_common.Config) { func addStandardEnvVars(container *corev1.Container, operatorConfig *configv1alpha1.OperatorConfiguration) {
standardEnvVars := []corev1.EnvVar{} standardEnvVars := []corev1.EnvVar{}
if controllerConfig.NatsAddress != "" { if operatorConfig.Infrastructure.NATSAddress != "" {
standardEnvVars = append(standardEnvVars, corev1.EnvVar{ standardEnvVars = append(standardEnvVars, corev1.EnvVar{
Name: "NATS_SERVER", Name: "NATS_SERVER",
Value: controllerConfig.NatsAddress, Value: operatorConfig.Infrastructure.NATSAddress,
}) })
} }
if controllerConfig.EtcdAddress != "" { if operatorConfig.Infrastructure.ETCDAddress != "" {
standardEnvVars = append(standardEnvVars, corev1.EnvVar{ standardEnvVars = append(standardEnvVars, corev1.EnvVar{
Name: "ETCD_ENDPOINTS", Name: "ETCD_ENDPOINTS",
Value: controllerConfig.EtcdAddress, Value: operatorConfig.Infrastructure.ETCDAddress,
}) })
} }
if controllerConfig.ModelExpressURL != "" { if operatorConfig.Infrastructure.ModelExpressURL != "" {
standardEnvVars = append(standardEnvVars, corev1.EnvVar{ standardEnvVars = append(standardEnvVars, corev1.EnvVar{
Name: "MODEL_EXPRESS_URL", Name: "MODEL_EXPRESS_URL",
Value: controllerConfig.ModelExpressURL, Value: operatorConfig.Infrastructure.ModelExpressURL,
}) })
} }
if controllerConfig.PrometheusEndpoint != "" { if operatorConfig.Infrastructure.PrometheusEndpoint != "" {
standardEnvVars = append(standardEnvVars, corev1.EnvVar{ standardEnvVars = append(standardEnvVars, corev1.EnvVar{
Name: "PROMETHEUS_ENDPOINT", Name: "PROMETHEUS_ENDPOINT",
Value: controllerConfig.PrometheusEndpoint, Value: operatorConfig.Infrastructure.PrometheusEndpoint,
}) })
} }
// merge the env vars to allow users to override the standard env vars // merge the env vars to allow users to override the standard env vars
...@@ -989,13 +990,13 @@ func GenerateBasePodSpec( ...@@ -989,13 +990,13 @@ func GenerateBasePodSpec(
namespace string, namespace string,
role Role, role Role,
numberOfNodes int32, numberOfNodes int32,
controllerConfig controller_common.Config, operatorConfig *configv1alpha1.OperatorConfiguration,
multinodeDeploymentType commonconsts.MultinodeDeploymentType, multinodeDeploymentType commonconsts.MultinodeDeploymentType,
serviceName string, serviceName string,
checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info (resolved by ResolveCheckpointForService) checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info (resolved by ResolveCheckpointForService)
) (*corev1.PodSpec, error) { ) (*corev1.PodSpec, error) {
// Start with base container generated per component type // Start with base container generated per component type
componentContext := generateComponentContext(component, parentGraphDeploymentName, namespace, numberOfNodes, controllerConfig.GetDiscoveryBackend(component.Annotations)) componentContext := generateComponentContext(component, parentGraphDeploymentName, namespace, numberOfNodes, controller_common.GetDiscoveryBackend(operatorConfig.Discovery.Backend, component.Annotations))
componentDefaults := ComponentDefaultsFactory(component.ComponentType) componentDefaults := ComponentDefaultsFactory(component.ComponentType)
container, err := componentDefaults.GetBaseContainer(componentContext) container, err := componentDefaults.GetBaseContainer(componentContext)
if err != nil { if err != nil {
...@@ -1078,7 +1079,7 @@ func GenerateBasePodSpec( ...@@ -1078,7 +1079,7 @@ func GenerateBasePodSpec(
}) })
} }
addStandardEnvVars(&container, controllerConfig) addStandardEnvVars(&container, operatorConfig)
volumes := make([]corev1.Volume, 0, len(component.VolumeMounts)+1) // +1 for shared memory volume volumes := make([]corev1.Volume, 0, len(component.VolumeMounts)+1) // +1 for shared memory volume
...@@ -1124,7 +1125,7 @@ func GenerateBasePodSpec( ...@@ -1124,7 +1125,7 @@ func GenerateBasePodSpec(
if multinodeDeployer == nil { if multinodeDeployer == nil {
return nil, fmt.Errorf("unsupported multinode deployment type: %s", multinodeDeploymentType) return nil, fmt.Errorf("unsupported multinode deployment type: %s", multinodeDeploymentType)
} }
backend := BackendFactory(backendFramework, controllerConfig) backend := BackendFactory(backendFramework, operatorConfig)
if backend == nil { if backend == nil {
return nil, fmt.Errorf("unsupported backend framework: %s", backendFramework) return nil, fmt.Errorf("unsupported backend framework: %s", backendFramework)
} }
...@@ -1156,7 +1157,7 @@ func GenerateBasePodSpec( ...@@ -1156,7 +1157,7 @@ func GenerateBasePodSpec(
applyDefaultSecurityContext(&podSpec) applyDefaultSecurityContext(&podSpec)
} }
if controllerConfig.IsK8sDiscoveryEnabled(component.Annotations) { if controller_common.IsK8sDiscoveryEnabled(operatorConfig.Discovery.Backend, component.Annotations) {
if podSpec.ServiceAccountName == "" { if podSpec.ServiceAccountName == "" {
podSpec.ServiceAccountName = discovery.GetK8sDiscoveryServiceAccountName(parentGraphDeploymentName) podSpec.ServiceAccountName = discovery.GetK8sDiscoveryServiceAccountName(parentGraphDeploymentName)
} }
...@@ -1176,9 +1177,9 @@ func GenerateBasePodSpec( ...@@ -1176,9 +1177,9 @@ func GenerateBasePodSpec(
// - Storage configuration (volumes, mounts) // - Storage configuration (volumes, mounts)
// CheckpointInfo should have been resolved by ResolveCheckpointForService before calling this function // CheckpointInfo should have been resolved by ResolveCheckpointForService before calling this function
// Checkpoint config comes from the operator's controller config (Helm values) // Checkpoint config comes from the operator's controller config (Helm values)
var checkpointConfig *controller_common.CheckpointConfig var checkpointConfig *configv1alpha1.CheckpointConfiguration
if controllerConfig.Checkpoint.Enabled { if operatorConfig.Checkpoint.Enabled {
checkpointConfig = &controllerConfig.Checkpoint checkpointConfig = &operatorConfig.Checkpoint
} }
if err := checkpoint.InjectCheckpointIntoPodSpec(&podSpec, checkpointInfo, checkpointConfig); err != nil { if err := checkpoint.InjectCheckpointIntoPodSpec(&podSpec, checkpointInfo, checkpointConfig); err != nil {
return nil, fmt.Errorf("failed to inject checkpoint config: %w", err) return nil, fmt.Errorf("failed to inject checkpoint config: %w", err)
...@@ -1198,7 +1199,7 @@ func setMetricsLabels(labels map[string]string, dynamoGraphDeployment *v1alpha1. ...@@ -1198,7 +1199,7 @@ func setMetricsLabels(labels map[string]string, dynamoGraphDeployment *v1alpha1.
labels[commonconsts.KubeLabelMetricsEnabled] = commonconsts.KubeLabelValueTrue labels[commonconsts.KubeLabelMetricsEnabled] = commonconsts.KubeLabelValueTrue
} }
func generateComponentContext(component *v1alpha1.DynamoComponentDeploymentSharedSpec, parentGraphDeploymentName string, namespace string, numberOfNodes int32, discoveryBackend string) ComponentContext { func generateComponentContext(component *v1alpha1.DynamoComponentDeploymentSharedSpec, parentGraphDeploymentName string, namespace string, numberOfNodes int32, discoveryBackend configv1alpha1.DiscoveryBackend) ComponentContext {
dynamoNamespace := v1alpha1.ComputeDynamoNamespace(component.GlobalDynamoNamespace, namespace, parentGraphDeploymentName) dynamoNamespace := v1alpha1.ComputeDynamoNamespace(component.GlobalDynamoNamespace, namespace, parentGraphDeploymentName)
var workerHashSuffix string var workerHashSuffix string
...@@ -1227,7 +1228,7 @@ func GeneratePodSpecForComponent( ...@@ -1227,7 +1228,7 @@ func GeneratePodSpecForComponent(
dynamoDeployment *v1alpha1.DynamoGraphDeployment, dynamoDeployment *v1alpha1.DynamoGraphDeployment,
role Role, role Role,
numberOfNodes int32, numberOfNodes int32,
controllerConfig controller_common.Config, operatorConfig *configv1alpha1.OperatorConfiguration,
multinodeDeploymentType commonconsts.MultinodeDeploymentType, multinodeDeploymentType commonconsts.MultinodeDeploymentType,
serviceName string, serviceName string,
checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info
...@@ -1235,7 +1236,7 @@ func GeneratePodSpecForComponent( ...@@ -1235,7 +1236,7 @@ func GeneratePodSpecForComponent(
if len(dynamoDeployment.Spec.Envs) > 0 { if len(dynamoDeployment.Spec.Envs) > 0 {
component.Envs = MergeEnvs(dynamoDeployment.Spec.Envs, component.Envs) component.Envs = MergeEnvs(dynamoDeployment.Spec.Envs, component.Envs)
} }
podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, controllerConfig, multinodeDeploymentType, serviceName, checkpointInfo) podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, operatorConfig, multinodeDeploymentType, serviceName, checkpointInfo)
if err != nil { if err != nil {
return nil, err return nil, err
} }
...@@ -1246,7 +1247,8 @@ func GeneratePodSpecForComponent( ...@@ -1246,7 +1247,8 @@ func GeneratePodSpecForComponent(
func GenerateGrovePodCliqueSet( func GenerateGrovePodCliqueSet(
ctx context.Context, ctx context.Context,
dynamoDeployment *v1alpha1.DynamoGraphDeployment, dynamoDeployment *v1alpha1.DynamoGraphDeployment,
controllerConfig controller_common.Config, operatorConfig *configv1alpha1.OperatorConfiguration,
runtimeConfig *controller_common.RuntimeConfig,
secretsRetriever SecretsRetriever, secretsRetriever SecretsRetriever,
restartState *RestartState, restartState *RestartState,
existingRestartAnnotations map[string]string, existingRestartAnnotations map[string]string,
...@@ -1260,13 +1262,13 @@ func GenerateGrovePodCliqueSet( ...@@ -1260,13 +1262,13 @@ func GenerateGrovePodCliqueSet(
PublishNotReadyAddresses: true, PublishNotReadyAddresses: true,
} }
gangSet.Spec.Template.StartupType = ptr.To(grovev1alpha1.CliqueStartupTypeAnyOrder) gangSet.Spec.Template.StartupType = ptr.To(grovev1alpha1.CliqueStartupTypeAnyOrder)
if controllerConfig.Grove.TerminationDelay > 0 { if operatorConfig.Orchestrators.Grove.TerminationDelay.Duration > 0 {
gangSet.Spec.Template.TerminationDelay = &metav1.Duration{Duration: controllerConfig.Grove.TerminationDelay} gangSet.Spec.Template.TerminationDelay = &operatorConfig.Orchestrators.Grove.TerminationDelay
} }
// Validate kai-scheduler queue once if kai-scheduler is enabled // Validate kai-scheduler queue once if kai-scheduler is enabled
var validatedQueueName string var validatedQueueName string
if controllerConfig.Grove.Enabled && controllerConfig.KaiScheduler.Enabled { if runtimeConfig.GroveEnabled && runtimeConfig.KaiSchedulerEnabled {
var err error var err error
validatedQueueName, err = DetermineKaiSchedulerQueue(ctx, dynamoDeployment.Annotations) validatedQueueName, err = DetermineKaiSchedulerQueue(ctx, dynamoDeployment.Annotations)
if err != nil { if err != nil {
...@@ -1274,7 +1276,7 @@ func GenerateGrovePodCliqueSet( ...@@ -1274,7 +1276,7 @@ func GenerateGrovePodCliqueSet(
} }
} }
discoveryBackend := controllerConfig.GetDiscoveryBackend(dynamoDeployment.Annotations) discoveryBackend := controller_common.GetDiscoveryBackend(operatorConfig.Discovery.Backend, dynamoDeployment.Annotations)
var scalingGroups []grovev1alpha1.PodCliqueScalingGroupConfig var scalingGroups []grovev1alpha1.PodCliqueScalingGroupConfig
for serviceName, component := range dynamoDeployment.Spec.Services { for serviceName, component := range dynamoDeployment.Spec.Services {
...@@ -1290,7 +1292,7 @@ func GenerateGrovePodCliqueSet( ...@@ -1290,7 +1292,7 @@ func GenerateGrovePodCliqueSet(
if component.Annotations == nil { if component.Annotations == nil {
component.Annotations = make(map[string]string) component.Annotations = make(map[string]string)
} }
component.Annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend] = discoveryBackend component.Annotations[commonconsts.KubeAnnotationDynamoDiscoveryBackend] = string(discoveryBackend)
} }
// Propagate operator origin version for version-gated behavior in backends // Propagate operator origin version for version-gated behavior in backends
...@@ -1320,7 +1322,7 @@ func GenerateGrovePodCliqueSet( ...@@ -1320,7 +1322,7 @@ func GenerateGrovePodCliqueSet(
dynamoDeployment, dynamoDeployment,
r.Role, r.Role,
numberOfNodes, numberOfNodes,
controllerConfig, operatorConfig,
commonconsts.MultinodeDeploymentTypeGrove, commonconsts.MultinodeDeploymentTypeGrove,
serviceName, serviceName,
checkpointInfo, checkpointInfo,
...@@ -1367,7 +1369,7 @@ func GenerateGrovePodCliqueSet( ...@@ -1367,7 +1369,7 @@ func GenerateGrovePodCliqueSet(
clique.Annotations = annotations clique.Annotations = annotations
// Inject kai-scheduler settings if enabled // Inject kai-scheduler settings if enabled
injectKaiSchedulerIfEnabled(clique, controllerConfig, validatedQueueName) injectKaiSchedulerIfEnabled(clique, runtimeConfig, validatedQueueName)
gangSet.Spec.Template.Cliques = append(gangSet.Spec.Template.Cliques, clique) gangSet.Spec.Template.Cliques = append(gangSet.Spec.Template.Cliques, clique)
cliqueNames = append(cliqueNames, strings.ToLower(r.Name)) cliqueNames = append(cliqueNames, strings.ToLower(r.Name))
...@@ -1604,7 +1606,7 @@ func GetBackendFrameworkFromDynamoComponent(dynComponent *v1alpha1.DynamoCompone ...@@ -1604,7 +1606,7 @@ func GetBackendFrameworkFromDynamoComponent(dynComponent *v1alpha1.DynamoCompone
func GenerateBasePodSpecForController( func GenerateBasePodSpecForController(
dynComponent *v1alpha1.DynamoComponentDeployment, dynComponent *v1alpha1.DynamoComponentDeployment,
secretsRetriever SecretsRetriever, secretsRetriever SecretsRetriever,
controllerConfig controller_common.Config, operatorConfig *configv1alpha1.OperatorConfiguration,
role Role, role Role,
multinodeDeploymentType commonconsts.MultinodeDeploymentType, multinodeDeploymentType commonconsts.MultinodeDeploymentType,
checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info (resolved by caller) checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info (resolved by caller)
...@@ -1631,7 +1633,7 @@ func GenerateBasePodSpecForController( ...@@ -1631,7 +1633,7 @@ func GenerateBasePodSpecForController(
dynComponent.Namespace, dynComponent.Namespace,
role, role,
numberOfNodes, numberOfNodes,
controllerConfig, operatorConfig,
multinodeDeploymentType, multinodeDeploymentType,
serviceName, serviceName,
checkpointInfo, checkpointInfo,
......
...@@ -26,6 +26,7 @@ import ( ...@@ -26,6 +26,7 @@ import (
"testing" "testing"
"time" "time"
configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common" "github.com/ai-dynamo/dynamo/deploy/operator/internal/controller_common"
...@@ -790,7 +791,7 @@ func TestGenerateComponentContext(t *testing.T) { ...@@ -790,7 +791,7 @@ func TestGenerateComponentContext(t *testing.T) {
parentGraphDeploymentName string parentGraphDeploymentName string
namespace string namespace string
numberOfNodes int32 numberOfNodes int32
discoveryBackend string discoveryBackend configv1alpha1.DiscoveryBackend
expectedDynamoNamespace string expectedDynamoNamespace string
expectedComponentType string expectedComponentType string
expectedParentDGDName string expectedParentDGDName string
...@@ -806,7 +807,7 @@ func TestGenerateComponentContext(t *testing.T) { ...@@ -806,7 +807,7 @@ func TestGenerateComponentContext(t *testing.T) {
parentGraphDeploymentName: "my-deployment", parentGraphDeploymentName: "my-deployment",
namespace: "my-namespace", namespace: "my-namespace",
numberOfNodes: 1, numberOfNodes: 1,
discoveryBackend: "kubernetes", discoveryBackend: configv1alpha1.DiscoveryBackendKubernetes,
expectedDynamoNamespace: "my-namespace-my-deployment", expectedDynamoNamespace: "my-namespace-my-deployment",
expectedComponentType: commonconsts.ComponentTypePlanner, expectedComponentType: commonconsts.ComponentTypePlanner,
expectedParentDGDName: "my-deployment", expectedParentDGDName: "my-deployment",
...@@ -822,7 +823,7 @@ func TestGenerateComponentContext(t *testing.T) { ...@@ -822,7 +823,7 @@ func TestGenerateComponentContext(t *testing.T) {
parentGraphDeploymentName: "vllm-disagg", parentGraphDeploymentName: "vllm-disagg",
namespace: "djangoz", namespace: "djangoz",
numberOfNodes: 1, numberOfNodes: 1,
discoveryBackend: "kubernetes", discoveryBackend: configv1alpha1.DiscoveryBackendKubernetes,
expectedDynamoNamespace: "djangoz-vllm-disagg", expectedDynamoNamespace: "djangoz-vllm-disagg",
expectedComponentType: commonconsts.ComponentTypeFrontend, expectedComponentType: commonconsts.ComponentTypeFrontend,
expectedParentDGDName: "vllm-disagg", expectedParentDGDName: "vllm-disagg",
...@@ -839,7 +840,7 @@ func TestGenerateComponentContext(t *testing.T) { ...@@ -839,7 +840,7 @@ func TestGenerateComponentContext(t *testing.T) {
parentGraphDeploymentName: "shared-frontend", parentGraphDeploymentName: "shared-frontend",
namespace: "production", namespace: "production",
numberOfNodes: 2, numberOfNodes: 2,
discoveryBackend: "etcd", discoveryBackend: configv1alpha1.DiscoveryBackendEtcd,
expectedDynamoNamespace: commonconsts.GlobalDynamoNamespace, expectedDynamoNamespace: commonconsts.GlobalDynamoNamespace,
expectedComponentType: commonconsts.ComponentTypeWorker, expectedComponentType: commonconsts.ComponentTypeWorker,
expectedParentDGDName: "shared-frontend", expectedParentDGDName: "shared-frontend",
...@@ -854,7 +855,7 @@ func TestGenerateComponentContext(t *testing.T) { ...@@ -854,7 +855,7 @@ func TestGenerateComponentContext(t *testing.T) {
parentGraphDeploymentName: "test-dgd", parentGraphDeploymentName: "test-dgd",
namespace: "default", namespace: "default",
numberOfNodes: 1, numberOfNodes: 1,
discoveryBackend: "kubernetes", discoveryBackend: configv1alpha1.DiscoveryBackendKubernetes,
expectedDynamoNamespace: "default-test-dgd", expectedDynamoNamespace: "default-test-dgd",
expectedComponentType: commonconsts.ComponentTypePlanner, expectedComponentType: commonconsts.ComponentTypePlanner,
expectedParentDGDName: "test-dgd", expectedParentDGDName: "test-dgd",
...@@ -868,7 +869,7 @@ func TestGenerateComponentContext(t *testing.T) { ...@@ -868,7 +869,7 @@ func TestGenerateComponentContext(t *testing.T) {
parentGraphDeploymentName: "llama-70b-prod", parentGraphDeploymentName: "llama-70b-prod",
namespace: "ml-inference", namespace: "ml-inference",
numberOfNodes: 4, numberOfNodes: 4,
discoveryBackend: "nats", discoveryBackend: configv1alpha1.DiscoveryBackendEtcd,
expectedDynamoNamespace: "ml-inference-llama-70b-prod", expectedDynamoNamespace: "ml-inference-llama-70b-prod",
expectedComponentType: commonconsts.ComponentTypeFrontend, expectedComponentType: commonconsts.ComponentTypeFrontend,
expectedParentDGDName: "llama-70b-prod", expectedParentDGDName: "llama-70b-prod",
...@@ -1220,7 +1221,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1220,7 +1221,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
type args struct { type args struct {
ctx context.Context ctx context.Context
dynamoDeployment *v1alpha1.DynamoGraphDeployment dynamoDeployment *v1alpha1.DynamoGraphDeployment
controllerConfig controller_common.Config controllerConfig *configv1alpha1.OperatorConfiguration
} }
tests := []struct { tests := []struct {
name string name string
...@@ -1232,14 +1233,18 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1232,14 +1233,18 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
name: "test_generate_grove_pod_clique_set_single_node", name: "test_generate_grove_pod_clique_set_single_node",
args: args{ args: args{
ctx: context.Background(), ctx: context.Background(),
controllerConfig: controller_common.Config{ controllerConfig: &configv1alpha1.OperatorConfiguration{
EtcdAddress: "etcd-address", Infrastructure: configv1alpha1.InfrastructureConfiguration{
NatsAddress: "nats-address", ETCDAddress: "etcd-address",
ModelExpressURL: "model-express-url", NATSAddress: "nats-address",
Grove: controller_common.GroveConfig{ ModelExpressURL: "model-express-url",
TerminationDelay: 15 * time.Minute, PrometheusEndpoint: "http://localhost:9090",
},
Orchestrators: configv1alpha1.OrchestratorConfiguration{
Grove: configv1alpha1.GroveConfiguration{
TerminationDelay: metav1.Duration{Duration: 15 * time.Minute},
},
}, },
PrometheusEndpoint: "http://localhost:9090",
}, },
dynamoDeployment: &v1alpha1.DynamoGraphDeployment{ dynamoDeployment: &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
...@@ -1790,11 +1795,15 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1790,11 +1795,15 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
name: "test_generate_grove_pod_gang_set_multinode sglang", name: "test_generate_grove_pod_gang_set_multinode sglang",
args: args{ args: args{
ctx: context.Background(), ctx: context.Background(),
controllerConfig: controller_common.Config{ controllerConfig: &configv1alpha1.OperatorConfiguration{
EtcdAddress: "etcd-address", Infrastructure: configv1alpha1.InfrastructureConfiguration{
NatsAddress: "nats-address", ETCDAddress: "etcd-address",
Grove: controller_common.GroveConfig{ NATSAddress: "nats-address",
TerminationDelay: 15 * time.Minute, },
Orchestrators: configv1alpha1.OrchestratorConfiguration{
Grove: configv1alpha1.GroveConfiguration{
TerminationDelay: metav1.Duration{Duration: 15 * time.Minute},
},
}, },
}, },
dynamoDeployment: &v1alpha1.DynamoGraphDeployment{ dynamoDeployment: &v1alpha1.DynamoGraphDeployment{
...@@ -2755,11 +2764,15 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2755,11 +2764,15 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
name: "test_generate_grove_pod_gang_set_multinode vllm", name: "test_generate_grove_pod_gang_set_multinode vllm",
args: args{ args: args{
ctx: context.Background(), ctx: context.Background(),
controllerConfig: controller_common.Config{ controllerConfig: &configv1alpha1.OperatorConfiguration{
EtcdAddress: "etcd-address", Infrastructure: configv1alpha1.InfrastructureConfiguration{
NatsAddress: "nats-address", ETCDAddress: "etcd-address",
Grove: controller_common.GroveConfig{ NATSAddress: "nats-address",
TerminationDelay: 15 * time.Minute, },
Orchestrators: configv1alpha1.OrchestratorConfiguration{
Grove: configv1alpha1.GroveConfiguration{
TerminationDelay: metav1.Duration{Duration: 15 * time.Minute},
},
}, },
}, },
dynamoDeployment: &v1alpha1.DynamoGraphDeployment{ dynamoDeployment: &v1alpha1.DynamoGraphDeployment{
...@@ -3736,7 +3749,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -3736,7 +3749,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
got, err := GenerateGrovePodCliqueSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, nil, nil, nil, nil) got, err := GenerateGrovePodCliqueSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, &controller_common.RuntimeConfig{}, nil, nil, nil, nil)
if (err != nil) != tt.wantErr { if (err != nil) != tt.wantErr {
t.Errorf("GenerateGrovePodCliqueSet() error = %v, wantErr %v", err, tt.wantErr) t.Errorf("GenerateGrovePodCliqueSet() error = %v, wantErr %v", err, tt.wantErr)
return return
...@@ -3797,7 +3810,7 @@ func Test_GeneratePodCliqueSetGlobalDynamoNamespace(t *testing.T) { ...@@ -3797,7 +3810,7 @@ func Test_GeneratePodCliqueSetGlobalDynamoNamespace(t *testing.T) {
}, },
} }
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controller_common.Config{}, nil, nil, nil, nil) got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, &configv1alpha1.OperatorConfiguration{}, &controller_common.RuntimeConfig{}, nil, nil, nil, nil)
if !assert.NoError(t, err) { if !assert.NoError(t, err) {
return return
} }
...@@ -3868,7 +3881,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) { ...@@ -3868,7 +3881,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
Namespace: "default", Namespace: "default",
}, },
} }
controllerConfig := controller_common.Config{} controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct { tests := []struct {
name string name string
...@@ -4015,7 +4028,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) { ...@@ -4015,7 +4028,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
Namespace: "default", Namespace: "default",
}, },
} }
controllerConfig := controller_common.Config{} controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct { tests := []struct {
name string name string
...@@ -4168,7 +4181,7 @@ func TestGeneratePodSpecForComponent_UnsupportedBackend(t *testing.T) { ...@@ -4168,7 +4181,7 @@ func TestGeneratePodSpecForComponent_UnsupportedBackend(t *testing.T) {
Namespace: "default", Namespace: "default",
}, },
} }
controllerConfig := controller_common.Config{} controllerConfig := &configv1alpha1.OperatorConfiguration{}
component := &v1alpha1.DynamoComponentDeploymentSharedSpec{ component := &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker, ComponentType: commonconsts.ComponentTypeWorker,
...@@ -4878,12 +4891,14 @@ func TestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) { ...@@ -4878,12 +4891,14 @@ func TestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
}, },
} }
controllerConfig := controller_common.Config{ controllerConfig := &configv1alpha1.OperatorConfiguration{
EtcdAddress: "etcd-av1alpha1", Infrastructure: configv1alpha1.InfrastructureConfiguration{
NatsAddress: "nats-address", ETCDAddress: "etcd-av1alpha1",
NATSAddress: "nats-address",
},
} }
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controllerConfig, secretsRetriever, nil, nil, nil) got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controllerConfig, &controller_common.RuntimeConfig{}, secretsRetriever, nil, nil, nil)
if err != nil { if err != nil {
t.Errorf("GenerateGrovePodCliqueSet() error = %v", err) t.Errorf("GenerateGrovePodCliqueSet() error = %v", err)
return return
...@@ -4936,7 +4951,7 @@ func TestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) { ...@@ -4936,7 +4951,7 @@ func TestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
func TestGenerateBasePodSpec_Frontend(t *testing.T) { func TestGenerateBasePodSpec_Frontend(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{} secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{} controllerConfig := &configv1alpha1.OperatorConfiguration{}
dynamoDeployment := &v1alpha1.DynamoGraphDeployment{ dynamoDeployment := &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment", Name: "test-deployment",
...@@ -5032,7 +5047,7 @@ func TestGenerateBasePodSpec_Frontend(t *testing.T) { ...@@ -5032,7 +5047,7 @@ func TestGenerateBasePodSpec_Frontend(t *testing.T) {
func TestGenerateBasePodSpec_PlannerServiceAccount(t *testing.T) { func TestGenerateBasePodSpec_PlannerServiceAccount(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{} secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{} controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct { tests := []struct {
name string name string
...@@ -5178,7 +5193,7 @@ func TestGenerateBasePodSpec_DisableImagePullSecretDiscovery(t *testing.T) { ...@@ -5178,7 +5193,7 @@ func TestGenerateBasePodSpec_DisableImagePullSecretDiscovery(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
controllerConfig := controller_common.Config{} controllerConfig := &configv1alpha1.OperatorConfiguration{}
podSpec, err := GenerateBasePodSpec( podSpec, err := GenerateBasePodSpec(
tt.component, tt.component,
...@@ -5211,7 +5226,7 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) { ...@@ -5211,7 +5226,7 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
component *v1alpha1.DynamoComponentDeploymentSharedSpec component *v1alpha1.DynamoComponentDeploymentSharedSpec
controllerConfig controller_common.Config controllerConfig *configv1alpha1.OperatorConfiguration
wantEnvVar string wantEnvVar string
}{ }{
{ {
...@@ -5221,15 +5236,18 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) { ...@@ -5221,15 +5236,18 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
commonconsts.KubeAnnotationDynamoDiscoveryBackend: "kubernetes", commonconsts.KubeAnnotationDynamoDiscoveryBackend: "kubernetes",
}, },
}, },
wantEnvVar: "kubernetes", controllerConfig: &configv1alpha1.OperatorConfiguration{},
wantEnvVar: "kubernetes",
}, },
{ {
name: "Kubernetes discovery from controller config should set env var to kubernetes", name: "Kubernetes discovery from controller config should set env var to kubernetes",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{ component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{}, Annotations: map[string]string{},
}, },
controllerConfig: controller_common.Config{ controllerConfig: &configv1alpha1.OperatorConfiguration{
DiscoveryBackend: "kubernetes", Discovery: configv1alpha1.DiscoveryConfiguration{
Backend: "kubernetes",
},
}, },
wantEnvVar: "kubernetes", wantEnvVar: "kubernetes",
}, },
...@@ -5240,8 +5258,10 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) { ...@@ -5240,8 +5258,10 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
commonconsts.KubeAnnotationDynamoDiscoveryBackend: "etcd", commonconsts.KubeAnnotationDynamoDiscoveryBackend: "etcd",
}, },
}, },
controllerConfig: controller_common.Config{ controllerConfig: &configv1alpha1.OperatorConfiguration{
DiscoveryBackend: "kubernetes", Discovery: configv1alpha1.DiscoveryConfiguration{
Backend: "kubernetes",
},
}, },
wantEnvVar: "", // etcd is the runtime default, no env var needed wantEnvVar: "", // etcd is the runtime default, no env var needed
}, },
...@@ -5250,8 +5270,10 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) { ...@@ -5250,8 +5270,10 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{ component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
Annotations: map[string]string{}, Annotations: map[string]string{},
}, },
controllerConfig: controller_common.Config{ controllerConfig: &configv1alpha1.OperatorConfiguration{
DiscoveryBackend: "etcd", Discovery: configv1alpha1.DiscoveryConfiguration{
Backend: "etcd",
},
}, },
wantEnvVar: "", // etcd is the runtime default, no env var needed wantEnvVar: "", // etcd is the runtime default, no env var needed
}, },
...@@ -5262,15 +5284,18 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) { ...@@ -5262,15 +5284,18 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
commonconsts.KubeAnnotationDynamoDiscoveryBackend: "", commonconsts.KubeAnnotationDynamoDiscoveryBackend: "",
}, },
}, },
controllerConfig: controller_common.Config{ controllerConfig: &configv1alpha1.OperatorConfiguration{
DiscoveryBackend: "", Discovery: configv1alpha1.DiscoveryConfiguration{
Backend: "",
},
}, },
wantEnvVar: "kubernetes", // empty defaults to kubernetes wantEnvVar: "kubernetes", // empty defaults to kubernetes
}, },
{ {
name: "Discovery backend not set defaults to kubernetes", name: "Discovery backend not set defaults to kubernetes",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{}, component: &v1alpha1.DynamoComponentDeploymentSharedSpec{},
wantEnvVar: "kubernetes", // not set defaults to kubernetes controllerConfig: &configv1alpha1.OperatorConfiguration{},
wantEnvVar: "kubernetes", // not set defaults to kubernetes
}, },
} }
secretsRetriever := &mockSecretsRetriever{} secretsRetriever := &mockSecretsRetriever{}
...@@ -5307,7 +5332,7 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) { ...@@ -5307,7 +5332,7 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
func TestGenerateBasePodSpec_Worker(t *testing.T) { func TestGenerateBasePodSpec_Worker(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{} secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{} controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct { tests := []struct {
name string name string
...@@ -5474,7 +5499,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) { ...@@ -5474,7 +5499,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
func TestGenerateBasePodSpec_VolumeMounts(t *testing.T) { func TestGenerateBasePodSpec_VolumeMounts(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{} secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{} controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct { tests := []struct {
name string name string
...@@ -5609,7 +5634,7 @@ func TestGenerateBasePodSpec_VolumeMounts(t *testing.T) { ...@@ -5609,7 +5634,7 @@ func TestGenerateBasePodSpec_VolumeMounts(t *testing.T) {
func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) { func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{} secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{} controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct { tests := []struct {
name string name string
...@@ -5869,7 +5894,7 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) { ...@@ -5869,7 +5894,7 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
func TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport(t *testing.T) { func TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{} secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{} controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct { tests := []struct {
name string name string
...@@ -6055,7 +6080,7 @@ func TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport(t *testing.T) ...@@ -6055,7 +6080,7 @@ func TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport(t *testing.T)
func TestGenerateBasePodSpec_SecurityContext(t *testing.T) { func TestGenerateBasePodSpec_SecurityContext(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{} secretsRetriever := &mockSecretsRetriever{}
controllerConfig := controller_common.Config{} controllerConfig := &configv1alpha1.OperatorConfiguration{}
tests := []struct { tests := []struct {
name string name string
...@@ -6696,12 +6721,14 @@ func TestGenerateGrovePodCliqueSet_RestartAnnotations(t *testing.T) { ...@@ -6696,12 +6721,14 @@ func TestGenerateGrovePodCliqueSet_RestartAnnotations(t *testing.T) {
}, },
} }
controllerConfig := controller_common.Config{ controllerConfig := &configv1alpha1.OperatorConfiguration{
EtcdAddress: "etcd-address", Infrastructure: configv1alpha1.InfrastructureConfiguration{
NatsAddress: "nats-address", ETCDAddress: "etcd-address",
NATSAddress: "nats-address",
},
} }
got, err := GenerateGrovePodCliqueSet(context.Background(), dgd, controllerConfig, nil, tt.restartState, nil, nil) got, err := GenerateGrovePodCliqueSet(context.Background(), dgd, controllerConfig, &controller_common.RuntimeConfig{}, nil, tt.restartState, nil, nil)
if err != nil { if err != nil {
t.Fatalf("GenerateGrovePodCliqueSet() error = %v", err) t.Fatalf("GenerateGrovePodCliqueSet() error = %v", err)
} }
......
...@@ -305,11 +305,11 @@ func ResolveKaiSchedulerQueue(annotations map[string]string) string { ...@@ -305,11 +305,11 @@ func ResolveKaiSchedulerQueue(annotations map[string]string) string {
// injectKaiSchedulerIfEnabled injects kai-scheduler settings into a clique if kai-scheduler is enabled and grove is enabled // injectKaiSchedulerIfEnabled injects kai-scheduler settings into a clique if kai-scheduler is enabled and grove is enabled
func injectKaiSchedulerIfEnabled( func injectKaiSchedulerIfEnabled(
clique *grovev1alpha1.PodCliqueTemplateSpec, clique *grovev1alpha1.PodCliqueTemplateSpec,
controllerConfig controller_common.Config, runtimeConfig *controller_common.RuntimeConfig,
validatedQueueName string, validatedQueueName string,
) { ) {
// Only proceed if grove is enabled, kai-scheduler is enabled, and no manual schedulerName is set // Only proceed if grove is enabled, kai-scheduler is enabled, and no manual schedulerName is set
if !controllerConfig.Grove.Enabled || !controllerConfig.KaiScheduler.Enabled { if !runtimeConfig.GroveEnabled || !runtimeConfig.KaiSchedulerEnabled {
return return
} }
......
...@@ -112,7 +112,7 @@ func TestResolveKaiSchedulerQueue(t *testing.T) { ...@@ -112,7 +112,7 @@ func TestResolveKaiSchedulerQueue(t *testing.T) {
func TestInjectKaiSchedulerIfEnabled(t *testing.T) { func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
controllerConfig controller_common.Config runtimeConfig *controller_common.RuntimeConfig
validatedQueueName string validatedQueueName string
initialClique *grovev1alpha1.PodCliqueTemplateSpec initialClique *grovev1alpha1.PodCliqueTemplateSpec
expectedScheduler string expectedScheduler string
...@@ -121,9 +121,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) { ...@@ -121,9 +121,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
}{ }{
{ {
name: "grove disabled - no injection", name: "grove disabled - no injection",
controllerConfig: controller_common.Config{ runtimeConfig: &controller_common.RuntimeConfig{
Grove: controller_common.GroveConfig{Enabled: false}, GroveEnabled: false,
KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: true}, KaiSchedulerEnabled: true,
}, },
validatedQueueName: "test-queue", validatedQueueName: "test-queue",
initialClique: &grovev1alpha1.PodCliqueTemplateSpec{ initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
...@@ -135,9 +135,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) { ...@@ -135,9 +135,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
}, },
{ {
name: "kai-scheduler disabled - no injection", name: "kai-scheduler disabled - no injection",
controllerConfig: controller_common.Config{ runtimeConfig: &controller_common.RuntimeConfig{
Grove: controller_common.GroveConfig{Enabled: true}, GroveEnabled: true,
KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: false}, KaiSchedulerEnabled: false,
}, },
validatedQueueName: "test-queue", validatedQueueName: "test-queue",
initialClique: &grovev1alpha1.PodCliqueTemplateSpec{ initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
...@@ -149,9 +149,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) { ...@@ -149,9 +149,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
}, },
{ {
name: "manual scheduler set - no injection", name: "manual scheduler set - no injection",
controllerConfig: controller_common.Config{ runtimeConfig: &controller_common.RuntimeConfig{
Grove: controller_common.GroveConfig{Enabled: true}, GroveEnabled: true,
KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: true}, KaiSchedulerEnabled: true,
}, },
validatedQueueName: "test-queue", validatedQueueName: "test-queue",
initialClique: &grovev1alpha1.PodCliqueTemplateSpec{ initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
...@@ -165,9 +165,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) { ...@@ -165,9 +165,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
}, },
{ {
name: "both enabled, no manual scheduler - inject", name: "both enabled, no manual scheduler - inject",
controllerConfig: controller_common.Config{ runtimeConfig: &controller_common.RuntimeConfig{
Grove: controller_common.GroveConfig{Enabled: true}, GroveEnabled: true,
KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: true}, KaiSchedulerEnabled: true,
}, },
validatedQueueName: "test-queue", validatedQueueName: "test-queue",
initialClique: &grovev1alpha1.PodCliqueTemplateSpec{ initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
...@@ -181,9 +181,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) { ...@@ -181,9 +181,9 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
}, },
{ {
name: "inject with existing labels", name: "inject with existing labels",
controllerConfig: controller_common.Config{ runtimeConfig: &controller_common.RuntimeConfig{
Grove: controller_common.GroveConfig{Enabled: true}, GroveEnabled: true,
KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: true}, KaiSchedulerEnabled: true,
}, },
validatedQueueName: "custom-queue", validatedQueueName: "custom-queue",
initialClique: &grovev1alpha1.PodCliqueTemplateSpec{ initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
...@@ -206,7 +206,7 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) { ...@@ -206,7 +206,7 @@ func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
clique := tt.initialClique.DeepCopy() clique := tt.initialClique.DeepCopy()
// Call the function // Call the function
injectKaiSchedulerIfEnabled(clique, tt.controllerConfig, tt.validatedQueueName) injectKaiSchedulerIfEnabled(clique, tt.runtimeConfig, tt.validatedQueueName)
if tt.shouldInject { if tt.shouldInject {
// Verify scheduler name is injected // Verify scheduler name is injected
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
## Packages ## Packages
- [nvidia.com/v1alpha1](#nvidiacomv1alpha1) - [nvidia.com/v1alpha1](#nvidiacomv1alpha1)
- [nvidia.com/v1beta1](#nvidiacomv1beta1) - [nvidia.com/v1beta1](#nvidiacomv1beta1)
- [operator.config.dynamo.nvidia.com/v1alpha1](#operatorconfigdynamonvidiacomv1alpha1)
## nvidia.com/v1alpha1 ## nvidia.com/v1alpha1
...@@ -1584,6 +1585,480 @@ _Appears in:_ ...@@ -1584,6 +1585,480 @@ _Appears in:_
| `requestRate` _float_ | RequestRate is the target request rate (req/s).<br />Required (or Concurrency) when the planner is disabled. | | Optional: \{\} <br /> | | `requestRate` _float_ | RequestRate is the target request rate (req/s).<br />Required (or Concurrency) when the planner is disabled. | | Optional: \{\} <br /> |
## operator.config.dynamo.nvidia.com/v1alpha1
### Resource Types
- [OperatorConfiguration](#operatorconfiguration)
#### CheckpointConfiguration
CheckpointConfiguration holds checkpoint/restore settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled indicates if checkpoint functionality is enabled | | |
| `readyForCheckpointFilePath` _string_ | ReadyForCheckpointFilePath signals model readiness for checkpoint jobs | /tmp/ready-for-checkpoint | |
| `storage` _[CheckpointStorageConfiguration](#checkpointstorageconfiguration)_ | Storage holds storage backend configuration | | |
#### CheckpointOCIConfig
CheckpointOCIConfig holds OCI registry storage configuration.
_Appears in:_
- [CheckpointStorageConfiguration](#checkpointstorageconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `uri` _string_ | URI is the OCI URI (oci://registry/repository) | | |
| `credentialsSecretRef` _string_ | CredentialsSecretRef is the name of the docker config secret | | |
#### CheckpointPVCConfig
CheckpointPVCConfig holds PVC storage configuration.
_Appears in:_
- [CheckpointStorageConfiguration](#checkpointstorageconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `pvcName` _string_ | PVCName is the name of the PVC | chrek-pvc | |
| `basePath` _string_ | BasePath is the base directory within the PVC | /checkpoints | |
#### CheckpointS3Config
CheckpointS3Config holds S3 storage configuration.
_Appears in:_
- [CheckpointStorageConfiguration](#checkpointstorageconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `uri` _string_ | URI is the S3 URI (s3://[endpoint/]bucket/prefix) | | |
| `credentialsSecretRef` _string_ | CredentialsSecretRef is the name of the credentials secret | | |
#### CheckpointStorageConfiguration
CheckpointStorageConfiguration holds storage backend configuration for checkpoints.
_Appears in:_
- [CheckpointConfiguration](#checkpointconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `type` _string_ | Type is the storage backend type: pvc, s3, or oci | pvc | |
| `pvc` _[CheckpointPVCConfig](#checkpointpvcconfig)_ | PVC configuration (used when Type=pvc) | | |
| `s3` _[CheckpointS3Config](#checkpoints3config)_ | S3 configuration (used when Type=s3) | | |
| `oci` _[CheckpointOCIConfig](#checkpointociconfig)_ | OCI configuration (used when Type=oci) | | |
#### DiscoveryBackend
_Underlying type:_ _string_
DiscoveryBackend is the type for the discovery backend.
_Appears in:_
- [DiscoveryConfiguration](#discoveryconfiguration)
| Field | Description |
| --- | --- |
| `kubernetes` | DiscoveryBackendKubernetes is the Kubernetes discovery backend<br /> |
| `etcd` | DiscoveryBackendEtcd is the etcd discovery backend<br /> |
#### DiscoveryConfiguration
DiscoveryConfiguration holds discovery backend settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `backend` _[DiscoveryBackend](#discoverybackend)_ | Backend is the discovery backend: "kubernetes" or "etcd" | kubernetes | |
#### GPUConfiguration
GPUConfiguration holds GPU discovery settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `discoveryEnabled` _boolean_ | DiscoveryEnabled indicates whether GPU discovery is enabled | true | |
#### GroveConfiguration
GroveConfiguration holds Grove orchestrator settings.
_Appears in:_
- [OrchestratorConfiguration](#orchestratorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled overrides auto-detection. nil = auto-detect. | | |
| `terminationDelay` _[Duration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#duration-v1-meta)_ | TerminationDelay configures the termination delay for Grove PodCliqueSets | 15m | |
#### InfrastructureConfiguration
InfrastructureConfiguration holds service mesh and backend addresses.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `natsAddress` _string_ | NATSAddress is the address of the NATS server | | |
| `etcdAddress` _string_ | ETCDAddress is the address of the etcd server | | |
| `modelExpressURL` _string_ | ModelExpressURL is the URL of the Model Express server to inject into all pods | | |
| `prometheusEndpoint` _string_ | PrometheusEndpoint is the URL of the Prometheus endpoint to use for metrics | | |
#### IngressConfiguration
IngressConfiguration holds ingress settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `virtualServiceGateway` _string_ | VirtualServiceGateway is the name of the Istio virtual service gateway | | |
| `controllerClassName` _string_ | ControllerClassName is the ingress controller class name | | |
| `controllerTLSSecretName` _string_ | ControllerTLSSecretName is the TLS secret for the ingress controller | | |
| `hostSuffix` _string_ | HostSuffix is the suffix for ingress hostnames | | |
#### KaiSchedulerConfiguration
KaiSchedulerConfiguration holds Kai-scheduler settings.
_Appears in:_
- [OrchestratorConfiguration](#orchestratorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled overrides auto-detection. nil = auto-detect. | | |
#### LWSConfiguration
LWSConfiguration holds LWS orchestrator settings.
_Appears in:_
- [OrchestratorConfiguration](#orchestratorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled overrides auto-detection. nil = auto-detect. | | |
#### LeaderElectionConfiguration
LeaderElectionConfiguration holds leader election settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled enables leader election for controller manager | false | |
| `id` _string_ | ID is the leader election resource identity | | |
| `namespace` _string_ | Namespace is the namespace for the leader election resource | | |
#### LoggingConfiguration
LoggingConfiguration holds logging settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `level` _string_ | Level is the log level (e.g., "info", "debug") | info | |
| `format` _string_ | Format is the log format (e.g., "json", "text") | json | |
#### MPIConfiguration
MPIConfiguration holds MPI SSH secret settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `sshSecretName` _string_ | SSHSecretName is the name of the secret containing the SSH key for MPI | | |
| `sshSecretNamespace` _string_ | SSHSecretNamespace is the namespace where the MPI SSH secret is located | | |
#### MetricsServer
MetricsServer extends Server with secure serving option.
_Appears in:_
- [ServerConfiguration](#serverconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `bindAddress` _string_ | BindAddress is the address the server binds to | | |
| `port` _integer_ | Port is the port the server listens on | | |
| `secure` _boolean_ | Secure enables secure serving for the metrics endpoint | | |
#### NamespaceConfiguration
NamespaceConfiguration determines operator namespace mode.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `restricted` _string_ | Restricted is the namespace to restrict to. Empty = cluster-wide mode. | | |
| `scope` _[NamespaceScopeConfiguration](#namespacescopeconfiguration)_ | Scope holds namespace scope lease settings (namespace-restricted mode only) | | |
#### NamespaceScopeConfiguration
NamespaceScopeConfiguration holds lease settings for namespace-restricted mode.
_Appears in:_
- [NamespaceConfiguration](#namespaceconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `leaseDuration` _[Duration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#duration-v1-meta)_ | LeaseDuration is the duration of namespace scope marker lease before expiration | 30s | |
| `leaseRenewInterval` _[Duration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#duration-v1-meta)_ | LeaseRenewInterval is the interval for renewing namespace scope marker lease | 10s | |
#### OperatorConfiguration
OperatorConfiguration is the Schema for the operator configuration.
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `apiVersion` _string_ | `operator.config.dynamo.nvidia.com/v1alpha1` | | |
| `kind` _string_ | `OperatorConfiguration` | | |
| `server` _[ServerConfiguration](#serverconfiguration)_ | Server configuration (metrics, health probes, webhooks) | | |
| `leaderElection` _[LeaderElectionConfiguration](#leaderelectionconfiguration)_ | Leader election configuration | | |
| `namespace` _[NamespaceConfiguration](#namespaceconfiguration)_ | Namespace configuration (restricted vs cluster-wide) | | |
| `orchestrators` _[OrchestratorConfiguration](#orchestratorconfiguration)_ | Orchestrator configuration with optional overrides | | |
| `infrastructure` _[InfrastructureConfiguration](#infrastructureconfiguration)_ | Service mesh and infrastructure addresses | | |
| `ingress` _[IngressConfiguration](#ingressconfiguration)_ | Ingress configuration | | |
| `rbac` _[RBACConfiguration](#rbacconfiguration)_ | RBAC configuration for cross-namespace resource management (cluster-wide mode) | | |
| `mpi` _[MPIConfiguration](#mpiconfiguration)_ | MPI SSH secret configuration | | |
| `checkpoint` _[CheckpointConfiguration](#checkpointconfiguration)_ | Checkpoint/restore configuration | | |
| `discovery` _[DiscoveryConfiguration](#discoveryconfiguration)_ | Discovery backend configuration | | |
| `gpu` _[GPUConfiguration](#gpuconfiguration)_ | GPU discovery configuration | | |
| `logging` _[LoggingConfiguration](#loggingconfiguration)_ | Logging configuration | | |
| `security` _[SecurityConfiguration](#securityconfiguration)_ | HTTP/2 and TLS settings | | |
#### OrchestratorConfiguration
OrchestratorConfiguration holds orchestrator override settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `grove` _[GroveConfiguration](#groveconfiguration)_ | Grove orchestrator configuration | | |
| `lws` _[LWSConfiguration](#lwsconfiguration)_ | LWS orchestrator configuration | | |
| `kaiScheduler` _[KaiSchedulerConfiguration](#kaischedulerconfiguration)_ | KaiScheduler configuration | | |
#### RBACConfiguration
RBACConfiguration holds RBAC settings for cluster-wide mode.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `plannerClusterRoleName` _string_ | PlannerClusterRoleName is the ClusterRole for planner | | |
| `dgdrProfilingClusterRoleName` _string_ | DGDRProfilingClusterRoleName is the ClusterRole for DGDR profiling jobs | | |
| `eppClusterRoleName` _string_ | EPPClusterRoleName is the ClusterRole for EPP | | |
#### SecurityConfiguration
SecurityConfiguration holds HTTP/2 and TLS settings.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enableHTTP2` _boolean_ | EnableHTTP2 enables HTTP/2 for metrics and webhook servers | false | |
#### Server
Server holds a bind address and port.
_Appears in:_
- [MetricsServer](#metricsserver)
- [ServerConfiguration](#serverconfiguration)
- [WebhookServer](#webhookserver)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `bindAddress` _string_ | BindAddress is the address the server binds to | | |
| `port` _integer_ | Port is the port the server listens on | | |
#### ServerConfiguration
ServerConfiguration holds server bind addresses and ports.
_Appears in:_
- [OperatorConfiguration](#operatorconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `metrics` _[MetricsServer](#metricsserver)_ | Metrics server configuration | \{ bindAddress:127.0.0.1 port:8080 \} | |
| `healthProbe` _[Server](#server)_ | Health probe server configuration | \{ bindAddress:0.0.0.0 port:8081 \} | |
| `webhook` _[WebhookServer](#webhookserver)_ | Webhook server configuration | \{ certDir:/tmp/k8s-webhook-server/serving-certs host:0.0.0.0 port:9443 \} | |
#### WebhookServer
WebhookServer extends Server with host and certificate directory.
_Appears in:_
- [ServerConfiguration](#serverconfiguration)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `bindAddress` _string_ | BindAddress is the address the server binds to | | |
| `port` _integer_ | Port is the port the server listens on | | |
| `host` _string_ | Host is the address the webhook server binds to | | |
| `certDir` _string_ | CertDir is the directory containing TLS certificates | | |
# Operator Default Values Injection # Operator Default Values Injection
The Dynamo operator automatically applies default values to various fields when they are not explicitly specified in your deployments. These defaults include: The Dynamo operator automatically applies default values to various fields when they are not explicitly specified in your deployments. These defaults include:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment