feat: Auto-inject kai-scheduler annotations and label (#2748)

Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>

feat: Auto-inject kai-scheduler annotations and label (#2748)
Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>
9e6972a5 · julienmancuso · GitHub · 700d345d · 9e6972a5 · 9e6972a5
Unverified Commit 9e6972a5 authored Aug 27, 2025 by julienmancuso Committed by GitHub Aug 28, 2025
10 changed files
--- a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml
+++ b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml
@@ -137,6 +137,13 @@ rules:
  - get
  - patch
  - update
+- apiGroups:
+  - scheduling.run.ai
+  resources:
+  - queues
+  verbs:
+  - get
+  - list
 - apiGroups:
  - apps
  resources:
@@ -475,6 +482,45 @@ roleRef:
 {{- end }}
  name: '{{ include "dynamo-operator.fullname" . }}-manager-role'
 subjects:
+- kind: ServiceAccount
+  name: '{{ include "dynamo-operator.fullname" . }}-controller-manager'
+  namespace: '{{ .Release.Namespace }}'
+---
+# ClusterRole for kai-scheduler queue access
+# This is always a ClusterRole since Queue resources are cluster-scoped
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "dynamo-operator.fullname" . }}-queue-reader
+  labels:
+    app.kubernetes.io/component: rbac
+    app.kubernetes.io/created-by: dynamo-operator
+    app.kubernetes.io/part-of: dynamo-operator
+  {{- include "dynamo-operator.labels" . | nindent 4 }}
+rules:
+- apiGroups:
+  - scheduling.run.ai
+  resources:
+  - queues
+  verbs:
+  - get
+  - list
+---
+# ClusterRoleBinding for kai-scheduler queue access
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ include "dynamo-operator.fullname" . }}-queue-reader-binding
+  labels:
+    app.kubernetes.io/component: rbac
+    app.kubernetes.io/created-by: dynamo-operator
+    app.kubernetes.io/part-of: dynamo-operator
+  {{- include "dynamo-operator.labels" . | nindent 4 }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ include "dynamo-operator.fullname" . }}-queue-reader
+subjects:
 - kind: ServiceAccount
  name: '{{ include "dynamo-operator.fullname" . }}-controller-manager'
  namespace: '{{ .Release.Namespace }}'
\ No newline at end of file
--- a/deploy/cloud/operator/cmd/main.go
+++ b/deploy/cloud/operator/cmd/main.go
@@ -172,6 +172,9 @@ func main() {
 		LWS: commonController.LWSConfig{
 			Enabled: false, // Will be set after LWS discovery
 		},
+		KaiScheduler: commonController.KaiSchedulerConfig{
+			Enabled: false, // Will be set after Kai-scheduler discovery
+		},
 		EtcdAddress: etcdAddr,
 		NatsAddress: natsAddr,
 		IngressConfig: commonController.IngressConfig{
@@ -247,6 +250,11 @@ func main() {
 	lwsEnabled := commonController.DetectLWSAvailability(mainCtx, mgr)
 	ctrlConfig.LWS.Enabled = lwsEnabled

+	// Detect Kai-scheduler availability using discovery client
+	setupLog.Info("Detecting Kai-scheduler availability...")
+	kaiSchedulerEnabled := commonController.DetectKaiSchedulerAvailability(mainCtx, mgr)
+	ctrlConfig.KaiScheduler.Enabled = kaiSchedulerEnabled
+
 	// Create etcd client
 	cli, err := clientv3.New(clientv3.Config{
 		Endpoints:            []string{etcdAddr},

--- a/deploy/cloud/operator/config/rbac/role.yaml
+++ b/deploy/cloud/operator/config/rbac/role.yaml
@@ -185,6 +185,13 @@ rules:
  - get
  - patch
  - update
+- apiGroups:
+  - scheduling.run.ai
+  resources:
+  - queues
+  verbs:
+  - get
+  - list
 - apiGroups:
  - scheduling.volcano.sh
  resources:

--- a/deploy/cloud/operator/internal/consts/consts.go
+++ b/deploy/cloud/operator/internal/consts/consts.go
 package consts

-import "time"
+import (
+	"time"
+
+	"k8s.io/apimachinery/pkg/runtime/schema"
+)

 const (
 	HPACPUDefaultAverageUtilization = 80
@@ -55,6 +59,12 @@ const (
 	DefaultSharedMemoryMountPath = "/dev/shm"
 	DefaultSharedMemorySize      = "8Gi"

+	// Kai-scheduler related constants
+	KubeAnnotationKaiSchedulerQueue = "nvidia.com/kai-scheduler-queue" // User-provided annotation to specify queue name
+	KubeLabelKaiSchedulerQueue      = "kai.scheduler/queue"            // Label injected into pods for kai-scheduler
+	KaiSchedulerName                = "kai-scheduler"                  // Scheduler name for kai-scheduler
+	DefaultKaiSchedulerQueue        = "dynamo"                         // Default queue name when none specified
+
 	// Grove multinode role suffixes
 	GroveRoleSuffixLeader = "ldr"
 	GroveRoleSuffixWorker = "wkr"
@@ -68,3 +78,25 @@ const (
 	MultinodeDeploymentTypeGrove MultinodeDeploymentType = "grove"
 	MultinodeDeploymentTypeLWS   MultinodeDeploymentType = "lws"
 )
+
+// GroupVersionResources for external APIs
+var (
+	// Grove GroupVersionResources for scaling operations
+	PodCliqueGVR = schema.GroupVersionResource{
+		Group:    "grove.io",
+		Version:  "v1alpha1",
+		Resource: "podcliques",
+	}
+	PodCliqueScalingGroupGVR = schema.GroupVersionResource{
+		Group:    "grove.io",
+		Version:  "v1alpha1",
+		Resource: "podcliquescalinggroups",
+	}
+
+	// KAI-Scheduler GroupVersionResource for queue validation
+	QueueGVR = schema.GroupVersionResource{
+		Group:    "scheduling.run.ai",
+		Version:  "v2",
+		Resource: "queues",
+	}
+)
--- a/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go
+++ b/deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go
@@ -55,20 +55,6 @@ const (
 	PendingState State = "pending"
 )

-var (
-	// Grove GroupVersionResources for scaling operations
-	podCliqueGVR = schema.GroupVersionResource{
-		Group:    "grove.io",
-		Version:  "v1alpha1",
-		Resource: "podcliques",
-	}
-	podCliqueScalingGroupGVR = schema.GroupVersionResource{
-		Group:    "grove.io",
-		Version:  "v1alpha1",
-		Resource: "podcliquescalinggroups",
-	}
-)
-
 type etcdStorage interface {
 	DeleteKeys(ctx context.Context, prefix string) error
 }
@@ -88,6 +74,7 @@ type DynamoGraphDeploymentReconciler struct {
 // +kubebuilder:rbac:groups=grove.io,resources=podgangsets,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch
 // +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch
+// +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list

 // Reconcile is part of the main kubernetes reconciliation loop which aims to
 // move the current state of the cluster closer to the desired state.
@@ -201,9 +188,9 @@ func (r *DynamoGraphDeploymentReconciler) scaleGroveResource(ctx context.Context
 	var gvr schema.GroupVersionResource
 	switch resourceType {
 	case "PodClique":
-		gvr = podCliqueGVR
+		gvr = consts.PodCliqueGVR
 	case "PodCliqueScalingGroup":
-		gvr = podCliqueScalingGroupGVR
+		gvr = consts.PodCliqueScalingGroupGVR
 	default:
 		return fmt.Errorf("unsupported Grove resource type: %s", resourceType)
 	}

--- a/deploy/cloud/operator/internal/controller_common/predicate.go
+++ b/deploy/cloud/operator/internal/controller_common/predicate.go
@@ -42,11 +42,17 @@ type LWSConfig struct {
 	Enabled bool
 }

+type KaiSchedulerConfig struct {
+	// Enabled is automatically determined by checking if Kai-scheduler CRDs are installed in the cluster
+	Enabled bool
+}
+
 type Config struct {
 	// Enable resources filtering, only the resources belonging to the given namespace will be handled.
 	RestrictedNamespace string
 	Grove               GroveConfig
 	LWS                 LWSConfig
+	KaiScheduler        KaiSchedulerConfig
 	EtcdAddress         string
 	NatsAddress         string
 	IngressConfig       IngressConfig
@@ -75,6 +81,12 @@ func DetectLWSAvailability(ctx context.Context, mgr ctrl.Manager) bool {
 	return detectAPIGroupAvailability(ctx, mgr, "leaderworkerset.x-k8s.io")
 }

+// DetectKaiSchedulerAvailability checks if Kai-scheduler is available by checking if the scheduling.run.ai API group is registered
+// This approach uses the discovery client which is simpler and more reliable
+func DetectKaiSchedulerAvailability(ctx context.Context, mgr ctrl.Manager) bool {
+	return detectAPIGroupAvailability(ctx, mgr, "scheduling.run.ai")
+}
+
 // detectAPIGroupAvailability checks if a specific API group is registered in the cluster
 func detectAPIGroupAvailability(ctx context.Context, mgr ctrl.Manager, groupName string) bool {
 	logger := log.FromContext(ctx)
@@ -107,6 +119,7 @@ func detectAPIGroupAvailability(ctx context.Context, mgr ctrl.Manager, groupName
 	logger.Info("API group not available", "group", groupName)
 	return false
 }
+
 func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
 	return predicate.NewPredicateFuncs(func(o client.Object) bool {
 		l := log.FromContext(context.Background())

--- a/deploy/cloud/operator/internal/dynamo/graph.go
+++ b/deploy/cloud/operator/internal/dynamo/graph.go
@@ -882,6 +882,17 @@ func GenerateGrovePodGangSet(
 	if controllerConfig.Grove.TerminationDelay > 0 {
 		gangSet.Spec.Template.TerminationDelay = &metav1.Duration{Duration: controllerConfig.Grove.TerminationDelay}
 	}
+
+	// Validate kai-scheduler queue once if kai-scheduler is enabled
+	var validatedQueueName string
+	if controllerConfig.Grove.Enabled && controllerConfig.KaiScheduler.Enabled {
+		var err error
+		validatedQueueName, err = DetermineKaiSchedulerQueue(ctx, dynamoDeployment.Annotations)
+		if err != nil {
+			return nil, fmt.Errorf("failed to determine kai-scheduler queue: %w", err)
+		}
+	}
+
 	dynamoNamespace, err := getDynamoNamespace(dynamoDeployment)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get the graph dynamo namespace: %w", err)
@@ -935,6 +946,10 @@ func GenerateGrovePodGangSet(
 				return nil, fmt.Errorf("failed to generate annotations: %w", err)
 			}
 			clique.Annotations = annotations
+
+			// Inject kai-scheduler settings if enabled
+			injectKaiSchedulerIfEnabled(clique, controllerConfig, validatedQueueName)
+
 			gangSet.Spec.Template.Cliques = append(gangSet.Spec.Template.Cliques, clique)
 			cliqueNames = append(cliqueNames, strings.ToLower(r.Name))
 		}

--- a/deploy/cloud/operator/internal/dynamo/grove.go
+++ b/deploy/cloud/operator/internal/dynamo/grove.go
@@ -14,6 +14,10 @@ import (

 	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
 	commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
+	"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/dynamic"
+	ctrl "sigs.k8s.io/controller-runtime"
 )

 type GroveMultinodeDeployer struct {
@@ -130,3 +134,95 @@ func checkPCSGReady(ctx context.Context, client client.Client, resourceName, nam

 	return true, ""
 }
+
+// resolveKaiSchedulerQueueName extracts the queue name from annotations or returns default
+// This is the shared logic between DetermineKaiSchedulerQueue and ResolveKaiSchedulerQueue
+func resolveKaiSchedulerQueueName(annotations map[string]string) string {
+	queueName := commonconsts.DefaultKaiSchedulerQueue
+	if annotations != nil {
+		if annotationQueue, exists := annotations[commonconsts.KubeAnnotationKaiSchedulerQueue]; exists && strings.TrimSpace(annotationQueue) != "" {
+			queueName = strings.TrimSpace(annotationQueue)
+		}
+	}
+	return queueName
+}
+
+// ensureQueueExists validates that a Queue resource with the given name exists in the cluster
+// Returns an error if the queue doesn't exist or if validation fails
+func ensureQueueExists(ctx context.Context, dynamicClient dynamic.Interface, queueName string) error {
+	logger := log.FromContext(ctx)
+
+	// Try to get the queue resource using the predefined GVR
+	_, err := dynamicClient.Resource(commonconsts.QueueGVR).Get(ctx, queueName, metav1.GetOptions{})
+	if err != nil {
+		if errors.IsNotFound(err) {
+			logger.Error(err, "Queue not found", "queueName", queueName)
+			return fmt.Errorf("queue '%s' not found in cluster. Ensure the queue exists before using kai-scheduler", queueName)
+		}
+		logger.Error(err, "Failed to validate queue", "queueName", queueName)
+		return fmt.Errorf("failed to validate queue '%s': %w", queueName, err)
+	}
+
+	logger.Info("Queue validation successful", "queueName", queueName)
+	return nil
+}
+
+// DetermineKaiSchedulerQueue determines the queue name for kai-scheduler from deployment annotations or returns default
+// Also validates that the queue exists in the cluster
+func DetermineKaiSchedulerQueue(ctx context.Context, annotations map[string]string) (string, error) {
+	// Get the queue name from annotation or use default
+	queueName := resolveKaiSchedulerQueueName(annotations)
+
+	// Create a dynamic client for CRD validation (Queue CRD might not be in the standard client scheme)
+	cfg, err := ctrl.GetConfig()
+	if err != nil {
+		return "", fmt.Errorf("failed to get kubernetes config for queue validation: %w", err)
+	}
+
+	dynamicClient, err := dynamic.NewForConfig(cfg)
+	if err != nil {
+		return "", fmt.Errorf("failed to create dynamic client for queue validation: %w", err)
+	}
+
+	// Validate that the queue exists
+	if err := ensureQueueExists(ctx, dynamicClient, queueName); err != nil {
+		return "", fmt.Errorf("kai-scheduler queue validation failed: %w", err)
+	}
+
+	return queueName, nil
+}
+
+// ResolveKaiSchedulerQueue determines the queue name for kai-scheduler from deployment annotations or returns default
+// Does NOT validate - use DetermineKaiSchedulerQueue for validation
+func ResolveKaiSchedulerQueue(annotations map[string]string) string {
+	return resolveKaiSchedulerQueueName(annotations)
+}
+
+// injectKaiSchedulerIfEnabled injects kai-scheduler settings into a clique if kai-scheduler is enabled and grove is enabled
+func injectKaiSchedulerIfEnabled(
+	clique *grovev1alpha1.PodCliqueTemplateSpec,
+	controllerConfig controller_common.Config,
+	validatedQueueName string,
+) {
+	// Only proceed if grove is enabled, kai-scheduler is enabled, and no manual schedulerName is set
+	if !controllerConfig.Grove.Enabled || !controllerConfig.KaiScheduler.Enabled {
+		return
+	}
+
+	// Check if user has manually set schedulerName - if so, respect their choice
+	if clique.Spec.PodSpec.SchedulerName != "" && clique.Spec.PodSpec.SchedulerName != commonconsts.KaiSchedulerName {
+		return
+	}
+
+	// Use the pre-validated queue name
+	queueName := validatedQueueName
+
+	// Inject schedulerName
+	clique.Spec.PodSpec.SchedulerName = commonconsts.KaiSchedulerName
+
+	// Inject queue label
+	if clique.Labels == nil {
+		clique.Labels = make(map[string]string)
+	}
+	clique.Labels[commonconsts.KubeLabelKaiSchedulerQueue] = queueName
+}
--- a/deploy/cloud/operator/internal/dynamo/grove_test.go
+++ b/deploy/cloud/operator/internal/dynamo/grove_test.go
+package dynamo
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
+	commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
+	"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
+	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/apimachinery/pkg/runtime/schema"
+	dynamicfake "k8s.io/client-go/dynamic/fake"
+)
+
+func TestResolveKaiSchedulerQueueName(t *testing.T) {
+	tests := []struct {
+		name        string
+		annotations map[string]string
+		expected    string
+	}{
+		{
+			name:        "nil annotations",
+			annotations: nil,
+			expected:    commonconsts.DefaultKaiSchedulerQueue,
+		},
+		{
+			name:        "empty annotations",
+			annotations: map[string]string{},
+			expected:    commonconsts.DefaultKaiSchedulerQueue,
+		},
+		{
+			name: "no kai-scheduler annotation",
+			annotations: map[string]string{
+				"other-annotation": "value",
+			},
+			expected: commonconsts.DefaultKaiSchedulerQueue,
+		},
+		{
+			name: "empty kai-scheduler annotation",
+			annotations: map[string]string{
+				commonconsts.KubeAnnotationKaiSchedulerQueue: "",
+			},
+			expected: commonconsts.DefaultKaiSchedulerQueue,
+		},
+		{
+			name: "custom queue name",
+			annotations: map[string]string{
+				commonconsts.KubeAnnotationKaiSchedulerQueue: "custom-queue",
+			},
+			expected: "custom-queue",
+		},
+		{
+			name: "whitespace is trimmed",
+			annotations: map[string]string{
+				commonconsts.KubeAnnotationKaiSchedulerQueue: "  custom-queue  ",
+			},
+			expected: "custom-queue",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := resolveKaiSchedulerQueueName(tt.annotations)
+			if result != tt.expected {
+				t.Errorf("resolveKaiSchedulerQueueName() = %v, expected %v", result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestResolveKaiSchedulerQueue(t *testing.T) {
+	tests := []struct {
+		name        string
+		annotations map[string]string
+		expected    string
+	}{
+		{
+			name:        "default queue",
+			annotations: nil,
+			expected:    commonconsts.DefaultKaiSchedulerQueue,
+		},
+		{
+			name: "custom queue",
+			annotations: map[string]string{
+				commonconsts.KubeAnnotationKaiSchedulerQueue: "my-queue",
+			},
+			expected: "my-queue",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := ResolveKaiSchedulerQueue(tt.annotations)
+			if result != tt.expected {
+				t.Errorf("ResolveKaiSchedulerQueue() = %v, expected %v", result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestInjectKaiSchedulerIfEnabled(t *testing.T) {
+	tests := []struct {
+		name               string
+		controllerConfig   controller_common.Config
+		validatedQueueName string
+		initialClique      *grovev1alpha1.PodCliqueTemplateSpec
+		expectedScheduler  string
+		expectedQueueLabel string
+		shouldInject       bool
+	}{
+		{
+			name: "grove disabled - no injection",
+			controllerConfig: controller_common.Config{
+				Grove:        controller_common.GroveConfig{Enabled: false},
+				KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: true},
+			},
+			validatedQueueName: "test-queue",
+			initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
+				Spec: grovev1alpha1.PodCliqueSpec{
+					PodSpec: corev1.PodSpec{},
+				},
+			},
+			shouldInject: false,
+		},
+		{
+			name: "kai-scheduler disabled - no injection",
+			controllerConfig: controller_common.Config{
+				Grove:        controller_common.GroveConfig{Enabled: true},
+				KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: false},
+			},
+			validatedQueueName: "test-queue",
+			initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
+				Spec: grovev1alpha1.PodCliqueSpec{
+					PodSpec: corev1.PodSpec{},
+				},
+			},
+			shouldInject: false,
+		},
+		{
+			name: "manual scheduler set - no injection",
+			controllerConfig: controller_common.Config{
+				Grove:        controller_common.GroveConfig{Enabled: true},
+				KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: true},
+			},
+			validatedQueueName: "test-queue",
+			initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
+				Spec: grovev1alpha1.PodCliqueSpec{
+					PodSpec: corev1.PodSpec{
+						SchedulerName: "manual-scheduler",
+					},
+				},
+			},
+			shouldInject: false,
+		},
+		{
+			name: "both enabled, no manual scheduler - inject",
+			controllerConfig: controller_common.Config{
+				Grove:        controller_common.GroveConfig{Enabled: true},
+				KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: true},
+			},
+			validatedQueueName: "test-queue",
+			initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
+				Spec: grovev1alpha1.PodCliqueSpec{
+					PodSpec: corev1.PodSpec{},
+				},
+			},
+			expectedScheduler:  commonconsts.KaiSchedulerName,
+			expectedQueueLabel: "test-queue",
+			shouldInject:       true,
+		},
+		{
+			name: "inject with existing labels",
+			controllerConfig: controller_common.Config{
+				Grove:        controller_common.GroveConfig{Enabled: true},
+				KaiScheduler: controller_common.KaiSchedulerConfig{Enabled: true},
+			},
+			validatedQueueName: "custom-queue",
+			initialClique: &grovev1alpha1.PodCliqueTemplateSpec{
+				Labels: map[string]string{
+					"existing-label": "existing-value",
+				},
+				Spec: grovev1alpha1.PodCliqueSpec{
+					PodSpec: corev1.PodSpec{},
+				},
+			},
+			expectedScheduler:  commonconsts.KaiSchedulerName,
+			expectedQueueLabel: "custom-queue",
+			shouldInject:       true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Make a deep copy to avoid modifying the test case
+			clique := tt.initialClique.DeepCopy()
+
+			// Call the function
+			injectKaiSchedulerIfEnabled(clique, tt.controllerConfig, tt.validatedQueueName)
+
+			if tt.shouldInject {
+				// Verify scheduler name is injected
+				if clique.Spec.PodSpec.SchedulerName != tt.expectedScheduler {
+					t.Errorf("expected schedulerName %v, got %v", tt.expectedScheduler, clique.Spec.PodSpec.SchedulerName)
+				}
+
+				// Verify queue label is injected
+				if clique.Labels == nil {
+					t.Errorf("expected labels to be set, got nil")
+				} else {
+					queueLabel := clique.Labels[commonconsts.KubeLabelKaiSchedulerQueue]
+					if queueLabel != tt.expectedQueueLabel {
+						t.Errorf("expected queue label %v, got %v", tt.expectedQueueLabel, queueLabel)
+					}
+				}
+
+				// Verify existing labels are preserved
+				if tt.initialClique.Labels != nil {
+					for key, value := range tt.initialClique.Labels {
+						if clique.Labels[key] != value {
+							t.Errorf("existing label %s=%s was not preserved, got %s", key, value, clique.Labels[key])
+						}
+					}
+				}
+			} else {
+				// Verify no injection occurred
+				if clique.Spec.PodSpec.SchedulerName != tt.initialClique.Spec.PodSpec.SchedulerName {
+					t.Errorf("schedulerName should not have changed, expected %v, got %v",
+						tt.initialClique.Spec.PodSpec.SchedulerName, clique.Spec.PodSpec.SchedulerName)
+				}
+
+				// Verify queue label was not added (unless it existed before)
+				if tt.initialClique.Labels == nil || tt.initialClique.Labels[commonconsts.KubeLabelKaiSchedulerQueue] == "" {
+					if clique.Labels != nil && clique.Labels[commonconsts.KubeLabelKaiSchedulerQueue] != "" {
+						t.Errorf("queue label should not have been added")
+					}
+				}
+			}
+		})
+	}
+}
+
+func TestEnsureQueueExists(t *testing.T) {
+	tests := []struct {
+		name          string
+		queueName     string
+		setupQueue    bool
+		expectedError bool
+		errorContains string
+	}{
+		{
+			name:          "queue exists",
+			queueName:     "existing-queue",
+			setupQueue:    true,
+			expectedError: false,
+		},
+		{
+			name:          "queue does not exist",
+			queueName:     "missing-queue",
+			setupQueue:    false,
+			expectedError: true,
+			errorContains: "not found in cluster",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Create a fake dynamic client
+			dynamicScheme := runtime.NewScheme()
+			fakeDynamic := dynamicfake.NewSimpleDynamicClient(dynamicScheme)
+
+			if tt.setupQueue {
+				// Create a fake queue resource
+				queueGVR := schema.GroupVersionResource{
+					Group:    "scheduling.run.ai",
+					Version:  "v2",
+					Resource: "queues",
+				}
+
+				queue := &unstructured.Unstructured{}
+				queue.SetAPIVersion("scheduling.run.ai/v2")
+				queue.SetKind("Queue")
+				queue.SetName(tt.queueName)
+
+				_, err := fakeDynamic.Resource(queueGVR).Create(context.Background(), queue, metav1.CreateOptions{})
+				if err != nil {
+					t.Fatalf("failed to create fake queue: %v", err)
+				}
+			}
+
+			// This test is limited because we can't easily mock the dynamic client creation
+			// In a real test environment, you would set up a proper test cluster or use envtest
+			err := ensureQueueExists(context.Background(), fakeDynamic, tt.queueName)
+
+			if tt.expectedError {
+				if err == nil {
+					t.Errorf("expected error but got none")
+				} else if tt.errorContains != "" && !strings.Contains(err.Error(), tt.errorContains) {
+					t.Errorf("expected error to contain %q, got %v", tt.errorContains, err)
+				}
+			} else {
+				// We expect an error here because we can't properly mock the dynamic client
+				// In a real test, this would work with proper test setup
+				if err == nil {
+					t.Logf("Queue validation passed (this is expected in unit tests)")
+				}
+			}
+		})
+	}
+}
--- a/docs/guides/dynamo_deploy/multinode-deployment.md
+++ b/docs/guides/dynamo_deploy/multinode-deployment.md
@@ -24,21 +24,22 @@ Dynamo supports multinode deployments through the `multinode` section in resourc

 For sophisticated multinode deployments, Dynamo integrates with advanced Kubernetes orchestration systems:

- **[Grove](https://github.com/NVIDIA/grove/blob/main/docs/installation.md)**: Network topology-aware gang scheduling and auto-scaling for AI workloads
- (optional) **[KAI-Scheduler](https://github.com/NVIDIA/KAI-Scheduler)**: Kubernetes native scheduler optimized for AI workloads at scale
+- **[Grove](https://github.com/NVIDIA/grove)**: Network topology-aware gang scheduling and auto-scaling for AI workloads
+- **[KAI-Scheduler](https://github.com/NVIDIA/KAI-Scheduler)**: Kubernetes native scheduler optimized for AI workloads at scale

 These systems provide enhanced scheduling capabilities including topology-aware placement, gang scheduling, and coordinated auto-scaling across multiple nodes.

 **Features Enabled with Grove:**
- Hierarchical gang scheduling with `PodGangSet` and `PodClique`
+- Declarative composition of AI workloads
 - Multi-level horizontal auto-scaling
 - Custom startup ordering for components
 - Resource-aware rolling updates


-[KAI-Scheduler](https://github.com/NVIDIA/KAI-Scheduler) is an optional enhancement that provides a Kubernetes native scheduler optimized for AI workloads at large scale.
+[KAI-Scheduler](https://github.com/NVIDIA/KAI-Scheduler) is a Kubernetes native scheduler optimized for AI workloads at large scale.

 **Features Enabled with KAI-Scheduler:**
+- Gang scheduling
 - Network topology-aware pod placement
 - AI workload-optimized scheduling algorithms
 - GPU resource awareness and allocation
@@ -46,6 +47,14 @@ These systems provide enhanced scheduling capabilities including topology-aware
 - Integration with Grove for enhanced capabilities
 - Performance optimizations for large-scale deployments

+
+##### Prerequisites
+
+- [Grove](https://github.com/NVIDIA/grove/blob/main/docs/installation.md) installed on the cluster
+- (Optional) [KAI-Scheduler](https://github.com/NVIDIA/KAI-Scheduler) installed on the cluster with default queue name `dynamo` created. You can use a different queue name by setting the `nvidia.com/kai-scheduler-queue` annotation on the DGD resource.
+
+KAI-Scheduler is optional but recommended for advanced scheduling capabilities.
+
 #### Using LWS and Volcano

 LWS is a simple multinode deployment mechanism that allows you to deploy a workload across multiple nodes.
@@ -58,16 +67,70 @@ Volcano is a Kubernetes native scheduler optimized for AI workloads at scale. It

 ## Core Concepts

+### Orchestrator Selection Algorithm
+
+Dynamo automatically selects the best available orchestrator for multinode deployments using the following logic:
+
+#### When Both Grove and LWS are Available:
+- **Grove is selected by default** (recommended for advanced AI workloads)
+- **LWS is selected** if you explicitly set `nvidia.com/enable-grove: "false"` annotation on your DGD resource
+
+#### When Only One Orchestrator is Available:
+- The installed orchestrator (Grove or LWS) is automatically selected
+
+#### Scheduler Integration:
+- **With Grove**: Automatically integrates with [KAI-Scheduler](https://github.com/NVIDIA/KAI-Scheduler) when available, providing:
+  - Advanced queue management via `nvidia.com/kai-scheduler-queue` annotation
+  - AI-optimized scheduling policies
+  - Resource-aware workload placement
+- **With LWS**: Uses Volcano scheduler for gang scheduling and resource coordination
+
+#### Configuration Examples:
+
+**Default (Grove with KAI-Scheduler):**
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: my-multinode-deployment
+  annotations:
+    nvidia.com/kai-scheduler-queue: "gpu-intensive"  # Optional: defaults to "dynamo"
+spec:
+  # ... your deployment spec
+```
+
+**Force LWS usage:**
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: my-multinode-deployment
+  annotations:
+    nvidia.com/enable-grove: "false"
+spec:
+  # ... your deployment spec
+```
+
+
 ### The `multinode` Section

 The `multinode` section in a resource specification defines how many physical nodes the workload should span:

 ```yaml
-multinode:
-  nodeCount: 2
-resources:
-  limits:
-    gpu: "2"            # 2 GPUs per node
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: my-multinode-deployment
+spec:
+  # ... your deployment spec
+  services:
+    my-service:
+      ...
+      multinode:
+        nodeCount: 2
+      resources:
+        limits:
+          gpu: "2"            # 2 GPUs per node
 ```

 ### GPU Distribution
@@ -88,16 +151,28 @@ The tensor parallelism (`tp-size` or `--tp`) in your command/args must match the

 ```yaml
 # Example: 2 multinode.nodeCount × 4 GPUs = 8 total GPUs
-multinode:
-  nodeCount: 2
-resources:
-  limits:
-    gpu: "4"
-
-# Command args must use tp-size=8
-args:
-  - "--tp-size"
-  - "8"  # Must equal multinode.nodeCount × gpu
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: my-multinode-deployment
+spec:
+  # ... your deployment spec
+  services:
+    my-service:
+      ...
+      multinode:
+        nodeCount: 2
+      resources:
+        limits:
+          gpu: "4"
+      extraPodSpec:
+        mainContainer:
+          ...
+          args:
+            # Command args must use tp-size=8
+            - "--tp-size"
+            - "8"  # Must equal multinode.nodeCount × gpu
+
 ```