dynamographdeployment_controller.go 9.95 KB
Newer Older
Neelay Shah's avatar
Neelay Shah committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package controller

import (
	"context"
22
	"fmt"
Neelay Shah's avatar
Neelay Shah committed
23
24
25
26
27
28
29
30
31
32

	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/client-go/tools/record"
	ctrl "sigs.k8s.io/controller-runtime"
	"sigs.k8s.io/controller-runtime/pkg/builder"
	"sigs.k8s.io/controller-runtime/pkg/client"
	"sigs.k8s.io/controller-runtime/pkg/event"
	"sigs.k8s.io/controller-runtime/pkg/log"
	"sigs.k8s.io/controller-runtime/pkg/predicate"

33
34
35
	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
	commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
	"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/dynamo"
Neelay Shah's avatar
Neelay Shah committed
36
37
38
39
40
41
42
43
)

const (
	FailedState  = "failed"
	ReadyState   = "successful"
	PendingState = "pending"
)

44
45
46
47
type etcdStorage interface {
	DeleteKeys(ctx context.Context, prefix string) error
}

48
49
// DynamoGraphDeploymentReconciler reconciles a DynamoGraphDeployment object
type DynamoGraphDeploymentReconciler struct {
Neelay Shah's avatar
Neelay Shah committed
50
	client.Client
51
52
53
54
55
56
	Config                     commonController.Config
	Recorder                   record.EventRecorder
	VirtualServiceGateway      string
	IngressControllerClassName string
	IngressControllerTLSSecret string
	IngressHostSuffix          string
Neelay Shah's avatar
Neelay Shah committed
57
58
}

59
60
61
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
Neelay Shah's avatar
Neelay Shah committed
62
63
64
65

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// TODO(user): Modify the Reconcile function to compare the state specified by
66
// the DynamoGraphDeployment object against the actual cluster state, and then
Neelay Shah's avatar
Neelay Shah committed
67
68
69
70
71
// perform operations to make the cluster state reflect the state specified by
// the user.
//
// For more details, check Reconcile and its Result here:
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile
72
func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
Neelay Shah's avatar
Neelay Shah committed
73
74
75
76
	logger := log.FromContext(ctx)

	var err error
	reason := "undefined"
77
	message := ""
Neelay Shah's avatar
Neelay Shah committed
78
79
	readyStatus := metav1.ConditionFalse
	// retrieve the CRD
80
	dynamoDeployment := &nvidiacomv1alpha1.DynamoGraphDeployment{}
Neelay Shah's avatar
Neelay Shah committed
81
82
83
84
85
86
87
88
89
90
91
92
	if err = r.Get(ctx, req.NamespacedName, dynamoDeployment); err != nil {
		return ctrl.Result{}, client.IgnoreNotFound(err)
	}
	if err != nil {
		// not found, nothing to do
		return ctrl.Result{}, nil
	}

	defer func() {
		if err != nil {
			dynamoDeployment.SetState(FailedState)
			message = err.Error()
93
			logger.Error(err, "Reconciliation failed")
Neelay Shah's avatar
Neelay Shah committed
94
95
		}
		// update the CRD status condition
96
97
98
99
100
101
102
		dynamoDeployment.AddStatusCondition(metav1.Condition{
			Type:               "Ready",
			Status:             readyStatus,
			Reason:             reason,
			Message:            message,
			LastTransitionTime: metav1.Now(),
		})
Neelay Shah's avatar
Neelay Shah committed
103
104
105
106
107
108
109
		err = r.Status().Update(ctx, dynamoDeployment)
		if err != nil {
			logger.Error(err, "Unable to update the CRD status", "crd", req.NamespacedName)
		}
		logger.Info("Reconciliation done")
	}()

110
111
	deleted, err := commonController.HandleFinalizer(ctx, dynamoDeployment, r.Client, r)
	if err != nil {
112
		logger.Error(err, "failed to handle the finalizer")
113
114
115
116
117
118
119
		reason = "failed_to_handle_the_finalizer"
		return ctrl.Result{}, err
	}
	if deleted {
		return ctrl.Result{}, nil
	}

120
121
	// fetch the dynamoGraphConfig
	dynamoGraphConfig, err := dynamo.GetDynamoGraphConfig(ctx, dynamoDeployment, r.Recorder)
Neelay Shah's avatar
Neelay Shah committed
122
	if err != nil {
123
		logger.Error(err, "failed to get the DynamoGraphConfig")
124
		reason = "failed_to_get_the_DynamoGraphConfig"
Neelay Shah's avatar
Neelay Shah committed
125
126
127
		return ctrl.Result{}, err
	}

128
129
	// generate the dynamoComponentsDeployments from the config
	dynamoComponentsDeployments, err := dynamo.GenerateDynamoComponentsDeployments(ctx, dynamoDeployment, dynamoGraphConfig, r.generateDefaultIngressSpec(dynamoDeployment))
Neelay Shah's avatar
Neelay Shah committed
130
	if err != nil {
131
		logger.Error(err, "failed to generate the DynamoComponentsDeployments")
132
		reason = "failed_to_generate_the_DynamoComponentsDeployments"
Neelay Shah's avatar
Neelay Shah committed
133
134
135
		return ctrl.Result{}, err
	}

136
	// merge the dynamoComponentsDeployments with the dynamoComponentsDeployments from the CRD
137
	for _, deployment := range dynamoComponentsDeployments {
138
		if deployment.Spec.Ingress.Enabled {
139
			dynamoDeployment.SetEndpointStatus(r.isEndpointSecured(), getIngressHost(deployment.Spec.Ingress))
140
		}
Neelay Shah's avatar
Neelay Shah committed
141
142
	}

143
	// reconcile the dynamoComponent
144
	// for now we use the same component for all the services and we differentiate them by the service name when launching the component
145
	dynamoComponent := &nvidiacomv1alpha1.DynamoComponent{
Neelay Shah's avatar
Neelay Shah committed
146
		ObjectMeta: metav1.ObjectMeta{
147
			Name:      getK8sName(dynamoDeployment.Spec.DynamoGraph),
Neelay Shah's avatar
Neelay Shah committed
148
149
			Namespace: dynamoDeployment.Namespace,
		},
150
		Spec: nvidiacomv1alpha1.DynamoComponentSpec{
151
			DynamoComponent: dynamoDeployment.Spec.DynamoGraph,
Neelay Shah's avatar
Neelay Shah committed
152
153
		},
	}
154
155
156
	_, dynamoComponent, err = commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoComponent, bool, error) {
		return dynamoComponent, false, nil
	})
Neelay Shah's avatar
Neelay Shah committed
157
	if err != nil {
158
		logger.Error(err, "failed to sync the DynamoComponent")
159
		reason = "failed_to_sync_the_DynamoComponent"
Neelay Shah's avatar
Neelay Shah committed
160
161
		return ctrl.Result{}, err
	}
162
163
164
165
166
167
168
	if !dynamoComponent.IsReady() {
		logger.Info("The DynamoComponent is not ready")
		reason = "dynamoComponent_is_not_ready"
		message = "The DynamoComponent is not ready"
		readyStatus = metav1.ConditionFalse
		return ctrl.Result{}, nil
	}
Neelay Shah's avatar
Neelay Shah committed
169

170
	notReadyDeployments := []string{}
171
172
	// reconcile the dynamoComponentsDeployments
	for serviceName, dynamoComponentDeployment := range dynamoComponentsDeployments {
173
		logger.Info("Reconciling the DynamoComponentDeployment", "serviceName", serviceName, "dynamoComponentDeployment", dynamoComponentDeployment)
174
175
176
		_, dynamoComponentDeployment, err = commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoComponentDeployment, bool, error) {
			return dynamoComponentDeployment, false, nil
		})
Neelay Shah's avatar
Neelay Shah committed
177
		if err != nil {
178
			logger.Error(err, "failed to sync the DynamoComponentDeployment")
179
			reason = "failed_to_sync_the_DynamoComponentDeployment"
Neelay Shah's avatar
Neelay Shah committed
180
181
			return ctrl.Result{}, err
		}
182
183
		if !dynamoComponentDeployment.Status.IsReady() {
			notReadyDeployments = append(notReadyDeployments, dynamoComponentDeployment.Name)
Neelay Shah's avatar
Neelay Shah committed
184
185
		}
	}
186
	if len(notReadyDeployments) == 0 {
Neelay Shah's avatar
Neelay Shah committed
187
		dynamoDeployment.SetState(ReadyState)
188
189
		reason = "all_deployments_are_ready"
		message = "All deployments are ready"
Neelay Shah's avatar
Neelay Shah committed
190
191
		readyStatus = metav1.ConditionTrue
	} else {
192
193
		reason = "some_deployments_are_not_ready"
		message = fmt.Sprintf("The following deployments are not ready: %v", notReadyDeployments)
Neelay Shah's avatar
Neelay Shah committed
194
195
196
197
198
199
200
		dynamoDeployment.SetState(PendingState)
	}

	return ctrl.Result{}, nil

}

201
func (r *DynamoGraphDeploymentReconciler) generateDefaultIngressSpec(dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) *nvidiacomv1alpha1.IngressSpec {
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
	res := &nvidiacomv1alpha1.IngressSpec{
		Enabled:           r.VirtualServiceGateway != "" || r.IngressControllerClassName != "",
		Host:              dynamoDeployment.Name,
		UseVirtualService: r.VirtualServiceGateway != "",
	}
	if r.IngressControllerClassName != "" {
		res.IngressControllerClassName = &r.IngressControllerClassName
	}
	if r.IngressControllerTLSSecret != "" {
		res.TLS = &nvidiacomv1alpha1.IngressTLSSpec{
			SecretName: r.IngressControllerTLSSecret,
		}
	}
	if r.IngressHostSuffix != "" {
		res.HostSuffix = &r.IngressHostSuffix
	}
	if r.VirtualServiceGateway != "" {
		res.VirtualServiceGateway = &r.VirtualServiceGateway
	}
	return res
}

224
func (r *DynamoGraphDeploymentReconciler) isEndpointSecured() bool {
225
226
227
	if r.VirtualServiceGateway != "" && r.Config.VirtualServiceSupportsHTTPS {
		return true
	}
228
229
230
	return r.IngressControllerTLSSecret != ""
}

231
func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error {
232
233
234
235
	// for now doing nothing
	return nil
}

Neelay Shah's avatar
Neelay Shah committed
236
// SetupWithManager sets up the controller with the Manager.
237
func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error {
Neelay Shah's avatar
Neelay Shah committed
238
	return ctrl.NewControllerManagedBy(mgr).
239
		For(&nvidiacomv1alpha1.DynamoGraphDeployment{}, builder.WithPredicates(
240
241
			predicate.GenerationChangedPredicate{},
		)).
242
243
		Named("dynamographdeployment").
		Owns(&nvidiacomv1alpha1.DynamoComponentDeployment{}, builder.WithPredicates(predicate.Funcs{
Neelay Shah's avatar
Neelay Shah committed
244
245
246
247
248
249
			// ignore creation cause we don't want to be called again after we create the deployment
			CreateFunc:  func(ce event.CreateEvent) bool { return false },
			DeleteFunc:  func(de event.DeleteEvent) bool { return true },
			UpdateFunc:  func(de event.UpdateEvent) bool { return true },
			GenericFunc: func(ge event.GenericEvent) bool { return true },
		})).
250
251
252
253
254
255
256
		Owns(&nvidiacomv1alpha1.DynamoComponent{}, builder.WithPredicates(predicate.Funcs{
			// ignore creation cause we don't want to be called again after we create the deployment
			CreateFunc:  func(ce event.CreateEvent) bool { return false },
			DeleteFunc:  func(de event.DeleteEvent) bool { return true },
			UpdateFunc:  func(de event.UpdateEvent) bool { return true },
			GenericFunc: func(ge event.GenericEvent) bool { return true },
		})).
Neelay Shah's avatar
Neelay Shah committed
257
258
259
		WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)).
		Complete(r)
}
260
261
262
263

func (r *DynamoGraphDeploymentReconciler) GetRecorder() record.EventRecorder {
	return r.Recorder
}