dynamographdeployment_controller.go 8.2 KB
Newer Older
Neelay Shah's avatar
Neelay Shah committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package controller

import (
	"context"
22
	"fmt"
Neelay Shah's avatar
Neelay Shah committed
23
24
25
26
27
28
29
30
31
32

	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/client-go/tools/record"
	ctrl "sigs.k8s.io/controller-runtime"
	"sigs.k8s.io/controller-runtime/pkg/builder"
	"sigs.k8s.io/controller-runtime/pkg/client"
	"sigs.k8s.io/controller-runtime/pkg/event"
	"sigs.k8s.io/controller-runtime/pkg/log"
	"sigs.k8s.io/controller-runtime/pkg/predicate"

33
34
35
	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
	commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
	"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/dynamo"
Neelay Shah's avatar
Neelay Shah committed
36
37
38
39
40
41
42
43
)

const (
	FailedState  = "failed"
	ReadyState   = "successful"
	PendingState = "pending"
)

44
45
46
47
type etcdStorage interface {
	DeleteKeys(ctx context.Context, prefix string) error
}

48
49
// DynamoGraphDeploymentReconciler reconciles a DynamoGraphDeployment object
type DynamoGraphDeploymentReconciler struct {
Neelay Shah's avatar
Neelay Shah committed
50
	client.Client
51
52
53
54
55
56
	Config                     commonController.Config
	Recorder                   record.EventRecorder
	VirtualServiceGateway      string
	IngressControllerClassName string
	IngressControllerTLSSecret string
	IngressHostSuffix          string
Neelay Shah's avatar
Neelay Shah committed
57
58
}

59
60
61
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
Neelay Shah's avatar
Neelay Shah committed
62
63
64
65

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// TODO(user): Modify the Reconcile function to compare the state specified by
66
// the DynamoGraphDeployment object against the actual cluster state, and then
Neelay Shah's avatar
Neelay Shah committed
67
68
69
70
71
// perform operations to make the cluster state reflect the state specified by
// the user.
//
// For more details, check Reconcile and its Result here:
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/reconcile
72
func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
Neelay Shah's avatar
Neelay Shah committed
73
74
75
76
	logger := log.FromContext(ctx)

	var err error
	reason := "undefined"
77
	message := ""
Neelay Shah's avatar
Neelay Shah committed
78
79
	readyStatus := metav1.ConditionFalse
	// retrieve the CRD
80
	dynamoDeployment := &nvidiacomv1alpha1.DynamoGraphDeployment{}
Neelay Shah's avatar
Neelay Shah committed
81
82
83
84
85
86
87
88
89
90
91
92
	if err = r.Get(ctx, req.NamespacedName, dynamoDeployment); err != nil {
		return ctrl.Result{}, client.IgnoreNotFound(err)
	}
	if err != nil {
		// not found, nothing to do
		return ctrl.Result{}, nil
	}

	defer func() {
		if err != nil {
			dynamoDeployment.SetState(FailedState)
			message = err.Error()
93
			logger.Error(err, "Reconciliation failed")
Neelay Shah's avatar
Neelay Shah committed
94
95
		}
		// update the CRD status condition
96
97
98
99
100
101
102
		dynamoDeployment.AddStatusCondition(metav1.Condition{
			Type:               "Ready",
			Status:             readyStatus,
			Reason:             reason,
			Message:            message,
			LastTransitionTime: metav1.Now(),
		})
Neelay Shah's avatar
Neelay Shah committed
103
104
105
106
107
108
109
		err = r.Status().Update(ctx, dynamoDeployment)
		if err != nil {
			logger.Error(err, "Unable to update the CRD status", "crd", req.NamespacedName)
		}
		logger.Info("Reconciliation done")
	}()

110
111
	deleted, err := commonController.HandleFinalizer(ctx, dynamoDeployment, r.Client, r)
	if err != nil {
112
		logger.Error(err, "failed to handle the finalizer")
113
114
115
116
117
118
119
		reason = "failed_to_handle_the_finalizer"
		return ctrl.Result{}, err
	}
	if deleted {
		return ctrl.Result{}, nil
	}

120
	// generate the dynamoComponentsDeployments from the config
121
	dynamoComponentsDeployments, err := dynamo.GenerateDynamoComponentsDeployments(ctx, dynamoDeployment, r.generateDefaultIngressSpec(dynamoDeployment))
Neelay Shah's avatar
Neelay Shah committed
122
	if err != nil {
123
		logger.Error(err, "failed to generate the DynamoComponentsDeployments and DynamoComponents")
124
		reason = "failed_to_generate_the_DynamoComponentsDeployments"
Neelay Shah's avatar
Neelay Shah committed
125
126
127
		return ctrl.Result{}, err
	}

128
	// merge the dynamoComponentsDeployments with the dynamoComponentsDeployments from the CRD
129
	for _, deployment := range dynamoComponentsDeployments {
130
		if deployment.Spec.Ingress.Enabled {
131
			dynamoDeployment.SetEndpointStatus(r.isEndpointSecured(), getIngressHost(deployment.Spec.Ingress))
132
		}
Neelay Shah's avatar
Neelay Shah committed
133
134
	}

135
	notReadyDeployments := []string{}
136
137
	// reconcile the dynamoComponentsDeployments
	for serviceName, dynamoComponentDeployment := range dynamoComponentsDeployments {
138
		logger.Info("Reconciling the DynamoComponentDeployment", "serviceName", serviceName, "dynamoComponentDeployment", dynamoComponentDeployment)
139
140
141
		_, dynamoComponentDeployment, err = commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*nvidiacomv1alpha1.DynamoComponentDeployment, bool, error) {
			return dynamoComponentDeployment, false, nil
		})
Neelay Shah's avatar
Neelay Shah committed
142
		if err != nil {
143
			logger.Error(err, "failed to sync the DynamoComponentDeployment")
144
			reason = "failed_to_sync_the_DynamoComponentDeployment"
Neelay Shah's avatar
Neelay Shah committed
145
146
			return ctrl.Result{}, err
		}
147
148
		if !dynamoComponentDeployment.Status.IsReady() {
			notReadyDeployments = append(notReadyDeployments, dynamoComponentDeployment.Name)
Neelay Shah's avatar
Neelay Shah committed
149
150
		}
	}
151
	if len(notReadyDeployments) == 0 {
Neelay Shah's avatar
Neelay Shah committed
152
		dynamoDeployment.SetState(ReadyState)
153
154
		reason = "all_deployments_are_ready"
		message = "All deployments are ready"
Neelay Shah's avatar
Neelay Shah committed
155
156
		readyStatus = metav1.ConditionTrue
	} else {
157
158
		reason = "some_deployments_are_not_ready"
		message = fmt.Sprintf("The following deployments are not ready: %v", notReadyDeployments)
Neelay Shah's avatar
Neelay Shah committed
159
160
161
162
163
164
165
		dynamoDeployment.SetState(PendingState)
	}

	return ctrl.Result{}, nil

}

166
func (r *DynamoGraphDeploymentReconciler) generateDefaultIngressSpec(dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) *nvidiacomv1alpha1.IngressSpec {
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
	res := &nvidiacomv1alpha1.IngressSpec{
		Enabled:           r.VirtualServiceGateway != "" || r.IngressControllerClassName != "",
		Host:              dynamoDeployment.Name,
		UseVirtualService: r.VirtualServiceGateway != "",
	}
	if r.IngressControllerClassName != "" {
		res.IngressControllerClassName = &r.IngressControllerClassName
	}
	if r.IngressControllerTLSSecret != "" {
		res.TLS = &nvidiacomv1alpha1.IngressTLSSpec{
			SecretName: r.IngressControllerTLSSecret,
		}
	}
	if r.IngressHostSuffix != "" {
		res.HostSuffix = &r.IngressHostSuffix
	}
	if r.VirtualServiceGateway != "" {
		res.VirtualServiceGateway = &r.VirtualServiceGateway
	}
	return res
}

189
func (r *DynamoGraphDeploymentReconciler) isEndpointSecured() bool {
190
191
192
	if r.VirtualServiceGateway != "" && r.Config.VirtualServiceSupportsHTTPS {
		return true
	}
193
194
195
	return r.IngressControllerTLSSecret != ""
}

196
func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error {
197
198
199
200
	// for now doing nothing
	return nil
}

Neelay Shah's avatar
Neelay Shah committed
201
// SetupWithManager sets up the controller with the Manager.
202
func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error {
Neelay Shah's avatar
Neelay Shah committed
203
	return ctrl.NewControllerManagedBy(mgr).
204
		For(&nvidiacomv1alpha1.DynamoGraphDeployment{}, builder.WithPredicates(
205
206
			predicate.GenerationChangedPredicate{},
		)).
207
208
		Named("dynamographdeployment").
		Owns(&nvidiacomv1alpha1.DynamoComponentDeployment{}, builder.WithPredicates(predicate.Funcs{
Neelay Shah's avatar
Neelay Shah committed
209
210
211
212
213
214
215
216
217
			// ignore creation cause we don't want to be called again after we create the deployment
			CreateFunc:  func(ce event.CreateEvent) bool { return false },
			DeleteFunc:  func(de event.DeleteEvent) bool { return true },
			UpdateFunc:  func(de event.UpdateEvent) bool { return true },
			GenericFunc: func(ge event.GenericEvent) bool { return true },
		})).
		WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config)).
		Complete(r)
}
218
219
220
221

func (r *DynamoGraphDeploymentReconciler) GetRecorder() record.EventRecorder {
	return r.Recorder
}