cert.go 11.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 */

package cert

import (
	"context"
	"fmt"
	"os"
	"strings"

	configv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/config/v1alpha1"
	"github.com/go-logr/logr"
	certrotator "github.com/open-policy-agent/cert-controller/pkg/rotator"
	admissionregistrationv1 "k8s.io/api/admissionregistration/v1"
	corev1 "k8s.io/api/core/v1"
	apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
	apierrors "k8s.io/apimachinery/pkg/api/errors"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/types"
	ctrl "sigs.k8s.io/controller-runtime"
	"sigs.k8s.io/controller-runtime/pkg/client"
)

const (
	certificateAuthorityName         = "Dynamo-Webhook-CA"
	certificateAuthorityOrganization = "NVIDIA"
	namespaceFile                    = "/var/run/secrets/kubernetes.io/serviceaccount/namespace"
	dgdrCRDName                      = "dynamographdeploymentrequests.nvidia.com"
	partOfLabel                      = "app.kubernetes.io/part-of"
	partOfValue                      = "dynamo-operator"
	operatorNamespaceLabel           = "nvidia.com/dynamo-operator-namespace"
)

// CertProvisioner abstracts the mechanism that adds a certificate rotator to
// the controller-runtime manager. The default implementation delegates to the
// OPA cert-controller; tests can substitute a stub.
type CertProvisioner interface {
	AddRotator(mgr ctrl.Manager, rotator *certrotator.CertRotator) error
}

// opaCertProvisioner is the production implementation backed by the OPA
// cert-controller library.
type opaCertProvisioner struct{}

func (opaCertProvisioner) AddRotator(mgr ctrl.Manager, rotator *certrotator.CertRotator) error {
	return certrotator.AddRotator(mgr, rotator)
}

// CertManager manages webhook TLS certificate lifecycle.
// In auto mode it uses a CertProvisioner for generation and rotation.
// In manual mode it expects externally provided certificates and signals
// readiness immediately.
type CertManager struct {
	client      client.Client
	cfg         *configv1alpha1.WebhookServer
	namespace   string
	ready       chan struct{}
	logger      logr.Logger
	provisioner CertProvisioner
}

// NewCertManager creates a CertManager. The client should be a direct
// (non-cached) client because the manager's cache isn't started yet when
// Setup is called. Only used to create the placeholder secret in auto mode;
// RBAC is the actual access boundary.
func NewCertManager(cl client.Client, cfg *configv1alpha1.WebhookServer) (*CertManager, error) {
	ns, err := getOperatorNamespace()
	if err != nil {
		return nil, fmt.Errorf("reading operator namespace: %w", err)
	}
	return &CertManager{
		client:      cl,
		cfg:         cfg,
		namespace:   ns,
		ready:       make(chan struct{}),
		logger:      ctrl.Log.WithName("cert-manager"),
		provisioner: opaCertProvisioner{},
	}, nil
}

// Setup configures certificate management and adds the cert-controller to
// the manager (auto mode) or closes the ready channel immediately (manual mode).
func (cm *CertManager) Setup(ctx context.Context, mgr ctrl.Manager) error {
	switch cm.cfg.CertProvisionMode {
	case configv1alpha1.CertProvisionModeManual:
		cm.logger.Info("Using externally provided certificates (manual mode)",
			"certDir", cm.cfg.CertDir, "secretName", cm.cfg.SecretName)
		close(cm.ready)
		return nil

	case configv1alpha1.CertProvisionModeAuto:
		return cm.setupAutoProvisioning(ctx, mgr)

	default:
		return fmt.Errorf("unsupported cert provision mode: %q", cm.cfg.CertProvisionMode)
	}
}

// WaitReady blocks until certificates have been written to the cert directory.
func (cm *CertManager) WaitReady() {
	cm.logger.Info("Waiting for webhook certificates to be ready")
	<-cm.ready
	cm.logger.Info("Webhook certificates are ready")
}

func (cm *CertManager) setupAutoProvisioning(ctx context.Context, mgr ctrl.Manager) error {
	if err := cm.createPlaceholderSecretIfNotExists(ctx); err != nil {
		return fmt.Errorf("ensuring webhook TLS secret exists: %w", err)
	}

	dnsName := fmt.Sprintf("%s.%s.svc", cm.cfg.ServiceName, cm.namespace)

	cm.logger.Info("Auto-provisioning certificates using cert-controller",
		"secretName", cm.cfg.SecretName, "dnsName", dnsName)

	rotator := &certrotator.CertRotator{
		SecretKey: types.NamespacedName{
			Namespace: cm.namespace,
			Name:      cm.cfg.SecretName,
		},
		CertDir:        cm.cfg.CertDir,
		CAName:         certificateAuthorityName,
		CAOrganization: certificateAuthorityOrganization,
		IsReady:        cm.ready,
		DNSName:        dnsName,
		ExtraDNSNames: []string{
			cm.cfg.ServiceName,
			fmt.Sprintf("%s.%s", cm.cfg.ServiceName, cm.namespace),
			fmt.Sprintf("%s.%s.svc.cluster.local", cm.cfg.ServiceName, cm.namespace),
		},
		EnableReadinessCheck:   true,
		RestartOnSecretRefresh: true,
	}
	return cm.provisioner.AddRotator(mgr, rotator)
}

// createPlaceholderSecretIfNotExists creates the webhook TLS secret if it does
// not already exist. The OPA cert-controller can only Update existing secrets,
// not Create them. If the secret already exists it is left untouched.
func (cm *CertManager) createPlaceholderSecretIfNotExists(ctx context.Context) error {
	err := cm.client.Get(ctx, types.NamespacedName{Namespace: cm.namespace, Name: cm.cfg.SecretName}, &corev1.Secret{})
	if !apierrors.IsNotFound(err) {
		return err
	}

	secret := &corev1.Secret{
		ObjectMeta: metav1.ObjectMeta{
			Namespace: cm.namespace,
			Name:      cm.cfg.SecretName,
			Labels: map[string]string{
				"app.kubernetes.io/managed-by": "dynamo-operator",
				"app.kubernetes.io/component":  "webhook",
				partOfLabel:                    partOfValue,
			},
		},
		Type: corev1.SecretTypeTLS,
		Data: map[string][]byte{
			"tls.crt": {},
			"tls.key": {},
			"ca.crt":  {},
		},
	}
	if err := cm.client.Create(ctx, secret); err != nil {
		if apierrors.IsAlreadyExists(err) {
			return nil
		}
		return fmt.Errorf("creating webhook TLS secret: %w", err)
	}

	cm.logger.Info("Created webhook TLS secret", "namespace", cm.namespace, "name", cm.cfg.SecretName)
	return nil
}

// CABundleInjector discovers webhook configurations owned by this operator
// instance and patches them with the CA bundle from the cert secret.
type CABundleInjector struct {
	client    client.Client
	cfg       *configv1alpha1.OperatorConfiguration
	namespace string
	logger    logr.Logger
}

// NewCABundleInjector creates a CABundleInjector. The client should be the
// manager's cached client (available after mgr.Start).
func NewCABundleInjector(cl client.Client, cfg *configv1alpha1.OperatorConfiguration) (*CABundleInjector, error) {
	ns, err := getOperatorNamespace()
	if err != nil {
		return nil, fmt.Errorf("reading operator namespace: %w", err)
	}
	return &CABundleInjector{
		client:    cl,
		cfg:       cfg,
		namespace: ns,
		logger:    ctrl.Log.WithName("ca-bundle-injector"),
	}, nil
}

// InjectAll reads the CA bundle from the cert secret and injects it into all
// webhook configurations owned by this operator instance (scoped by namespace
// label), and into the DGDR CRD conversion webhook.
func (i *CABundleInjector) InjectAll(ctx context.Context) error {
	caBundle, err := i.readCABundle(ctx)
	if err != nil {
		return fmt.Errorf("reading CA bundle from secret %s/%s: %w", i.namespace, i.cfg.Server.Webhook.SecretName, err)
	}

	if err := i.injectIntoValidatingWebhooks(ctx, caBundle); err != nil {
		return err
	}
	if err := i.injectIntoMutatingWebhooks(ctx, caBundle); err != nil {
		return err
	}
	if err := i.ensureCRDConversion(ctx, caBundle); err != nil {
		return err
	}

	i.logger.Info("CA bundle injected into all webhook configurations")
	return nil
}

func (i *CABundleInjector) readCABundle(ctx context.Context) ([]byte, error) {
	secret := &corev1.Secret{}
	if err := i.client.Get(ctx, types.NamespacedName{Namespace: i.namespace, Name: i.cfg.Server.Webhook.SecretName}, secret); err != nil {
		return nil, err
	}
	ca, ok := secret.Data["ca.crt"]
	if !ok || len(ca) == 0 {
		return nil, fmt.Errorf("ca.crt not found or empty in secret %s/%s", i.namespace, i.cfg.Server.Webhook.SecretName)
	}
	return ca, nil
}

func (i *CABundleInjector) webhookLabels() client.MatchingLabels {
	return client.MatchingLabels{
		partOfLabel:            partOfValue,
		operatorNamespaceLabel: i.namespace,
	}
}

func (i *CABundleInjector) injectIntoValidatingWebhooks(ctx context.Context, caBundle []byte) error {
	list := &admissionregistrationv1.ValidatingWebhookConfigurationList{}
	if err := i.client.List(ctx, list, i.webhookLabels()); err != nil {
		return fmt.Errorf("listing validating webhook configurations: %w", err)
	}
	for idx := range list.Items {
		wc := &list.Items[idx]
		original := wc.DeepCopy()
		for j := range wc.Webhooks {
			wc.Webhooks[j].ClientConfig.CABundle = caBundle
		}
		if err := i.client.Patch(ctx, wc, client.MergeFrom(original)); err != nil {
			return fmt.Errorf("patching validating webhook config %s: %w", wc.Name, err)
		}
		i.logger.Info("Injected CA bundle into ValidatingWebhookConfiguration", "name", wc.Name)
	}
	return nil
}

func (i *CABundleInjector) injectIntoMutatingWebhooks(ctx context.Context, caBundle []byte) error {
	list := &admissionregistrationv1.MutatingWebhookConfigurationList{}
	if err := i.client.List(ctx, list, i.webhookLabels()); err != nil {
		return fmt.Errorf("listing mutating webhook configurations: %w", err)
	}
	for idx := range list.Items {
		wc := &list.Items[idx]
		original := wc.DeepCopy()
		for j := range wc.Webhooks {
			wc.Webhooks[j].ClientConfig.CABundle = caBundle
		}
		if err := i.client.Patch(ctx, wc, client.MergeFrom(original)); err != nil {
			return fmt.Errorf("patching mutating webhook config %s: %w", wc.Name, err)
		}
		i.logger.Info("Injected CA bundle into MutatingWebhookConfiguration", "name", wc.Name)
	}
	return nil
}

// ensureCRDConversion patches the DGDR CRD with the conversion webhook config,
// setting the caBundle and service reference to this operator's webhook service.
func (i *CABundleInjector) ensureCRDConversion(ctx context.Context, caBundle []byte) error {
	crd := &apiextensionsv1.CustomResourceDefinition{}
	if err := i.client.Get(ctx, types.NamespacedName{Name: dgdrCRDName}, crd); err != nil {
		if apierrors.IsNotFound(err) {
			i.logger.Info("DGDR CRD not found, skipping conversion webhook setup")
			return nil
		}
		return fmt.Errorf("getting CRD %s: %w", dgdrCRDName, err)
	}

	original := crd.DeepCopy()
	path := "/convert"
	crd.Spec.Conversion = &apiextensionsv1.CustomResourceConversion{
		Strategy: apiextensionsv1.WebhookConverter,
		Webhook: &apiextensionsv1.WebhookConversion{
			ClientConfig: &apiextensionsv1.WebhookClientConfig{
				Service: &apiextensionsv1.ServiceReference{
					Name:      i.cfg.Server.Webhook.ServiceName,
					Namespace: i.namespace,
					Path:      &path,
				},
				CABundle: caBundle,
			},
			ConversionReviewVersions: []string{"v1"},
		},
	}

	if err := i.client.Patch(ctx, crd, client.MergeFrom(original)); err != nil {
		return fmt.Errorf("patching CRD %s conversion config: %w", dgdrCRDName, err)
	}
	i.logger.Info("Configured CRD conversion webhook", "crd", dgdrCRDName)
	return nil
}

func getOperatorNamespace() (string, error) {
	data, err := os.ReadFile(namespaceFile)
	if err != nil {
		return "", fmt.Errorf("reading namespace from %s: %w", namespaceFile, err)
	}
	ns := strings.TrimSpace(string(data))
	if len(ns) == 0 {
		return "", fmt.Errorf("operator namespace is empty")
	}
	return ns, nil
}