predicate.go 4.95 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package controller_common

import (
	"context"
	"strings"
23
	"time"
24
25

	"k8s.io/apimachinery/pkg/api/meta"
26
27
	"k8s.io/client-go/discovery"
	ctrl "sigs.k8s.io/controller-runtime"
28
29
30
31
32
	"sigs.k8s.io/controller-runtime/pkg/client"
	"sigs.k8s.io/controller-runtime/pkg/log"
	"sigs.k8s.io/controller-runtime/pkg/predicate"
)

33
34
35
type GroveConfig struct {
	// Enabled is automatically determined by checking if Grove CRDs are installed in the cluster
	Enabled bool
36
	// TerminationDelay configures the termination delay for Grove PodCliqueSets
37
38
39
	TerminationDelay time.Duration
}

40
41
42
43
44
type LWSConfig struct {
	// Enabled is automatically determined by checking if LWS CRDs are installed in the cluster
	Enabled bool
}

45
46
47
48
49
type KaiSchedulerConfig struct {
	// Enabled is automatically determined by checking if Kai-scheduler CRDs are installed in the cluster
	Enabled bool
}

50
51
52
type Config struct {
	// Enable resources filtering, only the resources belonging to the given namespace will be handled.
	RestrictedNamespace string
53
	Grove               GroveConfig
54
	LWS                 LWSConfig
55
	KaiScheduler        KaiSchedulerConfig
56
57
58
	EtcdAddress         string
	NatsAddress         string
	IngressConfig       IngressConfig
59
60
	// ModelExpressURL is the URL of the Model Express server to inject into all pods
	ModelExpressURL string
61
62
	// PrometheusEndpoint is the URL of the Prometheus endpoint to use for metrics
	PrometheusEndpoint string
63
64
65
66
67
68
69
70
71
72
73
}

type IngressConfig struct {
	VirtualServiceGateway      string
	IngressControllerClassName string
	IngressControllerTLSSecret string
	IngressHostSuffix          string
}

func (i *IngressConfig) UseVirtualService() bool {
	return i.VirtualServiceGateway != ""
74
75
}

76
77
78
// DetectGroveAvailability checks if Grove is available by checking if the Grove API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectGroveAvailability(ctx context.Context, mgr ctrl.Manager) bool {
79
80
81
82
83
84
85
86
87
	return detectAPIGroupAvailability(ctx, mgr, "grove.io")
}

// DetectLWSAvailability checks if LWS is available by checking if the LWS API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectLWSAvailability(ctx context.Context, mgr ctrl.Manager) bool {
	return detectAPIGroupAvailability(ctx, mgr, "leaderworkerset.x-k8s.io")
}

88
89
90
91
92
93
// DetectKaiSchedulerAvailability checks if Kai-scheduler is available by checking if the scheduling.run.ai API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectKaiSchedulerAvailability(ctx context.Context, mgr ctrl.Manager) bool {
	return detectAPIGroupAvailability(ctx, mgr, "scheduling.run.ai")
}

94
95
// detectAPIGroupAvailability checks if a specific API group is registered in the cluster
func detectAPIGroupAvailability(ctx context.Context, mgr ctrl.Manager, groupName string) bool {
96
97
98
99
	logger := log.FromContext(ctx)

	cfg := mgr.GetConfig()
	if cfg == nil {
100
		logger.Info("detection failed, no discovery client available", "group", groupName)
101
102
103
104
105
		return false
	}

	discoveryClient, err := discovery.NewDiscoveryClientForConfig(cfg)
	if err != nil {
106
		logger.Error(err, "detection failed, could not create discovery client", "group", groupName)
107
108
109
110
111
		return false
	}

	apiGroups, err := discoveryClient.ServerGroups()
	if err != nil {
112
		logger.Error(err, "detection failed, could not list server groups", "group", groupName)
113
114
115
116
		return false
	}

	for _, group := range apiGroups.Groups {
117
118
		if group.Name == groupName {
			logger.Info("API group is available", "group", groupName)
119
120
121
122
			return true
		}
	}

123
	logger.Info("API group not available", "group", groupName)
124
125
	return false
}
126

127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
func EphemeralDeploymentEventFilter(config Config) predicate.Predicate {
	return predicate.NewPredicateFuncs(func(o client.Object) bool {
		l := log.FromContext(context.Background())
		objMeta, err := meta.Accessor(o)
		if err != nil {
			l.Error(err, "Error extracting object metadata")
			return false
		}
		if config.RestrictedNamespace != "" {
			// in case of a restricted namespace, we only want to process the events that are in the restricted namespace
			return objMeta.GetNamespace() == config.RestrictedNamespace
		}
		// in all other cases, discard the event if it is destined to an ephemeral deployment
		if strings.Contains(objMeta.GetNamespace(), "ephemeral") {
			return false
		}
		return true
	})
}