Unverified Commit 7d78fdad authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

fix: autodetect Volcano installation (#3956)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent ce576c0c
......@@ -295,6 +295,9 @@ func main() {
mgrOpts.Cache.DefaultNamespaces = map[string]cache.Config{
restrictedNamespace: {},
}
setupLog.Info("Restricted namespace configured, launching in restricted mode", "namespace", restrictedNamespace)
} else {
setupLog.Info("No restricted namespace configured, launching in cluster-wide mode")
}
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), mgrOpts)
if err != nil {
......@@ -308,13 +311,22 @@ func main() {
ctrlConfig.Grove.Enabled = groveEnabled
setupLog.Info("Detecting LWS availability...")
lwsEnabled := commonController.DetectLWSAvailability(mainCtx, mgr)
ctrlConfig.LWS.Enabled = lwsEnabled
setupLog.Info("Detecting Volcano availability...")
volcanoEnabled := commonController.DetectVolcanoAvailability(mainCtx, mgr)
// LWS for multinode deployment usage depends on both LWS and Volcano availability
ctrlConfig.LWS.Enabled = lwsEnabled && volcanoEnabled
// Detect Kai-scheduler availability using discovery client
setupLog.Info("Detecting Kai-scheduler availability...")
kaiSchedulerEnabled := commonController.DetectKaiSchedulerAvailability(mainCtx, mgr)
ctrlConfig.KaiScheduler.Enabled = kaiSchedulerEnabled
setupLog.Info("Detected orchestrators availability",
"grove", groveEnabled,
"lws", lwsEnabled,
"volcano", volcanoEnabled,
"kai-scheduler", kaiSchedulerEnabled,
)
// Create etcd client
cli, err := clientv3.New(clientv3.Config{
Endpoints: []string{etcdAddr},
......
......@@ -101,6 +101,12 @@ func DetectLWSAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "leaderworkerset.x-k8s.io")
}
// detectVolcanoAvailability checks if Volcano is available by checking if the Volcano API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectVolcanoAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "scheduling.volcano.sh")
}
// DetectKaiSchedulerAvailability checks if Kai-scheduler is available by checking if the scheduling.run.ai API group is registered
// This approach uses the discovery client which is simpler and more reliable
func DetectKaiSchedulerAvailability(ctx context.Context, mgr ctrl.Manager) bool {
......
......@@ -131,14 +131,24 @@ Found existing namespace-restricted Dynamo operators in namespaces: ...
```
> [!TIP]
> For multinode deployments, you need to enable Grove and KAI Scheduler.
> You might chose to install them manually or through the dynamo-platform helm install command.
> When using the dynamo-platform helm install command, Grove and KAI Scheduler are NOT installed by default. You can enable their installation by setting the following flags in the helm install command:
```bash
--set "grove.enabled=true"
--set "kai-scheduler.enabled=true"
```
> For multinode deployments, you need to install multinode orchestration components:
>
> **Option 1 (Recommended): Grove + KAI Scheduler**
> - Grove and KAI Scheduler can be installed manually or through the dynamo-platform helm install command.
> - When using the dynamo-platform helm install command, Grove and KAI Scheduler are NOT installed by default. You can enable their installation by setting the following flags:
>
> ```bash
> --set "grove.enabled=true"
> --set "kai-scheduler.enabled=true"
> ```
>
> **Option 2: LeaderWorkerSet (LWS) + Volcano**
> - If using LWS for multinode deployments, you must also install Volcano (required dependency):
> - [LWS Installation](https://github.com/kubernetes-sigs/lws#installation)
> - [Volcano Installation](https://volcano.sh/en/docs/installation/) (required for gang scheduling with LWS)
> - These must be installed manually before deploying multinode workloads with LWS.
>
> See the [Multinode Deployment Guide](./deployment/multinode-deployment.md) for details on orchestrator selection.
> [!TIP]
> By default, Model Express Server is not used.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment