Unverified Commit c3820050 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: use official Grove 0.1.0-alpha release (#3030)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 2a61e29e
......@@ -35,11 +35,11 @@ dependencies:
repository: "https://charts.bitnami.com/bitnami"
condition: etcd.enabled
- name: kai-scheduler
version: v0.8.4
version: v0.9.2
repository: oci://ghcr.io/nvidia/kai-scheduler
condition: kai-scheduler.enabled
- name: grove-charts
alias: grove
version: v0.0.0-6e30275
version: v0.1.0-alpha.1
repository: oci://ghcr.io/nvidia/grove
condition: grove.enabled
......@@ -119,7 +119,7 @@ rules:
- apiGroups:
- grove.io
resources:
- podgangsets
- podcliquesets
verbs:
- create
- delete
......
......@@ -159,7 +159,7 @@ func main() {
flag.StringVar(&ingressHostSuffix, "ingress-host-suffix", "",
"The suffix to use for the ingress host")
flag.DurationVar(&groveTerminationDelay, "grove-termination-delay", consts.DefaultGroveTerminationDelay,
"The termination delay for Grove PodGangSets")
"The termination delay for Grove PodCliqueSets")
flag.StringVar(&modelExpressURL, "model-express-url", "",
"URL of the Model Express server to inject into all pods")
flag.StringVar(&prometheusEndpoint, "prometheus-endpoint", "",
......
......@@ -110,7 +110,7 @@ rules:
- apiGroups:
- grove.io
resources:
- podgangsets
- podcliquesets
verbs:
- create
- delete
......
......@@ -6,8 +6,9 @@ toolchain go1.24.3
require (
emperror.dev/errors v0.8.1
github.com/NVIDIA/grove/operator/api v0.0.0-20250825164137-da01400261a6
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.1
github.com/bsm/gomega v1.27.10
github.com/go-logr/logr v1.4.2
github.com/google/go-cmp v0.7.0
github.com/imdario/mergo v0.3.6
github.com/onsi/ginkgo/v2 v2.23.4
......@@ -39,7 +40,6 @@ require (
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/zapr v1.3.0 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.21.0 // indirect
......
emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE=
github.com/NVIDIA/grove/operator/api v0.0.0-20250825164137-da01400261a6 h1:JkW8LeRVsQH/YkRTz80T/JxlDgfk0URKgTUKyYKxbso=
github.com/NVIDIA/grove/operator/api v0.0.0-20250825164137-da01400261a6/go.mod h1:QlsR2wQLj9m/zVEqv5SsCPzyjN2ykYZ0r/NEnDf4WB4=
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.1 h1:4DE6ZGa/3muBa5gk1GtJskMVss6GjeCPpn+xTnR1h9w=
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.1/go.mod h1:QlsR2wQLj9m/zVEqv5SsCPzyjN2ykYZ0r/NEnDf4WB4=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
......
......@@ -71,7 +71,7 @@ type DynamoGraphDeploymentReconciler struct {
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
// +kubebuilder:rbac:groups=grove.io,resources=podgangsets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=grove.io,resources=podcliquesets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch
// +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch
// +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list
......@@ -258,12 +258,12 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont
func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
logger := log.FromContext(ctx)
// generate the dynamoComponentsDeployments from the config
groveGangSet, err := dynamo.GenerateGrovePodGangSet(ctx, dynamoDeployment, r.Config, r.DockerSecretRetriever)
groveGangSet, err := dynamo.GenerateGrovePodCliqueSet(ctx, dynamoDeployment, r.Config, r.DockerSecretRetriever)
if err != nil {
logger.Error(err, "failed to generate the Grove GangSet")
return "", "", "", fmt.Errorf("failed to generate the Grove GangSet: %w", err)
}
_, syncedGroveGangSet, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*grovev1alpha1.PodGangSet, bool, error) {
_, syncedGroveGangSet, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*grovev1alpha1.PodCliqueSet, bool, error) {
return groveGangSet, false, nil
})
if err != nil {
......@@ -421,7 +421,7 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
})).
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config))
if r.Config.Grove.Enabled {
ctrlBuilder = ctrlBuilder.Owns(&grovev1alpha1.PodGangSet{}, builder.WithPredicates(predicate.Funcs{
ctrlBuilder = ctrlBuilder.Owns(&grovev1alpha1.PodCliqueSet{}, builder.WithPredicates(predicate.Funcs{
// ignore creation cause we don't want to be called again after we create the pod gang set
CreateFunc: func(ce event.CreateEvent) bool { return false },
DeleteFunc: func(de event.DeleteEvent) bool { return true },
......
......@@ -6,7 +6,7 @@ import (
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
)
func CanonicalizePodGangSet(gangSet *grovev1alpha1.PodGangSet) *grovev1alpha1.PodGangSet {
func CanonicalizePodCliqueSet(gangSet *grovev1alpha1.PodCliqueSet) *grovev1alpha1.PodCliqueSet {
// sort cliques by name
sort.Slice(gangSet.Spec.Template.Cliques, func(i, j int) bool {
return gangSet.Spec.Template.Cliques[i].Name < gangSet.Spec.Template.Cliques[j].Name
......
......@@ -33,7 +33,7 @@ import (
type GroveConfig struct {
// Enabled is automatically determined by checking if Grove CRDs are installed in the cluster
Enabled bool
// TerminationDelay configures the termination delay for Grove PodGangSets
// TerminationDelay configures the termination delay for Grove PodCliqueSets
TerminationDelay time.Duration
}
......
......@@ -317,16 +317,16 @@ type SecretsRetriever interface {
GetSecrets(namespace, registry string) ([]string, error)
}
// applyCliqueStartupDependencies configures StartsAfter dependencies for cliques in a PodGangSet
// applyCliqueStartupDependencies configures StartsAfter dependencies for cliques in a PodCliqueSet
// based on the backend framework and multinode deployment patterns.
//
// Rules:
// - For VLLM and SGLang: worker cliques start after leader clique
// - For TRTLLM: leader clique starts after worker cliques
// - Only applies to multinode deployments (numberOfNodes > 1)
// - Sets the PodGangSet StartupType to Explicit if any dependencies are configured
// - Sets the PodCliqueSet StartupType to Explicit if any dependencies are configured
func applyCliqueStartupDependencies(
gangSet *grovev1alpha1.PodGangSet,
gangSet *grovev1alpha1.PodCliqueSet,
roles []ServiceRole,
backendFramework BackendFramework,
numberOfNodes int32,
......@@ -880,14 +880,14 @@ func GeneratePodSpecForComponent(
return podSpec, nil
}
// GenerateGrovePodGangSet generates a Grove PodGangSet for the given deployment, supporting both single-node and multinode cases.
func GenerateGrovePodGangSet(
// GenerateGrovePodCliqueSet generates a Grove PodCliqueSet for the given deployment, supporting both single-node and multinode cases.
func GenerateGrovePodCliqueSet(
ctx context.Context,
dynamoDeployment *v1alpha1.DynamoGraphDeployment,
controllerConfig controller_common.Config,
secretsRetriever SecretsRetriever,
) (*grovev1alpha1.PodGangSet, error) {
gangSet := &grovev1alpha1.PodGangSet{}
) (*grovev1alpha1.PodCliqueSet, error) {
gangSet := &grovev1alpha1.PodCliqueSet{}
gangSet.Name = dynamoDeployment.Name
gangSet.Namespace = dynamoDeployment.Namespace
gangSet.Spec.Replicas = 1
......@@ -986,7 +986,7 @@ func GenerateGrovePodGangSet(
gangSet.Spec.Template.PodCliqueScalingGroupConfigs = scalingGroups
}
return controller_common.CanonicalizePodGangSet(gangSet), nil
return controller_common.CanonicalizePodCliqueSet(gangSet), nil
}
func generateLabels(component *v1alpha1.DynamoComponentDeploymentOverridesSpec, dynamoDeployment *v1alpha1.DynamoGraphDeployment, componentName string) (map[string]string, error) {
......
......@@ -1048,7 +1048,7 @@ func sortEnvVars(envs []corev1.EnvVar) []corev1.EnvVar {
return sorted
}
func TestGenerateGrovePodGangSet(t *testing.T) {
func TestGenerateGrovePodCliqueSet(t *testing.T) {
type args struct {
ctx context.Context
dynamoDeployment *v1alpha1.DynamoGraphDeployment
......@@ -1057,11 +1057,11 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
tests := []struct {
name string
args args
want *grovev1alpha1.PodGangSet
want *grovev1alpha1.PodCliqueSet
wantErr bool
}{
{
name: "test_generate_grove_pod_gang_set_single_node",
name: "test_generate_grove_pod_clique_set_single_node",
args: args{
ctx: context.Background(),
controllerConfig: controller_common.Config{
......@@ -1220,14 +1220,14 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
},
},
},
want: &grovev1alpha1.PodGangSet{
want: &grovev1alpha1.PodCliqueSet{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dynamo-graph-deployment",
Namespace: "test-namespace",
},
Spec: grovev1alpha1.PodGangSetSpec{
Spec: grovev1alpha1.PodCliqueSetSpec{
Replicas: 1,
Template: grovev1alpha1.PodGangSetTemplateSpec{
Template: grovev1alpha1.PodCliqueSetTemplateSpec{
StartupType: ptr.To(grovev1alpha1.CliqueStartupTypeAnyOrder),
HeadlessServiceConfig: &grovev1alpha1.HeadlessServiceConfig{
PublishNotReadyAddresses: true,
......@@ -1737,14 +1737,14 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
},
},
},
want: &grovev1alpha1.PodGangSet{
want: &grovev1alpha1.PodCliqueSet{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dynamo-graph-deployment",
Namespace: "test-namespace",
},
Spec: grovev1alpha1.PodGangSetSpec{
Spec: grovev1alpha1.PodCliqueSetSpec{
Replicas: 1,
Template: grovev1alpha1.PodGangSetTemplateSpec{
Template: grovev1alpha1.PodCliqueSetTemplateSpec{
HeadlessServiceConfig: &grovev1alpha1.HeadlessServiceConfig{
PublishNotReadyAddresses: true,
},
......@@ -2533,14 +2533,14 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
},
},
},
want: &grovev1alpha1.PodGangSet{
want: &grovev1alpha1.PodCliqueSet{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dynamo-graph-deployment",
Namespace: "test-namespace",
},
Spec: grovev1alpha1.PodGangSetSpec{
Spec: grovev1alpha1.PodCliqueSetSpec{
Replicas: 1,
Template: grovev1alpha1.PodGangSetTemplateSpec{
Template: grovev1alpha1.PodCliqueSetTemplateSpec{
StartupType: ptr.To(grovev1alpha1.CliqueStartupTypeAnyOrder),
HeadlessServiceConfig: &grovev1alpha1.HeadlessServiceConfig{
PublishNotReadyAddresses: true,
......@@ -3099,9 +3099,9 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := GenerateGrovePodGangSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, nil)
got, err := GenerateGrovePodCliqueSet(tt.args.ctx, tt.args.dynamoDeployment, tt.args.controllerConfig, nil)
if (err != nil) != tt.wantErr {
t.Errorf("GenerateGrovePodGangSet() error = %v, wantErr %v", err, tt.wantErr)
t.Errorf("GenerateGrovePodCliqueSet() error = %v, wantErr %v", err, tt.wantErr)
return
}
sort.Slice(got.Spec.Template.Cliques, func(i, j int) bool {
......@@ -3124,7 +3124,7 @@ func TestGenerateGrovePodGangSet(t *testing.T) {
}
if diff := cmp.Diff(got, tt.want); diff != "" {
t.Errorf("GenerateGrovePodGangSet() mismatch (-want +got):\n%s", diff)
t.Errorf("GenerateGrovePodCliqueSet() mismatch (-want +got):\n%s", diff)
}
})
}
......@@ -4072,10 +4072,10 @@ func XTestApplyCliqueStartupDependencies(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Create a PodGangSet with cliques matching the roles
gangSet := &grovev1alpha1.PodGangSet{
Spec: grovev1alpha1.PodGangSetSpec{
Template: grovev1alpha1.PodGangSetTemplateSpec{
// Create a PodCliqueSet with cliques matching the roles
gangSet := &grovev1alpha1.PodCliqueSet{
Spec: grovev1alpha1.PodCliqueSetSpec{
Template: grovev1alpha1.PodCliqueSetTemplateSpec{
Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{},
},
},
......@@ -4234,7 +4234,7 @@ func XTestGetCliqueStartupDependencies(t *testing.T) {
// deactivated for now.
// TODO: reactivate this when we have a better way to handle the readiness probe for the leader.
func XTestGenerateGrovePodGangSet_StartsAfterDependencies(t *testing.T) {
func XTestGenerateGrovePodCliqueSet_StartsAfterDependencies(t *testing.T) {
secretsRetriever := &mockSecretsRetriever{}
tests := []struct {
......@@ -4301,9 +4301,9 @@ func XTestGenerateGrovePodGangSet_StartsAfterDependencies(t *testing.T) {
NatsAddress: "nats-address",
}
got, err := GenerateGrovePodGangSet(context.Background(), dynamoDeployment, controllerConfig, secretsRetriever)
got, err := GenerateGrovePodCliqueSet(context.Background(), dynamoDeployment, controllerConfig, secretsRetriever)
if err != nil {
t.Errorf("GenerateGrovePodGangSet() error = %v", err)
t.Errorf("GenerateGrovePodCliqueSet() error = %v", err)
return
}
......
......@@ -38,7 +38,7 @@ helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./com
### Installation using Grove
Same example as above, but using Grove PodGangSet resources.
Same example as above, but using Grove PodCliqueSet resources.
```bash
helm upgrade --install dynamo-graph ./deploy/helm/chart -n dynamo-cloud -f ./components/backends/vllm/deploy/agg.yaml --set deploymentType=grove
......@@ -72,10 +72,10 @@ The following table shows which deployment features are supported by the **Helm
| Feature | Helm Chart | Operator | Description |
|---------|------------|----------|-------------|
| **Singlenode** (k8sDeployments) | ✅ Supported | ✅ Supported | Single-node deployments using standard Kubernetes Deployments |
| **Singlenode** (Grove PodGangSet) | ✅ Supported | ✅ Supported | Single-node deployments using Grove PodGangSet resources |
| **Multinode** (Grove PodGangSet and LWS) | ❌ Not Supported | ✅ Supported | Multi-node deployments requiring Grove PodGangSet and LeaderWorkerSet (LWS) |
| **Singlenode** (Grove PodCliqueSet) | ✅ Supported | ✅ Supported | Single-node deployments using Grove PodCliqueSet resources |
| **Multinode** (Grove PodCliqueSet and LWS) | ❌ Not Supported | ✅ Supported | Multi-node deployments requiring Grove PodCliqueSet and LeaderWorkerSet (LWS) |
**Key Differences:**
- **Helm Chart**: Best for simple single-node deployments and quick testing. Supports both basic Kubernetes deployments and Grove PodGangSet resources.
- **Helm Chart**: Best for simple single-node deployments and quick testing. Supports both basic Kubernetes deployments and Grove PodCliqueSet resources.
- **Operator**: Required for advanced multi-node deployments. Provides full feature support including complex distributed inference configurations.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment