Unverified Commit ee3a8e42 authored by julienmancuso's avatar julienmancuso Committed by GitHub
Browse files

feat: add initial Grove support (#2012)

parent 19a77ae7
......@@ -79,5 +79,8 @@ spec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log
......@@ -79,5 +79,8 @@ spec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log
......@@ -79,6 +79,9 @@ spec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
VllmPrefillWorker:
......@@ -118,5 +121,8 @@ spec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
......@@ -118,5 +118,8 @@ spec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log
......@@ -79,8 +79,11 @@ spec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log
VllmPrefillWorker:
dynamoNamespace: vllm-v1-disagg-router
envFromSecret: hf-token-secret
......@@ -118,5 +121,8 @@ spec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log
......@@ -89,6 +89,12 @@ spec:
stabilizationWindowSeconds:
format: int32
type: integer
tolerance:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
scaleUp:
properties:
......@@ -115,6 +121,12 @@ spec:
stabilizationWindowSeconds:
format: int32
type: integer
tolerance:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
type: object
enabled:
......@@ -1163,6 +1175,8 @@ spec:
- port
type: object
type: object
stopSignal:
type: string
type: object
livenessProbe:
properties:
......@@ -1842,6 +1856,8 @@ spec:
- port
type: object
type: object
stopSignal:
type: string
type: object
livenessProbe:
properties:
......
......@@ -148,6 +148,12 @@ spec:
stabilizationWindowSeconds:
format: int32
type: integer
tolerance:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
scaleUp:
properties:
......@@ -174,6 +180,12 @@ spec:
stabilizationWindowSeconds:
format: int32
type: integer
tolerance:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
type: object
enabled:
......@@ -1218,6 +1230,8 @@ spec:
- port
type: object
type: object
stopSignal:
type: string
type: object
livenessProbe:
properties:
......@@ -1897,6 +1911,8 @@ spec:
- port
type: object
type: object
stopSignal:
type: string
type: object
livenessProbe:
properties:
......
......@@ -49,6 +49,7 @@ export ISTIO_GATEWAY="${ISTIO_GATEWAY:=istio-system/istio-ingressgateway}"
export INGRESS_CLASS="${INGRESS_CLASS:=nginx}"
export VIRTUAL_SERVICE_SUPPORTS_HTTPS="${VIRTUAL_SERVICE_SUPPORTS_HTTPS:=false}"
export ENABLE_LWS="${ENABLE_LWS:=false}"
export ENABLE_GROVE="${ENABLE_GROVE:=false}"
# Add command line options
INTERACTIVE=false
......@@ -164,7 +165,7 @@ echo "DYNAMO_INGRESS_SUFFIX: $DYNAMO_INGRESS_SUFFIX"
echo "VIRTUAL_SERVICE_SUPPORTS_HTTPS: $VIRTUAL_SERVICE_SUPPORTS_HTTPS"
echo "INSTALL_CRDS: $INSTALL_CRDS"
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS}' < dynamo-platform-values.yaml > generated-values.yaml
envsubst '${NAMESPACE} ${RELEASE_NAME} ${DOCKER_USERNAME} ${DOCKER_PASSWORD} ${DOCKER_SERVER} ${IMAGE_TAG} ${DYNAMO_INGRESS_SUFFIX} ${PIPELINES_DOCKER_SERVER} ${PIPELINES_DOCKER_USERNAME} ${PIPELINES_DOCKER_PASSWORD} ${DOCKER_SECRET_NAME} ${INGRESS_ENABLED} ${ISTIO_ENABLED} ${INGRESS_CLASS} ${ISTIO_GATEWAY} ${VIRTUAL_SERVICE_SUPPORTS_HTTPS} ${ENABLE_LWS} ${ENABLE_GROVE}' < dynamo-platform-values.yaml > generated-values.yaml
echo "generated file contents:"
cat generated-values.yaml
......@@ -197,5 +198,6 @@ helm upgrade --install dynamo-platform ./platform/ \
--namespace ${NAMESPACE} \
--set "dynamo-operator.controllerManager.manager.image.repository=${DOCKER_SERVER}/dynamo-operator" \
--set "dynamo-operator.controllerManager.manager.image.tag=${IMAGE_TAG}" \
--set "dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret"
--set "dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret" \
-f generated-values.yaml
echo "Helm chart deployment complete"
......@@ -24,6 +24,7 @@ dynamo-operator:
dynamo:
enableLWS: ${ENABLE_LWS}
enableGrove: ${ENABLE_GROVE}
ingress:
enabled: ${INGRESS_ENABLED}
className: ${INGRESS_CLASS}
......
......@@ -19,11 +19,11 @@ maintainers:
url: https://www.nvidia.com
description: A Helm chart for NVIDIA Dynamo Platform.
type: application
version: 0.3.2
version: 0.4.0
home: https://nvidia.com
dependencies:
- name: dynamo-operator
version: 0.3.2
version: 0.4.0
repository: file://components/operator
condition: dynamo-operator.enabled
- name: nats
......
......@@ -27,9 +27,9 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.3.2
version: 0.4.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.3.2"
appVersion: "0.4.0"
......@@ -100,6 +100,9 @@ spec:
{{- if .Values.dynamo.enableLWS }}
- --enable-lws
{{- end }}
{{- if .Values.dynamo.enableGrove }}
- --enable-grove
{{- end }}
command:
- /manager
env:
......
......@@ -116,6 +116,20 @@ rules:
- patch
- update
- watch
{{- if .Values.dynamo.enableGrove }}
- apiGroups:
- grove.io
resources:
- podgangsets
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
{{- end }}
- apiGroups:
- apps
resources:
......
......@@ -82,6 +82,7 @@ dynamo:
annotations: {}
enableLWS: false
enableGrove: false
internalImages:
debugger: python:3.12-slim
......
......@@ -34,6 +34,7 @@ dynamo-operator:
imagePullSecrets: []
dynamo:
enableLWS: false
enableGrove: false
internalImages:
debugger: python:3.12-slim
enableRestrictedSecurityContext: false
......
......@@ -72,7 +72,7 @@ type DynamoComponentDeploymentSharedSpec struct {
RunMode *RunMode `json:"runMode,omitempty"`
ExternalServices map[string]ExternalService `json:"externalServices,omitempty"`
Ingress IngressSpec `json:"ingress,omitempty"`
Ingress *IngressSpec `json:"ingress,omitempty"`
// +optional
ExtraPodMetadata *dynamoCommon.ExtraPodMetadata `json:"extraPodMetadata,omitempty"`
......@@ -149,6 +149,10 @@ func init() {
SchemeBuilder.Register(&DynamoComponentDeployment{}, &DynamoComponentDeploymentList{})
}
func (s *DynamoComponentDeployment) IsReady() bool {
return s.Status.IsReady()
}
func (s *DynamoComponentDeploymentStatus) IsReady() bool {
for _, condition := range s.Conditions {
if condition.Type == DynamoGraphDeploymentConditionTypeAvailable && condition.Status == metav1.ConditionTrue {
......
......@@ -20,8 +20,6 @@
package v1alpha1
import (
"fmt"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
......@@ -87,20 +85,6 @@ func (s *DynamoGraphDeployment) SetSpec(spec any) {
s.Spec = spec.(DynamoGraphDeploymentSpec)
}
func (s *DynamoGraphDeployment) SetEndpointStatus(isSecured bool, endpointHost string) {
protocol := "http"
if isSecured {
protocol = "https"
}
s.AddStatusCondition(metav1.Condition{
Type: "EndpointExposed",
Status: metav1.ConditionTrue,
Reason: "EndpointExposed",
Message: fmt.Sprintf("%s://%s", protocol, endpointHost),
LastTransitionTime: metav1.Now(),
})
}
func (s *DynamoGraphDeployment) AddStatusCondition(condition metav1.Condition) {
if s.Status.Conditions == nil {
s.Status.Conditions = []metav1.Condition{}
......
......@@ -250,7 +250,11 @@ func (in *DynamoComponentDeploymentSharedSpec) DeepCopyInto(out *DynamoComponent
(*out)[key] = val
}
}
in.Ingress.DeepCopyInto(&out.Ingress)
if in.Ingress != nil {
in, out := &in.Ingress, &out.Ingress
*out = new(IngressSpec)
(*in).DeepCopyInto(*out)
}
if in.ExtraPodMetadata != nil {
in, out := &in.ExtraPodMetadata, &out.ExtraPodMetadata
*out = new(common.ExtraPodMetadata)
......
......@@ -48,6 +48,7 @@ import (
lwsscheme "sigs.k8s.io/lws/client-go/clientset/versioned/scheme"
volcanoscheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme"
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller"
commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
......@@ -70,6 +71,8 @@ func init() {
utilruntime.Must(lwsscheme.AddToScheme(scheme))
utilruntime.Must(volcanoscheme.AddToScheme(scheme))
utilruntime.Must(grovev1alpha1.AddToScheme(scheme))
//+kubebuilder:scaffold:scheme
}
......@@ -89,6 +92,7 @@ func main() {
var ingressControllerTLSSecretName string
var ingressHostSuffix string
var enableLWS bool
var enableGrove bool
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
......@@ -116,6 +120,8 @@ func main() {
"The suffix to use for the ingress host")
flag.BoolVar(&enableLWS, "enable-lws", false,
"If set, enable leader worker set")
flag.BoolVar(&enableGrove, "enable-grove", false,
"If set, enable grove")
opts := zap.Options{
Development: true,
}
......@@ -125,9 +131,17 @@ func main() {
utilruntime.Must(istioclientsetscheme.AddToScheme(scheme))
ctrlConfig := commonController.Config{
RestrictedNamespace: restrictedNamespace,
VirtualServiceSupportsHTTPS: virtualServiceSupportsHTTPS,
EnableLWS: enableLWS,
RestrictedNamespace: restrictedNamespace,
EnableLWS: enableLWS,
EnableGrove: enableGrove,
EtcdAddress: etcdAddr,
NatsAddress: natsAddr,
IngressConfig: commonController.IngressConfig{
VirtualServiceGateway: istioVirtualServiceGateway,
IngressControllerClassName: ingressControllerClassName,
IngressControllerTLSSecret: ingressControllerTLSSecretName,
IngressHostSuffix: ingressHostSuffix,
},
}
mainCtx := ctrl.SetupSignalHandler()
......@@ -289,23 +303,17 @@ func main() {
Client: mgr.GetClient(),
Recorder: mgr.GetEventRecorderFor("dynamocomponentdeployment"),
Config: ctrlConfig,
NatsAddr: natsAddr,
EtcdAddr: etcdAddr,
EtcdStorage: etcd.NewStorage(cli),
UseVirtualService: istioVirtualServiceGateway != "",
DockerSecretRetriever: dockerSecretRetriever,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DynamoComponentDeployment")
os.Exit(1)
}
if err = (&controller.DynamoGraphDeploymentReconciler{
Client: mgr.GetClient(),
Recorder: mgr.GetEventRecorderFor("dynamographdeployment"),
Config: ctrlConfig,
VirtualServiceGateway: istioVirtualServiceGateway,
IngressControllerClassName: ingressControllerClassName,
IngressControllerTLSSecret: ingressControllerTLSSecretName,
IngressHostSuffix: ingressHostSuffix,
Client: mgr.GetClient(),
Recorder: mgr.GetEventRecorderFor("dynamographdeployment"),
Config: ctrlConfig,
DockerSecretRetriever: dockerSecretRetriever,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DynamoGraphDeployment")
os.Exit(1)
......
......@@ -89,6 +89,12 @@ spec:
stabilizationWindowSeconds:
format: int32
type: integer
tolerance:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
scaleUp:
properties:
......@@ -115,6 +121,12 @@ spec:
stabilizationWindowSeconds:
format: int32
type: integer
tolerance:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
type: object
enabled:
......@@ -1163,6 +1175,8 @@ spec:
- port
type: object
type: object
stopSignal:
type: string
type: object
livenessProbe:
properties:
......@@ -1842,6 +1856,8 @@ spec:
- port
type: object
type: object
stopSignal:
type: string
type: object
livenessProbe:
properties:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment