Unverified Commit 81c27803 authored by mohammedabdulwahhab's avatar mohammedabdulwahhab Committed by GitHub
Browse files

fix: operator defaults (#2398)


Signed-off-by: default avatarmohammedabdulwahhab <furkhan324@berkeley.edu>
parent 9ddb3efd
...@@ -8,26 +8,8 @@ metadata: ...@@ -8,26 +8,8 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-agg dynamoNamespace: sglang-agg
componentType: main componentType: frontend
replicas: 1 replicas: 1
resources: resources:
requests: requests:
...@@ -45,21 +27,6 @@ spec: ...@@ -45,21 +27,6 @@ spec:
- "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg && python3 -m dynamo.frontend --http-port=8000" - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg && python3 -m dynamo.frontend --http-port=8000"
SGLangDecodeWorker: SGLangDecodeWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
exec:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
dynamoNamespace: sglang-agg dynamoNamespace: sglang-agg
componentType: worker componentType: worker
replicas: 1 replicas: 1
...@@ -72,21 +39,8 @@ spec: ...@@ -72,21 +39,8 @@ spec:
cpu: "32" cpu: "32"
memory: "80Gi" memory: "80Gi"
gpu: "1" gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 10
failureThreshold: 60
image: my-registry/sglang-runtime:my-tag image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang workingDir: /workspace/components/backends/sglang
command: command:
......
...@@ -8,26 +8,8 @@ metadata: ...@@ -8,26 +8,8 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-agg-router dynamoNamespace: sglang-agg-router
componentType: main componentType: frontend
replicas: 1 replicas: 1
resources: resources:
requests: requests:
...@@ -45,21 +27,6 @@ spec: ...@@ -45,21 +27,6 @@ spec:
- "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg-router && python3 -m dynamo.frontend --http-port=8000 --router-mode kv" - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg-router && python3 -m dynamo.frontend --http-port=8000 --router-mode kv"
SGLangDecodeWorker: SGLangDecodeWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
exec:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
dynamoNamespace: sglang-agg-router dynamoNamespace: sglang-agg-router
componentType: worker componentType: worker
replicas: 1 replicas: 1
...@@ -72,21 +39,8 @@ spec: ...@@ -72,21 +39,8 @@ spec:
cpu: "32" cpu: "32"
memory: "80Gi" memory: "80Gi"
gpu: "1" gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 10
failureThreshold: 60
image: my-registry/sglang-runtime:my-tag image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang workingDir: /workspace/components/backends/sglang
command: command:
......
...@@ -8,26 +8,8 @@ metadata: ...@@ -8,26 +8,8 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-disagg dynamoNamespace: sglang-disagg
componentType: main componentType: frontend
replicas: 1 replicas: 1
resources: resources:
requests: requests:
...@@ -45,21 +27,6 @@ spec: ...@@ -45,21 +27,6 @@ spec:
- "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-disagg && python3 -m dynamo.frontend --http-port=8000" - "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-disagg && python3 -m dynamo.frontend --http-port=8000"
SGLangDecodeWorker: SGLangDecodeWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
exec:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
dynamoNamespace: sglang-disagg dynamoNamespace: sglang-disagg
componentType: worker componentType: worker
replicas: 1 replicas: 1
...@@ -72,21 +39,8 @@ spec: ...@@ -72,21 +39,8 @@ spec:
cpu: "32" cpu: "32"
memory: "80Gi" memory: "80Gi"
gpu: "1" gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0808-07 image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0808-07
workingDir: /workspace/components/backends/sglang workingDir: /workspace/components/backends/sglang
command: command:
...@@ -112,21 +66,6 @@ spec: ...@@ -112,21 +66,6 @@ spec:
- "nixl" - "nixl"
SGLangPrefillWorker: SGLangPrefillWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
exec:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
dynamoNamespace: sglang-disagg dynamoNamespace: sglang-disagg
componentType: worker componentType: worker
replicas: 1 replicas: 1
...@@ -139,21 +78,8 @@ spec: ...@@ -139,21 +78,8 @@ spec:
cpu: "32" cpu: "32"
memory: "80Gi" memory: "80Gi"
gpu: "1" gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0808-07 image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0808-07
workingDir: /workspace/components/backends/sglang workingDir: /workspace/components/backends/sglang
command: command:
......
...@@ -16,25 +16,7 @@ spec: ...@@ -16,25 +16,7 @@ spec:
services: services:
Frontend: Frontend:
dynamoNamespace: dynamo dynamoNamespace: dynamo
livenessProbe: componentType: frontend
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
componentType: main
replicas: 1 replicas: 1
resources: resources:
requests: requests:
...@@ -97,9 +79,9 @@ spec: ...@@ -97,9 +79,9 @@ spec:
- --backend=sglang - --backend=sglang
- --adjustment-interval=60 - --adjustment-interval=60
- --profile-results-dir=/workspace/profiling_results - --profile-results-dir=/workspace/profiling_results
Prometheus: Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
dynamoNamespace: dynamo dynamoNamespace: dynamo
componentType: main componentType: frontend
replicas: 1 replicas: 1
envs: envs:
- name: PYTHONPATH - name: PYTHONPATH
...@@ -142,20 +124,6 @@ spec: ...@@ -142,20 +124,6 @@ spec:
SGLangDecodeWorker: SGLangDecodeWorker:
dynamoNamespace: dynamo dynamoNamespace: dynamo
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
componentType: worker componentType: worker
replicas: 2 replicas: 2
resources: resources:
...@@ -167,21 +135,8 @@ spec: ...@@ -167,21 +135,8 @@ spec:
cpu: "32" cpu: "32"
memory: "80Gi" memory: "80Gi"
gpu: "1" gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0811-1 image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0811-1
workingDir: /workspace/components/backends/sglang workingDir: /workspace/components/backends/sglang
args: args:
...@@ -205,20 +160,6 @@ spec: ...@@ -205,20 +160,6 @@ spec:
SGLangPrefillWorker: SGLangPrefillWorker:
dynamoNamespace: dynamo dynamoNamespace: dynamo
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
componentType: worker componentType: worker
replicas: 2 replicas: 2
resources: resources:
...@@ -230,21 +171,8 @@ spec: ...@@ -230,21 +171,8 @@ spec:
cpu: "32" cpu: "32"
memory: "80Gi" memory: "80Gi"
gpu: "1" gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0811-1 image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0811-1
workingDir: /workspace/components/backends/sglang workingDir: /workspace/components/backends/sglang
args: args:
......
...@@ -9,26 +9,7 @@ spec: ...@@ -9,26 +9,7 @@ spec:
services: services:
Frontend: Frontend:
dynamoNamespace: trtllm-agg dynamoNamespace: trtllm-agg
componentType: main componentType: frontend
livenessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 3
failureThreshold: 10
replicas: 1 replicas: 1
resources: resources:
requests: requests:
...@@ -48,20 +29,6 @@ spec: ...@@ -48,20 +29,6 @@ spec:
- "python3 -m dynamo.frontend --http-port 8000" - "python3 -m dynamo.frontend --http-port 8000"
TRTLLMWorker: TRTLLMWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
dynamoNamespace: trtllm-agg dynamoNamespace: trtllm-agg
componentType: worker componentType: worker
replicas: 1 replicas: 1
...@@ -74,22 +41,8 @@ spec: ...@@ -74,22 +41,8 @@ spec:
cpu: "10" cpu: "10"
memory: "20Gi" memory: "20Gi"
gpu: "1" gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
args: args:
......
...@@ -8,35 +8,9 @@ metadata: ...@@ -8,35 +8,9 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
livenessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 3
failureThreshold: 5
dynamoNamespace: trtllm-agg-router dynamoNamespace: trtllm-agg-router
componentType: main componentType: frontend
replicas: 1 replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
...@@ -48,20 +22,6 @@ spec: ...@@ -48,20 +22,6 @@ spec:
- "python3 -m dynamo.frontend --http-port 8000 --router-mode kv" - "python3 -m dynamo.frontend --http-port 8000 --router-mode kv"
TRTLLMWorker: TRTLLMWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
dynamoNamespace: trtllm-agg-router dynamoNamespace: trtllm-agg-router
componentType: worker componentType: worker
replicas: 2 replicas: 2
...@@ -74,22 +34,8 @@ spec: ...@@ -74,22 +34,8 @@ spec:
cpu: "10" cpu: "10"
memory: "20Gi" memory: "20Gi"
gpu: "1" gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
args: args:
......
...@@ -9,26 +9,7 @@ spec: ...@@ -9,26 +9,7 @@ spec:
services: services:
Frontend: Frontend:
dynamoNamespace: trtllm-disagg dynamoNamespace: trtllm-disagg
componentType: main componentType: frontend
livenessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 3
failureThreshold: 10
replicas: 1 replicas: 1
resources: resources:
requests: requests:
...@@ -51,20 +32,6 @@ spec: ...@@ -51,20 +32,6 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 1 replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
resources: resources:
requests: requests:
cpu: "10" cpu: "10"
...@@ -76,46 +43,18 @@ spec: ...@@ -76,46 +43,18 @@ spec:
gpu: "1" gpu: "1"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
command: command:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first 2>&1 | tee /tmp/trtllm.log" - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
TRTLLMDecodeWorker: TRTLLMDecodeWorker:
dynamoNamespace: trtllm-disagg dynamoNamespace: trtllm-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 1 replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
resources: resources:
requests: requests:
cpu: "10" cpu: "10"
...@@ -127,24 +66,10 @@ spec: ...@@ -127,24 +66,10 @@ spec:
gpu: "1" gpu: "1"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
command: command:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first 2>&1 | tee /tmp/trtllm.log" - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
...@@ -9,26 +9,7 @@ spec: ...@@ -9,26 +9,7 @@ spec:
services: services:
Frontend: Frontend:
dynamoNamespace: trtllm-v1-disagg-router dynamoNamespace: trtllm-v1-disagg-router
componentType: main componentType: frontend
livenessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 3
failureThreshold: 10
replicas: 1 replicas: 1
resources: resources:
requests: requests:
...@@ -51,20 +32,6 @@ spec: ...@@ -51,20 +32,6 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 2 replicas: 2
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
resources: resources:
requests: requests:
cpu: "10" cpu: "10"
...@@ -76,46 +43,18 @@ spec: ...@@ -76,46 +43,18 @@ spec:
gpu: "1" gpu: "1"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
command: command:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics 2>&1 | tee /tmp/trtllm.log" - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
TRTLLMDecodeWorker: TRTLLMDecodeWorker:
dynamoNamespace: trtllm-v1-disagg-router dynamoNamespace: trtllm-v1-disagg-router
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 1 replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
resources: resources:
requests: requests:
cpu: "10" cpu: "10"
...@@ -127,24 +66,10 @@ spec: ...@@ -127,24 +66,10 @@ spec:
gpu: "1" gpu: "1"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
command: command:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first 2>&1 | tee /tmp/trtllm.log" - "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
...@@ -8,34 +8,9 @@ metadata: ...@@ -8,34 +8,9 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
dynamoNamespace: vllm-agg dynamoNamespace: vllm-agg
componentType: main componentType: frontend
replicas: 1 replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
...@@ -47,55 +22,15 @@ spec: ...@@ -47,55 +22,15 @@ spec:
- "python3 -m dynamo.frontend --http-port 8000" - "python3 -m dynamo.frontend --http-port 8000"
VllmDecodeWorker: VllmDecodeWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 60
dynamoNamespace: vllm-agg dynamoNamespace: vllm-agg
componentType: worker componentType: worker
replicas: 1 replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
workingDir: /workspace/components/backends/vllm workingDir: /workspace/components/backends/vllm
command: command:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B
...@@ -8,34 +8,9 @@ metadata: ...@@ -8,34 +8,9 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: vllm-agg-router dynamoNamespace: vllm-agg-router
componentType: main componentType: frontend
replicas: 1 replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
...@@ -47,51 +22,15 @@ spec: ...@@ -47,51 +22,15 @@ spec:
- "python3 -m dynamo.frontend --http-port 8000 --router-mode kv" - "python3 -m dynamo.frontend --http-port 8000 --router-mode kv"
VllmDecodeWorker: VllmDecodeWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
dynamoNamespace: vllm-agg-router dynamoNamespace: vllm-agg-router
componentType: worker componentType: worker
replicas: 2 replicas: 2
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
workingDir: /workspace/components/backends/vllm workingDir: /workspace/components/backends/vllm
command: command:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B
...@@ -9,26 +9,8 @@ spec: ...@@ -9,26 +9,8 @@ spec:
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-disagg dynamoNamespace: vllm-disagg
componentType: main componentType: frontend
replicas: 1 replicas: 1
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources: resources:
requests: requests:
cpu: "32" cpu: "32"
...@@ -50,20 +32,6 @@ spec: ...@@ -50,20 +32,6 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 1 replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources: resources:
requests: requests:
cpu: "32" cpu: "32"
...@@ -73,47 +41,20 @@ spec: ...@@ -73,47 +41,20 @@ spec:
cpu: "32" cpu: "32"
memory: "40Gi" memory: "40Gi"
gpu: "1" gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
workingDir: /workspace/components/backends/vllm workingDir: /workspace/components/backends/vllm
command: command:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B"
VllmPrefillWorker: VllmPrefillWorker:
dynamoNamespace: vllm-disagg dynamoNamespace: vllm-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 1 replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources: resources:
requests: requests:
cpu: "32" cpu: "32"
...@@ -123,25 +64,12 @@ spec: ...@@ -123,25 +64,12 @@ spec:
cpu: "32" cpu: "32"
memory: "40Gi" memory: "40Gi"
gpu: "1" gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
workingDir: /workspace/components/backends/vllm workingDir: /workspace/components/backends/vllm
command: command:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker"
...@@ -16,26 +16,8 @@ spec: ...@@ -16,26 +16,8 @@ spec:
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-disagg-planner dynamoNamespace: vllm-disagg-planner
componentType: main componentType: frontend
replicas: 1 replicas: 1
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources: resources:
requests: requests:
cpu: "32" cpu: "32"
...@@ -99,9 +81,9 @@ spec: ...@@ -99,9 +81,9 @@ spec:
- --backend=vllm - --backend=vllm
- --adjustment-interval=60 - --adjustment-interval=60
- --profile-results-dir=/workspace/profiling_results - --profile-results-dir=/workspace/profiling_results
Prometheus: Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
dynamoNamespace: vllm-disagg-planner dynamoNamespace: vllm-disagg-planner
componentType: main componentType: frontend
replicas: 1 replicas: 1
envs: envs:
- name: PYTHONPATH - name: PYTHONPATH
...@@ -146,20 +128,6 @@ spec: ...@@ -146,20 +128,6 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 2 replicas: 2
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources: resources:
requests: requests:
cpu: "8" cpu: "8"
...@@ -169,13 +137,6 @@ spec: ...@@ -169,13 +137,6 @@ spec:
cpu: "8" cpu: "8"
memory: "16Gi" memory: "16Gi"
gpu: "1" gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe: startupProbe:
...@@ -190,26 +151,12 @@ spec: ...@@ -190,26 +151,12 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --migration-limit=3 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --migration-limit=3"
VllmPrefillWorker: VllmPrefillWorker:
dynamoNamespace: vllm-disagg-planner dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 2 replicas: 2
livenessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources: resources:
requests: requests:
cpu: "8" cpu: "8"
...@@ -219,13 +166,6 @@ spec: ...@@ -219,13 +166,6 @@ spec:
cpu: "8" cpu: "8"
memory: "16Gi" memory: "16Gi"
gpu: "1" gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe: startupProbe:
...@@ -240,4 +180,4 @@ spec: ...@@ -240,4 +180,4 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker --migration-limit=3 2>&1 | tee /tmp/vllm.log - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker --migration-limit=3
...@@ -9,33 +9,8 @@ spec: ...@@ -9,33 +9,8 @@ spec:
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-v1-disagg-router dynamoNamespace: vllm-v1-disagg-router
componentType: main componentType: frontend
replicas: 1 replicas: 1
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
...@@ -50,96 +25,26 @@ spec: ...@@ -50,96 +25,26 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 2 replicas: 2
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
workingDir: /workspace/components/backends/vllm workingDir: /workspace/components/backends/vllm
command: command:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B
VllmPrefillWorker: VllmPrefillWorker:
dynamoNamespace: vllm-v1-disagg-router dynamoNamespace: vllm-v1-disagg-router
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 1 replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
workingDir: /workspace/components/backends/vllm workingDir: /workspace/components/backends/vllm
command: command:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker 2>&1 | tee /tmp/vllm.log - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker
...@@ -181,8 +181,8 @@ func (s *DynamoComponentDeployment) SetSpec(spec any) { ...@@ -181,8 +181,8 @@ func (s *DynamoComponentDeployment) SetSpec(spec any) {
s.Spec = spec.(DynamoComponentDeploymentSpec) s.Spec = spec.(DynamoComponentDeploymentSpec)
} }
func (s *DynamoComponentDeployment) IsMainComponent() bool { func (s *DynamoComponentDeployment) IsFrontendComponent() bool {
return strings.HasSuffix(s.Spec.DynamoTag, s.Spec.ServiceName) || s.Spec.ComponentType == commonconsts.ComponentTypeMain return strings.HasSuffix(s.Spec.DynamoTag, s.Spec.ServiceName) || s.Spec.ComponentType == commonconsts.ComponentTypeFrontend
} }
func (s *DynamoComponentDeployment) GetDynamoDeploymentConfig() []byte { func (s *DynamoComponentDeployment) GetDynamoDeploymentConfig() []byte {
......
...@@ -28,7 +28,7 @@ import ( ...@@ -28,7 +28,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
) )
func TestDynamoComponentDeployment_IsMainComponent(t *testing.T) { func TestDynamoComponentDeployment_IsFrontendComponent(t *testing.T) {
type fields struct { type fields struct {
TypeMeta metav1.TypeMeta TypeMeta metav1.TypeMeta
ObjectMeta metav1.ObjectMeta ObjectMeta metav1.ObjectMeta
...@@ -73,8 +73,8 @@ func TestDynamoComponentDeployment_IsMainComponent(t *testing.T) { ...@@ -73,8 +73,8 @@ func TestDynamoComponentDeployment_IsMainComponent(t *testing.T) {
Spec: tt.fields.Spec, Spec: tt.fields.Spec,
Status: tt.fields.Status, Status: tt.fields.Status,
} }
if got := s.IsMainComponent(); got != tt.want { if got := s.IsFrontendComponent(); got != tt.want {
t.Errorf("DynamoComponentDeployment.IsMainComponent() = %v, want %v", got, tt.want) t.Errorf("DynamoComponentDeployment.IsFrontendComponent() = %v, want %v", got, tt.want)
} }
}) })
} }
......
...@@ -38,8 +38,9 @@ const ( ...@@ -38,8 +38,9 @@ const (
DynamoDeploymentConfigEnvVar = "DYN_DEPLOYMENT_CONFIG" DynamoDeploymentConfigEnvVar = "DYN_DEPLOYMENT_CONFIG"
ComponentTypePlanner = "planner" ComponentTypePlanner = "planner"
ComponentTypeMain = "main" ComponentTypeFrontend = "frontend"
ComponentTypeWorker = "worker" ComponentTypeWorker = "worker"
ComponentTypeDefault = "default"
PlannerServiceAccountName = "planner-serviceaccount" PlannerServiceAccountName = "planner-serviceaccount"
DefaultIngressSuffix = "local" DefaultIngressSuffix = "local"
......
...@@ -1341,7 +1341,7 @@ func (r *DynamoComponentDeploymentReconciler) generateService(opt generateResour ...@@ -1341,7 +1341,7 @@ func (r *DynamoComponentDeploymentReconciler) generateService(opt generateResour
}, },
} }
if !opt.dynamoComponentDeployment.IsMainComponent() || (!opt.isGenericService && !opt.containsStealingTrafficDebugModeEnabled) { if !opt.dynamoComponentDeployment.IsFrontendComponent() || (!opt.isGenericService && !opt.containsStealingTrafficDebugModeEnabled) {
// if it's not the main component or if it's not a generic service and not contains stealing traffic debug mode enabled, we don't need to create the service // if it's not the main component or if it's not a generic service and not contains stealing traffic debug mode enabled, we don't need to create the service
return kubeService, true, nil return kubeService, true, nil
} }
......
...@@ -940,12 +940,8 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -940,12 +940,8 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Image: "test-image:latest", Image: "test-image:latest",
Command: []string{"sh", "-c"}, Command: []string{"sh", "-c"},
Args: []string{"ray start --head --port=6379 && some dynamo command"}, Args: []string{"ray start --head --port=6379 && some dynamo command"},
Env: []corev1.EnvVar{{Name: "TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC", Value: "test_value_from_dynamo_component_deployment_spec"}, {Name: "TEST_ENV_FROM_EXTRA_POD_SPEC", Value: "test_value_from_extra_pod_spec"}, {Name: "DYNAMO_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort)}}, Env: []corev1.EnvVar{{Name: "TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC", Value: "test_value_from_dynamo_component_deployment_spec"}, {Name: "TEST_ENV_FROM_EXTRA_POD_SPEC", Value: "test_value_from_extra_pod_spec"}},
Ports: []corev1.ContainerPort{ Ports: nil,
{
Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoServicePortName, ContainerPort: commonconsts.DynamoServicePort,
},
},
VolumeMounts: []corev1.VolumeMount{ VolumeMounts: []corev1.VolumeMount{
{ {
Name: "shared-memory", Name: "shared-memory",
...@@ -1000,8 +996,8 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -1000,8 +996,8 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Image: "test-image:latest", Image: "test-image:latest",
Command: []string{"sh", "-c"}, Command: []string{"sh", "-c"},
Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"}, Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"},
Env: []corev1.EnvVar{{Name: "TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC", Value: "test_value_from_dynamo_component_deployment_spec"}, {Name: "TEST_ENV_FROM_EXTRA_POD_SPEC", Value: "test_value_from_extra_pod_spec"}, {Name: "DYNAMO_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort)}}, Env: []corev1.EnvVar{{Name: "TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC", Value: "test_value_from_dynamo_component_deployment_spec"}, {Name: "TEST_ENV_FROM_EXTRA_POD_SPEC", Value: "test_value_from_extra_pod_spec"}},
Ports: []corev1.ContainerPort{{Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoServicePortName, ContainerPort: commonconsts.DynamoServicePort}}, Ports: nil,
VolumeMounts: []corev1.VolumeMount{ VolumeMounts: []corev1.VolumeMount{
{ {
Name: "shared-memory", Name: "shared-memory",
......
...@@ -179,7 +179,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co ...@@ -179,7 +179,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
}) })
resources := []Resource{groveGangSetAsResource} resources := []Resource{groveGangSetAsResource}
for componentName, component := range dynamoDeployment.Spec.Services { for componentName, component := range dynamoDeployment.Spec.Services {
if component.ComponentType == consts.ComponentTypeMain { if component.ComponentType == consts.ComponentTypeFrontend {
// generate the main component service // generate the main component service
mainComponentService, err := dynamo.GenerateComponentService(ctx, dynamo.GetDynamoComponentName(dynamoDeployment, componentName), dynamoDeployment.Namespace) mainComponentService, err := dynamo.GenerateComponentService(ctx, dynamo.GetDynamoComponentName(dynamoDeployment, componentName), dynamoDeployment.Namespace)
if err != nil { if err != nil {
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package dynamo
import (
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
)
// ComponentDefaults interface defines how defaults should be provided
type ComponentDefaults interface {
// GetBaseContainer returns the base container configuration for this component type
// The numberOfNodes parameter indicates the total number of nodes in the deployment
GetBaseContainer(numberOfNodes int32) (corev1.Container, error)
// GetBasePodSpec returns the base pod spec configuration for this component type
// The numberOfNodes parameter indicates the total number of nodes in the deployment
GetBasePodSpec(numberOfNodes int32) (corev1.PodSpec, error)
}
// ComponentDefaultsFactory creates appropriate defaults based on component type and number of nodes
func ComponentDefaultsFactory(componentType string, numberOfNodes int32) ComponentDefaults {
switch componentType {
case commonconsts.ComponentTypeFrontend:
return NewFrontendDefaults()
case commonconsts.ComponentTypeWorker:
return NewWorkerDefaults()
case commonconsts.ComponentTypePlanner:
return NewPlannerDefaults()
default:
return &BaseComponentDefaults{}
}
}
// BaseComponentDefaults provides common defaults shared by all components
type BaseComponentDefaults struct{}
func (b *BaseComponentDefaults) GetBaseContainer(numberOfNodes int32) (corev1.Container, error) {
return b.getCommonContainer(), nil
}
func (b *BaseComponentDefaults) GetBasePodSpec(numberOfNodes int32) (corev1.PodSpec, error) {
return corev1.PodSpec{}, nil
}
func (b *BaseComponentDefaults) getCommonContainer() corev1.Container {
container := corev1.Container{
Name: "main",
Command: []string{
"/bin/sh",
"-c",
},
}
return container
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment