"docs/vscode:/vscode.git/clone" did not exist on "c09ac69702ec9cd2772f5a5a837bee3488723d08"
Unverified Commit 81c27803 authored by mohammedabdulwahhab's avatar mohammedabdulwahhab Committed by GitHub
Browse files

fix: operator defaults (#2398)


Signed-off-by: default avatarmohammedabdulwahhab <furkhan324@berkeley.edu>
parent 9ddb3efd
......@@ -8,26 +8,8 @@ metadata:
spec:
services:
Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-agg
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
......@@ -45,21 +27,6 @@ spec:
- "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg && python3 -m dynamo.frontend --http-port=8000"
SGLangDecodeWorker:
envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
exec:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
dynamoNamespace: sglang-agg
componentType: worker
replicas: 1
......@@ -72,21 +39,8 @@ spec:
cpu: "32"
memory: "80Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 10
failureThreshold: 60
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
command:
......
......@@ -8,26 +8,8 @@ metadata:
spec:
services:
Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-agg-router
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
......@@ -45,21 +27,6 @@ spec:
- "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-agg-router && python3 -m dynamo.frontend --http-port=8000 --router-mode kv"
SGLangDecodeWorker:
envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
exec:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
dynamoNamespace: sglang-agg-router
componentType: worker
replicas: 1
......@@ -72,21 +39,8 @@ spec:
cpu: "32"
memory: "80Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 10
failureThreshold: 60
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
command:
......
......@@ -8,26 +8,8 @@ metadata:
spec:
services:
Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: sglang-disagg
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
......@@ -45,21 +27,6 @@ spec:
- "python3 -m dynamo.sglang.utils.clear_namespace --namespace sglang-disagg && python3 -m dynamo.frontend --http-port=8000"
SGLangDecodeWorker:
envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
exec:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
dynamoNamespace: sglang-disagg
componentType: worker
replicas: 1
......@@ -72,21 +39,8 @@ spec:
cpu: "32"
memory: "80Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0808-07
workingDir: /workspace/components/backends/sglang
command:
......@@ -112,21 +66,6 @@ spec:
- "nixl"
SGLangPrefillWorker:
envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
exec:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
dynamoNamespace: sglang-disagg
componentType: worker
replicas: 1
......@@ -139,21 +78,8 @@ spec:
cpu: "32"
memory: "80Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0808-07
workingDir: /workspace/components/backends/sglang
command:
......
......@@ -16,25 +16,7 @@ spec:
services:
Frontend:
dynamoNamespace: dynamo
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
......@@ -97,9 +79,9 @@ spec:
- --backend=sglang
- --adjustment-interval=60
- --profile-results-dir=/workspace/profiling_results
Prometheus:
Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
dynamoNamespace: dynamo
componentType: main
componentType: frontend
replicas: 1
envs:
- name: PYTHONPATH
......@@ -142,20 +124,6 @@ spec:
SGLangDecodeWorker:
dynamoNamespace: dynamo
envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
componentType: worker
replicas: 2
resources:
......@@ -167,21 +135,8 @@ spec:
cpu: "32"
memory: "80Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0811-1
workingDir: /workspace/components/backends/sglang
args:
......@@ -205,20 +160,6 @@ spec:
SGLangPrefillWorker:
dynamoNamespace: dynamo
envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
componentType: worker
replicas: 2
resources:
......@@ -230,21 +171,8 @@ spec:
cpu: "32"
memory: "80Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/sglang-runtime:hzhou-0811-1
workingDir: /workspace/components/backends/sglang
args:
......
......@@ -9,26 +9,7 @@ spec:
services:
Frontend:
dynamoNamespace: trtllm-agg
componentType: main
livenessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 3
failureThreshold: 10
componentType: frontend
replicas: 1
resources:
requests:
......@@ -48,20 +29,6 @@ spec:
- "python3 -m dynamo.frontend --http-port 8000"
TRTLLMWorker:
envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
dynamoNamespace: trtllm-agg
componentType: worker
replicas: 1
......@@ -74,22 +41,8 @@ spec:
cpu: "10"
memory: "20Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
args:
......
......@@ -8,35 +8,9 @@ metadata:
spec:
services:
Frontend:
livenessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 3
failureThreshold: 5
dynamoNamespace: trtllm-agg-router
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
......@@ -48,20 +22,6 @@ spec:
- "python3 -m dynamo.frontend --http-port 8000 --router-mode kv"
TRTLLMWorker:
envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
dynamoNamespace: trtllm-agg-router
componentType: worker
replicas: 2
......@@ -74,22 +34,8 @@ spec:
cpu: "10"
memory: "20Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
args:
......
......@@ -9,26 +9,7 @@ spec:
services:
Frontend:
dynamoNamespace: trtllm-disagg
componentType: main
livenessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 3
failureThreshold: 10
componentType: frontend
replicas: 1
resources:
requests:
......@@ -51,20 +32,6 @@ spec:
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
resources:
requests:
cpu: "10"
......@@ -76,46 +43,18 @@ spec:
gpu: "1"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first 2>&1 | tee /tmp/trtllm.log"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first"
TRTLLMDecodeWorker:
dynamoNamespace: trtllm-disagg
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
resources:
requests:
cpu: "10"
......@@ -127,24 +66,10 @@ spec:
gpu: "1"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first 2>&1 | tee /tmp/trtllm.log"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first"
......@@ -9,26 +9,7 @@ spec:
services:
Frontend:
dynamoNamespace: trtllm-v1-disagg-router
componentType: main
livenessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 3
failureThreshold: 10
componentType: frontend
replicas: 1
resources:
requests:
......@@ -51,20 +32,6 @@ spec:
envFromSecret: hf-token-secret
componentType: worker
replicas: 2
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
resources:
requests:
cpu: "10"
......@@ -76,46 +43,18 @@ spec:
gpu: "1"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics 2>&1 | tee /tmp/trtllm.log"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics"
TRTLLMDecodeWorker:
dynamoNamespace: trtllm-v1-disagg-router
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
resources:
requests:
cpu: "10"
......@@ -127,24 +66,10 @@ spec:
gpu: "1"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first 2>&1 | tee /tmp/trtllm.log"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first"
......@@ -8,34 +8,9 @@ metadata:
spec:
services:
Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
dynamoNamespace: vllm-agg
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
......@@ -47,55 +22,15 @@ spec:
- "python3 -m dynamo.frontend --http-port 8000"
VllmDecodeWorker:
envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 60
dynamoNamespace: vllm-agg
componentType: worker
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B
......@@ -8,34 +8,9 @@ metadata:
spec:
services:
Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: vllm-agg-router
componentType: main
componentType: frontend
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
......@@ -47,51 +22,15 @@ spec:
- "python3 -m dynamo.frontend --http-port 8000 --router-mode kv"
VllmDecodeWorker:
envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
dynamoNamespace: vllm-agg-router
componentType: worker
replicas: 2
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B
......@@ -9,26 +9,8 @@ spec:
services:
Frontend:
dynamoNamespace: vllm-disagg
componentType: main
componentType: frontend
replicas: 1
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources:
requests:
cpu: "32"
......@@ -50,20 +32,6 @@ spec:
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources:
requests:
cpu: "32"
......@@ -73,47 +41,20 @@ spec:
cpu: "32"
memory: "40Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B"
VllmPrefillWorker:
dynamoNamespace: vllm-disagg
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources:
requests:
cpu: "32"
......@@ -123,25 +64,12 @@ spec:
cpu: "32"
memory: "40Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker"
......@@ -16,26 +16,8 @@ spec:
services:
Frontend:
dynamoNamespace: vllm-disagg-planner
componentType: main
componentType: frontend
replicas: 1
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources:
requests:
cpu: "32"
......@@ -99,9 +81,9 @@ spec:
- --backend=vllm
- --adjustment-interval=60
- --profile-results-dir=/workspace/profiling_results
Prometheus:
Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
dynamoNamespace: vllm-disagg-planner
componentType: main
componentType: frontend
replicas: 1
envs:
- name: PYTHONPATH
......@@ -146,20 +128,6 @@ spec:
envFromSecret: hf-token-secret
componentType: worker
replicas: 2
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources:
requests:
cpu: "8"
......@@ -169,13 +137,6 @@ spec:
cpu: "8"
memory: "16Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
......@@ -190,26 +151,12 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --migration-limit=3 2>&1 | tee /tmp/vllm.log"
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --migration-limit=3"
VllmPrefillWorker:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
replicas: 2
livenessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources:
requests:
cpu: "8"
......@@ -219,13 +166,6 @@ spec:
cpu: "8"
memory: "16Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
......@@ -240,4 +180,4 @@ spec:
- /bin/sh
- -c
args:
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker --migration-limit=3 2>&1 | tee /tmp/vllm.log
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker --migration-limit=3
......@@ -9,33 +9,8 @@ spec:
services:
Frontend:
dynamoNamespace: vllm-v1-disagg-router
componentType: main
componentType: frontend
replicas: 1
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 20
periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
......@@ -50,96 +25,26 @@ spec:
envFromSecret: hf-token-secret
componentType: worker
replicas: 2
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B
VllmPrefillWorker:
dynamoNamespace: vllm-v1-disagg-router
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker 2>&1 | tee /tmp/vllm.log
- python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker
......@@ -181,8 +181,8 @@ func (s *DynamoComponentDeployment) SetSpec(spec any) {
s.Spec = spec.(DynamoComponentDeploymentSpec)
}
func (s *DynamoComponentDeployment) IsMainComponent() bool {
return strings.HasSuffix(s.Spec.DynamoTag, s.Spec.ServiceName) || s.Spec.ComponentType == commonconsts.ComponentTypeMain
func (s *DynamoComponentDeployment) IsFrontendComponent() bool {
return strings.HasSuffix(s.Spec.DynamoTag, s.Spec.ServiceName) || s.Spec.ComponentType == commonconsts.ComponentTypeFrontend
}
func (s *DynamoComponentDeployment) GetDynamoDeploymentConfig() []byte {
......
......@@ -28,7 +28,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
func TestDynamoComponentDeployment_IsMainComponent(t *testing.T) {
func TestDynamoComponentDeployment_IsFrontendComponent(t *testing.T) {
type fields struct {
TypeMeta metav1.TypeMeta
ObjectMeta metav1.ObjectMeta
......@@ -73,8 +73,8 @@ func TestDynamoComponentDeployment_IsMainComponent(t *testing.T) {
Spec: tt.fields.Spec,
Status: tt.fields.Status,
}
if got := s.IsMainComponent(); got != tt.want {
t.Errorf("DynamoComponentDeployment.IsMainComponent() = %v, want %v", got, tt.want)
if got := s.IsFrontendComponent(); got != tt.want {
t.Errorf("DynamoComponentDeployment.IsFrontendComponent() = %v, want %v", got, tt.want)
}
})
}
......
......@@ -38,8 +38,9 @@ const (
DynamoDeploymentConfigEnvVar = "DYN_DEPLOYMENT_CONFIG"
ComponentTypePlanner = "planner"
ComponentTypeMain = "main"
ComponentTypeFrontend = "frontend"
ComponentTypeWorker = "worker"
ComponentTypeDefault = "default"
PlannerServiceAccountName = "planner-serviceaccount"
DefaultIngressSuffix = "local"
......
......@@ -1341,7 +1341,7 @@ func (r *DynamoComponentDeploymentReconciler) generateService(opt generateResour
},
}
if !opt.dynamoComponentDeployment.IsMainComponent() || (!opt.isGenericService && !opt.containsStealingTrafficDebugModeEnabled) {
if !opt.dynamoComponentDeployment.IsFrontendComponent() || (!opt.isGenericService && !opt.containsStealingTrafficDebugModeEnabled) {
// if it's not the main component or if it's not a generic service and not contains stealing traffic debug mode enabled, we don't need to create the service
return kubeService, true, nil
}
......
......@@ -940,12 +940,8 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Image: "test-image:latest",
Command: []string{"sh", "-c"},
Args: []string{"ray start --head --port=6379 && some dynamo command"},
Env: []corev1.EnvVar{{Name: "TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC", Value: "test_value_from_dynamo_component_deployment_spec"}, {Name: "TEST_ENV_FROM_EXTRA_POD_SPEC", Value: "test_value_from_extra_pod_spec"}, {Name: "DYNAMO_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort)}},
Ports: []corev1.ContainerPort{
{
Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoServicePortName, ContainerPort: commonconsts.DynamoServicePort,
},
},
Env: []corev1.EnvVar{{Name: "TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC", Value: "test_value_from_dynamo_component_deployment_spec"}, {Name: "TEST_ENV_FROM_EXTRA_POD_SPEC", Value: "test_value_from_extra_pod_spec"}},
Ports: nil,
VolumeMounts: []corev1.VolumeMount{
{
Name: "shared-memory",
......@@ -1000,8 +996,8 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Image: "test-image:latest",
Command: []string{"sh", "-c"},
Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"},
Env: []corev1.EnvVar{{Name: "TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC", Value: "test_value_from_dynamo_component_deployment_spec"}, {Name: "TEST_ENV_FROM_EXTRA_POD_SPEC", Value: "test_value_from_extra_pod_spec"}, {Name: "DYNAMO_PORT", Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort)}},
Ports: []corev1.ContainerPort{{Protocol: corev1.ProtocolTCP, Name: commonconsts.DynamoServicePortName, ContainerPort: commonconsts.DynamoServicePort}},
Env: []corev1.EnvVar{{Name: "TEST_ENV_FROM_DYNAMO_COMPONENT_DEPLOYMENT_SPEC", Value: "test_value_from_dynamo_component_deployment_spec"}, {Name: "TEST_ENV_FROM_EXTRA_POD_SPEC", Value: "test_value_from_extra_pod_spec"}},
Ports: nil,
VolumeMounts: []corev1.VolumeMount{
{
Name: "shared-memory",
......
......@@ -179,7 +179,7 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
})
resources := []Resource{groveGangSetAsResource}
for componentName, component := range dynamoDeployment.Spec.Services {
if component.ComponentType == consts.ComponentTypeMain {
if component.ComponentType == consts.ComponentTypeFrontend {
// generate the main component service
mainComponentService, err := dynamo.GenerateComponentService(ctx, dynamo.GetDynamoComponentName(dynamoDeployment, componentName), dynamoDeployment.Namespace)
if err != nil {
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package dynamo
import (
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
)
// ComponentDefaults interface defines how defaults should be provided
type ComponentDefaults interface {
// GetBaseContainer returns the base container configuration for this component type
// The numberOfNodes parameter indicates the total number of nodes in the deployment
GetBaseContainer(numberOfNodes int32) (corev1.Container, error)
// GetBasePodSpec returns the base pod spec configuration for this component type
// The numberOfNodes parameter indicates the total number of nodes in the deployment
GetBasePodSpec(numberOfNodes int32) (corev1.PodSpec, error)
}
// ComponentDefaultsFactory creates appropriate defaults based on component type and number of nodes
func ComponentDefaultsFactory(componentType string, numberOfNodes int32) ComponentDefaults {
switch componentType {
case commonconsts.ComponentTypeFrontend:
return NewFrontendDefaults()
case commonconsts.ComponentTypeWorker:
return NewWorkerDefaults()
case commonconsts.ComponentTypePlanner:
return NewPlannerDefaults()
default:
return &BaseComponentDefaults{}
}
}
// BaseComponentDefaults provides common defaults shared by all components
type BaseComponentDefaults struct{}
func (b *BaseComponentDefaults) GetBaseContainer(numberOfNodes int32) (corev1.Container, error) {
return b.getCommonContainer(), nil
}
func (b *BaseComponentDefaults) GetBasePodSpec(numberOfNodes int32) (corev1.PodSpec, error) {
return corev1.PodSpec{}, nil
}
func (b *BaseComponentDefaults) getCommonContainer() corev1.Container {
container := corev1.Container{
Name: "main",
Command: []string{
"/bin/sh",
"-c",
},
}
return container
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment