Unverified Commit c8770464 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: normalize dynamo namespace computation (#5231)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent abd4b5d9
...@@ -13,7 +13,6 @@ spec: ...@@ -13,7 +13,6 @@ spec:
services: services:
Frontend: Frontend:
componentType: frontend componentType: frontend
dynamoNamespace: disagg-router-6p-2d
envs: envs:
- name: HF_HOME - name: HF_HOME
value: /home/dynamo/.cache/huggingface value: /home/dynamo/.cache/huggingface
...@@ -38,7 +37,6 @@ spec: ...@@ -38,7 +37,6 @@ spec:
subComponentType: null subComponentType: null
VllmDecodeWorker: VllmDecodeWorker:
componentType: worker componentType: worker
dynamoNamespace: disagg-router-6p-2d
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
...@@ -86,7 +84,6 @@ spec: ...@@ -86,7 +84,6 @@ spec:
useAsCompilationCache: true useAsCompilationCache: true
VllmPrefillWorker: VllmPrefillWorker:
componentType: worker componentType: worker
dynamoNamespace: disagg-router-6p-2d
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
extraPodMetadata: extraPodMetadata:
annotations: annotations:
......
...@@ -8,7 +8,6 @@ metadata: ...@@ -8,7 +8,6 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-moe-agg
componentType: frontend componentType: frontend
replicas: 1 replicas: 1
extraPodSpec: extraPodSpec:
...@@ -16,7 +15,6 @@ spec: ...@@ -16,7 +15,6 @@ spec:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
VllmDecodeWorker: VllmDecodeWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
dynamoNamespace: vllm-moe-agg
componentType: worker componentType: worker
replicas: 1 replicas: 1
resources: resources:
......
...@@ -8,7 +8,6 @@ metadata: ...@@ -8,7 +8,6 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-moe-disagg
componentType: frontend componentType: frontend
replicas: 1 replicas: 1
extraPodSpec: extraPodSpec:
...@@ -17,7 +16,6 @@ spec: ...@@ -17,7 +16,6 @@ spec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
VllmDecodeWorker: VllmDecodeWorker:
dynamoNamespace: vllm-moe-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
subComponentType: decode subComponentType: decode
...@@ -83,7 +81,6 @@ spec: ...@@ -83,7 +81,6 @@ spec:
- --no-enable-prefix-caching - --no-enable-prefix-caching
- --enforce-eager - --enforce-eager
VllmPrefillWorker: VllmPrefillWorker:
dynamoNamespace: vllm-moe-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
subComponentType: prefill subComponentType: prefill
......
...@@ -8,7 +8,6 @@ metadata: ...@@ -8,7 +8,6 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-agg
componentType: main componentType: main
replicas: 1 replicas: 1
livenessProbe: livenessProbe:
...@@ -46,7 +45,6 @@ spec: ...@@ -46,7 +45,6 @@ spec:
args: args:
- "python3 -m dynamo.frontend --http-port 8000" - "python3 -m dynamo.frontend --http-port 8000"
VllmDecodeWorker: VllmDecodeWorker:
dynamoNamespace: vllm-agg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 1 replicas: 1
......
...@@ -8,7 +8,6 @@ metadata: ...@@ -8,7 +8,6 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-disagg
componentType: main componentType: main
replicas: 1 replicas: 1
livenessProbe: livenessProbe:
...@@ -46,7 +45,6 @@ spec: ...@@ -46,7 +45,6 @@ spec:
args: args:
- "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128" - "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128"
VllmDecodeWorker: VllmDecodeWorker:
dynamoNamespace: vllm-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 2 replicas: 2
...@@ -94,7 +92,6 @@ spec: ...@@ -94,7 +92,6 @@ spec:
args: args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log"
VllmPrefillWorker: VllmPrefillWorker:
dynamoNamespace: vllm-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 2 replicas: 2
......
...@@ -8,7 +8,6 @@ metadata: ...@@ -8,7 +8,6 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-disagg
componentType: main componentType: main
replicas: 1 replicas: 1
livenessProbe: livenessProbe:
...@@ -46,7 +45,6 @@ spec: ...@@ -46,7 +45,6 @@ spec:
args: args:
- "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128" - "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128"
VllmDecodeWorker: VllmDecodeWorker:
dynamoNamespace: vllm-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 1 replicas: 1
...@@ -94,7 +92,6 @@ spec: ...@@ -94,7 +92,6 @@ spec:
args: args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 2>&1 | tee /tmp/vllm.log"
VllmPrefillWorker: VllmPrefillWorker:
dynamoNamespace: vllm-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 3 replicas: 3
......
...@@ -11,7 +11,6 @@ spec: ...@@ -11,7 +11,6 @@ spec:
value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:8000"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}' value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:8000"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-disagg-planner
componentType: main componentType: main
replicas: 1 replicas: 1
livenessProbe: livenessProbe:
...@@ -49,7 +48,6 @@ spec: ...@@ -49,7 +48,6 @@ spec:
args: args:
- "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128 --router-mode kv --kv-overlap-score-weight 0.0 --router-temperature 0.0 --no-kv-events" - "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128 --router-mode kv --kv-overlap-score-weight 0.0 --router-temperature 0.0 --no-kv-events"
Planner: Planner:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: planner componentType: planner
replicas: 1 replicas: 1
...@@ -94,7 +92,6 @@ spec: ...@@ -94,7 +92,6 @@ spec:
--prometheus-port=9085 --prometheus-port=9085
--no-correction --no-correction
VllmDecodeWorker: VllmDecodeWorker:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
subComponentType: decode subComponentType: decode
...@@ -149,7 +146,6 @@ spec: ...@@ -149,7 +146,6 @@ spec:
- --block-size - --block-size
- "128" - "128"
VllmPrefillWorker: VllmPrefillWorker:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
subComponentType: prefill subComponentType: prefill
......
...@@ -8,7 +8,6 @@ metadata: ...@@ -8,7 +8,6 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-disagg
componentType: main componentType: main
replicas: 1 replicas: 1
livenessProbe: livenessProbe:
...@@ -46,7 +45,6 @@ spec: ...@@ -46,7 +45,6 @@ spec:
args: args:
- "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128" - "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128"
VllmDecodeWorker: VllmDecodeWorker:
dynamoNamespace: vllm-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 1 replicas: 1
...@@ -94,7 +92,6 @@ spec: ...@@ -94,7 +92,6 @@ spec:
args: args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 --tensor-parallel-size 2 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --no-enable-prefix-caching --block-size 128 --tensor-parallel-size 2 2>&1 | tee /tmp/vllm.log"
VllmPrefillWorker: VllmPrefillWorker:
dynamoNamespace: vllm-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 1 replicas: 1
......
...@@ -8,7 +8,6 @@ metadata: ...@@ -8,7 +8,6 @@ metadata:
spec: spec:
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-disagg
componentType: main componentType: main
replicas: 1 replicas: 1
livenessProbe: livenessProbe:
...@@ -46,7 +45,6 @@ spec: ...@@ -46,7 +45,6 @@ spec:
args: args:
- "python3 -m dynamo.frontend --http-port 8000" - "python3 -m dynamo.frontend --http-port 8000"
VllmDecodeWorker: VllmDecodeWorker:
dynamoNamespace: vllm-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 1 replicas: 1
...@@ -94,7 +92,6 @@ spec: ...@@ -94,7 +92,6 @@ spec:
args: args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 2>&1 | tee /tmp/vllm.log"
VllmPrefillWorker: VllmPrefillWorker:
dynamoNamespace: vllm-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
replicas: 1 replicas: 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment