"components/vscode:/vscode.git/clone" did not exist on "2be83be2d8b6236860bfb0611ea5782fba5255c4"
Unverified Commit 5bf23d54 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: update DynamoGraphDeployments for vllm_v1 (#1890)


Co-authored-by: default avatarmohammedabdulwahhab <furkhan324@berkeley.edu>
parent 9e76590f
...@@ -116,6 +116,40 @@ bash launch/dep.sh ...@@ -116,6 +116,40 @@ bash launch/dep.sh
> [!TIP] > [!TIP]
> Run a disaggregated example and try adding another prefill worker once the setup is running! The system will automatically discover and utilize the new worker. > Run a disaggregated example and try adding another prefill worker once the setup is running! The system will automatically discover and utilize the new worker.
### Kubernetes Deployment
For Kubernetes deployment, YAML manifests are provided in the `deploy/` directory. These define DynamoGraphDeployment resources for various configurations:
- `agg.yaml` - Aggregated serving
- `agg_router.yaml` - Aggregated serving with KV routing
- `disagg.yaml` - Disaggregated serving
- `disagg_router.yaml` - Disaggregated serving with KV routing
#### Prerequisites
- **Dynamo Cloud**: Follow the [Quickstart Guide](../../docs/guides/dynamo_deploy/quickstart.md) to deploy Dynamo Cloud first.
- **Container Images**: The deployment files currently require access to `nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime`. If you don't have access, build and push your own image:
```bash
./container/build.sh --framework VLLM_V1
# Tag and push to your container registry
# Update the image references in the YAML files
```
- **Port Forwarding**: After deployment, forward the frontend service to access the API:
```bash
kubectl port-forward deployment/vllm-v1-disagg-frontend-<pod-uuid-info> 8080:8000
```
#### Deploy to Kubernetes
Example with disagg:
```bash
cd ~/dynamo/examples/vllm/deploy
kubectl apply -f disagg.yaml
```
### Testing the Deployment ### Testing the Deployment
Send a test request to verify your deployment: Send a test request to verify your deployment:
......
...@@ -15,10 +15,28 @@ ...@@ -15,10 +15,28 @@
apiVersion: nvidia.com/v1alpha1 apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment kind: DynamoGraphDeployment
metadata: metadata:
name: agg name: vllm-v1-agg
spec: spec:
services: services:
Frontend: Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: vllm-v1-agg dynamoNamespace: vllm-v1-agg
componentType: main componentType: main
replicas: 1 replicas: 1
...@@ -31,50 +49,38 @@ spec: ...@@ -31,50 +49,38 @@ spec:
memory: "2Gi" memory: "2Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm_v1 workingDir: /workspace/examples/vllm
args: args:
- dynamo - dynamo
- serve - run
- graphs.agg:Frontend - in=http
- --system-app-port - out=dyn
- "5000" - --http-port
- --enable-system-app - "8000"
- --use-default-health-checks
- --service-name
- Frontend
- -f
- ./configs/agg.yaml
SimpleLoadBalancer:
envFromSecret: hf-token-secret
dynamoNamespace: vllm-v1-agg
replicas: 1
resources:
requests:
cpu: "1"
memory: "20Gi"
limits:
cpu: "1"
memory: "20Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.agg:SimpleLoadBalancer
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- SimpleLoadBalancer
- -f
- ./configs/agg.yaml
VllmDecodeWorker: VllmDecodeWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: vllm-v1-agg dynamoNamespace: vllm-v1-agg
componentType: worker
replicas: 1 replicas: 1
resources: resources:
requests: requests:
...@@ -87,17 +93,7 @@ spec: ...@@ -87,17 +93,7 @@ spec:
gpu: "1" gpu: "1"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm_v1 workingDir: /workspace/examples/vllm
args: args:
- dynamo - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
- serve
- graphs.agg:VllmDecodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmDecodeWorker
- -f
- ./configs/agg.yaml
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-v1-agg
spec:
services:
Frontend:
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: vllm-v1-agg
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm
args:
- dynamo
- run
- in=http
- out=dyn
- --http-port
- "8000"
VllmDecodeWorker:
envFromSecret: hf-token-secret
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
dynamoNamespace: vllm-v1-agg
componentType: worker
replicas: 2
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm
args:
- "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
...@@ -15,13 +15,31 @@ ...@@ -15,13 +15,31 @@
apiVersion: nvidia.com/v1alpha1 apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment kind: DynamoGraphDeployment
metadata: metadata:
name: disagg name: vllm-v1-disagg
spec: spec:
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-v1-disagg dynamoNamespace: vllm-v1-disagg
componentType: main componentType: main
replicas: 1 replicas: 1
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources: resources:
requests: requests:
cpu: "1" cpu: "1"
...@@ -31,51 +49,39 @@ spec: ...@@ -31,51 +49,39 @@ spec:
memory: "2Gi" memory: "2Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm_v1 workingDir: /workspace/examples/vllm
args: args:
- dynamo - dynamo
- serve - run
- graphs.disagg:Frontend - in=http
- --system-app-port - out=dyn
- "5000" - --http-port
- --enable-system-app - "8000"
- --use-default-health-checks
- --service-name
- Frontend
- -f
- ./configs/disagg.yaml
SimpleLoadBalancer:
envFromSecret: hf-token-secret
dynamoNamespace: vllm-v1-disagg
replicas: 1
resources:
requests:
cpu: "1"
memory: "20Gi"
limits:
cpu: "1"
memory: "20Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg:SimpleLoadBalancer
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- SimpleLoadBalancer
- -f
- ./configs/disagg.yaml
VllmDecodeWorker: VllmDecodeWorker:
dynamoNamespace: vllm-v1-disagg dynamoNamespace: vllm-v1-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker
replicas: 1 replicas: 1
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources: resources:
requests: requests:
cpu: "10" cpu: "10"
...@@ -87,24 +93,34 @@ spec: ...@@ -87,24 +93,34 @@ spec:
gpu: "1" gpu: "1"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm_v1 workingDir: /workspace/examples/vllm
args: args:
- dynamo - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
- serve
- graphs.disagg:VllmDecodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmDecodeWorker
- -f
- ./configs/disagg.yaml
VllmPrefillWorker: VllmPrefillWorker:
dynamoNamespace: vllm-v1-disagg dynamoNamespace: vllm-v1-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker
replicas: 1 replicas: 1
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources: resources:
requests: requests:
cpu: "10" cpu: "10"
...@@ -116,17 +132,7 @@ spec: ...@@ -116,17 +132,7 @@ spec:
gpu: "1" gpu: "1"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm_v1 workingDir: /workspace/examples/vllm
args: args:
- dynamo - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
- serve
- graphs.disagg:VllmPrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmPrefillWorker
- -f
- ./configs/disagg.yaml
...@@ -15,172 +15,124 @@ ...@@ -15,172 +15,124 @@
apiVersion: nvidia.com/v1alpha1 apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment kind: DynamoGraphDeployment
metadata: metadata:
name: disagg-planner name: vllm-v1-disagg-planner
spec: spec:
services: services:
Frontend: Frontend:
dynamoNamespace: vllm-v1-disagg-planner dynamoNamespace: vllm-v1-disagg-planner
componentType: main componentType: main
replicas: 1 replicas: 1
resources: livenessProbe:
requests: httpGet:
cpu: "2" path: /health
memory: "4Gi" port: 8000
limits: initialDelaySeconds: 60
cpu: "2" periodSeconds: 60
memory: "4Gi" timeoutSeconds: 30
extraPodSpec: failureThreshold: 10
mainContainer: readinessProbe:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 exec:
workingDir: /workspace/examples/vllm_v1 command:
args: - /bin/sh
- dynamo - -c
- serve - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
- graphs.disagg_planner:Frontend initialDelaySeconds: 60
- --system-app-port periodSeconds: 60
- "5000" timeoutSeconds: 30
- --enable-system-app failureThreshold: 10
- --use-default-health-checks
- --service-name
- Frontend
- -f
- ./configs/disagg_planner.yaml
SimpleLoadBalancer:
envFromSecret: hf-token-secret
dynamoNamespace: vllm-v1-disagg-planner
replicas: 1
resources: resources:
requests: requests:
cpu: "1" cpu: "1"
memory: "20Gi" memory: "2Gi"
limits: limits:
cpu: "1" cpu: "1"
memory: "20Gi" memory: "2Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm_v1 workingDir: /workspace/examples/vllm
args: args:
- dynamo - dynamo
- serve - run
- graphs.disagg_planner:SimpleLoadBalancer - in=http
- --system-app-port - out=dyn
- "5000" - --http-port
- --enable-system-app - "8000"
- --use-default-health-checks
- --service-name
- SimpleLoadBalancer
- -f
- ./configs/disagg_planner.yaml
VllmDecodeWorker: VllmDecodeWorker:
dynamoNamespace: vllm-v1-disagg-planner dynamoNamespace: vllm-v1-disagg-planner
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker
replicas: 1 replicas: 1
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources: resources:
requests: requests:
cpu: "20" cpu: "10"
memory: "40Gi" memory: "20Gi"
gpu: "2" gpu: "1"
limits: limits:
cpu: "20" cpu: "10"
memory: "40Gi" memory: "20Gi"
gpu: "2" gpu: "1"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm_v1 workingDir: /workspace/examples/vllm
args: args:
- dynamo - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
- serve
- graphs.disagg_planner:VllmDecodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmDecodeWorker
- -f
- ./configs/disagg_planner.yaml
VllmPrefillWorker: VllmPrefillWorker:
dynamoNamespace: vllm-v1-disagg-planner dynamoNamespace: vllm-v1-disagg-planner
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker
replicas: 1 replicas: 1
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources: resources:
requests: requests:
cpu: "20" cpu: "10"
memory: "40Gi" memory: "20Gi"
gpu: "2" gpu: "1"
limits:
cpu: "20"
memory: "40Gi"
gpu: "2"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg_planner:VllmPrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmPrefillWorker
- -f
- ./configs/disagg_planner.yaml
Planner:
dynamoNamespace: vllm-v1-disagg-planner
replicas: 1
componentType: planner
resources:
requests:
cpu: "2"
memory: "2Gi"
limits:
cpu: "2"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg_planner:Planner
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Planner
- --Planner.environment=kubernetes
- -f
- ./configs/disagg_planner.yaml
Prometheus:
dynamoNamespace: vllm-v1-disagg-planner
replicas: 1
resources:
requests:
cpu: "1000m"
memory: "1000Mi"
limits: limits:
cpu: "1000m" cpu: "10"
memory: "1000Mi" memory: "20Gi"
gpu: "1"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm_v1 workingDir: /workspace/examples/vllm
args: args:
- dynamo - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
- serve
- graphs.disagg_planner:Prometheus
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Prometheus
- -f
- ./configs/disagg_planner.yaml
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-v1-disagg-router
spec:
services:
Frontend:
dynamoNamespace: vllm-v1-disagg-router
componentType: main
replicas: 1
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm
args:
- dynamo
- run
- in=http
- out=dyn
- --http-port
- "8000"
VllmDecodeWorker:
dynamoNamespace: vllm-v1-disagg-router
envFromSecret: hf-token-secret
componentType: worker
replicas: 2
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm
args:
- "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
VllmPrefillWorker:
dynamoNamespace: vllm-v1-disagg-router
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
workingDir: /workspace/examples/vllm
args:
- "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment