Unverified Commit 5505507b authored by mohammedabdulwahhab's avatar mohammedabdulwahhab Committed by GitHub
Browse files

feat: add crds for vllm and llm examples (#1766)


Signed-off-by: default avatarmohammedabdulwahhab <furkhan324@berkeley.edu>
Co-authored-by: default avatarHannah Zhang <hannahz@nvidia.com>
Co-authored-by: default avatarhhzhang16 <54051230+hhzhang16@users.noreply.github.com>
parent 439e977d
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: llm-agg
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","router-num-threads":4,"common-configs":["model","block-size","max-model-len"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services:
Frontend:
dynamoNamespace: llm-agg
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
Processor:
dynamoNamespace: llm-agg
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
VllmWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-agg
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: agg-router
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","router":"kv","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"softmax-sample":true,"common-configs":["model","block-size","router"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"tensor-parallel-size":1,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services:
Frontend:
dynamoNamespace: llm-agg-router
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg_router:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
Processor:
dynamoNamespace: llm-agg-router
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg_router:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
Router:
dynamoNamespace: llm-agg-router
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg_router:Router
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Router
VllmWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-agg-router
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg_router:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: llm-disagg
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","common-configs":["model","block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services:
Frontend:
dynamoNamespace: llm-disagg
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
Processor:
dynamoNamespace: llm-disagg
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
VllmWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-disagg
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
PrefillWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-disagg
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg:PrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- PrefillWorker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: disagg-router
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"router":"kv","kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"common-configs":["model","block-size","router"]},"VllmWorker":{"max-num-batched-tokens":16384,"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"tensor-parallel-size":1,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services:
Frontend:
dynamoNamespace: llm-disagg-router
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg_router:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
Processor:
dynamoNamespace: llm-disagg-router
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg_router:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
Router:
dynamoNamespace: llm-disagg-router
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg_router:Router
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Router
VllmWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-disagg-router
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg_router:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
PrefillWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-disagg-router
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg_router:PrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- PrefillWorker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: agg
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len"]}}'
services:
Frontend:
dynamoNamespace: vllm-v0-agg
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.agg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
VllmWorker:
envFromSecret: hf-token-secret
dynamoNamespace: vllm-v0-agg
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.agg:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: disagg
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]}}'
services:
Frontend:
dynamoNamespace: vllm-v0-disagg
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
VllmWorker:
dynamoNamespace: vllm-v0-disagg
envFromSecret: hf-token-secret
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
PrefillWorker:
dynamoNamespace: vllm-v0-disagg
envFromSecret: hf-token-secret
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg:PrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- PrefillWorker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: disagg-planner
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}'
services:
Frontend:
dynamoNamespace: vllm-v0-disagg-planner
componentType: main
replicas: 1
resources:
requests:
cpu: "2"
memory: "4Gi"
limits:
cpu: "2"
memory: "4Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg_planner:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
VllmWorker:
dynamoNamespace: vllm-v0-disagg-planner
envFromSecret: hf-token-secret
replicas: 1
resources:
requests:
cpu: "20"
memory: "40Gi"
nvidia.com/gpu: "2"
limits:
cpu: "20"
memory: "40Gi"
nvidia.com/gpu: "2"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg_planner:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
PrefillWorker:
dynamoNamespace: vllm-v0-disagg-planner
envFromSecret: hf-token-secret
replicas: 1
resources:
requests:
cpu: "20"
memory: "40Gi"
nvidia.com/gpu: "2"
limits:
cpu: "20"
memory: "40Gi"
nvidia.com/gpu: "2"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg_planner:PrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- PrefillWorker
Planner:
dynamoNamespace: vllm-v0-disagg-planner
replicas: 1
componentType: planner
resources:
requests:
cpu: "2"
memory: "2Gi"
limits:
cpu: "2"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg_planner:Planner
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Planner
- --Planner.environment=kubernetes
Prometheus:
dynamoNamespace: vllm-v0-disagg-planner
replicas: 1
resources:
requests:
cpu: "1000m"
memory: "1000Mi"
limits:
cpu: "1000m"
memory: "1000Mi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v0
args:
- dynamo
- serve
- graphs.disagg_planner:Prometheus
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Prometheus
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: agg
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len"]}}'
services:
Frontend:
dynamoNamespace: vllm-v1-agg
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.agg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
SimpleLoadBalancer:
envFromSecret: hf-token-secret
dynamoNamespace: vllm-v1-agg
replicas: 1
resources:
requests:
cpu: "1"
memory: "20Gi"
limits:
cpu: "1"
memory: "20Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.agg:SimpleLoadBalancer
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- SimpleLoadBalancer
VllmDecodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: vllm-v1-agg
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.agg:VllmDecodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmDecodeWorker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: disagg
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]}}'
services:
Frontend:
dynamoNamespace: vllm-v1-disagg
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
SimpleLoadBalancer:
envFromSecret: hf-token-secret
dynamoNamespace: vllm-v1-disagg
replicas: 1
resources:
requests:
cpu: "1"
memory: "20Gi"
limits:
cpu: "1"
memory: "20Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg:SimpleLoadBalancer
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- SimpleLoadBalancer
VllmDecodeWorker:
dynamoNamespace: vllm-v1-disagg
envFromSecret: hf-token-secret
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg:VllmDecodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmDecodeWorker
VllmPrefillWorker:
dynamoNamespace: vllm-v1-disagg
envFromSecret: hf-token-secret
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
nvidia.com/gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg:VllmPrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmPrefillWorker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: disagg-planner
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}'
services:
Frontend:
dynamoNamespace: vllm-v1-disagg-planner
componentType: main
replicas: 1
resources:
requests:
cpu: "2"
memory: "4Gi"
limits:
cpu: "2"
memory: "4Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg_planner:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
SimpleLoadBalancer:
envFromSecret: hf-token-secret
dynamoNamespace: vllm-v1-disagg-planner
replicas: 1
resources:
requests:
cpu: "1"
memory: "20Gi"
limits:
cpu: "1"
memory: "20Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg_planner:SimpleLoadBalancer
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- SimpleLoadBalancer
VllmDecodeWorker:
dynamoNamespace: vllm-v1-disagg-planner
envFromSecret: hf-token-secret
replicas: 1
resources:
requests:
cpu: "20"
memory: "40Gi"
nvidia.com/gpu: "2"
limits:
cpu: "20"
memory: "40Gi"
nvidia.com/gpu: "2"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg_planner:VllmDecodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmDecodeWorker
VllmPrefillWorker:
dynamoNamespace: vllm-v1-disagg-planner
envFromSecret: hf-token-secret
replicas: 1
resources:
requests:
cpu: "20"
memory: "40Gi"
nvidia.com/gpu: "2"
limits:
cpu: "20"
memory: "40Gi"
nvidia.com/gpu: "2"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg_planner:VllmPrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmPrefillWorker
Planner:
dynamoNamespace: vllm-v1-disagg-planner
replicas: 1
componentType: planner
resources:
requests:
cpu: "2"
memory: "2Gi"
limits:
cpu: "2"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg_planner:Planner
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Planner
- --Planner.environment=kubernetes
Prometheus:
dynamoNamespace: vllm-v1-disagg-planner
replicas: 1
resources:
requests:
cpu: "1000m"
memory: "1000Mi"
limits:
cpu: "1000m"
memory: "1000Mi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
workingDir: /workspace/examples/vllm_v1
args:
- dynamo
- serve
- graphs.disagg_planner:Prometheus
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Prometheus
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment