Unverified Commit 5f7c1f7e authored by Ziqi Fan's avatar Ziqi Fan Committed by GitHub
Browse files

chore: add sample KVBM related k8s deployment manifests (#3523)


Signed-off-by: default avatarZiqi Fan <ziqif@nvidia.com>
parent e9a71009
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-agg-kvbm
spec:
services:
Frontend:
dynamoNamespace: vllm-agg-kvbm
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
VllmDecodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: vllm-agg-kvbm
componentType: worker
replicas: 1
resources:
requests:
gpu: "1"
memory: "200Gi"
limits:
gpu: "1"
memory: "250Gi"
envs:
- name: DYN_KVBM_CPU_CACHE_GB
value: "100"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.45"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
- --connector
- kvbm
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-disagg-kvbm
spec:
services:
Frontend:
dynamoNamespace: vllm-disagg-kvbm
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
VllmDecodeWorker:
dynamoNamespace: vllm-disagg-kvbm
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
resources:
limits:
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.3"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
VllmPrefillWorker:
dynamoNamespace: vllm-disagg-kvbm
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
resources:
requests:
gpu: "1"
memory: "200Gi"
limits:
gpu: "1"
memory: "250Gi"
envs:
- name: DYN_KVBM_CPU_CACHE_GB
value: "100"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- Qwen/Qwen3-8B
- --is-prefill-worker
- --gpu-memory-utilization
- "0.3"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
- --connector
- kvbm
- nixl
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-disagg-kvbm-2p2d
spec:
services:
Frontend:
dynamoNamespace: vllm-disagg-kvbm-2p2d
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
VllmDecodeWorker:
dynamoNamespace: vllm-disagg-kvbm-2p2d
envFromSecret: hf-token-secret
componentType: worker
replicas: 2
resources:
limits:
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.3"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
VllmPrefillWorker:
dynamoNamespace: vllm-disagg-kvbm-2p2d
envFromSecret: hf-token-secret
componentType: worker
replicas: 2
resources:
requests:
gpu: "1"
memory: "200Gi"
limits:
gpu: "1"
memory: "250Gi"
envs:
- name: DYN_KVBM_CPU_CACHE_GB
value: "100"
- name: DYN_KVBM_BARRIER_ID_PREFIX
valueFrom:
fieldRef:
fieldPath: metadata.name
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- Qwen/Qwen3-8B
- --is-prefill-worker
- --gpu-memory-utilization
- "0.3"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
- --connector
- kvbm
- nixl
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-disagg-kvbm-tp2
spec:
services:
Frontend:
dynamoNamespace: vllm-disagg-kvbm-tp2
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
VllmDecodeWorker:
dynamoNamespace: vllm-disagg-kvbm-tp2
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
resources:
requests:
gpu: "2"
limits:
gpu: "2"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.23"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
- --tensor-parallel-size
- "2"
VllmPrefillWorker:
dynamoNamespace: vllm-disagg-kvbm-tp2
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
resources:
requests:
gpu: "2"
memory: "200Gi"
limits:
gpu: "2"
memory: "250Gi"
envs:
- name: DYN_KVBM_CPU_CACHE_GB
value: "100"
- name: DYN_KVBM_BARRIER_ID_PREFIX
valueFrom:
fieldRef:
fieldPath: metadata.name
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- Qwen/Qwen3-8B
- --is-prefill-worker
- --gpu-memory-utilization
- "0.23"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
- --connector
- kvbm
- nixl
- --tensor-parallel-size
- "2"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment