deploy_hopper_16gpu.yaml 4.97 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
6
# SPDX-License-Identifier: Apache-2.0

apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
7
  name: vllm-dsr1
8
9
10
spec:
  backendFramework: vllm
  pvcs:
11
    - name: model-cache
12
13
14
15
16
      create: false
  services:
    Frontend:
      componentType: frontend
      replicas: 1
17
18
19
      volumeMounts:
        - name: model-cache
          mountPoint: /model-cache
20
21
22
23
24
25
26
27
28
      extraPodSpec:
        mainContainer:
          startupProbe:
            httpGet:
              path: /health
              port: 8000
            periodSeconds: 10
            timeoutSeconds: 1800
            failureThreshold: 60
29
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.0
30
31
    decode:
      componentType: worker
32
      subComponentType: decode
33
34
35
36
37
38
39
40
41
      replicas: 1
      multinode:
        nodeCount: 2
      resources:
        limits:
          gpu: "8"
          custom:
            rdma/ib: "8"
      volumeMounts:
42
        - name: model-cache
43
44
45
46
47
48
49
50
51
52
53
54
          mountPoint: /model-cache
      sharedMemory:
        size: 80Gi
      extraPodSpec:
        mainContainer:
          startupProbe:
            httpGet:
              path: /health
              port: 9090
            periodSeconds: 10
            timeoutSeconds: 10
            failureThreshold: 600
55
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.0
56
57
58
59
60
          workingDir: /workspace/dynamo
          env:
            - name: VLLM_USE_DEEP_GEMM
              value: "1"
            - name: VLLM_MOE_DP_CHUNK_SIZE
61
              value: "384"
62
63
64
65
66
67
68
69
70
            - name: VLLM_SKIP_P2P_CHECK
              value: "1"
            - name: VLLM_RANDOMIZE_DP_DUMMY_INPUTS
              value: "1"
            - name: NVIDIA_GDRCOPY
              value: enabled
            - name: GLOO_SOCKET_IFNAME
              value: eth0
          command:
71
72
73
            - python3
            - -m
            - dynamo.vllm
74
          args:
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
            - --model
            - /model-cache/deepseek-r1
            - --served-model-name
            - deepseek-ai/DeepSeek-R1
            - --all2all-backend
            - deepep_low_latency
            - --data-parallel-hybrid-lb
            - --tensor-parallel-size
            - "1"
            - --data-parallel-size
            - "16"
            - --enable-expert-parallel
            - --max-model-len
            - "16384"
            - --enable-dbo
            - --dbo-decode-token-threshold
            - "32"
            - --async-scheduling
            - --enable-eplb
            - --eplb-config
            - '{"window_size":"1000","step_interval":"3000","num_redundant_experts":"32","log_balancedness":"False"}'
            - --max-num-seqs
            - "512"
            - --compilation_config
            - '{"pass_config":{"fuse_norm_quant":true,"eliminate_noops":true},"cudagraph_mode":"FULL_DECODE_ONLY"}'
100
101
            - --kv-transfer-config
            - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
102
103
    prefill:
      componentType: worker
104
      subComponentType: prefill
105
106
107
108
109
110
111
112
113
      replicas: 1
      multinode:
        nodeCount: 2
      resources:
        limits:
          gpu: "8"
          custom:
            rdma/ib: "8"
      volumeMounts:
114
        - name: model-cache
115
116
117
118
119
120
121
122
123
124
125
126
          mountPoint: /model-cache
      sharedMemory:
        size: 80Gi
      extraPodSpec:
        mainContainer:
          startupProbe:
            httpGet:
              path: /health
              port: 9090
            periodSeconds: 10
            timeoutSeconds: 10
            failureThreshold: 600
127
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.0
128
129
130
131
132
133
134
135
136
137
138
139
140
          workingDir: /workspace/dynamo
          env:
            - name: VLLM_USE_DEEP_GEMM
              value: "1"
            - name: VLLM_SKIP_P2P_CHECK
              value: "1"
            - name: VLLM_RANDOMIZE_DP_DUMMY_INPUTS
              value: "1"
            - name: NVIDIA_GDRCOPY
              value: enabled
            - name: GLOO_SOCKET_IFNAME
              value: eth0
          command:
141
142
143
            - python3
            - -m
            - dynamo.vllm
144
          args:
145
146
            - --model
            - /model-cache/deepseek-r1
147
148
            - --disaggregation-mode
            - prefill
149
150
            - --kv-transfer-config
            - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
            - --served-model-name
            - deepseek-ai/DeepSeek-R1
            - --all2all-backend
            - deepep_high_throughput
            - --data-parallel-hybrid-lb
            - --tensor-parallel-size
            - "1"
            - --data-parallel-size
            - "16"
            - --enable-expert-parallel
            - --max-model-len
            - "16384"
            - --enable-dbo
            - --dbo-decode-token-threshold
            - "32"
            - --async-scheduling
            - --enable-eplb
            - --eplb-config
            - '{"window_size":"1000","step_interval":"3000","num_redundant_experts":"32","log_balancedness":"False"}'
            - --max-num-seqs
            - "512"