"lib/bindings/kvbm/Cargo.toml" did not exist on "334ce5511863d8bfa0dfe85f238163129556a8bb"
disagg_router.yaml 3.39 KB
Newer Older
1
2
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
Alec's avatar
Alec committed
3

4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
  name: vllm-v1-disagg-router
spec:
  services:
    Frontend:
      dynamoNamespace: vllm-v1-disagg-router
      componentType: main
      replicas: 1
      livenessProbe:
        httpGet:
          path: /health
          port: 8000
        initialDelaySeconds: 60
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      readinessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
        initialDelaySeconds: 60
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      resources:
        requests:
          cpu: "1"
          memory: "2Gi"
        limits:
          cpu: "1"
          memory: "2Gi"
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
Alec's avatar
Alec committed
42
          workingDir: /workspace/components/backends/vllm
43
          args:
Alec's avatar
Alec committed
44
            - "python3 -m dynamo.frontend --http-port 8080 --router-mode kv"
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
    VllmDecodeWorker:
      dynamoNamespace: vllm-v1-disagg-router
      envFromSecret: hf-token-secret
      componentType: worker
      replicas: 2
      livenessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      readinessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
        initialDelaySeconds: 60
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      resources:
        requests:
          cpu: "10"
          memory: "20Gi"
          gpu: "1"
        limits:
          cpu: "10"
          memory: "20Gi"
          gpu: "1"
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
Alec's avatar
Alec committed
81
          workingDir: /workspace/components/backends/vllm
82
          args:
Alec's avatar
Alec committed
83
            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    VllmPrefillWorker:
      dynamoNamespace: vllm-v1-disagg-router
      envFromSecret: hf-token-secret
      componentType: worker
      replicas: 1
      livenessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - "exit 0"
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      readinessProbe:
        exec:
          command:
            - /bin/sh
            - -c
            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
        initialDelaySeconds: 60
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
      resources:
        requests:
          cpu: "10"
          memory: "20Gi"
          gpu: "1"
        limits:
          cpu: "10"
          memory: "20Gi"
          gpu: "1"
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
Alec's avatar
Alec committed
120
          workingDir: /workspace/components/backends/vllm
121
          args:
Alec's avatar
Alec committed
122
            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"