shared_frontend.yaml 3.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: dynamo-model-cache
spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 100Gi
---
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
  name: frontend
spec:
  services:
    Frontend:
      componentType: frontend
      dynamoNamespace: dynamo
      replicas: 1
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
---
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
  name: vllm-agg
spec:
  services:
    VllmDecodeWorker:
      pvc:
        create: false
        name: dynamo-model-cache
        mountPoint: /root/.cache/huggingface
      envFromSecret: hf-token-secret
      dynamoNamespace: vllm-agg
      componentType: worker
      replicas: 1
      resources:
        limits:
          gpu: "1"
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
49
          workingDir: /workspace/examples/backends/vllm
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
          command:
            - /bin/sh
            - -c
          args:
            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B
---
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
  name: agg-qwen
spec:
  backendFramework: vllm
  services:
    EncodeWorker:
      pvc:
        create: false
        name: dynamo-model-cache
        mountPoint: /root/.cache/huggingface
      envFromSecret: hf-token-secret
      dynamoNamespace: agg-qwen
      componentType: worker
      replicas: 1
      resources:
        limits:
          gpu: "1"
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
          workingDir: /workspace/examples/multimodal
          command:
            - /bin/sh
            - -c
          args:
            - python3 components/encode_worker.py --model Qwen/Qwen2.5-VL-7B-Instruct
    VLMWorker:
      pvc:
        create: false
        name: dynamo-model-cache
        mountPoint: /root/.cache/huggingface
      envFromSecret: hf-token-secret
      dynamoNamespace: agg-qwen
      componentType: worker
      replicas: 1
      resources:
        limits:
          gpu: "1"
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
          workingDir: /workspace/examples/multimodal
          command:
            - /bin/sh
            - -c
          args:
            - python3 components/worker.py --model Qwen/Qwen2.5-VL-7B-Instruct --worker-type prefill
    Processor:
      pvc:
        create: false
        name: dynamo-model-cache
        mountPoint: /root/.cache/huggingface
      envFromSecret: hf-token-secret
      dynamoNamespace: agg-qwen
      componentType: worker
      replicas: 1
      resources:
        limits:
          gpu: "1"
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
          workingDir: /workspace/examples/multimodal
          command:
            - /bin/sh
            - -c
          args:
            - 'python3 components/processor.py --model Qwen/Qwen2.5-VL-7B-Instruct --prompt-template "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><prompt><|im_end|>\n<|im_start|>assistant\n"'