# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 apiVersion: v1 kind: ConfigMap metadata: name: nvidia-config data: prefill.yaml: | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. tensor_parallel_size: 8 moe_expert_parallel_size: 1 enable_attention_dp: false max_num_tokens: 8192 trust_remote_code: true backend: pytorch enable_chunked_prefill: true # Overlap scheduler not currently supported in prefill only workers. disable_overlap_scheduler: true kv_cache_config: free_gpu_memory_fraction: 0.80 cache_transceiver_config: backend: default decode.yaml: | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. tensor_parallel_size: 8 moe_expert_parallel_size: 1 enable_attention_dp: false max_num_tokens: 8192 trust_remote_code: true backend: pytorch enable_chunked_prefill: true disable_overlap_scheduler: false kv_cache_config: free_gpu_memory_fraction: 0.80 cache_transceiver_config: backend: default --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: models spec: accessModes: - ReadWriteMany resources: requests: storage: 100Gi --- apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: name: trtllm-disagg-tp8 spec: backendFramework: trtllm envs: - name: OMPI_ALLOW_RUN_AS_ROOT value: "1" - name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM value: "1" - name: HF_HOME value: "/models" services: Frontend: dynamoNamespace: trtllm-disagg componentType: frontend replicas: 1 extraPodSpec: mainContainer: image: my-registry/trtllm-runtime:my-tag workingDir: /workspace/components/backends/trtllm command: - python3 - -m - dynamo.frontend args: - --http-port - "8000" prefill: pvc: name: models mountPoint: /models dynamoNamespace: trtllm-disagg envFromSecret: hf-token-secret componentType: worker replicas: 1 multinode: nodeCount: 2 resources: limits: gpu: "4" extraPodSpec: volumes: - name: nvidia-config configMap: name: nvidia-config mainContainer: volumeMounts: - name: nvidia-config mountPath: /workspace/components/backends/trtllm/engine_configs readOnly: true image: my-registry/trtllm-runtime:my-tag workingDir: /workspace/components/backends/trtllm command: - python3 - -m - dynamo.trtllm args: - --model-path - Qwen/Qwen3-0.6B - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - engine_configs/prefill.yaml - --disaggregation-mode - prefill - --disaggregation-strategy - decode_first decode: pvc: name: models mountPoint: /models dynamoNamespace: trtllm-disagg envFromSecret: hf-token-secret componentType: worker replicas: 1 multinode: nodeCount: 2 resources: limits: gpu: "4" extraPodSpec: volumes: - name: nvidia-config configMap: name: nvidia-config mainContainer: volumeMounts: - name: nvidia-config mountPath: /workspace/components/backends/trtllm/engine_configs readOnly: true image: my-registry/trtllm-runtime:my-tag workingDir: /workspace/components/backends/trtllm command: - python3 - -m - dynamo.trtllm args: - --model-path - Qwen/Qwen3-0.6B - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - engine_configs/decode.yaml - --disaggregation-mode - decode - --disaggregation-strategy - decode_first