feat(llm): adding initial TRTLLM disaggregation support

Co-authored-by: nnshah1 <neelays@nvidia.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>

feat(llm): adding initial TRTLLM disaggregation support
Co-authored-by: nnshah1 <neelays@nvidia.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
deb6c7e8 · ishandhanani · GitHub · 425be8ad · deb6c7e8 · deb6c7e8
Commit deb6c7e8 authored Feb 01, 2025 by ishandhanani Committed by GitHub Feb 01, 2025
6 changed files
--- a/examples/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/1/model.py
+++ b/examples/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/1/model.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import numpy
+import triton_python_backend_utils as pb_utils
+from transformers import AutoTokenizer, T5Tokenizer
+
+
+class TritonPythonModel:
+    """
+    This model allows Triton to act like a api server for T3 ICP
+    """
+
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        inputs = [
+            {"name": "query", "data_type": "TYPE_STRING", "dims": [1]},
+        ]
+        outputs = [
+            {"name": "start_ids", "data_type": "TYPE_INT32", "dims": [-1]},
+            {"name": "start_lengths", "data_type": "TYPE_INT32", "dims": [-1]},
+        ]
+
+        # Store the model configuration as a dictionary.
+        config = auto_complete_model_config.as_dict()
+        input_names = []
+        output_names = []
+        for input in config["input"]:
+            input_names.append(input["name"])
+        for output in config["output"]:
+            output_names.append(output["name"])
+
+        # Add only missing inputs and output to the model configuration.
+        for input in inputs:
+            if input["name"] not in input_names:
+                auto_complete_model_config.add_input(input)
+        for output in outputs:
+            if output["name"] not in output_names:
+                auto_complete_model_config.add_output(output)
+
+        return auto_complete_model_config
+
+    def initialize(self, args):
+        model_config = json.loads(args["model_config"])
+        self.logger = pb_utils.Logger
+
+        tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"]
+
+        self._tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True
+        )
+
+        if isinstance(self._tokenizer, T5Tokenizer):
+            self._tokenizer_bos_id = self._tokenizer.sp_model.bos_id()
+
+        if not self._tokenizer.pad_token:
+            self._tokenizer.pad_token = self._tokenizer.eos_token
+
+        self._tokenizer_end_id = self._tokenizer.encode(
+            self._tokenizer.eos_token, add_special_tokens=False
+        )[0]
+        self._tokenizer_pad_id = self._tokenizer.encode(
+            self._tokenizer.pad_token, add_special_tokens=False
+        )[0]
+        self._vocab_size = self._tokenizer.vocab_size
+
+        for output_name in ["start_ids", "start_lengths"]:
+            setattr(
+                self,
+                output_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_output_config_by_name(model_config, output_name)[
+                        "data_type"
+                    ]
+                ),
+            )
+
+    def execute(self, requests):
+        responses = []
+
+        for request in requests:
+            query = pb_utils.get_input_tensor_by_name(request, "query").as_numpy()
+
+            # Preprocessing input data.
+            if isinstance(self._tokenizer, T5Tokenizer):
+                start_ids = [
+                    numpy.array(
+                        [self._tokenizer_bos_id]
+                        + self._tokenizer.encode(
+                            s[0].decode(), add_special_tokens=False
+                        )
+                    ).astype(numpy.int32)
+                    for s in query
+                ]
+            else:
+                start_ids = [
+                    numpy.array(
+                        self._tokenizer.encode(s[0].decode(), add_special_tokens=False)
+                    ).astype(numpy.int32)
+                    for s in query
+                ]
+
+            start_lengths = numpy.array([[len(ids)] for ids in start_ids]).astype(
+                numpy.int32
+            )
+
+            max_len = 0
+            for seq in start_ids:
+                max_len = max(max_len, seq.shape[0])
+            start_ids = numpy.stack(
+                [
+                    numpy.pad(
+                        seq,
+                        (0, max_len - seq.shape[0]),
+                        "constant",
+                        constant_values=(0, self._tokenizer_pad_id),
+                    )
+                    for seq in start_ids
+                ]
+            )
+
+            start_ids_tensor = pb_utils.Tensor(
+                "start_ids", numpy.array(start_ids).astype(self.start_ids_dtype)
+            )
+            start_lengths_tensor = pb_utils.Tensor(
+                "start_lengths",
+                numpy.array(start_lengths).astype(self.start_lengths_dtype),
+            )
+
+            outputs = [start_ids_tensor, start_lengths_tensor]
+
+            inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
+            responses.append(inference_response)
+
+        return responses
--- a/examples/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/config.pbtxt
+++ b/examples/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/config.pbtxt
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+backend: "python"
+# TODO: Tune dynamic batcher
+max_batch_size: 1
+
+parameters {
+  key: "tokenizer_dir"
+  value: {
+    string_value: "/workspace/examples/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
+  }
+}
+
+instance_group [
+    {
+        count: 10
+        kind : KIND_CPU
+    }
+]
--- a/examples/llm/tensorrtllm/scripts/gpu_info.py
+++ b/examples/llm/tensorrtllm/scripts/gpu_info.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+
+
+def get_gpu_product_name():
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "0"
+    try:
+        result = subprocess.run(
+            [
+                "nvidia-smi",
+                "--query-gpu",
+                "name",
+                "--format",
+                "csv",
+            ],
+            capture_output=True,
+            text=True,
+            env=env,
+        )
+        result_values = [
+            x.replace(", ", ",").split(",") for x in result.stdout.split("\n") if x
+        ]
+        if result_values[0][0] == "No devices were found":
+            return None
+        return result_values[1][0].strip().replace(" ", "_")
+    except FileNotFoundError:
+        return None
+
+
+def number_of_gpus():
+    try:
+        result = subprocess.run(
+            ["nvidia-smi", "--list-gpus"], capture_output=True, text=True
+        )
+
+        return len(result.stdout.strip().split("\n"))
+    except FileNotFoundError:
+        return 0
--- a/examples/llm/tensorrtllm/scripts/known_models.py
+++ b/examples/llm/tensorrtllm/scripts/known_models.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+KNOWN_MODELS = {
+    "mock": {
+        "hf_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "download_patterns": ["*.json"],
+        "max_num_tokens": 2048,
+        "max_batch_size": 512,
+        "templates": [
+            "preprocessing",
+            "postprocessing",
+            "ensemble",
+            (
+                "/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock",
+                "context",
+            ),
+            (
+                "/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock",
+                "generate",
+            ),
+            (
+                "/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock",
+                "tensorrt_llm",
+            ),
+        ],
+        "template_arguments": {
+            "tokenizer_dir": "{args.hf_download}",
+            "triton_max_batch_size": "{args.max_batch_size}",
+            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
+            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
+            "context_token_latency_ms": "0.1",
+            "generate_token_latency_ms": "0.5",
+        },
+    },
+    "llama-3.1-70b-instruct": {
+        "hf_id": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+        "download_model_name": "llama-3.1-70b-instruct",
+        "convert": [
+            "quantization/quantize.py",
+            "--dtype",
+            "float16",
+            "--qformat",
+            "fp8",
+            "--calib_size",
+            "512",
+            "--kv_cache_dtype",
+            "fp8",
+        ],
+        "build": [
+            "--gpt_attention_plugin",
+            "float16",
+            "--max_seq_len",
+            "131072",
+            "--use_fused_mlp",
+            "enable",
+            "--reduce_fusion",
+            "disable",
+            "--multiple_profiles",
+            "enable",
+            "--use_paged_context_fmha",
+            "enable",
+        ],
+        "max_num_tokens": 2048,
+        "max_batch_size": 512,
+        "templates": [
+            "preprocessing",
+            "postprocessing",
+            "ensemble",
+            ("tensorrt_llm", "context"),
+            ("tensorrt_llm", "generate"),
+            "tensorrt_llm",
+        ],
+        "template_arguments": {
+            "triton_max_batch_size": "{args.max_batch_size}",
+            "decoupled_mode": "True",
+            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
+            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
+            "triton_backend": "tensorrtllm",
+            "enable_chunked_context": "{args.enable_chunked_context}",
+            "max_beam_width": "1",
+            "engine_dir": "{args.tensorrtllm_engine}",
+            "exclude_input_in_output": "True",
+            "enable_kv_cache_reuse": "False",
+            "batching_strategy": "inflight_fused_batching",
+            "max_queue_delay_microseconds": "0",
+            "max_queue_size": "0",
+            "participant_ids": "{args.participant_ids}",
+            "tokenizer_dir": "{args.hf_download}",
+            "encoder_input_features_data_type": "TYPE_FP16",
+        },
+    },
+    "llama-3.1-8b-instruct": {
+        "hf_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "download_model_name": "llama-3.1-8b-instruct",
+        "convert": ["llama/convert_checkpoint.py", "--dtype", "float16"],
+        "build": [
+            "--remove_input_padding",
+            "enable",
+            "--gpt_attention_plugin",
+            "float16",
+            "--context_fmha",
+            "enable",
+            "--gemm_plugin",
+            "float16",
+            "--paged_kv_cache",
+            "enable",
+        ],
+        "max_num_tokens": 16384,
+        "max_batch_size": 64,
+        "templates": [
+            "preprocessing",
+            "postprocessing",
+            "ensemble",
+            ("tensorrt_llm", "context"),
+            ("tensorrt_llm", "generate"),
+            "tensorrt_llm",
+        ],
+        "template_arguments": {
+            "triton_max_batch_size": "{args.max_batch_size}",
+            "decoupled_mode": "True",
+            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
+            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
+            "triton_backend": "tensorrtllm",
+            "max_beam_width": "1",
+            "engine_dir": "{args.tensorrtllm_engine}",
+            "exclude_input_in_output": "True",
+            "enable_kv_cache_reuse": "False",
+            "batching_strategy": "inflight_fused_batching",
+            "max_queue_delay_microseconds": "0",
+            "max_queue_size": "0",
+            "participant_ids": "0",
+            "tokenizer_dir": "{args.hf_download}",
+            "encoder_input_features_data_type": "TYPE_FP16",
+        },
+    },
+    "llama-3-8b-instruct-generate": {
+        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "max_batch_size": 256,
+        "model_repo_name": "llama-3-8b-instruct-disaggregated",
+        "download_model_name": "llama-3-8b-instruct",
+        "convert": [
+            "quantization/quantize.py",
+            "--dtype",
+            "float16",
+            "--qformat",
+            "fp8",
+            "--calib_size",
+            "512",
+            "--kv_cache_dtype",
+            "fp8",
+        ],
+        "build": [
+            "--gpt_attention_plugin",
+            "float16",
+            "--workers",
+            "{args.tp_size}",
+            "--max_seq_len",
+            "1024",
+            "--use_fused_mlp",
+            "enable",
+            "--multiple_profiles",
+            "enable",
+        ],
+        "max_num_tokens": 256,
+        "templates": [
+            ("tensorrt_llm", "generate"),
+            "postprocessing",
+        ],
+        "template_arguments": {
+            "triton_max_batch_size": "{args.max_batch_size}",
+            "decoupled_mode": "True",
+            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
+            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
+            "triton_backend": "tensorrtllm",
+            "max_beam_width": "1",
+            "engine_dir": "{args.tensorrtllm_engine}",
+            "exclude_input_in_output": "True",
+            "enable_kv_cache_reuse": "False",
+            "batching_strategy": "inflight_fused_batching",
+            "max_queue_delay_microseconds": "0",
+            "max_queue_size": "0",
+            "participant_ids": "0",
+            "tokenizer_dir": "{args.hf_download}",
+            "encoder_input_features_data_type": "TYPE_FP16",
+        },
+    },
+    "llama-3-8b-instruct-context": {
+        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "max_batch_size": 256,
+        "model_repo_name": "llama-3-8b-instruct-disaggregated",
+        "download_model_name": "llama-3-8b-instruct",
+        "convert": [
+            "quantization/quantize.py",
+            "--dtype",
+            "float16",
+            "--qformat",
+            "fp8",
+            "--calib_size",
+            "512",
+            "--kv_cache_dtype",
+            "fp8",
+        ],
+        "build": [
+            "--gpt_attention_plugin",
+            "float16",
+            "--workers",
+            "{args.tp_size}",
+            "--max_seq_len",
+            "8192",
+            "--use_fused_mlp",
+            "enable",
+            "--multiple_profiles",
+            "enable",
+        ],
+        "max_num_tokens": 8192,
+        "templates": [
+            "/workspace/examples/disaggregated_serving/tensorrtllm_templates/context",
+            "preprocessing",
+        ],
+        "template_arguments": {
+            "triton_max_batch_size": "{args.max_batch_size}",
+            "decoupled_mode": "False",
+            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
+            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
+            "triton_backend": "tensorrtllm",
+            "max_beam_width": "1",
+            "engine_dir": "{args.tensorrtllm_engine}",
+            "exclude_input_in_output": "True",
+            "enable_kv_cache_reuse": "False",
+            "batching_strategy": "inflight_fused_batching",
+            "max_queue_delay_microseconds": "0",
+            "max_queue_size": "0",
+            "participant_ids": "0",
+            "tokenizer_dir": "{args.hf_download}",
+            "encoder_input_features_data_type": "TYPE_FP16",
+        },
+    },
+    "llama-3-8b-instruct": {
+        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "convert": [
+            "quantization/quantize.py",
+            "--dtype",
+            "float16",
+            "--qformat",
+            "fp8",
+            "--calib_size",
+            "512",
+            "--kv_cache_dtype",
+            "fp8",
+        ],
+        "build": [
+            "--gpt_attention_plugin",
+            "float16",
+            "--workers",
+            "{args.tp_size}",
+            "--max_seq_len",
+            "8192",
+            "--use_fused_mlp",
+            "enable",
+            "--multiple_profiles",
+            "enable",
+            "--reduce_fusion",
+            "{args.reduce_fusion}",
+        ],
+        "max_num_tokens": 16384,
+        "max_batch_size": 512,
+        "templates": [
+            "preprocessing",
+            "postprocessing",
+            "ensemble",
+            ("tensorrt_llm", "context"),
+            ("tensorrt_llm", "generate"),
+            "tensorrt_llm",
+        ],
+        "template_arguments": {
+            "triton_max_batch_size": "{args.max_batch_size}",
+            "decoupled_mode": "True",
+            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
+            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
+            "triton_backend": "tensorrtllm",
+            "max_beam_width": "1",
+            "engine_dir": "{args.tensorrtllm_engine}",
+            "exclude_input_in_output": "True",
+            "enable_kv_cache_reuse": "False",
+            "batching_strategy": "inflight_fused_batching",
+            "max_queue_delay_microseconds": "0",
+            "max_queue_size": "0",
+            "participant_ids": "0",
+            "tokenizer_dir": "{args.hf_download}",
+            "encoder_input_features_data_type": "TYPE_FP16",
+        },
+    },
+    "llama-3-8b-instruct-default": {
+        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "download_model_name": "llama-3-8b-instruct",
+        "convert": ["llama/convert_checkpoint.py", "--dtype", "float16"],
+        "build": [
+            "--remove_input_padding",
+            "enable",
+            "--gpt_attention_plugin",
+            "float16",
+            "--context_fmha",
+            "enable",
+            "--gemm_plugin",
+            "float16",
+            "--paged_kv_cache",
+            "enable",
+        ],
+        "max_batch_size": 64,
+        "templates": [
+            "preprocessing",
+            "postprocessing",
+            "ensemble",
+            ("tensorrt_llm", "context"),
+            ("tensorrt_llm", "generate"),
+            "tensorrt_llm",
+        ],
+        "template_arguments": {
+            "triton_max_batch_size": "{args.max_batch_size}",
+            "decoupled_mode": "True",
+            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
+            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
+            "triton_backend": "tensorrtllm",
+            "max_beam_width": "1",
+            "engine_dir": "{args.tensorrtllm_engine}",
+            "exclude_input_in_output": "True",
+            "enable_kv_cache_reuse": "False",
+            "batching_strategy": "inflight_fused_batching",
+            "max_queue_delay_microseconds": "0",
+            "max_queue_size": "0",
+            "participant_ids": "0",
+            "tokenizer_dir": "{args.hf_download}",
+            "encoder_input_features_data_type": "TYPE_FP16",
+        },
+    },
+    "llama-3-70b-instruct-context": {
+        "hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
+        "download_model_name": "llama-3-70b-instruct",
+        "model_repo_name": "llama-3-70b-disaggegated",
+        "max_batch_size": 128,
+        "convert": [
+            "quantization/quantize.py",
+            "--dtype",
+            "float16",
+            "--qformat",
+            "fp8",
+            "--calib_size",
+            "512",
+            "--kv_cache_dtype",
+            "fp8",
+        ],
+        "build": [
+            "--gpt_attention_plugin",
+            "float16",
+            "--workers",
+            "{args.tp_size}",
+            "--max_seq_len",
+            "8192",
+            "--use_fused_mlp",
+            "enable",
+            "--reduce_fusion",
+            "{args.reduce_fusion}",
+            "--multiple_profiles",
+            "enable",
+        ],
+        "max_num_tokens": 8192,
+        "templates": [
+            "preprocessing",
+            "/workspace/examples/disaggregated_serving/tensorrtllm_templates/context",
+        ],
+        "template_arguments": {
+            "triton_max_batch_size": "{args.max_batch_size}",
+            "decoupled_mode": "True",
+            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
+            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
+            "triton_backend": "tensorrtllm",
+            "max_beam_width": "1",
+            "engine_dir": "{args.tensorrtllm_engine}",
+            "exclude_input_in_output": "True",
+            "enable_kv_cache_reuse": "False",
+            "batching_strategy": "inflight_fused_batching",
+            "max_queue_delay_microseconds": "0",
+            "max_queue_size": "0",
+            "participant_ids": "{args.participant_ids}",
+            "tokenizer_dir": "{args.hf_download}",
+            "encoder_input_features_data_type": "TYPE_FP16",
+        },
+    },
+    "llama-3-70b-instruct-generate": {
+        "hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
+        "download_model_name": "llama-3-70b-instruct",
+        "model_repo_name": "llama-3-70b-disaggegated",
+        "max_batch_size": 128,
+        "convert": [
+            "quantization/quantize.py",
+            "--dtype",
+            "float16",
+            "--qformat",
+            "fp8",
+            "--calib_size",
+            "512",
+            "--kv_cache_dtype",
+            "fp8",
+        ],
+        "build": [
+            "--gpt_attention_plugin",
+            "float16",
+            "--workers",
+            "{args.tp_size}",
+            "--max_seq_len",
+            "1024",
+            "--use_fused_mlp",
+            "enable",
+            "--reduce_fusion",
+            "{args.reduce_fusion}",
+            "--multiple_profiles",
+            "enable",
+        ],
+        "max_num_tokens": 128,
+        "templates": [
+            "postprocessing",
+            "/workspace/examples/disaggregated_serving/tensorrtllm_templates/generate",
+        ],
+        "template_arguments": {
+            "triton_max_batch_size": "{args.max_batch_size}",
+            "decoupled_mode": "True",
+            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
+            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
+            "triton_backend": "tensorrtllm",
+            "max_beam_width": "1",
+            "engine_dir": "{args.tensorrtllm_engine}",
+            "exclude_input_in_output": "True",
+            "enable_kv_cache_reuse": "False",
+            "batching_strategy": "inflight_fused_batching",
+            "max_queue_delay_microseconds": "0",
+            "max_queue_size": "0",
+            "participant_ids": "{args.participant_ids}",
+            "tokenizer_dir": "{args.hf_download}",
+            "encoder_input_features_data_type": "TYPE_FP16",
+        },
+    },
+    "llama-3-70b-instruct": {
+        "hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
+        "max_batch_size": 512,
+        "convert": [
+            "quantization/quantize.py",
+            "--dtype",
+            "float16",
+            "--qformat",
+            "fp8",
+            "--calib_size",
+            "512",
+            "--kv_cache_dtype",
+            "fp8",
+        ],
+        "build": [
+            "--gpt_attention_plugin",
+            "float16",
+            "--workers",
+            "{args.tp_size}",
+            "--max_seq_len",
+            "8192",
+            "--use_fused_mlp",
+            "enable",
+            "--reduce_fusion",
+            "{args.reduce_fusion}",
+            "--multiple_profiles",
+            "enable",
+        ],
+        "max_num_tokens": 16384,
+        "templates": [
+            "preprocessing",
+            "postprocessing",
+            "ensemble",
+            "tensorrt_llm",
+        ],
+        "template_arguments": {
+            "triton_max_batch_size": "{args.max_batch_size}",
+            "decoupled_mode": "True",
+            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
+            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
+            "triton_backend": "tensorrtllm",
+            "max_beam_width": "1",
+            "engine_dir": "{args.tensorrtllm_engine}",
+            "exclude_input_in_output": "True",
+            "enable_kv_cache_reuse": "False",
+            "batching_strategy": "inflight_fused_batching",
+            "max_queue_delay_microseconds": "0",
+            "max_queue_size": "0",
+            "participant_ids": "{args.participant_ids}",
+            "tokenizer_dir": "{args.hf_download}",
+            "encoder_input_features_data_type": "TYPE_FP16",
+        },
+    },
+}
--- a/examples/llm/tensorrtllm/scripts/prepare_models.py
+++ b/examples/llm/tensorrtllm/scripts/prepare_models.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import shutil
+import subprocess
+from string import Template
+
+from gpu_info import get_gpu_product_name
+from huggingface_hub import snapshot_download
+from known_models import KNOWN_MODELS
+
+TARGET_DIR = "/workspace/examples/llm/tensorrtllm/operators"
+
+TENSORRTLLM_EXAMPLE_DIR = "/tensorrtllm_backend/tensorrt_llm/examples"
+
+TENSORRTLLM_BACKEND_DIR = "/tensorrtllm_backend"
+
+
+def _prepare(args):
+    templates = KNOWN_MODELS[args.model]["templates"]
+    template_arguments = KNOWN_MODELS[args.model]["template_arguments"]
+
+    model_name = (
+        KNOWN_MODELS[args.model]["model_repo_name"]
+        if "model_repo_name" in KNOWN_MODELS[args.model]
+        else None
+    )
+
+    _existing_dir(
+        args,
+        "tensorrtllm_model",
+        args.force_model_repo,
+        "model repo",
+        suffix=[args.hw_name, f"TP_{args.tp_size}"],
+        model_name=model_name,
+    )
+
+    for argument, value in template_arguments.items():
+        template_arguments[argument] = value.format(args=args)
+    template_arguments["request_stats_max_iterations"] = 1000
+    print(template_arguments)
+
+    for template in templates:
+        if isinstance(template, tuple):
+            template_basename = template[1]
+            template = template[0]
+        else:
+            template_basename = os.path.basename(template)
+        template_path = os.path.join(
+            TENSORRTLLM_BACKEND_DIR,
+            "all_models",
+            "inflight_batcher_llm",
+            template,
+            "config.pbtxt",
+        )
+        if template == "ensemble":
+            target_path = os.path.join(
+                args.tensorrtllm_model, args.model, "config.pbtxt"
+            )
+        else:
+            target_path = os.path.join(
+                args.tensorrtllm_model, template_basename, "config.pbtxt"
+            )
+
+        if not args.force_model_repo and os.path.exists(target_path):
+            continue
+
+        print(template_path, os.path.exists(template_path), target_path)
+
+        with open(template_path) as f:
+            pbtxt_template = Template(f.read())
+
+        pbtxt = pbtxt_template.safe_substitute(template_arguments)
+
+        pbtxt = pbtxt.replace(f'name: "{os.path.basename(template)}"', "")
+
+        if not args.dry_run:
+            os.makedirs(os.path.dirname(target_path), exist_ok=True)
+            with open(target_path, "w") as f:
+                f.write(pbtxt)
+            model_asset_path = os.path.join(os.path.dirname(template_path), "1")
+            if os.path.exists(model_asset_path):
+                shutil.copytree(
+                    model_asset_path,
+                    os.path.join(
+                        os.path.dirname(target_path), os.path.basename(model_asset_path)
+                    ),
+                )
+
+
+def _call(args, command):
+    print(" ".join(command))
+    if args.dry_run:
+        return 0
+    else:
+        return subprocess.call(command)
+
+
+def _existing_dir(args, directory_type, force, command, suffix=[], model_name=None):
+    model_name = args.model if model_name is None else model_name
+    target_dir = os.path.join(
+        args.target_dir, directory_type + "s", model_name, *suffix
+    )
+
+    setattr(args, directory_type, target_dir)
+    if force:
+        if not args.dry_run:
+            shutil.rmtree(target_dir, ignore_errors=True)
+    if os.path.exists(target_dir):
+        print(f"Skipping {command} Found {target_dir}")
+        return True
+
+    if not args.dry_run:
+        os.makedirs(target_dir, exist_ok=True)
+
+    return False
+
+
+def _download(args):
+    if "hf_id" not in KNOWN_MODELS[args.model]:
+        print("Skipping Download")
+        return
+
+    if "download_patterns" in KNOWN_MODELS[args.model]:
+        patterns = KNOWN_MODELS[args.model]["download_patterns"]
+    else:
+        patterns = ["*.safetensors", "*.json"]
+
+    model_name = (
+        KNOWN_MODELS[args.model]["download_model_name"]
+        if "download_model_name" in KNOWN_MODELS[args.model]
+        else None
+    )
+
+    if _existing_dir(
+        args, "hf_download", args.force_download, "download", model_name=model_name
+    ):
+        return
+
+    print(f"Downloading {KNOWN_MODELS[args.model]['hf_id']} to {args.hf_download}")
+
+    if args.dry_run:
+        return
+
+    snapshot_download(
+        KNOWN_MODELS[args.model]["hf_id"],
+        allow_patterns=patterns,
+        token=True,
+        local_dir=args.hf_download,
+    )
+
+
+def _convert(args):
+    if "convert" not in KNOWN_MODELS[args.model]:
+        return
+
+    if _existing_dir(
+        args,
+        "tensorrtllm_checkpoint",
+        args.force_convert,
+        "convert",
+        suffix=[args.gpu_name, f"TP_{args.tp_size}"],
+    ):
+        return
+
+    convert_command = ["python3"]
+
+    convert_command.extend(KNOWN_MODELS[args.model]["convert"])
+
+    convert_command[1] = os.path.join(args.tensorrtllm_example_dir, convert_command[1])
+
+    convert_command.extend(["--model_dir", "{args.hf_download}"])
+    convert_command.extend(["--output_dir", "{args.tensorrtllm_checkpoint}"])
+    convert_command.extend(["--tp_size", "{args.tp_size}"])
+
+    convert_command = [x.format(args=args) for x in convert_command]
+
+    _call(args, convert_command)
+
+
+def _build(args):
+    if "build" not in KNOWN_MODELS[args.model]:
+        return
+
+    if _existing_dir(
+        args,
+        "tensorrtllm_engine",
+        args.force_build,
+        "build",
+        suffix=[args.gpu_name, f"TP_{args.tp_size}"],
+    ):
+        return
+
+    build_command = [
+        "python3",
+        "-m",
+        "tensorrt_llm.commands.build",
+        "--checkpoint_dir",
+        "{args.tensorrtllm_checkpoint}",
+        "--output_dir",
+        "{args.tensorrtllm_engine}",
+        "--max_batch_size",
+        args.max_batch_size,
+        "--max_num_tokens",
+        args.max_num_tokens,
+    ]
+
+    build_command.extend(KNOWN_MODELS[args.model]["build"])
+
+    build_command = [x.format(args=args) for x in build_command]
+
+    _call(args, build_command)
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(description="Prepare Models")
+    parser.add_argument(
+        "--model",
+        type=str,
+        choices=list(KNOWN_MODELS.keys()),
+        default="llama-3.1-8b-instruct",
+        help="model",
+    )
+
+    parser.add_argument(
+        "--force-download",
+        action="store_true",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--force-build",
+        action="store_true",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--force-model-repo",
+        action="store_true",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--force-convert",
+        action="store_true",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--target_dir",
+        default=TARGET_DIR,
+    )
+
+    parser.add_argument(
+        "--tensorrtllm_example_dir",
+        default=TENSORRTLLM_EXAMPLE_DIR,
+    )
+
+    parser.add_argument("--reduce_fusion", default=None, choices=["enable", "disable"])
+
+    parser.add_argument(
+        "--enable_chunked_context", default="true", choices=["true", "false"]
+    )
+
+    parser.add_argument("--dry-run", action="store_true", default=False)
+
+    parser.add_argument("--tp-size", type=int, default=1)
+
+    parser.add_argument("--max-batch-size", type=int, default=None)
+
+    parser.add_argument("--max-num-tokens", type=int, default=None)
+
+    parser.add_argument("--postprocessing-instance-count", type=int, default=10)
+
+    parser.add_argument("--preprocessing-instance-count", type=int, default=1)
+
+    args = parser.parse_args()
+
+    args.gpu_name = get_gpu_product_name()
+
+    args.hw_name = args.gpu_name
+    if args.hw_name is None:
+        args.hw_name = "CPU"
+
+    max_batch_size = (
+        str(KNOWN_MODELS[args.model]["max_batch_size"])
+        if not args.max_batch_size
+        else str(args.max_batch_size)
+    )
+
+    args.max_batch_size = max_batch_size
+
+    max_num_tokens = (
+        str(KNOWN_MODELS[args.model]["max_num_tokens"])
+        if not args.max_num_tokens
+        else str(args.max_num_tokens)
+    )
+
+    args.max_num_tokens = max_num_tokens
+
+    args.participant_ids = ",".join([str(index) for index in range(args.tp_size)])
+
+    if args.reduce_fusion is None:
+        args.reduce_fusion = "enable" if args.tp_size > 1 else "disable"
+
+    # args.participant_ids = ""
+
+    return args
+
+
+if __name__ == "__main__":
+    args = _parse_args()
+    print(args)
+
+    _download(args)
+    _convert(args)
+    _build(args)
+    _prepare(args)
+
+    print("Your models under GPU type: ", args.gpu_name)
--- a/examples/llm/vllm/README.md
+++ b/examples/llm/vllm/README.md
@@ -15,7 +15,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->

-# Disaggregated Serving
+# Disaggregated Serving with VLLM

 This example demonstrates **disaggregated serving** [^1] using Triton Distributed together with vLLM engines. Disaggregated serving decouples the prefill (prompt encoding) and the decode (token generation) stages of large language model (LLM) inference into separate processes. This separation allows you to independently scale, optimize, and distribute resources for each stage.