refactor: remove python native runtime

0bfd9a76 · Neelay Shah · GitHub · 8f741f14 · 8f741f14 · 8f741f14
Commit 0bfd9a76 authored Feb 24, 2025 by Neelay Shah Committed by GitHub Feb 24, 2025
20 changed files
--- a/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_postprocessing/1/model.py
+++ b/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_postprocessing/1/model.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import numpy as np
-import triton_python_backend_utils as pb_utils
-from transformers import AutoTokenizer
-class TritonPythonModel:
-    """
-    This model allows Triton to act like a api server for T3 ICP
-    """
-    @staticmethod
-    def auto_complete_config(auto_complete_model_config):
-        inputs = [
-            {"name": "tokens_batch", "data_type": "TYPE_INT32", "dims": [-1, -1]},
-            {"name": "sequence_lengths", "data_type": "TYPE_INT32", "dims": [-1]},
-        ]
-        outputs = [
-            {"name": "output", "data_type": "TYPE_STRING", "dims": [-1]},
-        ]
-        # Store the model configuration as a dictionary.
-        config = auto_complete_model_config.as_dict()
-        input_names = []
-        output_names = []
-        for input in config["input"]:
-            input_names.append(input["name"])
-        for output in config["output"]:
-            output_names.append(output["name"])
-        # Add only missing inputs and output to the model configuration.
-        for input in inputs:
-            if input["name"] not in input_names:
-                auto_complete_model_config.add_input(input)
-        for output in outputs:
-            if output["name"] not in output_names:
-                auto_complete_model_config.add_output(output)
-        return auto_complete_model_config
-    def initialize(self, args):
-        model_config = json.loads(args["model_config"])
-        self.logger = pb_utils.Logger
-        # Parse model configs
-        model_config = json.loads(args["model_config"])
-        tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"]
-        skip_special_tokens = model_config["parameters"].get("skip_special_tokens")
-        if skip_special_tokens is not None:
-            skip_special_tokens_str = skip_special_tokens["string_value"].lower()
-            if skip_special_tokens_str in [
-                "true",
-                "false",
-                "1",
-                "0",
-                "t",
-                "f",
-                "y",
-                "n",
-                "yes",
-                "no",
-            ]:
-                self.skip_special_tokens = skip_special_tokens_str in [
-                    "true",
-                    "1",
-                    "t",
-                    "y",
-                    "yes",
-                ]
-            else:
-                self.logger.log_warn(
-                    f"[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens' correctly (set value is {skip_special_tokens['string_value']}). Set it as True by default."
-                )
-                self.skip_special_tokens = True
-        else:
-            self.logger.log_warn(
-                "[TensorRT-LLM][WARNING] Don't setup 'skip_special_tokens'. Set it as True by default."
-            )
-            self.skip_special_tokens = True
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True
-        )
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-        for output_name in ["output"]:
-            setattr(
-                self,
-                output_name.lower() + "_dtype",
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(model_config, output_name)[
-                        "data_type"
-                    ]
-                ),
-            )
-    def execute(self, requests):
-        tokens_batch = []
-        sequence_lengths = []
-        for idx, request in enumerate(requests):
-            for input_tensor in request.inputs():
-                if input_tensor.name() == "tokens_batch":
-                    tokens_batch.append(input_tensor.as_numpy())
-                elif input_tensor.name() == "sequence_lengths":
-                    sequence_lengths.append(input_tensor.as_numpy())
-                else:
-                    raise ValueError(f"unknown input {input_tensor.name}")
-        # batch decode
-        list_of_tokens = []
-        req_idx_offset = 0
-        req_idx_offsets = [req_idx_offset]
-        for idx, token_batch in enumerate(tokens_batch):
-            for batch_idx, beam_tokens in enumerate(token_batch):
-                for beam_idx, tokens in enumerate(beam_tokens):
-                    seq_len = sequence_lengths[idx][batch_idx][beam_idx]
-                    list_of_tokens.append(tokens[:seq_len])
-                    req_idx_offset += 1
-            req_idx_offsets.append(req_idx_offset)
-        all_outputs = self.tokenizer.batch_decode(
-            list_of_tokens, skip_special_tokens=self.skip_special_tokens
-        )
-        # construct responses
-        responses = []
-        for idx, request in enumerate(requests):
-            req_outputs = [
-                x.encode("utf8")
-                for x in all_outputs[req_idx_offsets[idx] : req_idx_offsets[idx + 1]]
-            ]
-            output_tensor = pb_utils.Tensor(
-                "output", np.array(req_outputs).astype(self.output_dtype)
-            )
-            outputs = [output_tensor]
-            inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
-            responses.append(inference_response)
-        return responses
-    def finalize(self):
-        """`finalize` is called only once when the model is being unloaded.
-        Implementing `finalize` function is optional. This function allows
-        the model to perform any necessary clean ups before exit.
-        """
-        print("Cleaning up...")
--- a/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_postprocessing/config.pbtxt
+++ b/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_postprocessing/config.pbtxt
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-backend: "python"
-# TODO: Tune dynamic batcher
-max_batch_size: 64
-dynamic_batching {}
-parameters {
-  key: "tokenizer_dir"
-  value: {
-    string_value: "/workspace/examples/python/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
-  }
-}
-#parameters {
-#  key: "skip_special_tokens"
-#  value: {
-#    string_value: "${skip_special_tokens}"
-#  }
-#}
-instance_group [
-    {
-        count: 10
-        kind : KIND_CPU
-    }
-]
\ No newline at end of file
--- a/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/1/model.py
+++ b/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/1/model.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import numpy
-import triton_python_backend_utils as pb_utils
-from transformers import AutoTokenizer, T5Tokenizer
-class TritonPythonModel:
-    """
-    This model allows Triton to act like a api server for T3 ICP
-    """
-    @staticmethod
-    def auto_complete_config(auto_complete_model_config):
-        inputs = [
-            {"name": "query", "data_type": "TYPE_STRING", "dims": [1]},
-        ]
-        outputs = [
-            {"name": "start_ids", "data_type": "TYPE_INT32", "dims": [-1]},
-            {"name": "start_lengths", "data_type": "TYPE_INT32", "dims": [-1]},
-        ]
-        # Store the model configuration as a dictionary.
-        config = auto_complete_model_config.as_dict()
-        input_names = []
-        output_names = []
-        for input in config["input"]:
-            input_names.append(input["name"])
-        for output in config["output"]:
-            output_names.append(output["name"])
-        # Add only missing inputs and output to the model configuration.
-        for input in inputs:
-            if input["name"] not in input_names:
-                auto_complete_model_config.add_input(input)
-        for output in outputs:
-            if output["name"] not in output_names:
-                auto_complete_model_config.add_output(output)
-        return auto_complete_model_config
-    def initialize(self, args):
-        model_config = json.loads(args["model_config"])
-        self.logger = pb_utils.Logger
-        tokenizer_dir = model_config["parameters"]["tokenizer_dir"]["string_value"]
-        self._tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_dir, legacy=False, padding_side="left", trust_remote_code=True
-        )
-        if isinstance(self._tokenizer, T5Tokenizer):
-            self._tokenizer_bos_id = self._tokenizer.sp_model.bos_id()
-        if not self._tokenizer.pad_token:
-            self._tokenizer.pad_token = self._tokenizer.eos_token
-        self._tokenizer_end_id = self._tokenizer.encode(
-            self._tokenizer.eos_token, add_special_tokens=False
-        )[0]
-        self._tokenizer_pad_id = self._tokenizer.encode(
-            self._tokenizer.pad_token, add_special_tokens=False
-        )[0]
-        self._vocab_size = self._tokenizer.vocab_size
-        for output_name in ["start_ids", "start_lengths"]:
-            setattr(
-                self,
-                output_name.lower() + "_dtype",
-                pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(model_config, output_name)[
-                        "data_type"
-                    ]
-                ),
-            )
-    def execute(self, requests):
-        responses = []
-        for request in requests:
-            query = pb_utils.get_input_tensor_by_name(request, "query").as_numpy()
-            # Preprocessing input data.
-            if isinstance(self._tokenizer, T5Tokenizer):
-                start_ids = [
-                    numpy.array(
-                        [self._tokenizer_bos_id]
-                        + self._tokenizer.encode(
-                            s[0].decode(), add_special_tokens=False
-                        )
-                    ).astype(numpy.int32)
-                    for s in query
-                ]
-            else:
-                start_ids = [
-                    numpy.array(
-                        self._tokenizer.encode(s[0].decode(), add_special_tokens=False)
-                    ).astype(numpy.int32)
-                    for s in query
-                ]
-            start_lengths = numpy.array([[len(ids)] for ids in start_ids]).astype(
-                numpy.int32
-            )
-            max_len = 0
-            for seq in start_ids:
-                max_len = max(max_len, seq.shape[0])
-            start_ids = numpy.stack(
-                [
-                    numpy.pad(
-                        seq,
-                        (0, max_len - seq.shape[0]),
-                        "constant",
-                        constant_values=(0, self._tokenizer_pad_id),
-                    )
-                    for seq in start_ids
-                ]
-            )
-            start_ids_tensor = pb_utils.Tensor(
-                "start_ids", numpy.array(start_ids).astype(self.start_ids_dtype)
-            )
-            start_lengths_tensor = pb_utils.Tensor(
-                "start_lengths",
-                numpy.array(start_lengths).astype(self.start_lengths_dtype),
-            )
-            outputs = [start_ids_tensor, start_lengths_tensor]
-            inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
-            responses.append(inference_response)
-        return responses
--- a/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/config.pbtxt
+++ b/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/config.pbtxt
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-backend: "python"
-# TODO: Tune dynamic batcher
-max_batch_size: 1
-parameters {
-  key: "tokenizer_dir"
-  value: {
-    string_value: "/workspace/examples/python/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
-  }
-}
-instance_group [
-    {
-        count: 10
-        kind : KIND_CPU
-    }
-]
--- a/examples/python/llm/tensorrtllm/scripts/gpu_info.py
+++ b/examples/python/llm/tensorrtllm/scripts/gpu_info.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import subprocess
-def get_gpu_product_name():
-    env = os.environ.copy()
-    env["CUDA_VISIBLE_DEVICES"] = "0"
-    try:
-        result = subprocess.run(
-            [
-                "nvidia-smi",
-                "--query-gpu",
-                "name",
-                "--format",
-                "csv",
-            ],
-            capture_output=True,
-            text=True,
-            env=env,
-        )
-        result_values = [
-            x.replace(", ", ",").split(",") for x in result.stdout.split("\n") if x
-        ]
-        if result_values[0][0] == "No devices were found":
-            return None
-        return result_values[1][0].strip().replace(" ", "_")
-    except FileNotFoundError:
-        return None
-def number_of_gpus():
-    try:
-        result = subprocess.run(
-            ["nvidia-smi", "--list-gpus"], capture_output=True, text=True
-        )
-        return len(result.stdout.strip().split("\n"))
-    except FileNotFoundError:
-        return 0
--- a/examples/python/llm/tensorrtllm/scripts/known_models.py
+++ b/examples/python/llm/tensorrtllm/scripts/known_models.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-KNOWN_MODELS = {
-    "mock": {
-        "hf_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-        "download_patterns": ["*.json"],
-        "max_num_tokens": 2048,
-        "max_batch_size": 512,
-        "templates": [
-            "preprocessing",
-            "postprocessing",
-            "ensemble",
-            (
-                "/workspace/examples/python/llm/tensorrtllm/operators/triton_core_models/mock",
-                "context",
-            ),
-            (
-                "/workspace/examples/python/llm/tensorrtllm/operators/triton_core_models/mock",
-                "generate",
-            ),
-            (
-                "/workspace/examples/python/llm/tensorrtllm/operators/triton_core_models/mock",
-                "tensorrt_llm",
-            ),
-        ],
-        "template_arguments": {
-            "tokenizer_dir": "{args.hf_download}",
-            "triton_max_batch_size": "{args.max_batch_size}",
-            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
-            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
-            "context_token_latency_ms": "0.1",
-            "generate_token_latency_ms": "0.5",
-        },
-    },
-    "llama-3.1-70b-instruct": {
-        "hf_id": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-        "download_model_name": "llama-3.1-70b-instruct",
-        "convert": [
-            "quantization/quantize.py",
-            "--dtype",
-            "float16",
-            "--qformat",
-            "fp8",
-            "--calib_size",
-            "512",
-            "--kv_cache_dtype",
-            "fp8",
-        ],
-        "build": [
-            "--gpt_attention_plugin",
-            "float16",
-            "--max_seq_len",
-            "131072",
-            "--use_fused_mlp",
-            "enable",
-            "--reduce_fusion",
-            "disable",
-            "--multiple_profiles",
-            "enable",
-            "--use_paged_context_fmha",
-            "enable",
-        ],
-        "max_num_tokens": 2048,
-        "max_batch_size": 512,
-        "templates": [
-            "preprocessing",
-            "postprocessing",
-            "ensemble",
-            ("tensorrt_llm", "context"),
-            ("tensorrt_llm", "generate"),
-            "tensorrt_llm",
-        ],
-        "template_arguments": {
-            "triton_max_batch_size": "{args.max_batch_size}",
-            "decoupled_mode": "True",
-            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
-            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
-            "triton_backend": "tensorrtllm",
-            "enable_chunked_context": "{args.enable_chunked_context}",
-            "max_beam_width": "1",
-            "engine_dir": "{args.tensorrtllm_engine}",
-            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "True",
-            "batching_strategy": "inflight_fused_batching",
-            "max_queue_delay_microseconds": "0",
-            "max_queue_size": "0",
-            "participant_ids": "{args.participant_ids}",
-            "tokenizer_dir": "{args.hf_download}",
-            "encoder_input_features_data_type": "TYPE_FP16",
-        },
-    },
-    "llama-3.1-8b-instruct": {
-        "hf_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-        "download_model_name": "llama-3.1-8b-instruct",
-        "convert": ["llama/convert_checkpoint.py", "--dtype", "float16"],
-        "build": [
-            "--remove_input_padding",
-            "enable",
-            "--gpt_attention_plugin",
-            "float16",
-            "--context_fmha",
-            "enable",
-            "--gemm_plugin",
-            "float16",
-            "--paged_kv_cache",
-            "enable",
-            "--use_paged_context_fmha",
-            "enable",
-        ],
-        "max_num_tokens": 16384,
-        "max_batch_size": 64,
-        "templates": [
-            "preprocessing",
-            "postprocessing",
-            "ensemble",
-            ("tensorrt_llm", "context"),
-            ("tensorrt_llm", "generate"),
-            "tensorrt_llm",
-        ],
-        "template_arguments": {
-            "triton_max_batch_size": "{args.max_batch_size}",
-            "decoupled_mode": "True",
-            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
-            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
-            "triton_backend": "tensorrtllm",
-            "max_beam_width": "1",
-            "engine_dir": "{args.tensorrtllm_engine}",
-            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "True",
-            "batching_strategy": "inflight_fused_batching",
-            "max_queue_delay_microseconds": "0",
-            "max_queue_size": "0",
-            "participant_ids": "0",
-            "tokenizer_dir": "{args.hf_download}",
-            "encoder_input_features_data_type": "TYPE_FP16",
-        },
-    },
-    "llama-3-8b-instruct-generate": {
-        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
-        "max_batch_size": 256,
-        "model_repo_name": "llama-3-8b-instruct-disaggregated",
-        "download_model_name": "llama-3-8b-instruct",
-        "convert": [
-            "quantization/quantize.py",
-            "--dtype",
-            "float16",
-            "--qformat",
-            "fp8",
-            "--calib_size",
-            "512",
-            "--kv_cache_dtype",
-            "fp8",
-        ],
-        "build": [
-            "--gpt_attention_plugin",
-            "float16",
-            "--workers",
-            "{args.tp_size}",
-            "--max_seq_len",
-            "1024",
-            "--use_fused_mlp",
-            "enable",
-            "--multiple_profiles",
-            "enable",
-            "--use_paged_context_fmha",
-            "enable",
-        ],
-        "max_num_tokens": 256,
-        "templates": [
-            ("tensorrt_llm", "generate"),
-            "postprocessing",
-        ],
-        "template_arguments": {
-            "triton_max_batch_size": "{args.max_batch_size}",
-            "decoupled_mode": "True",
-            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
-            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
-            "triton_backend": "tensorrtllm",
-            "max_beam_width": "1",
-            "engine_dir": "{args.tensorrtllm_engine}",
-            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "True",
-            "batching_strategy": "inflight_fused_batching",
-            "max_queue_delay_microseconds": "0",
-            "max_queue_size": "0",
-            "participant_ids": "0",
-            "tokenizer_dir": "{args.hf_download}",
-            "encoder_input_features_data_type": "TYPE_FP16",
-        },
-    },
-    "llama-3-8b-instruct-context": {
-        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
-        "max_batch_size": 256,
-        "model_repo_name": "llama-3-8b-instruct-disaggregated",
-        "download_model_name": "llama-3-8b-instruct",
-        "convert": [
-            "quantization/quantize.py",
-            "--dtype",
-            "float16",
-            "--qformat",
-            "fp8",
-            "--calib_size",
-            "512",
-            "--kv_cache_dtype",
-            "fp8",
-        ],
-        "build": [
-            "--gpt_attention_plugin",
-            "float16",
-            "--workers",
-            "{args.tp_size}",
-            "--max_seq_len",
-            "8192",
-            "--use_fused_mlp",
-            "enable",
-            "--multiple_profiles",
-            "enable",
-            "--use_paged_context_fmha",
-            "enable",
-        ],
-        "max_num_tokens": 8192,
-        "templates": [
-            "/workspace/examples/disaggregated_serving/tensorrtllm_templates/context",
-            "preprocessing",
-        ],
-        "template_arguments": {
-            "triton_max_batch_size": "{args.max_batch_size}",
-            "decoupled_mode": "False",
-            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
-            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
-            "triton_backend": "tensorrtllm",
-            "max_beam_width": "1",
-            "engine_dir": "{args.tensorrtllm_engine}",
-            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "True",
-            "batching_strategy": "inflight_fused_batching",
-            "max_queue_delay_microseconds": "0",
-            "max_queue_size": "0",
-            "participant_ids": "0",
-            "tokenizer_dir": "{args.hf_download}",
-            "encoder_input_features_data_type": "TYPE_FP16",
-        },
-    },
-    "llama-3-8b-instruct": {
-        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
-        "convert": [
-            "quantization/quantize.py",
-            "--dtype",
-            "float16",
-            "--qformat",
-            "fp8",
-            "--calib_size",
-            "512",
-            "--kv_cache_dtype",
-            "fp8",
-        ],
-        "build": [
-            "--gpt_attention_plugin",
-            "float16",
-            "--workers",
-            "{args.tp_size}",
-            "--max_seq_len",
-            "8192",
-            "--use_fused_mlp",
-            "enable",
-            "--multiple_profiles",
-            "enable",
-            "--reduce_fusion",
-            "{args.reduce_fusion}",
-            "--use_paged_context_fmha",
-            "enable",
-        ],
-        "max_num_tokens": 16384,
-        "max_batch_size": 512,
-        "templates": [
-            "preprocessing",
-            "postprocessing",
-            "ensemble",
-            ("tensorrt_llm", "context"),
-            ("tensorrt_llm", "generate"),
-            "tensorrt_llm",
-        ],
-        "template_arguments": {
-            "triton_max_batch_size": "{args.max_batch_size}",
-            "decoupled_mode": "True",
-            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
-            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
-            "triton_backend": "tensorrtllm",
-            "max_beam_width": "1",
-            "engine_dir": "{args.tensorrtllm_engine}",
-            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "True",
-            "batching_strategy": "inflight_fused_batching",
-            "max_queue_delay_microseconds": "0",
-            "max_queue_size": "0",
-            "participant_ids": "0",
-            "tokenizer_dir": "{args.hf_download}",
-            "encoder_input_features_data_type": "TYPE_FP16",
-        },
-    },
-    "llama-3-8b-instruct-default": {
-        "hf_id": "meta-llama/Meta-Llama-3-8B-Instruct",
-        "download_model_name": "llama-3-8b-instruct",
-        "convert": ["llama/convert_checkpoint.py", "--dtype", "float16"],
-        "build": [
-            "--remove_input_padding",
-            "enable",
-            "--gpt_attention_plugin",
-            "float16",
-            "--context_fmha",
-            "enable",
-            "--gemm_plugin",
-            "float16",
-            "--paged_kv_cache",
-            "enable",
-            "--use_paged_context_fmha",
-            "enable",
-        ],
-        "max_batch_size": 64,
-        "templates": [
-            "preprocessing",
-            "postprocessing",
-            "ensemble",
-            ("tensorrt_llm", "context"),
-            ("tensorrt_llm", "generate"),
-            "tensorrt_llm",
-        ],
-        "template_arguments": {
-            "triton_max_batch_size": "{args.max_batch_size}",
-            "decoupled_mode": "True",
-            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
-            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
-            "triton_backend": "tensorrtllm",
-            "max_beam_width": "1",
-            "engine_dir": "{args.tensorrtllm_engine}",
-            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "True",
-            "batching_strategy": "inflight_fused_batching",
-            "max_queue_delay_microseconds": "0",
-            "max_queue_size": "0",
-            "participant_ids": "0",
-            "tokenizer_dir": "{args.hf_download}",
-            "encoder_input_features_data_type": "TYPE_FP16",
-        },
-    },
-    "llama-3-70b-instruct-context": {
-        "hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
-        "download_model_name": "llama-3-70b-instruct",
-        "model_repo_name": "llama-3-70b-disaggegated",
-        "max_batch_size": 128,
-        "convert": [
-            "quantization/quantize.py",
-            "--dtype",
-            "float16",
-            "--qformat",
-            "fp8",
-            "--calib_size",
-            "512",
-            "--kv_cache_dtype",
-            "fp8",
-        ],
-        "build": [
-            "--gpt_attention_plugin",
-            "float16",
-            "--workers",
-            "{args.tp_size}",
-            "--max_seq_len",
-            "8192",
-            "--use_fused_mlp",
-            "enable",
-            "--reduce_fusion",
-            "{args.reduce_fusion}",
-            "--multiple_profiles",
-            "enable",
-            "--use_paged_context_fmha",
-            "enable",
-        ],
-        "max_num_tokens": 8192,
-        "templates": [
-            "preprocessing",
-            "/workspace/examples/disaggregated_serving/tensorrtllm_templates/context",
-        ],
-        "template_arguments": {
-            "triton_max_batch_size": "{args.max_batch_size}",
-            "decoupled_mode": "True",
-            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
-            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
-            "triton_backend": "tensorrtllm",
-            "max_beam_width": "1",
-            "engine_dir": "{args.tensorrtllm_engine}",
-            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "True",
-            "batching_strategy": "inflight_fused_batching",
-            "max_queue_delay_microseconds": "0",
-            "max_queue_size": "0",
-            "participant_ids": "{args.participant_ids}",
-            "tokenizer_dir": "{args.hf_download}",
-            "encoder_input_features_data_type": "TYPE_FP16",
-        },
-    },
-    "llama-3-70b-instruct-generate": {
-        "hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
-        "download_model_name": "llama-3-70b-instruct",
-        "model_repo_name": "llama-3-70b-disaggegated",
-        "max_batch_size": 128,
-        "convert": [
-            "quantization/quantize.py",
-            "--dtype",
-            "float16",
-            "--qformat",
-            "fp8",
-            "--calib_size",
-            "512",
-            "--kv_cache_dtype",
-            "fp8",
-        ],
-        "build": [
-            "--gpt_attention_plugin",
-            "float16",
-            "--workers",
-            "{args.tp_size}",
-            "--max_seq_len",
-            "1024",
-            "--use_fused_mlp",
-            "enable",
-            "--reduce_fusion",
-            "{args.reduce_fusion}",
-            "--multiple_profiles",
-            "enable",
-            "--use_paged_context_fmha",
-            "enable",
-        ],
-        "max_num_tokens": 128,
-        "templates": [
-            "postprocessing",
-            "/workspace/examples/disaggregated_serving/tensorrtllm_templates/generate",
-        ],
-        "template_arguments": {
-            "triton_max_batch_size": "{args.max_batch_size}",
-            "decoupled_mode": "True",
-            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
-            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
-            "triton_backend": "tensorrtllm",
-            "max_beam_width": "1",
-            "engine_dir": "{args.tensorrtllm_engine}",
-            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "True",
-            "batching_strategy": "inflight_fused_batching",
-            "max_queue_delay_microseconds": "0",
-            "max_queue_size": "0",
-            "participant_ids": "{args.participant_ids}",
-            "tokenizer_dir": "{args.hf_download}",
-            "encoder_input_features_data_type": "TYPE_FP16",
-        },
-    },
-    "llama-3-70b-instruct": {
-        "hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
-        "max_batch_size": 512,
-        "convert": [
-            "quantization/quantize.py",
-            "--dtype",
-            "float16",
-            "--qformat",
-            "fp8",
-            "--calib_size",
-            "512",
-            "--kv_cache_dtype",
-            "fp8",
-        ],
-        "build": [
-            "--gpt_attention_plugin",
-            "float16",
-            "--workers",
-            "{args.tp_size}",
-            "--max_seq_len",
-            "8192",
-            "--use_fused_mlp",
-            "enable",
-            "--reduce_fusion",
-            "{args.reduce_fusion}",
-            "--multiple_profiles",
-            "enable",
-            "--use_paged_context_fmha",
-            "enable",
-        ],
-        "max_num_tokens": 16384,
-        "templates": [
-            "preprocessing",
-            "postprocessing",
-            "ensemble",
-            "tensorrt_llm",
-        ],
-        "template_arguments": {
-            "triton_max_batch_size": "{args.max_batch_size}",
-            "decoupled_mode": "True",
-            "preprocessing_instance_count": "{args.preprocessing_instance_count}",
-            "postprocessing_instance_count": "{args.postprocessing_instance_count}",
-            "triton_backend": "tensorrtllm",
-            "max_beam_width": "1",
-            "engine_dir": "{args.tensorrtllm_engine}",
-            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "True",
-            "batching_strategy": "inflight_fused_batching",
-            "max_queue_delay_microseconds": "0",
-            "max_queue_size": "0",
-            "participant_ids": "{args.participant_ids}",
-            "tokenizer_dir": "{args.hf_download}",
-            "encoder_input_features_data_type": "TYPE_FP16",
-        },
-    },
-}
--- a/examples/python/llm/tensorrtllm/scripts/prepare_models.py
+++ b/examples/python/llm/tensorrtllm/scripts/prepare_models.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import shutil
-import subprocess
-from string import Template
-from gpu_info import get_gpu_product_name
-from huggingface_hub import snapshot_download
-from known_models import KNOWN_MODELS
-TARGET_DIR = "/workspace/examples/python/llm/tensorrtllm/operators"
-TENSORRTLLM_EXAMPLE_DIR = "/tensorrtllm_backend/tensorrt_llm/examples"
-TENSORRTLLM_BACKEND_DIR = "/tensorrtllm_backend"
-def _prepare(args):
-    templates = KNOWN_MODELS[args.model]["templates"]
-    template_arguments = KNOWN_MODELS[args.model]["template_arguments"]
-    model_name = (
-        KNOWN_MODELS[args.model]["model_repo_name"]
-        if "model_repo_name" in KNOWN_MODELS[args.model]
-        else None
-    )
-    _existing_dir(
-        args,
-        "tensorrtllm_model",
-        args.force_model_repo,
-        "model repo",
-        suffix=[args.hw_name, f"TP_{args.tp_size}"],
-        model_name=model_name,
-    )
-    for argument, value in template_arguments.items():
-        template_arguments[argument] = value.format(args=args)
-    template_arguments["request_stats_max_iterations"] = 1000
-    print(template_arguments)
-    for template in templates:
-        if isinstance(template, tuple):
-            template_basename = template[1]
-            template = template[0]
-        else:
-            template_basename = os.path.basename(template)
-        template_path = os.path.join(
-            TENSORRTLLM_BACKEND_DIR,
-            "all_models",
-            "inflight_batcher_llm",
-            template,
-            "config.pbtxt",
-        )
-        if template == "ensemble":
-            target_path = os.path.join(
-                args.tensorrtllm_model, args.model, "config.pbtxt"
-            )
-        else:
-            target_path = os.path.join(
-                args.tensorrtllm_model, template_basename, "config.pbtxt"
-            )
-        if not args.force_model_repo and os.path.exists(target_path):
-            continue
-        print(template_path, os.path.exists(template_path), target_path)
-        with open(template_path) as f:
-            pbtxt_template = Template(f.read())
-        pbtxt = pbtxt_template.safe_substitute(template_arguments)
-        pbtxt = pbtxt.replace(f'name: "{os.path.basename(template)}"', "")
-        if not args.dry_run:
-            os.makedirs(os.path.dirname(target_path), exist_ok=True)
-            with open(target_path, "w") as f:
-                f.write(pbtxt)
-            model_asset_path = os.path.join(os.path.dirname(template_path), "1")
-            if os.path.exists(model_asset_path):
-                shutil.copytree(
-                    model_asset_path,
-                    os.path.join(
-                        os.path.dirname(target_path), os.path.basename(model_asset_path)
-                    ),
-                )
-def _call(args, command):
-    print(" ".join(command))
-    if args.dry_run:
-        return 0
-    else:
-        return subprocess.call(command)
-def _existing_dir(args, directory_type, force, command, suffix=[], model_name=None):
-    model_name = args.model if model_name is None else model_name
-    target_dir = os.path.join(
-        args.target_dir, directory_type + "s", model_name, *suffix
-    )
-    setattr(args, directory_type, target_dir)
-    if force:
-        if not args.dry_run:
-            shutil.rmtree(target_dir, ignore_errors=True)
-    if os.path.exists(target_dir):
-        print(f"Skipping {command} Found {target_dir}")
-        return True
-    if not args.dry_run:
-        os.makedirs(target_dir, exist_ok=True)
-    return False
-def _download(args):
-    if "hf_id" not in KNOWN_MODELS[args.model]:
-        print("Skipping Download")
-        return
-    if "download_patterns" in KNOWN_MODELS[args.model]:
-        patterns = KNOWN_MODELS[args.model]["download_patterns"]
-    else:
-        patterns = ["*.safetensors", "*.json"]
-    model_name = (
-        KNOWN_MODELS[args.model]["download_model_name"]
-        if "download_model_name" in KNOWN_MODELS[args.model]
-        else None
-    )
-    if _existing_dir(
-        args, "hf_download", args.force_download, "download", model_name=model_name
-    ):
-        return
-    print(f"Downloading {KNOWN_MODELS[args.model]['hf_id']} to {args.hf_download}")
-    if args.dry_run:
-        return
-    snapshot_download(
-        KNOWN_MODELS[args.model]["hf_id"],
-        allow_patterns=patterns,
-        token=True,
-        local_dir=args.hf_download,
-    )
-def _convert(args):
-    if "convert" not in KNOWN_MODELS[args.model]:
-        return
-    if _existing_dir(
-        args,
-        "tensorrtllm_checkpoint",
-        args.force_convert,
-        "convert",
-        suffix=[args.gpu_name, f"TP_{args.tp_size}"],
-    ):
-        return
-    convert_command = ["python3"]
-    convert_command.extend(KNOWN_MODELS[args.model]["convert"])
-    convert_command[1] = os.path.join(args.tensorrtllm_example_dir, convert_command[1])
-    convert_command.extend(["--model_dir", "{args.hf_download}"])
-    convert_command.extend(["--output_dir", "{args.tensorrtllm_checkpoint}"])
-    convert_command.extend(["--tp_size", "{args.tp_size}"])
-    convert_command = [x.format(args=args) for x in convert_command]
-    _call(args, convert_command)
-def _build(args):
-    if "build" not in KNOWN_MODELS[args.model]:
-        return
-    if _existing_dir(
-        args,
-        "tensorrtllm_engine",
-        args.force_build,
-        "build",
-        suffix=[args.gpu_name, f"TP_{args.tp_size}"],
-    ):
-        return
-    build_command = [
-        "python3",
-        "-m",
-        "tensorrt_llm.commands.build",
-        "--checkpoint_dir",
-        "{args.tensorrtllm_checkpoint}",
-        "--output_dir",
-        "{args.tensorrtllm_engine}",
-        "--max_batch_size",
-        args.max_batch_size,
-        "--max_num_tokens",
-        args.max_num_tokens,
-    ]
-    build_command.extend(KNOWN_MODELS[args.model]["build"])
-    build_command = [x.format(args=args) for x in build_command]
-    _call(args, build_command)
-def _parse_args():
-    parser = argparse.ArgumentParser(description="Prepare Models")
-    parser.add_argument(
-        "--model",
-        type=str,
-        choices=list(KNOWN_MODELS.keys()),
-        default="llama-3.1-8b-instruct",
-        help="model",
-    )
-    parser.add_argument(
-        "--force-download",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--force-build",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--force-model-repo",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--force-convert",
-        action="store_true",
-        default=False,
-    )
-    parser.add_argument(
-        "--target_dir",
-        default=TARGET_DIR,
-    )
-    parser.add_argument(
-        "--tensorrtllm_example_dir",
-        default=TENSORRTLLM_EXAMPLE_DIR,
-    )
-    parser.add_argument("--reduce_fusion", default=None, choices=["enable", "disable"])
-    parser.add_argument(
-        "--enable_chunked_context", default="true", choices=["true", "false"]
-    )
-    parser.add_argument("--dry-run", action="store_true", default=False)
-    parser.add_argument("--tp-size", type=int, default=1)
-    parser.add_argument("--max-batch-size", type=int, default=None)
-    parser.add_argument("--max-num-tokens", type=int, default=None)
-    parser.add_argument("--postprocessing-instance-count", type=int, default=10)
-    parser.add_argument("--preprocessing-instance-count", type=int, default=1)
-    args = parser.parse_args()
-    args.gpu_name = get_gpu_product_name()
-    args.hw_name = args.gpu_name
-    if args.hw_name is None:
-        args.hw_name = "CPU"
-    max_batch_size = (
-        str(KNOWN_MODELS[args.model]["max_batch_size"])
-        if not args.max_batch_size
-        else str(args.max_batch_size)
-    )
-    args.max_batch_size = max_batch_size
-    max_num_tokens = (
-        str(KNOWN_MODELS[args.model]["max_num_tokens"])
-        if not args.max_num_tokens
-        else str(args.max_num_tokens)
-    )
-    args.max_num_tokens = max_num_tokens
-    args.participant_ids = ",".join([str(index) for index in range(args.tp_size)])
-    if args.reduce_fusion is None:
-        args.reduce_fusion = "enable" if args.tp_size > 1 else "disable"
-    # args.participant_ids = ""
-    return args
-if __name__ == "__main__":
-    args = _parse_args()
-    print(args)
-    _download(args)
-    _convert(args)
-    _build(args)
-    _prepare(args)
-    print("Your models under GPU type: ", args.gpu_name)
--- a/examples/python/llm/vllm/README.md
+++ b/examples/python/llm/vllm/README.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-# Disaggregated Serving with VLLM
-> **Warning**
-> This example is currently not tested and might not work as expected. For working disaggregated serving examples, please see the [vLLM example](/examples/python_rs/llm/vllm/).
-This example demonstrates **disaggregated serving** [^1] using Triton Distributed together with vLLM engines. Disaggregated serving decouples the prefill (prompt encoding) and the decode (token generation) stages of large language model (LLM) inference into separate processes. This separation allows you to independently scale, optimize, and distribute resources for each stage.
-In this example, you will deploy:
- An **OpenAI-compatible API server** (which receives requests and streams responses).
- One or more **prefill workers** (for encoding the prompt).
- One or more **decode workers** (for generating tokens based on the encoded prompt).
-![Overview of disaggregated serving deployment architecture](assets/vllm_disagg_architecture_overview.jpg)
-For more details on the basics of Triton Distributed, please see the [Hello World example](../../hello_world/).
---
-## 1. Prerequisites
-1. **GPU Availability**
-   This setup requires at least two GPUs:
-   - One GPU is typically used by the **prefill** process.
-   - Another GPU is used by the **decode** process.
-   In production systems with heavier loads, you will typically allocate more GPUs across multiple prefill and decode workers.
-2. **NATS or Another Coordination Service**
-   Triton Distributed uses NATS by default for coordination and message passing. Make sure your environment has a running NATS service accessible via a valid `nats://<address>:<port>` endpoint. By default, examples assume `nats://localhost:4223`.
-3. **vLLM Patch**
-   This example requires some features that are not yet in the main vLLM release. A patch is automatically applied inside the provided container. Details of the patch can be found [here](../../../container/deps/vllm/). The current patch is compatible with **vLLM 0.6.3post1**.
-4. **Supported GPUs**
-   - For FP8 usage, GPUs with **Compute Capability >= 8.9** are required.
-   - If you have older GPUs, consider BF16/FP16 precision variants instead of `FP8`. (See [below](#model-precision-variants).)
-5. **HuggingFace**
-   - You need a HuggingFace account to download the model and set HF_TOKEN environment variable.
---
-## 2. Building the Environment
-The example is designed to run in a containerized environment using Triton Distributed, vLLM, and associated dependencies. To build the container:
-```bash
-./container/build.sh --framework vllm
-```
-This command pulls necessary dependencies and patches vLLM in the container image.
---
-## 3. Starting the Deployment
-Below is a minimal example of how to start each component of a disaggregated serving setup. The typical sequence is:
-2. **Start the Context Worker(s) and Request Plane**
-3. **Start the Generate Worker(s)**
-1. **Start the API Server** (handles incoming requests and coordinates workers)
-All components must be able to connect to the same request plane to coordinate.
-### 3.1 HuggingFace Token
-```bash
-export HF_TOKEN=<YOUR TOKEN>
-```
-### 3.2 Launch Interactive Environment
-```bash
-./container/run.sh --framework vllm -it
-```
-Note: all subsequent commands will be run in the same container for simplicity
-Note: by default this command makes all gpu devices visible. Use flag
-```
--gpus
-```
-to selectively make gpu devices visible.
-### 3.2 Launch Context Worker and Request Plane
-The context stage encodes incoming prompts. By default, vLLM uses GPU resources to tokenize and prepare the model’s key-value (KV) caches.
-Within the container start the context worker and the request plane:
-```
-CUDA_VISIBLE_DEVICES=0 \
-VLLM_WORKER_ID=0 \
-python3 -m llm.vllm.deploy \
-  --context-worker-count 1 \
-  --request-plane-uri ${HOSTNAME}:4223 \
-  --model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
-  --kv-cache-dtype fp8 \
-  --dtype auto \
-  --worker-name llama \
-  --disable-async-output-proc \
-  --disable-log-stats \
-  --max-model-len 3500 \
-  --max-batch-size 10000 \
-  --gpu-memory-utilization 0.9 \
-  --context-tp-size 1 \
-  --generate-tp-size 1 \
-  --initialize-request-plane &
-```
-**Key flags**:
- `--context-worker-count`: Launches only context (prefill) workers.
- `--kv-cache-dtype fp8`: Using FP8 for caching (requires CC >= 8.9).
- `CUDA_VISIBLE_DEVICES=0`: Binds worker to GPU `0`.
-#### Expected Output
-```
-<SNIP>
-Workers started ... press Ctrl-C to Exit
-[168] 2025/01/24 09:17:38.879908 [INF] Starting nats-server
-[168] 2025/01/24 09:17:38.879982 [INF]   Version:  2.10.24
-[168] 2025/01/24 09:17:38.879987 [INF]   Git:      [1d6f7ea]
-[168] 2025/01/24 09:17:38.879989 [INF]   Name:     NDBCCXARM6D2BMMRJOKZCJD4TGVXXPCJKQRXALJOPHLA5W7ISCW4VHU5
-[168] 2025/01/24 09:17:38.879992 [INF]   Node:     S4g51H7K
-[168] 2025/01/24 09:17:38.879995 [INF]   ID:       NDBCCXARM6D2BMMRJOKZCJD4TGVXXPCJKQRXALJOPHLA5W7ISCW4VHU5
-[168] 2025/01/24 09:17:38.880339 [INF] Starting JetStream
-<SNIP>
-INFO 01-24 09:17:49 parallel_state.py:942] Stage: PREFILL
-```
-### 3.3 Launch Generate (Decode) Worker
-The generate stage consumes the KV cache produced in the context step and generates output tokens.
-Within the container start the generate worker:
-```bash
-CUDA_VISIBLE_DEVICES=1 \
-VLLM_WORKER_ID=1 \
-python3 -m llm.vllm.deploy \
-  --generate-worker-count 1 \
-  --request-plane-uri ${HOSTNAME}:4223 \
-  --model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
-  --kv-cache-dtype fp8 \
-  --dtype auto \
-  --worker-name llama \
-  --disable-async-output-proc \
-  --disable-log-stats \
-  --max-model-len 3500 \
-  --max-batch-size 10000 \
-  --gpu-memory-utilization 0.9 \
-  --context-tp-size 1 \
-  --generate-tp-size 1 &
-```
-> [!NOTE]
-> - First time running in a newly launched container will
->   include model download. Please wait until you see the
->   llama handler started before sending requests
-**Key flags**:
- `--generate-worker-count`: Launches decode worker(s).
- `CUDA_VISIBLE_DEVICES=1`: Binds worker to GPU `1`.
-#### Expected Output
-```
-<SNIP>x
-model-00002-of-00002.safetensors: 100% 4.08G/4.08G [01:36<00:00, 42.2MB/s]
-model-00001-of-00002.safetensors: 100%% 4.71G/5.00G [01:51<00:06, 41.9MB/s]
-<SNIP>
-INFO 01-24 09:21:22 model_runner.py:1406] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
-INFO 01-24 09:21:22 model_runner.py:1410] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
-<SNIP>
-09:22:10 worker.py:266[Triton Worker] INFO: Worker started...
-09:22:10 worker.py:241[Triton Worker] INFO: Starting generate handler...
-09:22:10 worker.py:266[Triton Worker] INFO: Worker started...
-09:22:10 worker.py:241[Triton Worker] INFO: Starting llama handler...
-```
-> [!NOTE]
-> - You can run multiple prefill and decode workers for higher throughput.
-> - For large models, ensure you have enough GPU memory (or GPUs).
-### 3.4 API Server
-The API server in a vLLM-disaggregated setup listens for OpenAI-compatible requests on a chosen port (default 8005). Below is an example command:
-```bash
-python3 -m llm.api_server \
-  --tokenizer neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
-  --request-plane-uri ${HOSTNAME}:4223 \
-  --api-server-host ${HOSTNAME} \
-  --model-name llama \
-  --api-server-port 8005 &
-```
-#### Expected Output
-```
-[WARNING] Adding CORS for the following origins: ['http://localhost']
-INFO:     Started server process [498]
-INFO:     Waiting for application startup.
-TRACE:    ASGI [1] Started scope={'type': 'lifespan', 'asgi': {'version': '3.0', 'spec_version': '2.0'}, 'state': {}}
-TRACE:    ASGI [1] Receive {'type': 'lifespan.startup'}
-TRACE:    ASGI [1] Send {'type': 'lifespan.startup.complete'}
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://2u2g-gen-0349:8005 (Press CTRL+C to quit)
-```
-## 4. Sending Requests
-Once the API server is running (by default on `localhost:8005`), you can send OpenAI-compatible requests. For example:
-```bash
-curl ${HOSTNAME}:8005/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "llama",
-    "messages": [
-      {"role": "user", "content": "What is the capital of France?"}
-    ],
-    "temperature": 0,
-    "top_p": 0.95,
-    "max_tokens": 25,
-    "stream": true,
-    "n": 1,
-    "frequency_penalty": 0.0,
-    "stop": []
-  }'
-```
-The above request will return a streamed response with the model’s answer.
-#### Expected Output
-```
-INFO 01-24 09:33:05 async_llm_engine.py:207] Added request 052eabe0-fc54-4f7c-9be8-4926523b26fc___0.
-INFO 01-24 09:33:05 kv_cache.py:378] Fetching source address for worker 0 by key worker_0_rank_0
-TRACE:    127.0.0.1:49878 - ASGI [2] Send {'type': 'http.response.body', 'body': '<290 bytes>', 'more_body': True}
-data: {"id":"052eabe0-fc54-4f7c-9be8-4926523b26fc","choices":[{"delta":{"content":"\n\n","role":"assistant"},"logprobs":null,"finish_reason":null,"index":0}],"created":1737711185,"model":"llama","system_fingerprint":"052eabe0-fc54-4f7c-9be8-4926523b26fc","object":"chat.completion.chunk"}
-INFO 01-24 09:33:05 async_llm_engine.py:175] Finished request 052eabe0-fc54-4f7c-9be8-4926523b26fc___0.
-TRACE:    127.0.0.1:49878 - ASGI [2] Send {'type': 'http.response.body', 'body': '<317 bytes>', 'more_body': True}
-TRACE:    127.0.0.1:49878 - ASGI [2] Send {'type': 'http.response.body', 'body': '<14 bytes>', 'more_body': True}
-TRACE:    127.0.0.1:49878 - ASGI [2] Send {'type': 'http.response.body', 'body': '<0 bytes>', 'more_body': False}
-data: {"id":"052eabe0-fc54-4f7c-9be8-4926523b26fc","choices":[{"delta":{"content":"The capital of France is Paris.","role":"assistant"},"logprobs":null,"finish_reason":null,"index":0}],"created":1737711185,"model":"llama","system_fingerprint":"052eabe0-fc54-4f7c-9be8-4926523b26fc","object":"chat.completion.chunk"}
-TRACE:    127.0.0.1:49878 - ASGI [2] Receive {'type': 'http.disconnect'}
-data: [DONE]
-```
-## 5. Benchmarking
-You can benchmark this setup using [**GenAI-Perf**](https://github.com/triton-inference-server/perf_analyzer/blob/main/genai-perf/README.md), which supports OpenAI endpoints for chat or completion requests.
-```bash
-genai-perf profile \
-  -m llama \
-  --url ${HOSTNAME}:8005 \
-  --endpoint-type chat \
-  --streaming \
-  --num-dataset-entries 1000 \
-  --service-kind openai \
-  --endpoint v1/chat/completions \
-  --warmup-request-count 10 \
-  --random-seed 123 \
-  --synthetic-input-tokens-stddev 0 \
-  --output-tokens-stddev 0 \
-  --tokenizer neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 \
-  --synthetic-input-tokens-mean 300 \
-  --output-tokens-mean 3000 \
-  --extra-inputs seed:100 \
-  --extra-inputs min_tokens:150 \
-  --extra-inputs max_tokens:150 \
-  --profile-export-file my_profile_export.json \
-  --artifact-dir artifacts/ \
-  --concurrency 32 \
-  --request-count 320 \
-  -- -v \
-  --async
-```
-**Key Parameters**:
- **`-m llama`**: Your model name (must match the name used in your server).
- **`--url <API_SERVER_HOST>:8005`**: The location of your API server.
- **`--endpoint v1/chat/completions`**: Using the OpenAI chat endpoint.
- **`--streaming`**: Ensures tokens are streamed back for chat-like usage.
-## 6. Teardown
-To tear down a deployment during local development, you can either kill the
-container or the kill the relevant processes involved in the deployment.
-To kill the processes being run inside the container, you can run:
-```bash
-pkill -9 -f python3
-pkill -9 -f nats-server
-```
-You will generally want to make sure you have a clean slate between
-deployments to avoid any unexpected errors.
-NOTE: If you have other unrelated processes in the environment with `python3`
-in the name, the `pkill` command above will terminate them as well. In this
-scenario, you could select specific process IDs and use the following command
-instead for each process ID replacing `<pid>` below:
-```
-kill -9 <pid>
-```
-## 7. Model Precision Variants
-In the commands above, we used the FP8 variant `neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8` because it significantly reduces KV cache size, which helps with network transfer and memory usage. However, if your GPU is older or does not support FP8, try using the standard BF16/FP16 precision variant, for example:
-```bash
--model-name meta-llama/Meta-Llama-3.1-8B-Instruct
--kv-cache-dtype bf16
-```
-## 8. Multi-node Deployment
-To deploy the solution in a multi-node environment please refer to [deploy_llama_8b_disaggregated_multinode.sh](examples/llm/vllm/deploy/deploy_llama_8b_disaggregated_multinode.sh) script. On a head node run NATS server, API server and context worker with
-```
-./examples/llm/vllm/deploy/deploy_llama_8b_disaggregated.sh context --head-url <head url>
-```
-On the second node run the generate worker
-```
-./examples/llm/vllm/deploy/deploy_llama_8b_disaggregated.sh generate --head-url <head url>
-```
-The example script is set by default to launch one context worker with TP 1 on the head node and one generate worker with TP 1 on the secondary node. This can be changed for other configurations - see the script for details.
-## 9. Known Issues & Limitations
-1. **Fixed Worker Count**
-   Currently, the number of prefill and decode workers must be fixed at the start of deployment. Dynamically adding or removing workers is not yet supported.
-2. **KV Transfer OOM**
-   During heavy loads, KV cache transfers between prefill and decode processes may cause out-of-memory errors if there is insufficient GPU memory.
-3. **KV Cache Preemption**
-   Cache preemption (evicting old prompts to free memory) is not supported in the current patch.
-4. **Experimental Patch**
-   The required vLLM patch is experimental and not yet merged into upstream vLLM. Future releases may remove the need for a custom patch.
-5. **Single generate worker**
-   Only one generate worker can be used in a single deployment.
-6. **Streaming**
-   When streaming is enabled, only two responses will be returned in the stream: the first token and the complete response.
-## 10. References
-[^1]: Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao
-Zhang. Distserve: Disaggregating prefill and decoding for goodput-optimized large language
-model serving. *arXiv:2401.09670v3 [cs.DC]*, 2024.
-For more details on Triton Distributed and additional examples, please consult the official [Hello World example](../../hello_world/) and the [Triton Inference Server documentation](https://github.com/triton-inference-server/server).
--- a/examples/python/llm/vllm/__init__.py
+++ b/examples/python/llm/vllm/__init__.py
--- a/examples/python/llm/vllm/assets/vllm_disagg_architecture_overview.jpg
+++ b/examples/python/llm/vllm/assets/vllm_disagg_architecture_overview.jpg
--- a/examples/python/llm/vllm/benchmark/README.md
+++ b/examples/python/llm/vllm/benchmark/README.md
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-# Tuning and Benchmarking Disaggregated Serving
-**Disaggregated Serving** [^1] enables developers and teams deploying
-LLMs to tune their deployment based on input and output sequence
-lengths to achieve a targeted SLA with the right mix of context and
-generation workers. In particular disaggregated serving enables teams
-the ability to choose different parallelization strategies for each
-phase and balance throughput (tokens / sec / gpu) and latency (tokens
-/ sec / user).
-## Example:
-### 50 tokens per sec SLA with Input (3000) / Output (150)  Sequence Length Tuning
-To determine the best mix of context and generate workers for a
-targeted latency and input and output sequence length generally we
-perform "sweeps" comparing different strategies to find the best
-throughput within the SLA.
-For example for input sequence length 3000 and output sequence length
-150 after sweeping different tensor parallellism strategies on two
-8 x H100 GPU nodes, we've found that using 4 instances of TP 2 for
-context (on one node) and using 1 instance of TP 8 for generate (on
-the second node) gives the best throughput at a latency target of 50
-tokens per sec per user.
-At that latency target, in our early measurements disaggregated
-serving outperforms traditional aggregated LLM serving by more than 1.5x
-(with throughput normalized per GPU).
-### Reproducing Results
-To reproduce similar results on a 2 node H100 x 8 GPU system we
-provide sample scripts.
-### Launch Context Workers on First Node
-On first (head) node:
-```
-bash deploy_llama_70b_context_tp2dp4.sh --head-url <head url>
-```
-### Launch Generate Worker on Second Node
-On second node:
-```
-bash deploy_llama_70b_generate_tp8dp1.sh --head-url <head url>
-```
-### Benchmark
-The following `genai-perf` command simulates traffic with 3000 input and 150 output sequence lengths.
-```
-genai-perf profile \
-  -m llama \
-  --url <api server url> \
-  --endpoint-type chat \
-  --streaming \
-  --num-dataset-entries 100 \
-  --service-kind openai \
-  --endpoint v1/chat/completions \
-  --warmup-request-count 10 \
-  --random-seed 123 \
-  --synthetic-input-tokens-stddev 0 \
-  --output-tokens-stddev 0 \
-  --tokenizer neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 \
-  --synthetic-input-tokens-mean 3000 \
-  --output-tokens-mean 150 \
-  --extra-inputs seed:100 \
-  --extra-inputs min_tokens:150 \
-  --extra-inputs max_tokens:150 \
-  --profile-export-file my_profile_export.json \
-  --artifact-dir artifacts/ \
-  --concurrency < N > \
-  --request-count < 10 * N > \
-  -- -v \
-  --async
-```
-### Example Results
-The following results are given as an example, are not fully
-optimized, and do not indicate what you may get locally.
-| label    | configuration                  | concurrency | output token throughput per request | output token throughput per gpu | time to first token | inter token latency |
-|----------|--------------------------------|-------------|-------------------------------------|---------------------------------|---------------------|---------------------|
-| disagg   | context tp2dp4 generate tp8dp1 |          48 |                    49.18197330348195      |        87.55798331              |       1157.4852116520833    |       15.935926391666667  |
-| baseline | baseline tp4dp1                |           4 |                         50.27116554062172 |                     56.26445983 |         709.2506074249999 |         15.265875249999999 |
-###  Baseline Comparison
-On a single node you can run a comparison. With aggregated workers we
-found the best throughput at the target SLA and input and output
-sequence lengths with 2 instances of tensor parallelism 4.
-```
-bash deploy_llama_70b_baseline_tp4dp2.sh --head-url <head url>
-```
-To see the results use the same `genai-perf` command used to benchmark
-the disaggregated setup.
-### Stopping deployment
-```
-pkill -SIGINT -f python3
-pkill -SIGINT -f nats
-```
-## Known issue
-Sometimes during the first run there there are nats errors. In that case just restart the deployment.
-## References
-[^1]: Yinmin Zhong, Shengyu Liu, Junda Chen, Jianbo Hu, Yibo Zhu, Xuanzhe Liu, Xin Jin, and Hao
-Zhang. Distserve: Disaggregating prefill and decoding for goodput-optimized large language
-model serving. *arXiv:2401.09670v3 [cs.DC]*, 2024.
--- a/examples/python/llm/vllm/benchmark/deploy_llama_70b_context_tp2dp4.sh
+++ b/examples/python/llm/vllm/benchmark/deploy_llama_70b_context_tp2dp4.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-export VLLM_ATTENTION_BACKEND=FLASHINFER
-export VLLM_WORKER_MULTIPROC_METHOD=spawn
-export VLLM_TORCH_PORT=36183
-export VLLM_CONTEXT_WORKERS=4
-export VLLM_CONTEXT_TP_SIZE=2
-export VLLM_GENERATE_WORKERS=1
-export VLLM_GENERATE_TP_SIZE=8
-export VLLM_LOGGING_LEVEL=INFO
-export VLLM_DATA_PLANE_BACKEND=nccl
-export PYTHONUNBUFFERED=1
-export NATS_PORT=4223
-export NATS_STORE="$(mktemp -d)"
-export API_SERVER_PORT=8005
-if [ "$1" != "--head-url" ] || [ -z "$2" ]; then
-    echo "Usage: $0 --head-url <head url>"
-    exit 1
-fi
-head_url=$2
-export NATS_HOST="$head_url"
-export VLLM_TORCH_HOST="$head_url"
-export API_SERVER_HOST="$head_url"
-# Start NATS Server
-echo "Flushing NATS store: ${NATS_STORE}..."
-rm -r "${NATS_STORE}"
-echo "Starting NATS Server..."
-nats-server -p ${NATS_PORT} --jetstream --store_dir "${NATS_STORE}" &
-# Start API Server
-echo "Starting LLM API Server..."
-python3 -m llm.api_server \
-  --tokenizer neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 \
-  --request-plane-uri ${NATS_HOST}:${NATS_PORT} \
-  --api-server-host ${API_SERVER_HOST} \
-  --model-name "llama" \
-  --api-server-port ${API_SERVER_PORT} &
-# Empty --log-dir will dump logs to stdout
-echo "Starting vLLM baseline workers..."
-gpu_configs=(
-  "0,1"
-  "2,3"
-  "4,5"
-  "6,7"
-)
-for i in "${!gpu_configs[@]}"; do
-    CUDA_VISIBLE_DEVICES="${gpu_configs[$i]}" \
-    VLLM_WORKER_ID=$i \
-    python3 -m llm.vllm.deploy \
-    --context-worker-count 1 \
-    --context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
-    --generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
-    --request-plane-uri ${NATS_HOST}:${NATS_PORT} \
-    --model-name neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 \
-    --worker-name llama \
-    --kv-cache-dtype fp8 \
-    --dtype auto \
-    --disable-async-output-proc \
-    --disable-log-stats \
-    --max-model-len 3500 \
-    --max-batch-size 10000 \
-    --gpu-memory-utilization 0.5 &
-done
--- a/examples/python/llm/vllm/benchmark/deploy_llama_70b_generate_tp8dp1.sh
+++ b/examples/python/llm/vllm/benchmark/deploy_llama_70b_generate_tp8dp1.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-export VLLM_ATTENTION_BACKEND=FLASHINFER
-export VLLM_WORKER_MULTIPROC_METHOD=spawn
-export VLLM_TORCH_PORT=36183
-export VLLM_CONTEXT_WORKERS=4
-export VLLM_CONTEXT_TP_SIZE=2
-export VLLM_GENERATE_WORKERS=1
-export VLLM_GENERATE_TP_SIZE=8
-export VLLM_LOGGING_LEVEL=INFO
-export VLLM_DATA_PLANE_BACKEND=nccl
-export PYTHONUNBUFFERED=1
-export NATS_PORT=4223
-export NATS_STORE="$(mktemp -d)"
-export API_SERVER_PORT=8005
-if [ "$1" != "--head-url" ] || [ -z "$2" ]; then
-    echo "Usage: $0 --head-url <head url>"
-    exit 1
-fi
-head_url=$2
-export NATS_HOST="$head_url"
-export VLLM_TORCH_HOST="$head_url"
-export API_SERVER_HOST="$head_url"
-# Empty --log-dir will dump logs to stdout
-echo "Starting vLLM generate workers..."
-CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \
-  VLLM_WORKER_ID=${VLLM_CONTEXT_WORKERS} \
-  python3 -m llm.vllm.deploy \
-  --generate-worker-count 1 \
-  --context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
-  --generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
-  --request-plane-uri ${NATS_HOST}:${NATS_PORT} \
-  --model-name neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 \
-  --worker-name llama \
-  --kv-cache-dtype fp8 \
-  --dtype auto \
-  --disable-async-output-proc \
-  --disable-log-stats \
-  --max-model-len 3500 \
-  --max-batch-size 10000 \
-  --gpu-memory-utilization 0.9 &
--- a/examples/python/llm/vllm/deploy/__main__.py
+++ b/examples/python/llm/vllm/deploy/__main__.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import signal
-import sys
-import time
-from pathlib import Path
-from llm.vllm.operators.vllm import (
-    VllmContextOperator,
-    VllmGenerateOperator,
-    VllmOperator,
-)
-from triton_distributed.runtime import Deployment, OperatorConfig, WorkerConfig
-from .parser import parse_args
-deployment = None
-def handler(signum, frame):
-    exit_code = 0
-    if deployment:
-        print("Stopping Workers")
-        exit_code = deployment.stop()
-    print(f"Workers Stopped Exit Code {exit_code}")
-    sys.exit(exit_code)
-signals = (signal.SIGHUP, signal.SIGTERM, signal.SIGINT)
-for sig in signals:
-    try:
-        signal.signal(sig, handler)
-    except Exception:
-        pass
-def _create_context_op(name, args, max_inflight_requests):
-    return OperatorConfig(
-        name=name,
-        implementation=VllmContextOperator,
-        max_inflight_requests=int(max_inflight_requests),
-        parameters=vars(args),
-    )
-def _create_generate_op(name, args, max_inflight_requests):
-    return OperatorConfig(
-        name=name,
-        implementation=VllmGenerateOperator,
-        max_inflight_requests=int(max_inflight_requests),
-        parameters=vars(args),
-    )
-def _create_baseline_op(name, args, max_inflight_requests):
-    return OperatorConfig(
-        name=name,
-        implementation=VllmOperator,
-        max_inflight_requests=int(max_inflight_requests),
-        parameters=vars(args),
-    )
-def main(args):
-    global deployment
-    if args.log_dir:
-        log_dir = Path(args.log_dir)
-        log_dir.mkdir(exist_ok=True)
-    worker_configs = []
-    # Context/Generate workers used for Disaggregated Serving
-    if args.context_worker_count == 1:
-        context_op = _create_context_op(args.worker_name, args, 1000)
-        context = WorkerConfig(
-            operators=[context_op],
-            # Context worker gets --worker-name as it is the model that will
-            # be hit first in a disaggregated setting.
-            name=args.worker_name,
-        )
-        worker_configs.append((context, 1))
-    if args.generate_worker_count == 1:
-        generate_op = _create_generate_op("generate", args, 1000)
-        generate = WorkerConfig(
-            operators=[generate_op],
-            # Generate worker gets a hard-coded name "generate" as the context
-            # worker will talk directly to it.
-            name="generate",
-        )
-        worker_configs.append((generate, 1))
-    # NOTE: Launching baseline worker and context/generate workers at
-    # the same time is not currently supported.
-    if args.baseline_worker_count == 1:
-        # Baseline worker has a hard-coded name just for testing purposes
-        baseline_op = _create_baseline_op("baseline", args, 1000)
-        baseline = WorkerConfig(
-            operators=[baseline_op],
-            name="baseline",
-        )
-        worker_configs.append((baseline, 1))
-    deployment = Deployment(
-        worker_configs,
-        initialize_request_plane=args.initialize_request_plane,
-        log_dir=args.log_dir,
-        log_level=args.log_level,
-        starting_metrics_port=args.starting_metrics_port,
-        request_plane_args=([], {"request_plane_uri": args.request_plane_uri}),
-    )
-    deployment.start()
-    print("Workers started ... press Ctrl-C to Exit")
-    while True:
-        time.sleep(10)
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
--- a/examples/python/llm/vllm/deploy/deploy_llama_8b_baseline.sh
+++ b/examples/python/llm/vllm/deploy/deploy_llama_8b_baseline.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# FIXME: Convert this script to README steps
-export VLLM_ATTENTION_BACKEND=FLASHINFER
-export VLLM_WORKER_MULTIPROC_METHOD=spawn
-export VLLM_TORCH_HOST=localhost
-export VLLM_TORCH_PORT=36183
-export VLLM_BASELINE_WORKERS=1
-export VLLM_BASELINE_TP_SIZE=1
-export VLLM_LOGGING_LEVEL=INFO
-export VLLM_DATA_PLANE_BACKEND=nccl
-export PYTHONUNBUFFERED=1
-export NATS_HOST=localhost
-export NATS_PORT=4223
-export NATS_STORE="$(mktemp -d)"
-export API_SERVER_HOST=localhost
-export API_SERVER_PORT=8005
-# Start NATS Server
-echo "Flushing NATS store: ${NATS_STORE}..."
-rm -r "${NATS_STORE}"
-echo "Starting NATS Server..."
-nats-server -p ${NATS_PORT} --jetstream --store_dir "${NATS_STORE}" &
-# Start API Server
-echo "Starting LLM API Server..."
-python3 -m llm.api_server \
-  --tokenizer neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
-  --request-plane-uri ${NATS_HOST}:${NATS_PORT} \
-  --api-server-host ${API_SERVER_HOST} \
-  --model-name "baseline" \
-  --api-server-port ${API_SERVER_PORT} &
-# Empty --log-dir will dump logs to stdout
-echo "Starting vLLM baseline workers..."
-CUDA_VISIBLE_DEVICES=0 \
-VLLM_WORKER_ID=0 \
-python3 -m llm.vllm.deploy \
-  --baseline-worker-count ${VLLM_BASELINE_WORKERS} \
-  --request-plane-uri ${NATS_HOST}:${NATS_PORT} \
-  --model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
-  --kv-cache-dtype fp8 \
-  --dtype auto \
-  --disable-async-output-proc \
-  --disable-log-stats \
-  --max-model-len 1000 \
-  --max-batch-size 10000 \
-  --gpu-memory-utilization 0.9 \
-  --baseline-tp-size ${VLLM_BASELINE_TP_SIZE} \
-  --log-dir ""
-# NOTE: It may take more than a minute for the vllm worker to start up
-# if the model weights aren't cached and need to be downloaded.
-echo "Waiting for deployment to finish startup..."
-sleep 60
-# Make a Chat Completion Request
-echo "Sending chat completions request..."
-curl ${API_SERVER_HOST}:${API_SERVER_PORT}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
-  "model": "baseline",
-  "messages": [
-    {"role": "user", "content": "What is the capital of France?"}
-  ],
-  "temperature": 0,
-  "top_p": 0.95,
-  "max_tokens": 25,
-  "stream": true,
-  "n": 1,
-  "frequency_penalty": 0.0,
-  "stop": []
-}'
--- a/examples/python/llm/vllm/deploy/deploy_llama_8b_disaggregated.sh
+++ b/examples/python/llm/vllm/deploy/deploy_llama_8b_disaggregated.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# FIXME: Convert this script to README steps
-export VLLM_ATTENTION_BACKEND=FLASHINFER
-export VLLM_WORKER_MULTIPROC_METHOD=spawn
-export VLLM_TORCH_HOST=localhost
-export VLLM_TORCH_PORT=36183
-export VLLM_BASELINE_WORKERS=0
-export VLLM_CONTEXT_WORKERS=1
-export VLLM_GENERATE_WORKERS=1
-export VLLM_BASELINE_TP_SIZE=1
-export VLLM_CONTEXT_TP_SIZE=1
-export VLLM_GENERATE_TP_SIZE=1
-export VLLM_LOGGING_LEVEL=INFO
-export VLLM_DATA_PLANE_BACKEND=nccl
-export PYTHONUNBUFFERED=1
-export NATS_HOST=localhost
-export NATS_PORT=4223
-export NATS_STORE="$(mktemp -d)"
-export API_SERVER_HOST=localhost
-export API_SERVER_PORT=8005
-# Start NATS Server
-echo "Flushing NATS store: ${NATS_STORE}..."
-rm -r "${NATS_STORE}"
-echo "Starting NATS Server..."
-nats-server -p ${NATS_PORT} --jetstream --store_dir "${NATS_STORE}" &
-# Start API Server
-echo "Starting LLM API Server..."
-python3 -m llm.api_server \
-  --tokenizer neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
-  --request-plane-uri ${NATS_HOST}:${NATS_PORT} \
-  --api-server-host ${API_SERVER_HOST} \
-  --model-name llama \
-  --api-server-port ${API_SERVER_PORT} &
-# Start VLLM Worker 0
-echo "Starting vLLM context workers..."
-CUDA_VISIBLE_DEVICES=0 \
-VLLM_WORKER_ID=0 \
-python3 -m llm.vllm.deploy \
-  --context-worker-count ${VLLM_CONTEXT_WORKERS} \
-  --request-plane-uri ${NATS_HOST}:${NATS_PORT} \
-  --model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
-  --kv-cache-dtype fp8 \
-  --dtype auto \
-  --worker-name llama \
-  --disable-async-output-proc \
-  --disable-log-stats \
-  --max-model-len 1000 \
-  --max-batch-size 10000 \
-  --gpu-memory-utilization 0.9 \
-  --context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
-  --generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
-  --log-dir "/tmp/vllm_logs" &
-# Start VLLM Worker 1
-echo "Starting vLLM generate workers..."
-CUDA_VISIBLE_DEVICES=1 \
-VLLM_WORKER_ID=1 \
-python3 -m llm.vllm.deploy \
-  --generate-worker-count ${VLLM_GENERATE_WORKERS} \
-  --request-plane-uri ${NATS_HOST}:${NATS_PORT} \
-  --model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
-  --kv-cache-dtype fp8 \
-  --dtype auto \
-  --worker-name llama \
-  --disable-async-output-proc \
-  --disable-log-stats \
-  --max-model-len 1000 \
-  --max-batch-size 10000 \
-  --gpu-memory-utilization 0.9 \
-  --context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
-  --generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
-  --log-dir "/tmp/vllm_logs" &
-# NOTE: It may take more than a minute for the vllm worker to start up
-# if the model weights aren't cached and need to be downloaded.
-echo "Waiting for deployment to finish startup..."
-echo "Once you see all ranks connected to the server, it should be ready..."
-echo "Example output:"
-echo "\tRank 0 connected to the server"
-echo "\t..."
-echo "\tRank 1 connected to the server"
-sleep 120
-# Make a Chat Completion Request
-echo "Sending chat completions request..."
-curl ${API_SERVER_HOST}:${API_SERVER_PORT}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
-  "model": "llama",
-  "messages": [
-    {"role": "system", "content": "What is the capital of France?"}
-  ],
-  "temperature": 0,
-  "top_p": 0.95,
-  "max_tokens": 25,
-  "stream": true,
-  "n": 1,
-  "frequency_penalty": 0.0,
-  "stop": []
-}'
--- a/examples/python/llm/vllm/deploy/deploy_llama_8b_disaggregated_multinode.sh
+++ b/examples/python/llm/vllm/deploy/deploy_llama_8b_disaggregated_multinode.sh
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-export VLLM_ATTENTION_BACKEND=FLASHINFER
-export VLLM_WORKER_MULTIPROC_METHOD=spawn
-export VLLM_TORCH_HOST=""
-export VLLM_TORCH_PORT=36183
-export VLLM_BASELINE_WORKERS=0
-export VLLM_CONTEXT_WORKERS=1
-export VLLM_GENERATE_WORKERS=1
-export VLLM_BASELINE_TP_SIZE=1
-export VLLM_CONTEXT_TP_SIZE=1
-export VLLM_GENERATE_TP_SIZE=1
-export VLLM_LOGGING_LEVEL=INFO
-export VLLM_DATA_PLANE_BACKEND=nccl
-export PYTHONUNBUFFERED=1
-export NATS_HOST=""
-export NATS_PORT=4223
-export NATS_STORE="$(mktemp -d)"
-export API_SERVER_HOST=""
-export API_SERVER_PORT=8005
-start_nats_server() {
-    local head_url=$1
-    export NATS_HOST="$head_url"
-    echo "Flushing NATS store: ${NATS_STORE}..."
-    rm -r "${NATS_STORE}"
-    echo "Starting NATS Server..."
-    nats-server -p ${NATS_PORT} --jetstream --store_dir "${NATS_STORE}" &
-}
-start_api_server() {
-    local head_url=$1
-    export VLLM_TORCH_HOST="$head_url"
-    echo "Starting LLM API Server..."
-    python3 -m llm.api_server \
-      --tokenizer neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
-      --request-plane-uri ${head_url}:${NATS_PORT} \
-      --api-server-host ${API_SERVER_HOST} \
-      --model-name llama \
-      --api-server-port ${API_SERVER_PORT} &
-}
-start_context_worker() {
-    local head_url=$1
-    export VLLM_TORCH_HOST="$head_url"
-    echo "Starting vLLM context workers..."
-    CUDA_VISIBLE_DEVICES=0 \
-    VLLM_WORKER_ID=0 \
-    python3 -m llm.vllm.deploy \
-      --context-worker-count ${VLLM_CONTEXT_WORKERS} \
-      --request-plane-uri ${head_url}:${NATS_PORT} \
-      --model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
-      --kv-cache-dtype fp8 \
-      --dtype auto \
-      --worker-name llama \
-      --disable-async-output-proc \
-      --disable-log-stats \
-      --max-model-len 1000 \
-      --max-batch-size 10000 \
-      --gpu-memory-utilization 0.9 \
-      --context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
-      --generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
-      --log-dir "/tmp/vllm_logs" &
-}
-start_generate_worker() {
-    local head_url=$1
-    export VLLM_TORCH_HOST="$head_url"
-    echo "Starting vLLM generate workers..."
-    CUDA_VISIBLE_DEVICES=1 \
-    VLLM_WORKER_ID=1 \
-    python3 -m llm.vllm.deploy \
-      --generate-worker-count ${VLLM_GENERATE_WORKERS} \
-      --request-plane-uri ${head_url}:${NATS_PORT} \
-      --model-name neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 \
-      --kv-cache-dtype fp8 \
-      --dtype auto \
-      --worker-name llama \
-      --disable-async-output-proc \
-      --disable-log-stats \
-      --max-model-len 1000 \
-      --max-batch-size 10000 \
-      --gpu-memory-utilization 0.9 \
-      --context-tp-size ${VLLM_CONTEXT_TP_SIZE} \
-      --generate-tp-size ${VLLM_GENERATE_TP_SIZE} \
-      --log-dir "/tmp/vllm_logs" &
-}
-case "$1" in
-    context)
-        if [ "$2" != "--head-url" ] || [ -z "$3" ]; then
-            echo "Usage: $0 context --head-url <head url>"
-            exit 1
-        fi
-        head_url=$3
-        export API_SERVER_HOST="$head_url"
-        start_nats_server
-        start_api_server "$head_url"
-        start_context_worker "$head_url"
-        ;;
-    generate)
-        if [ "$2" != "--head-url" ] || [ -z "$3" ]; then
-            echo "Usage: $0 generate --head-url <head url>"
-            exit 1
-        fi
-        head_url=$3
-        export API_SERVER_HOST="$head_url"
-        start_generate_worker "$head_url"
-        ;;
-    *)
-        echo "Usage: $0 {context|generate} --head-url <head url>"
-        exit 1
-        ;;
-esac
-echo "Waiting for deployment to finish startup..."
-echo "Once you see all ranks connected to the server, it should be ready..."
-echo "Example output:"
-echo "\tRank 0 connected to the server"
-echo "\t..."
-echo "\tRank 1 connected to the server"
-sleep 120
-echo "Sending chat completions request..."
-curl ${API_SERVER_HOST}:${API_SERVER_PORT}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
-  "model": "llama",
-  "messages": [
-    {"role": "system", "content": "What is the capital of France?"}
-  ],
-  "temperature": 0,
-  "top_p": 0.95,
-  "max_tokens": 25,
-  "stream": true,
-  "n": 1,
-  "frequency_penalty": 0.0,
-  "stop": []
-}'
--- a/examples/python/llm/vllm/deploy/parser.py
+++ b/examples/python/llm/vllm/deploy/parser.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-# FIXME: Remove unused args if any
-def parse_args():
-    parser = argparse.ArgumentParser(description="Run an example of the VLLM pipeline.")
-    #    example_dir = Path(__file__).parent.absolute().parent.absolute()
-    #    default_log_dir = "" example_dir.joinpath("logs")
-    default_log_dir = ""
-    parser.add_argument(
-        "--log-dir",
-        type=str,
-        default=str(default_log_dir),
-        help="log dir folder",
-    )
-    parser.add_argument(
-        "--request-plane-uri",
-        type=str,
-        default="nats://localhost:4223",
-        help="URI of request plane",
-    )
-    parser.add_argument(
-        "--initialize-request-plane",
-        default=False,
-        action="store_true",
-        help="Initialize the request plane, should only be done once per deployment",
-    )
-    parser.add_argument(
-        "--starting-metrics-port",
-        type=int,
-        default=0,
-        help="Metrics port for first worker. Each worker will expose metrics on subsequent ports, ex. worker 1: 50000, worker 2: 50001, worker 3: 50002",
-    )
-    parser.add_argument(
-        "--context-worker-count",
-        type=int,
-        required=False,
-        default=0,
-        help="Number of context workers",
-    )
-    parser.add_argument(
-        "--dummy-worker-count",
-        type=int,
-        required=False,
-        default=0,
-        help="Number of dummy workers",
-    )
-    parser.add_argument(
-        "--baseline-worker-count",
-        type=int,
-        required=False,
-        default=0,
-        help="Number of baseline workers",
-    )
-    parser.add_argument(
-        "--generate-worker-count",
-        type=int,
-        required=False,
-        default=0,
-        help="Number of generate workers",
-    )
-    parser.add_argument(
-        "--nats-url",
-        type=str,
-        required=False,
-        default="nats://localhost:4223",
-        help="URL of NATS server",
-    )
-    parser.add_argument(
-        "--model-name",
-        type=str,
-        required=False,
-        default="meta-llama/Meta-Llama-3.1-8B-Instruct",
-        help="Model name",
-    )
-    parser.add_argument(
-        "--worker-name",
-        type=str,
-        required=False,
-        default="llama",
-        help="Worker name",
-    )
-    parser.add_argument(
-        "--max-model-len",
-        type=int,
-        required=False,
-        default=None,
-        help="Maximum input/output latency length.",
-    )
-    parser.add_argument(
-        "--max-batch-size",
-        type=int,
-        required=False,
-        default=10000,
-        help="Max batch size",
-    )
-    parser.add_argument(
-        "--gpu-memory-utilization",
-        type=float,
-        required=False,
-        default=0.45,
-        help="GPU memory utilization (fraction of memory from 0.0 to 1.0)",
-    )
-    parser.add_argument(
-        "--dtype",
-        type=str,
-        required=False,
-        default="float16",
-        help="Attention data type (float16, TODO: fp8)",
-    )
-    parser.add_argument(
-        "--kv-cache-dtype",
-        type=str,
-        required=False,
-        default="auto",
-        help="Key-value cache data type",
-    )
-    # FIXME: Support string values like 'debug', 'info, etc.
-    parser.add_argument(
-        "--log-level",
-        type=int,
-        required=False,
-        choices=[0, 1, 2],
-        default=1,
-        help="Logging level: 2=debug, 1=info, 0=error (default=1)",
-    )
-    ## Logical arguments for vLLM engine
-    parser.add_argument(
-        "--enable-prefix-caching",
-        action=argparse.BooleanOptionalAction,
-        required=False,
-        default=False,
-        help="Enable prefix caching",
-    )
-    parser.add_argument(
-        "--enable-chunked-prefill",
-        action=argparse.BooleanOptionalAction,
-        required=False,
-        default=False,
-        help="Enable chunked prefill",
-    )
-    parser.add_argument(
-        "--enforce-eager",
-        action=argparse.BooleanOptionalAction,
-        required=False,
-        default=False,
-        help="Enforce eager execution",
-    )
-    parser.add_argument(
-        "--ignore-eos",
-        action=argparse.BooleanOptionalAction,
-        required=False,
-        default=False,
-        help="Ignore EOS token when generating",
-    )
-    parser.add_argument(
-        "--baseline-tp-size",
-        type=int,
-        default=1,
-        help="Tensor parallel size of a baseline worker.",
-    )
-    parser.add_argument(
-        "--context-tp-size",
-        type=int,
-        default=1,
-        help="Tensor parallel size of a context worker.",
-    )
-    parser.add_argument(
-        "--generate-tp-size",
-        type=int,
-        default=1,
-        help="Tensor parallel size of a generate worker.",
-    )
-    parser.add_argument(
-        "--max-num-seqs",
-        type=int,
-        default=None,
-        help="maximum number of sequences per iteration",
-    )
-    parser.add_argument(
-        "--disable-async-output-proc",
-        action="store_true",
-        help="Disable async output processing",
-    )
-    parser.add_argument(
-        "--disable-log-stats",
-        action="store_true",
-        help="Disable logging statistics",
-    )
-    return parser.parse_args()
--- a/examples/python/llm/vllm/operators/__init__.py
+++ b/examples/python/llm/vllm/operators/__init__.py
--- a/examples/python/llm/vllm/operators/stages.py
+++ b/examples/python/llm/vllm/operators/stages.py
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import abc
-import inspect
-import os
-import time
-from typing import Any, AsyncGenerator, Dict, Optional
-import numpy as np
-import vllm.engine.arg_utils
-import vllm.engine.async_llm_engine
-import vllm.inputs.data
-LOGGER = vllm.logger.init_logger(__name__)
-# TODO ptarasiewicz remove after veryfing streaming works efficiently
-# FIXME currently streaming all the tokens is not efficient
-# with RETURN_EVERY_N so large we return only first token and whole sequence at the end
-RETURN_EVERY_N = 1
-class Stage(abc.ABC):
-    @abc.abstractmethod
-    async def __call__(
-        self, input_payload: Dict[str, Any]
-    ) -> AsyncGenerator[Dict[str, Any], None]:
-        yield {}
-class AggregatedStage(Stage):
-    def __init__(
-        self,
-        **kwargs,
-    ):
-        self._ignore_eos = kwargs.pop("ignore_eos", False)
-        engine_args = vllm.engine.arg_utils.AsyncEngineArgs(**kwargs)
-        LOGGER.info(f"Creating engine with args: {engine_args}")
-        self._engine = vllm.engine.async_llm_engine.AsyncLLMEngine.from_engine_args(
-            engine_args
-        )
-        LOGGER.info(f"Created engine: {self._engine}")
-    async def __call__(
-        self, input_payload: Dict[str, Any]
-    ) -> AsyncGenerator[Dict[str, Any], None]:
-        try:
-            vllm_input = input_payload["parameters"]["prompt"]
-            sampling_params = vllm.SamplingParams(
-                **input_payload["parameters"].get("sampling_params", {}),
-                ignore_eos=self._ignore_eos,
-            )
-            LOGGER.debug(f"sampling_params: {sampling_params}")
-            request_id = input_payload["parameters"].get("request_id", None)
-            results_generator = self._engine.generate(
-                vllm_input, sampling_params, request_id
-            )
-            LOGGER.debug("results_generator started")
-            counter = 0
-            async for result in results_generator:
-                if counter % RETURN_EVERY_N == 0 or result.finished:
-                    tokens_ids = np.stack(
-                        [output_row.token_ids for output_row in result.outputs]
-                    ).astype(np.int64)
-                    LOGGER.debug(f"tokens_ids: {tokens_ids.shape}")
-                    yield {
-                        "outputs": {},
-                        "error": None,
-                        "final": result.finished,
-                        "parameters": {
-                            "text": result.outputs[0].text,
-                        },
-                    }
-                counter += 1
-            LOGGER.debug("results_generator finished")
-        except Exception as e:
-            LOGGER.error(f"Exception in SingleComputePipeline: {e}")
-            yield {"outputs": {}, "error": str(e), "final": True}
-class PrefillStage(Stage):
-    def __init__(
-        self,
-        generate_tensor_parallel_size: Optional[int] = None,
-        **kwargs,
-    ):
-        context_tensor_parallel_size = kwargs.get("tensor_parallel_size", 1)
-        generate_tensor_parallel_size = (
-            generate_tensor_parallel_size or context_tensor_parallel_size
-        )
-        assert (
-            generate_tensor_parallel_size % context_tensor_parallel_size == 0
-        ), "generate_tensor_parallel_size must be multiple of context_tensor_parallel_size"
-        LOGGER.debug(f"context_tensor_parallel_size: {context_tensor_parallel_size}")
-        LOGGER.debug(f"generate_tensor_parallel_size: {generate_tensor_parallel_size}")
-        os.environ["VLLM_DISAGG_STAGE"] = "PREFILL"
-        os.environ["VLLM_CONTEXT_TP_SIZE"] = str(context_tensor_parallel_size)
-        os.environ["VLLM_GENERATE_TP_SIZE"] = str(generate_tensor_parallel_size)
-        LOGGER.info(f"Env VLLM_DISAGG_STAGE set to {os.environ['VLLM_DISAGG_STAGE']}")
-        kwargs[
-            "enforce_eager"
-        ] = True  # Prefill stage must be eager because of variable ISL
-        self._ignore_eos = kwargs.pop("ignore_eos", False)
-        engine_args = vllm.engine.arg_utils.AsyncEngineArgs(**kwargs)
-        LOGGER.info(f"Creating engine with args: {engine_args}")
-        self._engine = vllm.engine.async_llm_engine.AsyncLLMEngine.from_engine_args(
-            engine_args
-        )
-        LOGGER.info("Prefill stage initialized")
-    async def __call__(
-        self, input_payload: Dict[str, Any]
-    ) -> AsyncGenerator[Dict[str, Any], None]:
-        try:
-            vllm_input = input_payload["parameters"]["prompt"]
-            request_id = input_payload["parameters"].get("request_id", None)
-            assert request_id is not None, "request_id is required for prefill"
-            sampling_params = vllm.SamplingParams(
-                **input_payload["parameters"].get("sampling_params", {}),
-                ignore_eos=self._ignore_eos,
-            )
-            old_my_max_tokens = sampling_params.max_tokens
-            old_my_min_tokens = sampling_params.min_tokens
-            sampling_params.max_tokens = 1
-            sampling_params.min_tokens = 1
-            LOGGER.debug(f"sampling_params: {sampling_params}")
-            start_time_ns = time.monotonic_ns()
-            results_generator = self._engine.generate(
-                vllm_input, sampling_params, request_id
-            )
-            LOGGER.debug("results_generator started")
-            async for result in results_generator:
-                taken_ms = (time.monotonic_ns() - start_time_ns) / 1_000_000
-                LOGGER.info(
-                    "==== Prefill completed kv cache taken %0.3fms ====", taken_ms
-                )
-                # TODO: needed to pass prompt, request_id, sampling_params to the next stage as there is no pipeline concept in online scenario
-                sampling_params.max_tokens = old_my_max_tokens
-                sampling_params.min_tokens = old_my_min_tokens
-                sampling_params_init_names = inspect.signature(
-                    vllm.SamplingParams
-                ).parameters.keys()
-                sampling_params = {
-                    k: v
-                    for k, v in sampling_params.__dict__.items()
-                    if k in sampling_params_init_names
-                }
-                LOGGER.debug(
-                    f"Yield response {input_payload['inputs'].keys()} parameters {input_payload['parameters']}"
-                )
-                yield {
-                    "outputs": {},  # See line 195 for context
-                    "error": None,
-                    "parameters": {
-                        "context_worker_id": os.environ["VLLM_WORKER_ID"],
-                        "first_token": result.outputs[0].token_ids[0],
-                        "seq_len": len(result.prompt_token_ids),
-                    },
-                    "final": True,
-                }
-            LOGGER.debug("Results generator for prefill finishes")
-        except Exception as e:
-            LOGGER.error(f"Exception in SingleComputePipeline: {e}")
-            yield {"outputs": {}, "error": str(e), "final": True}
-class GenerateStage(Stage):
-    def __init__(
-        self,
-        **kwargs,
-    ):
-        os.environ["VLLM_DISAGG_STAGE"] = "GENERATE"
-        LOGGER.info(f"Env VLLM_DISAGG_STAGE set to {os.environ['VLLM_DISAGG_STAGE']}")
-        self._ignore_eos = kwargs.pop("ignore_eos", False)
-        engine_args = vllm.engine.arg_utils.AsyncEngineArgs(**kwargs)
-        LOGGER.info(f"Creating engine with args: {engine_args}")
-        self._engine = vllm.engine.async_llm_engine.AsyncLLMEngine.from_engine_args(
-            engine_args
-        )
-        LOGGER.info("Generation stage initialized")
-    async def __call__(
-        self, input_payload: Dict[str, Any]
-    ) -> AsyncGenerator[Dict[str, Any], None]:
-        seq_len = input_payload["parameters"]["seq_len"]
-        LOGGER.debug(f"input sequence length: {seq_len}")
-        # we can use any tokens because first token is already sampled by the context worker
-        # and we just need the correct shape to allocate space in the kv cache
-        vllm_input = vllm.inputs.data.TokensPrompt(prompt_token_ids=[0] * seq_len)
-        sampling_params = vllm.SamplingParams(
-            **input_payload["parameters"].get("sampling_params", {}),
-            ignore_eos=self._ignore_eos,
-        )
-        LOGGER.debug(f"sampling_params: {sampling_params}")
-        request_id = input_payload["parameters"].get("request_id", None)
-        assert request_id is not None, "request_id is required for generate"
-        context_worker_id = input_payload["parameters"]["context_worker_id"]
-        new_request_id = f"{request_id}___{context_worker_id}"
-        first_token = input_payload["parameters"]["first_token"]
-        self._engine.engine.model_executor.driver_worker.model_runner.set_first_token(
-            new_request_id, first_token
-        )
-        # TODO ptarasiewicz this is only temporary way to pass worker id to the engine
-        # so that it can pull the correct kv cache
-        results_generator = self._engine.generate(
-            vllm_input,
-            sampling_params,
-            new_request_id,
-        )
-        LOGGER.debug("results_generator started")
-        counter = 0
-        async for result in results_generator:
-            if counter % RETURN_EVERY_N == 0 or result.finished:
-                yield {
-                    "outputs": {},
-                    "error": None,
-                    "final": result.finished,
-                    "parameters": {
-                        "text": result.outputs[0].text,
-                    },
-                }
-            counter += 1
-        LOGGER.debug("results_generator finished for generate")