feat: initial worker

f1f29171 · Neelay Shah · GitHub · b0195f54 · f1f29171 · f1f29171
Commit f1f29171 authored Jan 14, 2025 by Neelay Shah Committed by GitHub Jan 14, 2025
19 changed files
--- a/worker/tests/python/integration/operators/models/add/1/model.py
+++ b/worker/tests/python/integration/operators/models/add/1/model.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import numpy as np
+import triton_python_backend_utils as pb_utils
+try:
+    import cupy
+except Exception:
+    cupy = None
+class TritonPythonModel:
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        inputs = []
+        outputs = []
+        dims = [-1, -1]
+        optional = True
+        for data_type in ["type_int64"]:
+            type_name = data_type.split("_")[1].lower()
+            input_name = f"{type_name}_input"
+            output_name_1 = f"{type_name}_output_total"
+            output_name_2 = f"{type_name}_output_partial"
+            inputs.append(
+                {
+                    "name": input_name,
+                    "data_type": data_type,
+                    "dims": dims,
+                    "optional": optional,
+                }
+            )
+            outputs.append(
+                {"name": output_name_1, "data_type": data_type, "dims": dims}
+            )
+            outputs.append(
+                {"name": output_name_2, "data_type": data_type, "dims": dims}
+            )
+        outputs.append(
+            {"name": "output_parameters", "data_type": "TYPE_STRING", "dims": [1]}
+        )
+        for input_ in inputs:
+            auto_complete_model_config.add_input(input_)
+        for output in outputs:
+            auto_complete_model_config.add_output(output)
+        auto_complete_model_config.set_max_batch_size(0)
+        return auto_complete_model_config
+    def initialize(self, args):
+        self._model_config = json.loads(args["model_config"])
+        self._request_gpu_memory = False
+        if "parameters" in self._model_config:
+            parameters = self._model_config["parameters"]
+            if (
+                "request_gpu_memory" in parameters
+                and parameters["request_gpu_memory"]["string_value"] == "True"
+            ):
+                self._request_gpu_memory = True
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            output_tensors = []
+            for input_tensor in request.inputs():
+                input_value = input_tensor.as_numpy()
+                output_value_partial = np.array([[x.sum() for x in input_value]])
+                output_value_total = np.array([[input_value.sum()]])
+                if self._request_gpu_memory:
+                    output_value_partial = cupy.array(output_value_partial)
+                    output_value_total = cupy.array(output_value_total)
+                    output_tensor = pb_utils.Tensor.from_dlpack(
+                        input_tensor.name().replace("input", "output_partial"),
+                        output_value_partial,
+                    )
+                    output_tensors.append(output_tensor)
+                    output_tensor = pb_utils.Tensor.from_dlpack(
+                        input_tensor.name().replace("input", "output_total"),
+                        output_value_total,
+                    )
+                    output_tensors.append(output_tensor)
+                else:
+                    output_tensor = pb_utils.Tensor(
+                        input_tensor.name().replace("input", "output_partial"),
+                        output_value_partial,
+                    )
+                    output_tensors.append(output_tensor)
+                    output_tensor = pb_utils.Tensor(
+                        input_tensor.name().replace("input", "output_total"),
+                        output_value_total,
+                    )
+                    output_tensors.append(output_tensor)
+            output_parameters = np.array([request.parameters()]).astype(np.object_)
+            output_tensors.append(
+                pb_utils.Tensor("output_parameters", output_parameters)
+            )
+            responses.append(
+                pb_utils.InferenceResponse(
+                    output_tensors=output_tensors,
+                )
+            )
+        return responses
--- a/worker/tests/python/integration/operators/models/add/config.pbtxt
+++ b/worker/tests/python/integration/operators/models/add/config.pbtxt
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+backend: "python"
--- a/worker/tests/python/integration/operators/models/context/1/model.py
+++ b/worker/tests/python/integration/operators/models/context/1/model.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import time
+import triton_python_backend_utils as pb_utils
+class TritonPythonModel:
+    def initialize(self, args):
+        model_config = json.loads(args["model_config"])
+        self._context_delay = (
+            int(model_config["parameters"]["context_delay_ms"]["string_value"])
+        ) / 1000
+        for output_name in [
+            "KV_CACHE",
+            "OUTPUT_IDS",
+            "SEQUENCE_LENGTH",
+            "REQUEST_OUTPUT_LEN",
+        ]:
+            setattr(
+                self,
+                output_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_output_config_by_name(model_config, output_name)[
+                        "data_type"
+                    ]
+                ),
+            )
+    def execute(self, requests):
+        responses = []
+        for idx, request in enumerate(requests):
+            # Get input tensors
+            input_ids = pb_utils.get_input_tensor_by_name(
+                request, "INPUT_IDS"
+            ).as_numpy()
+            input_lengths = pb_utils.get_input_tensor_by_name(
+                request, "INPUT_LENGTH"
+            ).as_numpy()
+            request_output_len = pb_utils.get_input_tensor_by_name(
+                request, "REQUEST_OUTPUT_LEN"
+            ).as_numpy()
+            time.sleep(self._context_delay)
+            # Create output tensors. You need pb_utils.Tensor
+            # objects to create pb_utils.InferenceResponse.
+            kv_cache_tensor = pb_utils.Tensor(
+                "KV_CACHE", input_ids.astype(self.kv_cache_dtype)
+            )
+            output_ids_tensor = pb_utils.Tensor(
+                "OUTPUT_IDS", input_ids.astype(self.output_ids_dtype)
+            )
+            sequence_length_tensor = pb_utils.Tensor(
+                "SEQUENCE_LENGTH", input_lengths.astype(self.sequence_length_dtype)
+            )
+            request_output_len_tensor = pb_utils.Tensor(
+                "REQUEST_OUTPUT_LEN", request_output_len
+            )
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[
+                    kv_cache_tensor,
+                    output_ids_tensor,
+                    sequence_length_tensor,
+                    request_output_len_tensor,
+                ]
+            )
+            responses.append(inference_response)
+        return responses
--- a/worker/tests/python/integration/operators/models/context/config.pbtxt
+++ b/worker/tests/python/integration/operators/models/context/config.pbtxt
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Emulates the tensorrt_llm config from:
+# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
+name: "context"
+backend: "python"
+max_batch_size: 0
+parameters: {
+  key: "context_delay_ms"
+  value: {
+    string_value: "1000"
+  }
+}
+input [
+    {
+        name: "INPUT_IDS"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "INPUT_LENGTH"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    }
+    # Add more inputs as per requirement.
+    # For simplicity only sticking with these
+    # inputs for preprocessing.
+]
+output [
+    # Section of the first request that returns the first token.
+    # These will be handed over directly to the post-processor
+    {
+        name: "OUTPUT_IDS"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "SEQUENCE_LENGTH"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    # Section of the second part of handover to the generate stage
+    {
+        # TODO: revisit how kv cache is being exposed to generate worker.
+        name: "KV_CACHE"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    }
+    # Add more outputs as per requirement.
+    # For simplicity only sticking with these
+    # outputs for preprocessing.
+]
+# Add more parameters as per requirement
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
--- a/worker/tests/python/integration/operators/models/divide/1/model.py
+++ b/worker/tests/python/integration/operators/models/divide/1/model.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import numpy as np
+import triton_python_backend_utils as pb_utils
+try:
+    import cupy
+except Exception:
+    cupy = None
+class TritonPythonModel:
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        inputs = []
+        outputs = []
+        dims = [-1, -1]
+        optional = True
+        for data_type in ["type_int64"]:
+            type_name = data_type.split("_")[1].lower()
+            input_name = f"{type_name}_input"
+            output_name = "fp64_output_partial"
+            inputs.append(
+                {
+                    "name": input_name,
+                    "data_type": data_type,
+                    "dims": dims,
+                    "optional": optional,
+                }
+            )
+            outputs.append({"name": output_name, "data_type": data_type, "dims": dims})
+            input_name = f"{type_name}_input_divisor"
+            inputs.append(
+                {
+                    "name": input_name,
+                    "data_type": data_type,
+                    "dims": dims,
+                    "optional": optional,
+                }
+            )
+        outputs.append(
+            {"name": "output_parameters", "data_type": "TYPE_STRING", "dims": [1]}
+        )
+        for input_ in inputs:
+            auto_complete_model_config.add_input(input_)
+        for output in outputs:
+            auto_complete_model_config.add_output(output)
+        auto_complete_model_config.set_max_batch_size(0)
+        return auto_complete_model_config
+    def initialize(self, args):
+        self._model_config = json.loads(args["model_config"])
+        self._request_gpu_memory = False
+        if "parameters" in self._model_config:
+            parameters = self._model_config["parameters"]
+            if (
+                "request_gpu_memory" in parameters
+                and parameters["request_gpu_memory"]["string_value"] == "True"
+            ):
+                self._request_gpu_memory = True
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            output_tensors = []
+            divisor = pb_utils.get_input_tensor_by_name(request, "int64_input_divisor")
+            divisor = divisor.as_numpy()[0][0]
+            dividends = pb_utils.get_input_tensor_by_name(request, "int64_input")
+            dividends = dividends.as_numpy()
+            output_value = np.array([np.divide(dividends, divisor)])
+            if self._request_gpu_memory:
+                output_value = cupy.array(output_value)
+                output_tensor = pb_utils.Tensor.from_dlpack(
+                    "fp64_output_partial", output_value
+                )
+            else:
+                output_tensor = pb_utils.Tensor("fp64_output_partial", output_value)
+            output_tensors.append(output_tensor)
+            output_parameters = np.array([request.parameters()]).astype(np.object_)
+            output_tensors.append(
+                pb_utils.Tensor("output_parameters", output_parameters)
+            )
+            responses.append(
+                pb_utils.InferenceResponse(
+                    output_tensors=output_tensors,
+                )
+            )
+        return responses
--- a/worker/tests/python/integration/operators/models/divide/config.pbtxt
+++ b/worker/tests/python/integration/operators/models/divide/config.pbtxt
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+backend: "python"
--- a/worker/tests/python/integration/operators/models/generation/1/model.py
+++ b/worker/tests/python/integration/operators/models/generation/1/model.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import threading
+import time
+import numpy as np
+import triton_python_backend_utils as pb_utils
+class TritonPythonModel:
+    def initialize(self, args):
+        model_config = json.loads(args["model_config"])
+        self._output_token_latency = (
+            int(model_config["parameters"]["inter_token_latency_ms"]["string_value"])
+        ) / 1000
+        # You must parse model_config. JSON string is not parsed here
+        self.model_config = model_config = json.loads(args["model_config"])
+        using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
+            model_config
+        )
+        if not using_decoupled:
+            raise pb_utils.TritonModelException(
+                """the model `{}` can generate any number of responses per request,
+                enable decoupled transaction policy in model configuration to
+                serve this model""".format(
+                    args["model_name"]
+                )
+            )
+        for output_name in ["OUTPUT_IDS", "SEQUENCE_LENGTH"]:
+            setattr(
+                self,
+                output_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_output_config_by_name(model_config, output_name)[
+                        "data_type"
+                    ]
+                ),
+            )
+        # To keep track of response threads so that we can delay
+        # the finalizing the model until all response threads
+        # have completed.
+        self.inflight_thread_count = 0
+        self.inflight_thread_count_lck = threading.Lock()
+    def response_thread(self, response_sender, kv_cache, request_output_len):
+        for idx in range(request_output_len):
+            time.sleep(self._output_token_latency)
+            output_ids_tensor = pb_utils.Tensor(
+                "OUTPUT_IDS", kv_cache.astype(self.output_ids_dtype)
+            )
+            sequence_length = np.array([kv_cache.size])
+            sequence_length_tensor = pb_utils.Tensor(
+                "SEQUENCE_LENGTH", sequence_length.astype(self.sequence_length_dtype)
+            )
+            response = pb_utils.InferenceResponse(
+                output_tensors=[output_ids_tensor, sequence_length_tensor]
+            )
+            response_sender.send(response)
+        # We must close the response sender to indicate to Triton that we are
+        # done sending responses for the corresponding request. We can't use the
+        # response sender after closing it. The response sender is closed by
+        # setting the TRITONSERVER_RESPONSE_COMPLETE_FINAL.
+        response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+        with self.inflight_thread_count_lck:
+            self.inflight_thread_count -= 1
+    def execute(self, requests):
+        for idx, request in enumerate(requests):
+            # Get input tensors
+            kv_cache = pb_utils.get_input_tensor_by_name(request, "KV_CACHE").as_numpy()
+            request_output_len = pb_utils.get_input_tensor_by_name(
+                request, "REQUEST_OUTPUT_LEN"
+            ).as_numpy()
+            # Start a separate thread to send the responses for the request. The
+            # sending back the responses is delegated to this thread.
+            thread = threading.Thread(
+                target=self.response_thread,
+                args=(
+                    requests[0].get_response_sender(),
+                    kv_cache,
+                    request_output_len[0],
+                ),
+            )
+            # A model using decoupled transaction policy is not required to send all
+            # responses for the current request before returning from the execute.
+            # To demonstrate the flexibility of the decoupled API, we are running
+            # response thread entirely independent of the execute thread.
+            thread.daemon = True
+            with self.inflight_thread_count_lck:
+                self.inflight_thread_count += 1
+            thread.start()
+        return None
--- a/worker/tests/python/integration/operators/models/generation/config.pbtxt
+++ b/worker/tests/python/integration/operators/models/generation/config.pbtxt
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Emulates the tensorrt_llm config from:
+# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
+name: "generation"
+backend: "python"
+max_batch_size: 0
+model_transaction_policy {
+    decoupled: true
+}
+parameters: {
+  key: "inter_token_latency_ms"
+  value: {
+    string_value: "1000"
+  }
+}
+input [
+    {
+        # TODO: revisit how kv cache is being exposed to generate worker.
+        name: "KV_CACHE"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    }
+    # Add more inputs as per requirement.
+    # For simplicity only sticking with these
+    # inputs for preprocessing.
+]
+output [
+    {
+        name: "OUTPUT_IDS"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "SEQUENCE_LENGTH"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    }
+    # Add more outputs as per requirement.
+    # For simplicity only sticking with these
+    # outputs for preprocessing.
+]
+# Add more parameters as per requirement
+instance_group [
+    {
+        count: 1
+        kind : KIND_CPU
+    }
+]
--- a/worker/tests/python/integration/operators/models/identity/1/model.py
+++ b/worker/tests/python/integration/operators/models/identity/1/model.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import numpy as np
+import triton_python_backend_utils as pb_utils
+try:
+    import cupy
+except Exception:
+    cupy = None
+class TritonPythonModel:
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        inputs = []
+        outputs = []
+        dims = [-1, -1]
+        optional = True
+        config = auto_complete_model_config.as_dict()
+        for data_type in pb_utils.TRITON_STRING_TO_NUMPY.keys():
+            type_name = data_type.split("_")[1].lower()
+            input_name = f"{type_name}_input"
+            output_name = f"{type_name}_output"
+            inputs.append(
+                {
+                    "name": input_name,
+                    "data_type": data_type,
+                    "dims": dims,
+                    "optional": optional,
+                }
+            )
+            outputs.append({"name": output_name, "data_type": data_type, "dims": dims})
+        outputs.append(
+            {"name": "output_parameters", "data_type": "TYPE_STRING", "dims": [1]}
+        )
+        for input_ in inputs:
+            auto_complete_model_config.add_input(input_)
+        for output in outputs:
+            auto_complete_model_config.add_output(output)
+        auto_complete_model_config.set_max_batch_size(0)
+        if "decoupled" in config["parameters"]:
+            if config["parameters"]["decoupled"]["string_value"] == "True":
+                auto_complete_model_config.set_model_transaction_policy(
+                    {"decoupled": True}
+                )
+        return auto_complete_model_config
+    def initialize(self, args):
+        self._model_config = json.loads(args["model_config"])
+        self._decoupled = self._model_config.get("model_transaction_policy", {}).get(
+            "decoupled"
+        )
+        self._request_gpu_memory = False
+        if "parameters" in self._model_config:
+            parameters = self._model_config["parameters"]
+            if (
+                "request_gpu_memory" in parameters
+                and parameters["request_gpu_memory"]["string_value"] == "True"
+            ):
+                self._request_gpu_memory = True
+    def execute_decoupled(self, requests):
+        for request in requests:
+            sender = request.get_response_sender()
+            output_tensors = []
+            for input_tensor in request.inputs():
+                input_value = input_tensor.as_numpy()
+                output_tensor = pb_utils.Tensor(
+                    input_tensor.name().replace("input", "output"), input_value
+                )
+                output_tensors.append(output_tensor)
+            sender.send(pb_utils.InferenceResponse(output_tensors=output_tensors))
+            sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+        return None
+    def execute(self, requests):
+        if self._decoupled:
+            return self.execute_decoupled(requests)
+        responses = []
+        for request in requests:
+            output_tensors = []
+            for input_tensor in request.inputs():
+                input_value = input_tensor.as_numpy()
+                if self._request_gpu_memory:
+                    input_value = cupy.array(input_value)
+                    output_tensor = pb_utils.Tensor.from_dlpack(
+                        input_tensor.name().replace("input", "output"), input_value
+                    )
+                else:
+                    output_tensor = pb_utils.Tensor(
+                        input_tensor.name().replace("input", "output"), input_value
+                    )
+                output_tensors.append(output_tensor)
+            output_parameters = np.array([request.parameters()]).astype(np.object_)
+            output_tensors.append(
+                pb_utils.Tensor("output_parameters", output_parameters)
+            )
+            responses.append(
+                pb_utils.InferenceResponse(
+                    output_tensors=output_tensors,
+                )
+            )
+        return responses
--- a/worker/tests/python/integration/operators/models/identity/config.pbtxt
+++ b/worker/tests/python/integration/operators/models/identity/config.pbtxt
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+backend: "python"
--- a/worker/tests/python/integration/operators/models/multiply/1/model.py
+++ b/worker/tests/python/integration/operators/models/multiply/1/model.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import numpy as np
+import triton_python_backend_utils as pb_utils
+try:
+    import cupy
+except Exception:
+    cupy = None
+class TritonPythonModel:
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        inputs = []
+        outputs = []
+        dims = [-1, -1]
+        optional = True
+        for data_type in ["type_int64"]:
+            type_name = data_type.split("_")[1].lower()
+            input_name = f"{type_name}_input"
+            output_name = f"{type_name}_output_total"
+            inputs.append(
+                {
+                    "name": input_name,
+                    "data_type": data_type,
+                    "dims": dims,
+                    "optional": optional,
+                }
+            )
+            outputs.append({"name": output_name, "data_type": data_type, "dims": dims})
+        outputs.append(
+            {"name": "output_parameters", "data_type": "TYPE_STRING", "dims": [1]}
+        )
+        for input_ in inputs:
+            auto_complete_model_config.add_input(input_)
+        for output in outputs:
+            auto_complete_model_config.add_output(output)
+        auto_complete_model_config.set_max_batch_size(0)
+        return auto_complete_model_config
+    def initialize(self, args):
+        self._model_config = json.loads(args["model_config"])
+        self._request_gpu_memory = False
+        if "parameters" in self._model_config:
+            parameters = self._model_config["parameters"]
+            if (
+                "request_gpu_memory" in parameters
+                and parameters["request_gpu_memory"]["string_value"] == "True"
+            ):
+                self._request_gpu_memory = True
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            output_tensors = []
+            for input_tensor in request.inputs():
+                input_value = input_tensor.as_numpy()
+                output_value = np.array([[x.prod() for x in input_value]])
+                if self._request_gpu_memory:
+                    output_value = cupy.array(output_value)
+                    output_tensor = pb_utils.Tensor.from_dlpack(
+                        input_tensor.name().replace("input", "output_total"),
+                        output_value,
+                    )
+                else:
+                    output_tensor = pb_utils.Tensor(
+                        input_tensor.name().replace("input", "output_total"),
+                        output_value,
+                    )
+                output_tensors.append(output_tensor)
+            output_parameters = np.array([request.parameters()]).astype(np.object_)
+            output_tensors.append(
+                pb_utils.Tensor("output_parameters", output_parameters)
+            )
+            responses.append(
+                pb_utils.InferenceResponse(
+                    output_tensors=output_tensors,
+                )
+            )
+        return responses
--- a/worker/tests/python/integration/operators/models/multiply/config.pbtxt
+++ b/worker/tests/python/integration/operators/models/multiply/config.pbtxt
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+backend: "python"
--- a/worker/tests/python/integration/operators/models/postprocessing/1/model.py
+++ b/worker/tests/python/integration/operators/models/postprocessing/1/model.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import numpy as np
+import triton_python_backend_utils as pb_utils
+# from transformers import LlamaTokenizer
+# llama_tokenizer = LlamaTokenizer.from_pretrained("/path/to/hfmodel")
+from transformers import XLNetTokenizer
+class TritonPythonModel:
+    def initialize(self, args):
+        model_config = json.loads(args["model_config"])
+        for output_name in ["OUTPUT"]:
+            setattr(
+                self,
+                output_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_output_config_by_name(model_config, output_name)[
+                        "data_type"
+                    ]
+                ),
+            )
+        # Using a mock hard coded auto-tokenizer
+        self.tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+    def execute(self, requests):
+        responses = []
+        for idx, request in enumerate(requests):
+            # Get input tensors
+            output_ids = pb_utils.get_input_tensor_by_name(
+                request, "OUTPUT_IDS"
+            ).as_numpy()
+            output_result = np.array(
+                self.tokenizer.convert_ids_to_tokens((output_ids.tolist()))
+            )
+            print(f"Output Result \n\n {output_result}", flush=True)
+            output_tensor = pb_utils.Tensor(
+                "OUTPUT", output_result.astype(self.output_dtype)
+            )
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[output_tensor]
+            )
+            responses.append(inference_response)
+        return responses
--- a/worker/tests/python/integration/operators/models/postprocessing/config.pbtxt
+++ b/worker/tests/python/integration/operators/models/postprocessing/config.pbtxt
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Emulates the pre-processing config from:
+# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
+name: "postprocessing"
+backend: "python"
+max_batch_size: 0
+input [
+    {
+        name: "OUTPUT_IDS"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "SEQUENCE_LENGTH"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    }
+    # Add more inputs as per requirement.
+    # For simplicity only sticking with these
+    # inputs for preprocessing.
+]
+output [
+    {
+        name: "OUTPUT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    }
+    # Add more outputs as per requirement.
+    # For simplicity only sticking with these
+    # outputs for preprocessing.
+]
+instance_group [
+    {
+        count: 4
+        kind: KIND_CPU
+    }
+]
--- a/worker/tests/python/integration/operators/models/preprocessing/1/model.py
+++ b/worker/tests/python/integration/operators/models/preprocessing/1/model.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import numpy as np
+import triton_python_backend_utils as pb_utils
+# from transformers import LlamaTokenizer
+# llama_tokenizer = LlamaTokenizer.from_pretrained("/path/to/hfmodel")
+from transformers import XLNetTokenizer
+class TritonPythonModel:
+    """
+    This is a mock disaggregated serving pre-processing model.
+    """
+    def initialize(self, args):
+        model_config = json.loads(args["model_config"])
+        for output_name in ["INPUT_IDS", "INPUT_LENGTH", "REQUEST_OUTPUT_LEN"]:
+            setattr(
+                self,
+                output_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_output_config_by_name(model_config, output_name)[
+                        "data_type"
+                    ]
+                ),
+            )
+        # Using a mock hard coded auto-tokenizer
+        self.tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+    def execute(self, requests):
+        print("In preprocessing execute!", flush=True)
+        responses = []
+        for idx, request in enumerate(requests):
+            # Get input tensors
+            query = pb_utils.get_input_tensor_by_name(request, "query").as_numpy()
+            request_output_len = pb_utils.get_input_tensor_by_name(
+                request, "request_output_len"
+            ).as_numpy()
+            print(f"query(pre-proc) {query}", flush=True)
+            tokenize = np.array(self.tokenizer.encode(query[0].decode()))
+            print(f"tokenize(pre-proc) {tokenize.size}", flush=True)
+            input_length = np.array([tokenize.size])
+            # Just forwarding query to the pre-processed input_ids
+            input_id_tensor = pb_utils.Tensor(
+                "INPUT_IDS", tokenize.astype(self.input_ids_dtype)
+            )
+            # Just forwarding query to the pre-processed input_ids
+            input_length_tensor = pb_utils.Tensor(
+                "INPUT_LENGTH", input_length.astype(self.input_length_dtype)
+            )
+            request_output_len_tensor = pb_utils.Tensor(
+                "REQUEST_OUTPUT_LEN", request_output_len
+            )
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[
+                    input_id_tensor,
+                    input_length_tensor,
+                    request_output_len_tensor,
+                ]
+            )
+            responses.append(inference_response)
+        return responses
--- a/worker/tests/python/integration/operators/models/preprocessing/config.pbtxt
+++ b/worker/tests/python/integration/operators/models/preprocessing/config.pbtxt
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Emulates the pre-processing config from:
+# https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/preprocessing/config.pbtxt
+name: "preprocessing"
+backend: "python"
+max_batch_size: 0
+input [
+    {
+        name: "query"
+        data_type: TYPE_STRING
+        dims: [ 1 ]
+    },
+    {
+        name: "request_output_len"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    }
+    # Add more inputs as per requirement.
+    # For simplicity only sticking with these
+    # inputs for preprocessing.
+]
+output [
+    {
+        name: "INPUT_IDS"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
+    {
+        name: "INPUT_LENGTH"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    },
+    {
+        name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ 1 ]
+    }
+    # Add more outputs as per requirement.
+    # For simplicity only sticking with these
+    # outputs for preprocessing.
+]
+instance_group [
+    {
+        count: 4
+        kind: KIND_CPU
+    }
+]
--- a/worker/tests/python/integration/test_add_multiply_divide.py
+++ b/worker/tests/python/integration/test_add_multiply_divide.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import logging
+import sys
+from multiprocessing import Manager, Process
+import cupy
+import numpy
+import pytest
+import ucp
+from cupy_backends.cuda.api.runtime import CUDARuntimeError
+from triton_distributed.icp.nats_request_plane import NatsRequestPlane
+from triton_distributed.icp.ucp_data_plane import UcpDataPlane
+from triton_distributed.worker.log_formatter import LOGGER_NAME
+from triton_distributed.worker.operator import OperatorConfig
+from triton_distributed.worker.remote_operator import RemoteOperator
+from triton_distributed.worker.triton_core_operator import TritonCoreOperator
+from triton_distributed.worker.worker import WorkerConfig
+NATS_PORT = 4223
+MODEL_REPOSITORY = "/workspace/worker/tests/python/integration/operators/models"
+WORKFLOW_REPOSITORY = "/workspace/worker/tests/python/integration/operators"
+TRITON_LOG_LEVEL = 6
+logger = logging.getLogger(LOGGER_NAME)
+# Run cupy's cuda.is_available once to
+# avoid the exception hitting runtime code.
+try:
+    if cupy.cuda.is_available():
+        pass
+    else:
+        print("CUDA not available.")
+except CUDARuntimeError:
+    print("CUDA not available")
+# TODO
+# Decide if this should be
+# pre merge, nightly, or weekly
+pytestmark = pytest.mark.pre_merge
+@pytest.fixture
+def workers(worker_manager, request):
+    worker_config = WorkerConfig(
+        request_plane=NatsRequestPlane,
+        data_plane=UcpDataPlane,
+        request_plane_args=([], {"request_plane_uri": f"nats://localhost:{NATS_PORT}"}),
+        log_level=TRITON_LOG_LEVEL,
+    )
+    store_outputs_in_response = request.getfixturevalue("store_outputs_in_response")
+    add_model = OperatorConfig(
+        name="add",
+        implementation=TritonCoreOperator,
+        version=1,
+        max_inflight_requests=10,
+        parameters={"store_outputs_in_response": store_outputs_in_response},
+        repository=MODEL_REPOSITORY,
+    )
+    multiply_model = OperatorConfig(
+        name="multiply",
+        implementation=TritonCoreOperator,
+        version=1,
+        max_inflight_requests=10,
+        parameters={"store_outputs_in_response": store_outputs_in_response},
+        repository=MODEL_REPOSITORY,
+    )
+    divide_model = OperatorConfig(
+        name="divide",
+        implementation=TritonCoreOperator,
+        version=1,
+        max_inflight_requests=10,
+        parameters={"store_outputs_in_response": store_outputs_in_response},
+        repository=MODEL_REPOSITORY,
+    )
+    workflow = OperatorConfig(
+        name="add_multiply_divide",
+        implementation="add_multiply_divide:AddMultiplyDivide",
+        version=1,
+        max_inflight_requests=10,
+        parameters={"store_outputs_in_response": store_outputs_in_response},
+        repository=WORKFLOW_REPOSITORY,
+    )
+    with Manager() as manager:
+        workers = []
+        queues = []
+        queues.append(manager.Queue(maxsize=1))
+        workers.append(
+            worker_manager.setup_worker_process(
+                [add_model], "add", queues[-1], worker_config
+            )
+        )
+        queues.append(manager.Queue(maxsize=1))
+        workers.append(
+            worker_manager.setup_worker_process(
+                [multiply_model], "multiply", queues[-1], worker_config
+            )
+        )
+        queues.append(manager.Queue(maxsize=1))
+        workers.append(
+            worker_manager.setup_worker_process(
+                [divide_model], "divide", queues[-1], worker_config
+            )
+        )
+        queues.append(manager.Queue(maxsize=1))
+        workers.append(
+            worker_manager.setup_worker_process(
+                [workflow], "add_multiply_divide", queues[-1], worker_config
+            )
+        )
+        workers_failed = False
+        status_list = []
+        for queue, worker in zip(queues, workers):
+            status = queue.get()
+            status_list.append(status)
+            if status != "READY":
+                workers_failed = True
+        if workers_failed:
+            worker_manager.cleanup_workers(workers, check_status=False)
+            raise Exception(f"Failed to start worker processes: {status_list}")
+        yield workers
+        worker_manager.cleanup_workers(workers)
+def _create_inputs(number, size):
+    inputs = []
+    outputs = []
+    for index in range(number):
+        input_ = numpy.random.randint(low=1, high=100, size=[2, size])
+        expected_ = {}
+        expected_["add_int64_output_total"] = numpy.array([[input_.sum()]])
+        expected_["add_int64_output_partial"] = numpy.array([[x.sum() for x in input_]])
+        expected_["multiply_int64_output_total"] = numpy.array(
+            [[x.prod() for x in expected_["add_int64_output_partial"]]]
+        )
+        divisor = expected_["add_int64_output_total"][0][0]
+        dividends = expected_["add_int64_output_partial"]
+        expected_["divide_fp64_output_partial"] = numpy.array(
+            [numpy.divide(dividends, divisor)]
+        )
+        inputs.append(input_)
+        outputs.append(expected_)
+    return inputs, outputs
+async def post_requests(num_requests, store_inputs_in_request):
+    ucp.reset()
+    timeout = 5
+    data_plane = UcpDataPlane()
+    data_plane.connect()
+    request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
+    await request_plane.connect()
+    add_multiply_divide_model = RemoteOperator(
+        "add_multiply_divide", 1, request_plane, data_plane
+    )
+    results = []
+    expected_results = {}
+    inputs, outputs = _create_inputs(num_requests, 40)
+    for i, input_ in enumerate(inputs):
+        request_id = str(i)
+        request = add_multiply_divide_model.create_request(
+            inputs={"int64_input": input_}, request_id=request_id
+        )
+        if store_inputs_in_request:
+            request.store_inputs_in_request.add("int64_input")
+        print(request)
+        results.append(add_multiply_divide_model.async_infer(request))
+        expected_results[request_id] = outputs[i]
+    for result in asyncio.as_completed(results):
+        responses = await result
+        async for response in responses:
+            print(response)
+            for output_name, expected_value in expected_results[
+                response.request_id
+            ].items():
+                output = response.outputs[output_name]
+                output_value = numpy.from_dlpack(output.to_host())
+                numpy.testing.assert_equal(output_value, expected_value)
+                del output
+            print(expected_results[response.request_id])
+            del response
+    timeout = 5
+    data_plane.close(timeout)
+    await request_plane.close()
+def run(num_requests, store_inputs_in_request=False):
+    sys.exit(
+        asyncio.run(
+            post_requests(
+                num_requests=num_requests,
+                store_inputs_in_request=store_inputs_in_request,
+            )
+        )
+    )
+@pytest.mark.skipif(
+    "(not os.path.exists('/usr/local/bin/nats-server'))",
+    reason="NATS.io not present",
+)
+@pytest.mark.timeout(30)
+@pytest.mark.parametrize(
+    ["store_inputs_in_request", "store_outputs_in_response"],
+    [(False, False), (True, True)],
+)
+def test_add_multiply_divide(
+    request, nats_server, workers, store_inputs_in_request, store_outputs_in_response
+):
+    # Using a separate process to use data plane across multiple tests.
+    p = Process(target=run, args=(2, store_inputs_in_request))
+    p.start()
+    p.join()
+    assert p.exitcode == 0
--- a/worker/tests/python/unit/test_args.py
+++ b/worker/tests/python/unit/test_args.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+from triton_distributed.worker.parser import Parser
+"""
+Tests for parsing the arguments by command line parser
+"""
+@pytest.fixture
+def default_values():
+    # Add default values for the command-line interface
+    return {
+        "request_plane_uri": "nats://localhost:4222",
+        "log_level": 0,
+        # TODO: Add the default options for the worker executable here
+    }
+def test_parse_args_default(default_values):
+    # Tests for default values
+    args, parser = Parser.parse_args([])
+    assert args.request_plane_uri == default_values["request_plane_uri"]
+    assert args.log_level == default_values["log_level"]
+    if args.operators:
+        raise Exception(f"Expected no operators by default, got {args.operators}")
+    if args.operator_configs:
+        raise Exception(
+            f"Expected no operators by default, got {args.operator_configs}"
+        )
+@pytest.mark.parametrize(
+    "valid_request_plane_uri",
+    [
+        "https://example.com",
+        # Add valid request plane uri values
+    ],
+)
+def test_parse_args_valid_request_plane_uri(valid_request_plane_uri):
+    # Tests with valid values for request plane uri
+    args, _ = Parser.parse_args(["--request-plane-uri", valid_request_plane_uri])
+    assert args.request_plane_uri == valid_request_plane_uri
+def clean_argument_list(args_list):
+    return [x for x in args_list if x is not None]
+@pytest.mark.parametrize(
+    "first_arg, second_arg, third_arg",
+    [
+        ("name:abc", "version:1", "max_inflight_requests:5"),
+        ("name:abc", "max_inflight_requests:5", None),
+        ("name:abc", "version:1", None),
+        ("name:abc", None, None),
+        # Add valid cases
+    ],
+)
+def test_parse_args_valid_model(first_arg, second_arg, third_arg, tmp_path):
+    model_repo_path = tmp_path / "model_repo"
+    model_repo_path.mkdir()
+    d = model_repo_path / "abc"
+    d.mkdir()
+    # Tests with valid arguments
+    input_args = ["--operator"]
+    model_args = clean_argument_list(
+        [
+            first_arg,
+            second_arg,
+            third_arg,
+            f"repository:{model_repo_path}",
+            "module:worker.triton_core_operator:TritonCoreOperator",
+        ]
+    )
+    print(model_args)
+    input_args = input_args + model_args
+    args, _ = Parser.parse_args(input_args)
+    assert args.operators[0] == model_args
+def test_parse_args_invalid_operator(capsys):
+    # Tests with  invalid arguments
+    with pytest.raises(SystemExit):
+        Parser.parse_args(["--operator"])
+    captured = capsys.readouterr()
+    assert "expected at least one argument" in captured.err
+@pytest.mark.parametrize(
+    "first_arg, second_arg, third_arg",
+    [
+        ("name:abc", "version:1", "max_inflight_requests:5"),
+        ("name:abc", "max_inflight_requests:5", None),
+        ("name:abc", "version:1", None),
+        # TODO: Revisit can be uncommented once the operator module can be inferred automatically.
+        # ("abc", None, None),
+        # Add valid cases
+    ],
+)
+def test_parse_args_valid_operator(first_arg, second_arg, third_arg, tmp_path):
+    repo_path = tmp_path / "worker_repo"
+    repo_path.mkdir()
+    d = repo_path / "abc"
+    d.mkdir()
+    # Tests with valid arguments
+    input_args = ["--operator"]
+    operator_args = clean_argument_list([first_arg, second_arg, third_arg])
+    input_args = input_args + operator_args + ["module:dummyworkflow:Workflow"]
+    args, _ = Parser.parse_args(input_args)
+    assert args.operators[0] == operator_args + ["module:dummyworkflow:Workflow"]
--- a/worker/tests/python/unit/test_logger.py
+++ b/worker/tests/python/unit/test_logger.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import pytest
+from triton_distributed.worker.log_formatter import LOGGER_NAME, setup_logger
+logger = logging.getLogger(LOGGER_NAME)
+MSG = "This is a sample message"
+"""
+Tests for Logging module
+"""
+def logging_function(logger):
+    logger.info(MSG)
+    logger.warning(MSG)
+    try:
+        raise Exception("This is an exception")
+    except Exception:
+        logger.exception(MSG)
+    logger.error(MSG)
+    logger.debug(MSG)
+@pytest.fixture
+def reset_logger(caplog):
+    loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
+    loggers.append(logging.getLogger())
+    for logger in loggers:
+        handlers = logger.handlers[:]
+        for handler in handlers:
+            logger.removeHandler(handler)
+            handler.close()
+        logger.setLevel(logging.NOTSET)
+        logger.propagate = True
+        caplog.clear()
+@pytest.mark.parametrize(
+    "log_level, expected_record_counts",
+    [
+        # For log-level 0 only error and exception should be recorded
+        (0, 2),
+        # For log-level 1 only info, error, exception and warning should be recorded
+        (1, 4),
+        # All logs(error, exception, info, debug and warning) should be printed for log-level 2
+        (2, 5),
+    ],
+)
+def test_logging(reset_logger, caplog, log_level, expected_record_counts):
+    caplog.set_level(log_level)
+    setup_logger(log_level=log_level)
+    logging_function(logger)
+    assert len(caplog.records) == expected_record_counts