feat: hello world

Co-authored-by: Piotr Marcinkiewicz <piotrm@nvidia.com> Co-authored-by: Tanmay Verma <tanmay2592@gmail.com>

feat: hello world
Co-authored-by: Piotr Marcinkiewicz <piotrm@nvidia.com> Co-authored-by: Tanmay Verma <tanmay2592@gmail.com>
4698c0f4 · Neelay Shah · GitHub · e6c12674 · 4698c0f4 · 4698c0f4
Commit 4698c0f4 authored Jan 17, 2025 by Neelay Shah Committed by GitHub Jan 17, 2025
10 changed files
--- a/worker/src/python/triton_distributed/worker/remote_operator.py
+++ b/worker/src/python/triton_distributed/worker/remote_operator.py
@@ -29,14 +29,17 @@ from tritonserver import InvalidArgumentError
 class RemoteOperator:
    def __init__(
        self,
-        name: str,
-        version: int,
+        operator: str | tuple[str, int],
        request_plane: RequestPlane,
        data_plane: DataPlane,
        component_id: Optional[uuid.UUID] = None,
    ):
-        self.name = name
-        self.version = version
+        if isinstance(operator, str):
+            self.name = operator
+            self.version = 1
+        else:
+            self.name = operator[0]
+            self.version = operator[1]
        self._request_plane = request_plane
        self._data_plane = data_plane
        self.component_id = component_id

--- a/worker/src/python/triton_distributed/worker/worker.py
+++ b/worker/src/python/triton_distributed/worker/worker.py
@@ -16,7 +16,6 @@
 import asyncio
 import importlib
 import logging
-import multiprocessing
 import os
 import pathlib
 import signal
@@ -50,7 +49,7 @@ class WorkerConfig:
    data_plane: Type[DataPlane] = UcpDataPlane
    request_plane_args: tuple[list, dict] = field(default_factory=lambda: ([], {}))
    data_plane_args: tuple[list, dict] = field(default_factory=lambda: ([], {}))
-    log_level: int = 0
+    log_level: Optional[int] = None
    operators: list[OperatorConfig] = field(default_factory=list)
    triton_log_path: Optional[str] = None
    name: str = str(uuid.uuid1())
@@ -75,6 +74,8 @@ class Worker:
        self._triton_log_path = config.triton_log_path
        self._name = config.name
        self._log_level = config.log_level
+        if self._log_level is None:
+            self._log_level = 0
        self._operator_configs = config.operators
        self._log_dir = config.log_dir

@@ -87,6 +88,7 @@ class Worker:
        self._operators: dict[tuple[str, int], Operator] = {}
        self._metrics_port = config.metrics_port
        self._metrics_server: Optional[uvicorn.Server] = None
+        self._component_id = self._request_plane.component_id

    def _import_operators(self):
        for operator_config in self._operator_configs:
@@ -225,6 +227,7 @@ class Worker:
        await asyncio.gather(*handlers)

    async def serve(self):
+        error = None
        self._triton_core = tritonserver.Server(
            model_repository=".",
            log_error=True,
@@ -258,6 +261,7 @@ class Worker:
        except Exception as e:
            logger.exception("Encountered an error in worker: %s", e)
            self._stop_requested = True
+            error = e
        logger.info("worker store: %s", list(self._data_plane._tensor_store.keys()))
        logger.info("Worker stopped...")
        logger.info(
@@ -272,6 +276,7 @@ class Worker:
        if self._metrics_server:
            self._metrics_server.should_exit = True
            await self._metrics_server.shutdown()
+        return error

    async def shutdown(self, signal):
        logger.info("Received exit signal %s...", signal.name)
@@ -326,13 +331,20 @@ class Worker:
        loop.stop()

    def start(self):
+        exit_condition = None
+
        if self._log_dir:
+            pid = os.getpid()
            os.makedirs(self._log_dir, exist_ok=True)
-            stdout_path = os.path.join(self._log_dir, f"{self._name}.stdout.log")
-            stderr_path = os.path.join(self._log_dir, f"{self._name}.stderr.log")
+            stdout_path = os.path.join(
+                self._log_dir, f"{self._name}.{self._component_id}.{pid}.stdout.log"
+            )
+            stderr_path = os.path.join(
+                self._log_dir, f"{self._name}.{self._component_id}.{pid}.stderr.log"
+            )
            if not self._triton_log_path:
                self._triton_log_path = os.path.join(
-                    self._log_dir, f"{self._name}.triton.log"
+                    self._log_dir, f"{self._name}.{self._component_id}.{pid}.triton.log"
                )
            sys.stdout = open(stdout_path, "w", buffering=1)
            sys.stderr = open(stderr_path, "w", buffering=1)
@@ -349,55 +361,34 @@ class Worker:
            loop.add_signal_handler(
                sig, lambda s=sig: asyncio.create_task(self.shutdown(s))  # type: ignore
            )
+        serve_result = None
        try:
            if self._metrics_port:
-                loop.create_task(self.serve())
+                serve_result = loop.create_task(self.serve())
                self._metrics_server = self._setup_metrics_server()
                assert self._metrics_server, "Unable to start metrics server"
                loop.run_until_complete(self._metrics_server.serve())
            else:
-                loop.run_until_complete(self.serve())
+                serve_result = loop.run_until_complete(self.serve())
        except asyncio.CancelledError:
-            pass
            logger.info("Worker cancelled!")
        finally:
            loop.run_until_complete(self._wait_for_tasks(loop))
            loop.close()
            logger.info("Successfully shutdown worker.")
+            if isinstance(serve_result, asyncio.Task):
+                exit_condition = serve_result.result()
+            else:
+                exit_condition = serve_result
+
            sys.stdout.flush()
            sys.stderr.flush()
+
            if self._log_dir:
                sys.stdout.close()
                sys.stderr.close()

-
-class Deployment:
-    def __init__(self, worker_configs: list[WorkerConfig]):
-        self._process_context = multiprocessing.get_context("spawn")
-        self._worker_configs = worker_configs
-        self._workers: list[multiprocessing.context.SpawnProcess] = []
-
-    @staticmethod
-    def _start_worker(worker_config):
-        Worker(worker_config).start()
-
-    def start(self):
-        for worker_config in self._worker_configs:
-            self._workers.append(
-                self._process_context.Process(
-                    target=Deployment._start_worker,
-                    name=worker_config.name,
-                    args=[worker_config],
-                )
-            )
-
-    def shutdown(self, join=True, timeout=10):
-        for worker in self._workers:
-            worker.terminate()
-        if join:
-            for worker in self._workers:
-                worker.join(timeout)
-            for worker in self._workers:
-                if worker.is_alive():
-                    worker.kill()
-                    worker.join(timeout)
+        if exit_condition is not None:
+            sys.exit(1)
+        else:
+            sys.exit(0)
--- a/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py
+++ b/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import asyncio
 import gc
@@ -162,7 +151,7 @@ class TritonPythonModel:
            "string_value"
        ]
        self._remote_operator = RemoteOperator(
-            self._remote_worker_name, 1, self._request_plane, self._data_plane
+            self._remote_worker_name, self._request_plane, self._data_plane
        )

        # Starting the response thread. It allows API Server to keep making progress while

--- a/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/config.pbtxt
+++ b/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/config.pbtxt
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 name: "mock_disaggregated_serving"
 backend: "python"

--- a/worker/tests/python/integration/operators/add_multiply_divide.py
+++ b/worker/tests/python/integration/operators/add_multiply_divide.py
@@ -35,14 +35,12 @@ class AddMultiplyDivide(Operator):
        self._request_plane = request_plane
        self._data_plane = data_plane
        self._parameters = parameters
-        self._add_model = RemoteOperator(
-            "add", 1, self._request_plane, self._data_plane
-        )
+        self._add_model = RemoteOperator("add", self._request_plane, self._data_plane)
        self._multiply_model = RemoteOperator(
-            "multiply", 1, self._request_plane, self._data_plane
+            "multiply", self._request_plane, self._data_plane
        )
        self._divide_model = RemoteOperator(
-            "divide", 1, self._request_plane, self._data_plane
+            "divide", self._request_plane, self._data_plane
        )

    async def execute(self, requests: list[RemoteInferenceRequest]):

--- a/worker/tests/python/integration/operators/mock_disaggregated_serving.py
+++ b/worker/tests/python/integration/operators/mock_disaggregated_serving.py
@@ -37,16 +37,16 @@ class MockDisaggregatedServing(Operator):
        self._data_plane = data_plane
        self._params = params
        self._preprocessing_model = RemoteOperator(
-            "preprocessing", 1, self._request_plane, self._data_plane
+            "preprocessing", self._request_plane, self._data_plane
        )
        self._context_model = RemoteOperator(
-            "context", 1, self._request_plane, self._data_plane
+            "context", self._request_plane, self._data_plane
        )
        self._generate_model = RemoteOperator(
-            "generation", 1, self._request_plane, self._data_plane
+            "generation", self._request_plane, self._data_plane
        )
        self._postprocessing_model = RemoteOperator(
-            "postprocessing", 1, self._request_plane, self._data_plane
+            "postprocessing", self._request_plane, self._data_plane
        )
        self._logger = logger


--- a/worker/tests/python/integration/test_add_multiply_divide.py
+++ b/worker/tests/python/integration/test_add_multiply_divide.py
@@ -160,7 +160,7 @@ async def post_requests(num_requests, store_inputs_in_request):
    await request_plane.connect()

    add_multiply_divide_operator = RemoteOperator(
-        "add_multiply_divide", 1, request_plane, data_plane
+        "add_multiply_divide", request_plane, data_plane
    )

    results = []

--- a/worker/tests/python/integration/test_direct.py
+++ b/worker/tests/python/integration/test_direct.py
@@ -115,7 +115,7 @@ async def post_requests(num_requests, num_targets):
    request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
    await request_plane.connect()

-    identity_operator = RemoteOperator("identity", 1, request_plane, data_plane)
+    identity_operator = RemoteOperator("identity", request_plane, data_plane)

    target_components = set()
    target_component_list: list[uuid.UUID] = []

--- a/worker/tests/python/integration/test_mock_disaggregated_serving.py
+++ b/worker/tests/python/integration/test_mock_disaggregated_serving.py
@@ -156,7 +156,7 @@ async def post_requests(num_requests):
    await request_plane.connect()

    mock_disaggregated_serving_operator = RemoteOperator(
-        "mock_disaggregated_serving", 1, request_plane, data_plane
+        "mock_disaggregated_serving", request_plane, data_plane
    )

    expected_results = {}

--- a/worker/tests/python/integration/test_perf_benchmark.py
+++ b/worker/tests/python/integration/test_perf_benchmark.py
@@ -133,7 +133,7 @@ def run(
    asyncio.get_event_loop().run_until_complete(request_plane.connect())

    identity_operator = RemoteOperator(
-        "identity", 1, request_plane, data_plane_tracker._data_plane
+        "identity", request_plane, data_plane_tracker._data_plane
    )

    inputs, outputs = _create_inputs(1, tensor_size_in_kb)