feat: add python bindings + wheel build (#94)

Co-authored-by: Ryan McCormick <rmccormick@nvidia.com> Co-authored-by: Neelay Shah <neelays@nvidia.com>

feat: add python bindings + wheel build (#94)
Co-authored-by: Ryan McCormick <rmccormick@nvidia.com> Co-authored-by: Neelay Shah <neelays@nvidia.com>
03b0101e · Ryan Olson · GitHub · ffbc06cc · 03b0101e · 03b0101e
Commit 03b0101e authored Feb 05, 2025 by Ryan Olson Committed by GitHub Feb 05, 2025
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -55,3 +55,21 @@ api_server_models/
 server/
 **/*backups*
+### Rust ###
+# Generated by Cargo
+# will have compiled files and executables
+debug/
+target/
+### Virtual Environment ###
+.venv/
+### Ruff ###
+.ruff_cache/
+### Python ###
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,8 +52,18 @@ tmp_path_retention_policy = "failed"
 # NOTE
 # We ignore model.py explcitly here to avoid mypy errors with duplicate modules
 # pytest overrides the default mypy exclude configuration and so we exclude here as well
+addopts = [
-addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config", "--mypy", "--ignore-glob=*model.py"]
+    "-ra",
+    "--showlocals",
+    "--strict-markers",
+    "--strict-config",
+    "--mypy",
+    "--ignore-glob=*model.py",
+    # FIXME: Get relative/generic blob paths to work here
+    # Ignore rust<->python bindings until python package is built/installed in environment
+    "--ignore-glob=/workspace/runtime/rust/python-wheel/python/triton_distributed_rs/*.py",
+    "--ignore-glob=/workspace/runtime/rust/python-wheel/python/triton_distributed_rs/*.pyi",
+]
 xfail_strict = true
 log_cli_level = "INFO"
 filterwarnings = [
@@ -83,10 +93,6 @@ indent-width = 4
 #   throughout project such as launch_workers.py
 # explicit_package_bases = true
-# NOTE
-# We ignore model.py explcitly here to avoid mypy errors with duplicate modules
-exclude = ["model.py"]
 # --ignore-missing-imports: WAR too many errors when developing outside
 #   of container environment with PYTHONPATH set and packages installed.
 #   NOTE: Can possibly move mypy from pre-commit to a github action run only in

--- a/runtime/rust/Cargo.toml
+++ b/runtime/rust/Cargo.toml
@@ -3,7 +3,9 @@ name = "triton-distributed"
 version = "0.1.1"
 edition = "2021"
 authors = ["NVIDIA"]
+license = "Apache-2.0"
 homepage = "https://github.com/triton-inference-server/triton_distributed"
+repository = "https://github.com/triton-inference-server/triton_distributed"
 [dependencies]
 # workspace - when we expand to multiple crates; put these in the workspace

--- a/runtime/rust/python-wheel/.gitignore
+++ b/runtime/rust/python-wheel/.gitignore
+/target
+python/triton_distributed/*.so
--- a/runtime/rust/python-wheel/Cargo.lock
+++ b/runtime/rust/python-wheel/Cargo.lock
--- a/runtime/rust/python-wheel/Cargo.toml
+++ b/runtime/rust/python-wheel/Cargo.toml
+[package]
+name = "triton_distributed_py3"
+version = "0.1.1"
+edition = "2021"
+authors = ["NVIDIA"]
+license = "Apache-2.0"
+homepage = "https://github.com/triton-inference-server/triton_distributed"
+repository = "https://github.com/triton-inference-server/triton_distributed"
+[lib]
+path = "rust/lib.rs"
+name = "_core"
+# "cdylib" is necessary to produce a shared library for Python to import from.
+crate-type = ["cdylib"]
+[dependencies]
+triton-distributed = { version = "0.1.1", path = "../" }
+futures = "0.3"
+once_cell = "1"
+serde = "1"
+serde_json = "1.0.138"
+tokio = { version = "1", features = ["full"] }
+tokio-stream = "0"
+tracing = "0"
+# "extension-module" tells pyo3 we want to build an extension module (skips linking against libpython.so)
+# "abi3-py39" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.9
+pyo3 = { version = "0.23.4", default-features = false, features = [
+  "macros",
+  "experimental-async",
+  "experimental-inspect",
+  "extension-module",
+  "py-clone",
+] }
+pyo3-async-runtimes = { version = "0.23.0", default-features = false, features = [
+  "attributes",
+  "testing",
+  "tokio-runtime",
+  "unstable-streams",
+] }
+pythonize = "0.23"
--- a/runtime/rust/python-wheel/examples/bls/bar.py
+++ b/runtime/rust/python-wheel/examples/bls/bar.py
+import asyncio
+import uvloop
+from triton_distributed_rs import DistributedRuntime, triton_worker
+uvloop.install()
+class RequestHandler:
+    async def generate(self, request):
+        for char in request:
+            yield char
+            yield char
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    component = runtime.namespace("examples/bls").component("bar")
+    await component.create_service()
+    endpoint = component.endpoint("generate")
+    await endpoint.serve_endpoint(RequestHandler().generate)
+asyncio.run(worker())
--- a/runtime/rust/python-wheel/examples/bls/bls.py
+++ b/runtime/rust/python-wheel/examples/bls/bls.py
+import asyncio
+import uvloop
+from triton_distributed_rs import DistributedRuntime, triton_worker
+uvloop.install()
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    foo = (
+        await runtime.namespace("examples/bls")
+        .component("foo")
+        .endpoint("generate")
+        .client()
+    )
+    bar = (
+        await runtime.namespace("examples/bls")
+        .component("bar")
+        .endpoint("generate")
+        .client()
+    )
+    # hello world showed us the client has a .generate, which uses the default load balancer
+    # however, you can explicitly opt-in to client side load balancing by using the `round_robin`
+    # or `random` methods on client. note - there is a direct method as well, but that is for a
+    # router example
+    async for char in await foo.round_robin("hello world"):
+        # the responses are sse-style responses, so we extract the data key
+        async for x in await bar.random(char.get("data")):
+            print(x)
+asyncio.run(worker())
--- a/runtime/rust/python-wheel/examples/bls/foo.py
+++ b/runtime/rust/python-wheel/examples/bls/foo.py
+import asyncio
+import uvloop
+from triton_distributed_rs import DistributedRuntime, triton_worker
+uvloop.install()
+class RequestHandler:
+    async def generate(self, request):
+        for char in request:
+            yield char
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    component = runtime.namespace("examples/bls").component("foo")
+    await component.create_service()
+    endpoint = component.endpoint("generate")
+    await endpoint.serve_endpoint(RequestHandler().generate)
+asyncio.run(worker())
--- a/runtime/rust/python-wheel/examples/error_handling/__init__.py
+++ b/runtime/rust/python-wheel/examples/error_handling/__init__.py
--- a/runtime/rust/python-wheel/examples/error_handling/client.py
+++ b/runtime/rust/python-wheel/examples/error_handling/client.py
+import asyncio
+import uvloop
+from triton_distributed_rs import DistributedRuntime, triton_worker
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    await init(runtime, "triton-init")
+async def init(runtime: DistributedRuntime, ns: str):
+    """
+    Instantiate a `backend` client and call the `generate` endpoint
+    """
+    # get endpoint
+    endpoint = runtime.namespace(ns).component("backend").endpoint("generate")
+    # create client
+    client = await endpoint.client()
+    # wait for an endpoint to be ready
+    await client.wait_for_endpoints()
+    # issue request
+    stream = await client.generate("hello world")
+    error_count = 0
+    try:
+        # process response
+        async for char in stream:
+            print(char)
+    except ValueError:
+        error_count += 1
+        pass
+    finally:
+        assert error_count == 1
+    stream = await client.generate("hello earth", annotated=False)
+    async for char in stream:
+        print(char)
+if __name__ == "__main__":
+    uvloop.install()
+    asyncio.run(worker())
--- a/runtime/rust/python-wheel/examples/error_handling/run.py
+++ b/runtime/rust/python-wheel/examples/error_handling/run.py
+import asyncio
+import random
+import string
+import uvloop
+from client import init as client_init
+from server import init as server_init
+from triton_distributed_rs import DistributedRuntime, triton_worker
+def random_string(length=10):
+    chars = string.ascii_letters + string.digits  # a-z, A-Z, 0-9
+    return "".join(random.choices(chars, k=length))
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    ns = random_string()
+    task = asyncio.create_task(server_init(runtime, ns))
+    await client_init(runtime, ns)
+    runtime.shutdown()
+    await task
+if __name__ == "__main__":
+    uvloop.install()
+    asyncio.run(worker())
--- a/runtime/rust/python-wheel/examples/error_handling/server.py
+++ b/runtime/rust/python-wheel/examples/error_handling/server.py
+import asyncio
+import uvloop
+from triton_distributed_rs import DistributedRuntime, triton_worker
+class RequestHandler:
+    """
+    Request handler for the generate endpoint
+    """
+    async def generate(self, request):
+        print(f"Received request: {request}")
+        for char in request:
+            if char == "w":
+                raise ValueError("w is not allowed")
+            yield char
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    await init(runtime, "triton-init")
+async def init(runtime: DistributedRuntime, ns: str):
+    """
+    Instantiate a `backend` component and serve the `generate` endpoint
+    A `Component` can serve multiple endpoints
+    """
+    component = runtime.namespace(ns).component("backend")
+    await component.create_service()
+    endpoint = component.endpoint("generate")
+    print("Started server instance")
+    await endpoint.serve_endpoint(RequestHandler().generate)
+if __name__ == "__main__":
+    uvloop.install()
+    asyncio.run(worker())
--- a/runtime/rust/python-wheel/examples/hello_world/client.py
+++ b/runtime/rust/python-wheel/examples/hello_world/client.py
+import asyncio
+import uvloop
+from triton_distributed_rs import DistributedRuntime, triton_worker
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    await init(runtime, "triton-init")
+async def init(runtime: DistributedRuntime, ns: str):
+    """
+    Instantiate a `backend` client and call the `generate` endpoint
+    """
+    # get endpoint
+    endpoint = runtime.namespace(ns).component("backend").endpoint("generate")
+    # create client
+    client = await endpoint.client()
+    # wait for an endpoint to be ready
+    await client.wait_for_endpoints()
+    # issue request
+    stream = await client.generate("hello world")
+    # process the stream
+    async for char in stream:
+        print(char)
+if __name__ == "__main__":
+    uvloop.install()
+    asyncio.run(worker())
--- a/runtime/rust/python-wheel/examples/hello_world/run.py
+++ b/runtime/rust/python-wheel/examples/hello_world/run.py
+import asyncio
+import random
+import string
+import uvloop
+from client import init as client_init
+from server import init as server_init
+from triton_distributed_rs import DistributedRuntime, triton_worker
+def random_string(length=10):
+    chars = string.ascii_letters + string.digits  # a-z, A-Z, 0-9
+    return "".join(random.choices(chars, k=length))
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    ns = random_string()
+    task = asyncio.create_task(server_init(runtime, ns))
+    await client_init(runtime, ns)
+    runtime.shutdown()
+    await task
+if __name__ == "__main__":
+    uvloop.install()
+    asyncio.run(worker())
--- a/runtime/rust/python-wheel/examples/hello_world/server.py
+++ b/runtime/rust/python-wheel/examples/hello_world/server.py
+import asyncio
+import uvloop
+from triton_distributed_rs import DistributedRuntime, triton_worker
+class RequestHandler:
+    """
+    Request handler for the generate endpoint
+    """
+    async def generate(self, request):
+        print(f"Received request: {request}")
+        for char in request:
+            yield char
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    await init(runtime, "triton-init")
+async def init(runtime: DistributedRuntime, ns: str):
+    """
+    Instantiate a `backend` component and serve the `generate` endpoint
+    A `Component` can serve multiple endpoints
+    """
+    component = runtime.namespace(ns).component("backend")
+    await component.create_service()
+    endpoint = component.endpoint("generate")
+    print("Started server instance")
+    await endpoint.serve_endpoint(RequestHandler().generate)
+if __name__ == "__main__":
+    uvloop.install()
+    asyncio.run(worker())
--- a/runtime/rust/python-wheel/examples/pipeline/backend.py
+++ b/runtime/rust/python-wheel/examples/pipeline/backend.py
+import asyncio
+import uvloop
+from triton_distributed_rs import DistributedRuntime, triton_worker
+uvloop.install()
+class RequestHandler:
+    async def generate(self, request):
+        request = f"{request}-back"
+        for char in request:
+            yield char
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    component = runtime.namespace("examples/pipeline").component("backend")
+    await component.create_service()
+    endpoint = component.endpoint("generate")
+    await endpoint.serve_endpoint(RequestHandler().generate)
+asyncio.run(worker())
--- a/runtime/rust/python-wheel/examples/pipeline/frontend.py
+++ b/runtime/rust/python-wheel/examples/pipeline/frontend.py
+import asyncio
+import uvloop
+from triton_distributed_rs import DistributedRuntime, triton_worker
+uvloop.install()
+class RequestHandler:
+    def __init__(self, next):
+        self.next = next
+    async def generate(self, request):
+        request = f"{request} front"
+        async for output in await self.next.round_robin(request):
+            yield output.get("data")
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    # client to the next component - in this case the middle component
+    next = (
+        await runtime.namespace("examples/pipeline")
+        .component("middle")
+        .endpoint("generate")
+        .client()
+    )
+    # create endpoint service for frontend component
+    component = runtime.namespace("examples/pipeline").component("frontend")
+    await component.create_service()
+    endpoint = component.endpoint("generate")
+    handler = RequestHandler(next)
+    await endpoint.serve_endpoint(handler.generate)
+asyncio.run(worker())
--- a/runtime/rust/python-wheel/examples/pipeline/middle.py
+++ b/runtime/rust/python-wheel/examples/pipeline/middle.py
+import asyncio
+import uvloop
+from triton_distributed_rs import DistributedRuntime, triton_worker
+uvloop.install()
+class RequestHandler:
+    def __init__(self, backend):
+        self.backend = backend
+    async def generate(self, request):
+        request = f"{request}-mid"
+        async for output in await self.backend.random(request):
+            yield output.get("data")
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    # client to backend
+    backend = (
+        await runtime.namespace("examples/pipeline")
+        .component("backend")
+        .endpoint("generate")
+        .client()
+    )
+    # create endpoint service for middle component
+    component = runtime.namespace("examples/pipeline").component("middle")
+    await component.create_service()
+    endpoint = component.endpoint("generate")
+    await endpoint.serve_endpoint(RequestHandler(backend).generate)
+asyncio.run(worker())
--- a/runtime/rust/python-wheel/examples/pipeline/pipeline.py
+++ b/runtime/rust/python-wheel/examples/pipeline/pipeline.py
+import asyncio
+import uvloop
+from triton_distributed_rs import DistributedRuntime, triton_worker
+uvloop.install()
+@triton_worker()
+async def worker(runtime: DistributedRuntime):
+    """
+    # Pipeline Example
+    This example demonstrates how to create a pipeline of components:
+    - `frontend` call `middle` which calls `backend`
+    - each component transforms the request before passing it to the backend
+    """
+    pipeline = (
+        await runtime.namespace("examples/pipeline")
+        .component("frontend")
+        .endpoint("generate")
+        .client()
+    )
+    async for char in await pipeline.round_robin("hello from"):
+        print(char)
+asyncio.run(worker())