Commit 03b0101e authored by Ryan Olson's avatar Ryan Olson Committed by GitHub
Browse files

feat: add python bindings + wheel build (#94)


Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
Co-authored-by: default avatarNeelay Shah <neelays@nvidia.com>
parent ffbc06cc
...@@ -55,3 +55,21 @@ api_server_models/ ...@@ -55,3 +55,21 @@ api_server_models/
server/ server/
**/*backups* **/*backups*
### Rust ###
# Generated by Cargo
# will have compiled files and executables
debug/
target/
### Virtual Environment ###
.venv/
### Ruff ###
.ruff_cache/
### Python ###
__pycache__/
*.py[cod]
*$py.class
*.so
...@@ -52,8 +52,18 @@ tmp_path_retention_policy = "failed" ...@@ -52,8 +52,18 @@ tmp_path_retention_policy = "failed"
# NOTE # NOTE
# We ignore model.py explcitly here to avoid mypy errors with duplicate modules # We ignore model.py explcitly here to avoid mypy errors with duplicate modules
# pytest overrides the default mypy exclude configuration and so we exclude here as well # pytest overrides the default mypy exclude configuration and so we exclude here as well
addopts = [
addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config", "--mypy", "--ignore-glob=*model.py"] "-ra",
"--showlocals",
"--strict-markers",
"--strict-config",
"--mypy",
"--ignore-glob=*model.py",
# FIXME: Get relative/generic blob paths to work here
# Ignore rust<->python bindings until python package is built/installed in environment
"--ignore-glob=/workspace/runtime/rust/python-wheel/python/triton_distributed_rs/*.py",
"--ignore-glob=/workspace/runtime/rust/python-wheel/python/triton_distributed_rs/*.pyi",
]
xfail_strict = true xfail_strict = true
log_cli_level = "INFO" log_cli_level = "INFO"
filterwarnings = [ filterwarnings = [
...@@ -83,10 +93,6 @@ indent-width = 4 ...@@ -83,10 +93,6 @@ indent-width = 4
# throughout project such as launch_workers.py # throughout project such as launch_workers.py
# explicit_package_bases = true # explicit_package_bases = true
# NOTE
# We ignore model.py explcitly here to avoid mypy errors with duplicate modules
exclude = ["model.py"]
# --ignore-missing-imports: WAR too many errors when developing outside # --ignore-missing-imports: WAR too many errors when developing outside
# of container environment with PYTHONPATH set and packages installed. # of container environment with PYTHONPATH set and packages installed.
# NOTE: Can possibly move mypy from pre-commit to a github action run only in # NOTE: Can possibly move mypy from pre-commit to a github action run only in
......
...@@ -3,7 +3,9 @@ name = "triton-distributed" ...@@ -3,7 +3,9 @@ name = "triton-distributed"
version = "0.1.1" version = "0.1.1"
edition = "2021" edition = "2021"
authors = ["NVIDIA"] authors = ["NVIDIA"]
license = "Apache-2.0"
homepage = "https://github.com/triton-inference-server/triton_distributed" homepage = "https://github.com/triton-inference-server/triton_distributed"
repository = "https://github.com/triton-inference-server/triton_distributed"
[dependencies] [dependencies]
# workspace - when we expand to multiple crates; put these in the workspace # workspace - when we expand to multiple crates; put these in the workspace
......
/target
python/triton_distributed/*.so
This diff is collapsed.
[package]
name = "triton_distributed_py3"
version = "0.1.1"
edition = "2021"
authors = ["NVIDIA"]
license = "Apache-2.0"
homepage = "https://github.com/triton-inference-server/triton_distributed"
repository = "https://github.com/triton-inference-server/triton_distributed"
[lib]
path = "rust/lib.rs"
name = "_core"
# "cdylib" is necessary to produce a shared library for Python to import from.
crate-type = ["cdylib"]
[dependencies]
triton-distributed = { version = "0.1.1", path = "../" }
futures = "0.3"
once_cell = "1"
serde = "1"
serde_json = "1.0.138"
tokio = { version = "1", features = ["full"] }
tokio-stream = "0"
tracing = "0"
# "extension-module" tells pyo3 we want to build an extension module (skips linking against libpython.so)
# "abi3-py39" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.9
pyo3 = { version = "0.23.4", default-features = false, features = [
"macros",
"experimental-async",
"experimental-inspect",
"extension-module",
"py-clone",
] }
pyo3-async-runtimes = { version = "0.23.0", default-features = false, features = [
"attributes",
"testing",
"tokio-runtime",
"unstable-streams",
] }
pythonize = "0.23"
import asyncio
import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
uvloop.install()
class RequestHandler:
async def generate(self, request):
for char in request:
yield char
yield char
@triton_worker()
async def worker(runtime: DistributedRuntime):
component = runtime.namespace("examples/bls").component("bar")
await component.create_service()
endpoint = component.endpoint("generate")
await endpoint.serve_endpoint(RequestHandler().generate)
asyncio.run(worker())
import asyncio
import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
uvloop.install()
@triton_worker()
async def worker(runtime: DistributedRuntime):
foo = (
await runtime.namespace("examples/bls")
.component("foo")
.endpoint("generate")
.client()
)
bar = (
await runtime.namespace("examples/bls")
.component("bar")
.endpoint("generate")
.client()
)
# hello world showed us the client has a .generate, which uses the default load balancer
# however, you can explicitly opt-in to client side load balancing by using the `round_robin`
# or `random` methods on client. note - there is a direct method as well, but that is for a
# router example
async for char in await foo.round_robin("hello world"):
# the responses are sse-style responses, so we extract the data key
async for x in await bar.random(char.get("data")):
print(x)
asyncio.run(worker())
import asyncio
import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
uvloop.install()
class RequestHandler:
async def generate(self, request):
for char in request:
yield char
@triton_worker()
async def worker(runtime: DistributedRuntime):
component = runtime.namespace("examples/bls").component("foo")
await component.create_service()
endpoint = component.endpoint("generate")
await endpoint.serve_endpoint(RequestHandler().generate)
asyncio.run(worker())
import asyncio
import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
@triton_worker()
async def worker(runtime: DistributedRuntime):
await init(runtime, "triton-init")
async def init(runtime: DistributedRuntime, ns: str):
"""
Instantiate a `backend` client and call the `generate` endpoint
"""
# get endpoint
endpoint = runtime.namespace(ns).component("backend").endpoint("generate")
# create client
client = await endpoint.client()
# wait for an endpoint to be ready
await client.wait_for_endpoints()
# issue request
stream = await client.generate("hello world")
error_count = 0
try:
# process response
async for char in stream:
print(char)
except ValueError:
error_count += 1
pass
finally:
assert error_count == 1
stream = await client.generate("hello earth", annotated=False)
async for char in stream:
print(char)
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
import asyncio
import random
import string
import uvloop
from client import init as client_init
from server import init as server_init
from triton_distributed_rs import DistributedRuntime, triton_worker
def random_string(length=10):
chars = string.ascii_letters + string.digits # a-z, A-Z, 0-9
return "".join(random.choices(chars, k=length))
@triton_worker()
async def worker(runtime: DistributedRuntime):
ns = random_string()
task = asyncio.create_task(server_init(runtime, ns))
await client_init(runtime, ns)
runtime.shutdown()
await task
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
import asyncio
import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
class RequestHandler:
"""
Request handler for the generate endpoint
"""
async def generate(self, request):
print(f"Received request: {request}")
for char in request:
if char == "w":
raise ValueError("w is not allowed")
yield char
@triton_worker()
async def worker(runtime: DistributedRuntime):
await init(runtime, "triton-init")
async def init(runtime: DistributedRuntime, ns: str):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace(ns).component("backend")
await component.create_service()
endpoint = component.endpoint("generate")
print("Started server instance")
await endpoint.serve_endpoint(RequestHandler().generate)
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
import asyncio
import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
@triton_worker()
async def worker(runtime: DistributedRuntime):
await init(runtime, "triton-init")
async def init(runtime: DistributedRuntime, ns: str):
"""
Instantiate a `backend` client and call the `generate` endpoint
"""
# get endpoint
endpoint = runtime.namespace(ns).component("backend").endpoint("generate")
# create client
client = await endpoint.client()
# wait for an endpoint to be ready
await client.wait_for_endpoints()
# issue request
stream = await client.generate("hello world")
# process the stream
async for char in stream:
print(char)
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
import asyncio
import random
import string
import uvloop
from client import init as client_init
from server import init as server_init
from triton_distributed_rs import DistributedRuntime, triton_worker
def random_string(length=10):
chars = string.ascii_letters + string.digits # a-z, A-Z, 0-9
return "".join(random.choices(chars, k=length))
@triton_worker()
async def worker(runtime: DistributedRuntime):
ns = random_string()
task = asyncio.create_task(server_init(runtime, ns))
await client_init(runtime, ns)
runtime.shutdown()
await task
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
import asyncio
import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
class RequestHandler:
"""
Request handler for the generate endpoint
"""
async def generate(self, request):
print(f"Received request: {request}")
for char in request:
yield char
@triton_worker()
async def worker(runtime: DistributedRuntime):
await init(runtime, "triton-init")
async def init(runtime: DistributedRuntime, ns: str):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component = runtime.namespace(ns).component("backend")
await component.create_service()
endpoint = component.endpoint("generate")
print("Started server instance")
await endpoint.serve_endpoint(RequestHandler().generate)
if __name__ == "__main__":
uvloop.install()
asyncio.run(worker())
import asyncio
import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
uvloop.install()
class RequestHandler:
async def generate(self, request):
request = f"{request}-back"
for char in request:
yield char
@triton_worker()
async def worker(runtime: DistributedRuntime):
component = runtime.namespace("examples/pipeline").component("backend")
await component.create_service()
endpoint = component.endpoint("generate")
await endpoint.serve_endpoint(RequestHandler().generate)
asyncio.run(worker())
import asyncio
import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
uvloop.install()
class RequestHandler:
def __init__(self, next):
self.next = next
async def generate(self, request):
request = f"{request} front"
async for output in await self.next.round_robin(request):
yield output.get("data")
@triton_worker()
async def worker(runtime: DistributedRuntime):
# client to the next component - in this case the middle component
next = (
await runtime.namespace("examples/pipeline")
.component("middle")
.endpoint("generate")
.client()
)
# create endpoint service for frontend component
component = runtime.namespace("examples/pipeline").component("frontend")
await component.create_service()
endpoint = component.endpoint("generate")
handler = RequestHandler(next)
await endpoint.serve_endpoint(handler.generate)
asyncio.run(worker())
import asyncio
import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
uvloop.install()
class RequestHandler:
def __init__(self, backend):
self.backend = backend
async def generate(self, request):
request = f"{request}-mid"
async for output in await self.backend.random(request):
yield output.get("data")
@triton_worker()
async def worker(runtime: DistributedRuntime):
# client to backend
backend = (
await runtime.namespace("examples/pipeline")
.component("backend")
.endpoint("generate")
.client()
)
# create endpoint service for middle component
component = runtime.namespace("examples/pipeline").component("middle")
await component.create_service()
endpoint = component.endpoint("generate")
await endpoint.serve_endpoint(RequestHandler(backend).generate)
asyncio.run(worker())
import asyncio
import uvloop
from triton_distributed_rs import DistributedRuntime, triton_worker
uvloop.install()
@triton_worker()
async def worker(runtime: DistributedRuntime):
"""
# Pipeline Example
This example demonstrates how to create a pipeline of components:
- `frontend` call `middle` which calls `backend`
- each component transforms the request before passing it to the backend
"""
pipeline = (
await runtime.namespace("examples/pipeline")
.component("frontend")
.endpoint("generate")
.client()
)
async for char in await pipeline.round_robin("hello from"):
print(char)
asyncio.run(worker())
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment