Commit ccfe101e authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

build: pip based installation of icp and runtime. Also make tritonserver optional


Signed-off-by: default avatarNeelay Shah <neelays@nvidia.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent 5701753a
...@@ -154,8 +154,20 @@ COPY . /workspace ...@@ -154,8 +154,20 @@ COPY . /workspace
RUN cd runtime/rust && cargo build --release --locked && cargo doc --no-deps RUN cd runtime/rust && cargo build --release --locked && cargo doc --no-deps
RUN /workspace/icp/protos/gen_python.sh RUN /workspace/icp/protos/gen_python.sh
# Install python packages
ARG PYTHON_PACKAGE_VERSION=0.0.1.dev+unknown
# SETUPTOOLS_SCM_PRETEND_VERSION allows dynamically setting the package versions during build/install.
# This allows having versioned packages during development between releases, such as commit IDs.
#
# Normally SCM version is taken directly from .git but this is not available in the Dockerfile
# and so we pass in via a buildarg
RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_ICP=${PYTHON_PACKAGE_VERSION} pip install -e /workspace/icp/python
RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_RUNTIME=${PYTHON_PACKAGE_VERSION} pip install -e /workspace/runtime/python
# Sets pythonpath for python modules # Sets pythonpath for python modules
ENV PYTHONPATH="${PYTHONPATH}:/workspace/icp/python/src:/workspace/runtime/python/src:/workspace/examples/python:/opt/tritonserver/python/openai/openai_frontend" ENV PYTHONPATH="${PYTHONPATH}:/workspace/examples/python:/opt/tritonserver/python/openai/openai_frontend"
# Enable system UCX # Enable system UCX
ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
......
...@@ -22,12 +22,16 @@ PLATFORM=linux/amd64 ...@@ -22,12 +22,16 @@ PLATFORM=linux/amd64
# Get short commit hash # Get short commit hash
commit_id=$(git rev-parse --short HEAD) commit_id=$(git rev-parse --short HEAD)
# Attempt to get current tag # if COMMIT_ID matches a TAG use that
current_tag=$(git describe --tags --exact-match 2>/dev/null) || true current_tag=$(git describe --tags --exact-match 2>/dev/null | sed 's/^v//') || true
# Use tag if available, otherwise use commit hash # Get latest TAG and add COMMIT_ID for dev
VERSION=${current_tag:-$commit_id} latest_tag=$(git describe --tags --abbrev=0 $(git rev-list --tags --max-count=1 main) | sed 's/^v//') || true
# Use tag if available, otherwise use latest_tag.dev.commit_id
VERSION=v${current_tag:-$latest_tag.dev.$commit_id}
PYTHON_PACKAGE_VERSION=${current_tag:-$latest_tag.dev+$commit_id}
# Frameworks # Frameworks
# #
...@@ -244,7 +248,7 @@ get_options "$@" ...@@ -244,7 +248,7 @@ get_options "$@"
# BUILD DEV IMAGE # BUILD DEV IMAGE
BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG --build-arg FRAMEWORK=$FRAMEWORK --build-arg ${FRAMEWORK}_FRAMEWORK=1" BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG --build-arg FRAMEWORK=$FRAMEWORK --build-arg ${FRAMEWORK}_FRAMEWORK=1 --build-arg VERSION=$VERSION --build-arg PYTHON_PACKAGE_VERSION=$PYTHON_PACKAGE_VERSION"
if [ ! -z ${GITHUB_TOKEN} ]; then if [ ! -z ${GITHUB_TOKEN} ]; then
BUILD_ARGS+=" --build-arg GITHUB_TOKEN=${GITHUB_TOKEN} " BUILD_ARGS+=" --build-arg GITHUB_TOKEN=${GITHUB_TOKEN} "
......
...@@ -8,12 +8,12 @@ class EncodeDecodeOperator(Operator): ...@@ -8,12 +8,12 @@ class EncodeDecodeOperator(Operator):
self, self,
name, name,
version, version,
triton_core,
request_plane, request_plane,
data_plane, data_plane,
parameters, parameters,
repository, repository,
logger, logger,
triton_core,
): ):
self._encoder = RemoteOperator("encoder", request_plane, data_plane) self._encoder = RemoteOperator("encoder", request_plane, data_plane)
self._decoder = RemoteOperator("decoder", request_plane, data_plane) self._decoder = RemoteOperator("decoder", request_plane, data_plane)
......
...@@ -41,12 +41,12 @@ class EncodeDecodeOperator(Operator): ...@@ -41,12 +41,12 @@ class EncodeDecodeOperator(Operator):
self, self,
name, name,
version, version,
triton_core,
request_plane, request_plane,
data_plane, data_plane,
parameters, parameters,
repository, repository,
logger, logger,
triton_core,
): ):
self._encoder = RemoteOperator("encoder", request_plane, data_plane) self._encoder = RemoteOperator("encoder", request_plane, data_plane)
self._decoder = RemoteOperator("decoder", request_plane, data_plane) self._decoder = RemoteOperator("decoder", request_plane, data_plane)
......
...@@ -2,7 +2,7 @@ import argparse ...@@ -2,7 +2,7 @@ import argparse
import json import json
import logging import logging
from dataclasses import field from dataclasses import field
from typing import AsyncGenerator, List, Optional from typing import Any, AsyncGenerator, List, Optional
import numpy as np import numpy as np
...@@ -23,7 +23,6 @@ class VllmOperator(Operator): ...@@ -23,7 +23,6 @@ class VllmOperator(Operator):
self, self,
name: str, name: str,
version: int, version: int,
triton_core,
request_plane: RequestPlane, request_plane: RequestPlane,
data_plane: DataPlane, data_plane: DataPlane,
parameters: Optional[dict[str, str | int | bool | bytes]] = field( parameters: Optional[dict[str, str | int | bool | bytes]] = field(
...@@ -31,6 +30,7 @@ class VllmOperator(Operator): ...@@ -31,6 +30,7 @@ class VllmOperator(Operator):
), ),
repository: Optional[str] = None, repository: Optional[str] = None,
logger: Optional[logging.Logger] = None, logger: Optional[logging.Logger] = None,
triton_core: Optional[Any] = None,
): ):
self.name = name self.name = name
self.version = version self.version = version
......
[build-system]
requires = ["setuptools>=65.0", "setuptools-scm>=8"]
build-backend = "setuptools.build_meta"
[project]
name = "triton_distributed.icp"
dynamic = ["version"]
[tool.setuptools_scm]
version_file = "src/triton_distributed/icp/_version.py"
root = "../.."
[tool.setuptools.packages.find]
where = ["src"]
include = ["triton_distributed.icp*"]
namespaces= true
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Type
class CustomKeyErrorDict(dict):
def __init__(
self,
from_name: str,
to_name: str,
*args,
exception: Type[Exception] = ValueError,
**kwargs,
):
super().__init__(*args, **kwargs)
self._to_name = to_name
self._from_name = from_name
self._exception = exception
def __getitem__(self, key):
try:
return super().__getitem__(key)
except KeyError:
raise self._exception(
f"Unsupported {self._from_name}. Can't convert {key} to {self._to_name}"
) from None
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
# This file contains the DLPack API wrapped in Python style (see
# 'dlpack.h' for detail) and the utilities for Triton client to interact
# with DLPack
#
# Ref:
# https://github.com/dmlc/dlpack/blob/main/include/dlpack/dlpack.h
# https://github.com/dmlc/dlpack/blob/main/apps/numpy_dlpack/dlpack/from_numpy.py
################################################################################
import ctypes
from typing import Union
from triton_distributed.icp._custom_key_error_dict import CustomKeyErrorDict
from triton_distributed.icp.data_type import DataType
from triton_distributed.icp.memory_type import MemoryType, string_to_memory_type
try:
import cupy
except ImportError:
cupy = None
# Need to explicit set the res / arg types for pythonapi functions to
# work properly
ctypes.pythonapi.PyMem_RawMalloc.restype = ctypes.c_void_p
ctypes.pythonapi.PyMem_RawFree.argtypes = [ctypes.c_void_p]
ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
ctypes.pythonapi.PyCapsule_New.argtypes = [
ctypes.c_void_p,
ctypes.c_char_p,
ctypes.c_void_p,
]
ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]
c_str_dltensor = b"dltensor"
class DLDeviceType(ctypes.c_int):
kDLCPU = 1
kDLCUDA = 2
kDLCUDAHost = 3
kDLOpenCL = 4
kDLVulkan = 7
kDLMetal = 8
kDLVPI = 9
kDLROCM = 10
kDLROCMHost = 11
kDLExtDev = 12
kDLCUDAManaged = 13
kDLOneAPI = 14
kDLWebGPU = 15
kDLHexagon = 16
DeviceOrMemoryType = Union[
tuple[MemoryType, int], MemoryType, tuple[DLDeviceType, int], str
]
class DLDevice(ctypes.Structure):
_fields_ = [
("device_type", ctypes.c_int),
("device_id", ctypes.c_int),
]
class DLDataTypeCode(ctypes.c_uint8):
kDLInt = 0
kDLUInt = 1
kDLFloat = 2
kDLOpaquePointer = 3
kDLBfloat = 4
kDLComplex = 5
kDLBool = 6
class DLDataType(ctypes.Structure):
_fields_ = [
("type_code", ctypes.c_uint8),
("bits", ctypes.c_uint8),
("lanes", ctypes.c_uint16),
]
class DLTensor(ctypes.Structure):
_fields_ = [
("data", ctypes.c_void_p),
("device", DLDevice),
("ndim", ctypes.c_int),
("dtype", DLDataType),
("shape", ctypes.POINTER(ctypes.c_int64)),
("strides", ctypes.POINTER(ctypes.c_int64)),
("byte_offset", ctypes.c_uint64),
]
class DLManagedTensor(ctypes.Structure):
_fields_ = [
("dl_tensor", DLTensor),
("manager_ctx", ctypes.c_void_p),
("deleter", ctypes.CFUNCTYPE(None, ctypes.c_void_p)),
]
# Utilities
def _raise_error(msg):
"""
Raise error with the provided message
"""
raise Exception(msg) from None
# Use as managed context in DLPack that doesn't hold ownership of the
# data content.
class DataViewContext:
def __init__(self, shape) -> None:
# Convert the Python object to ctypes objects expected by
# DLPack
self._shape = (ctypes.c_int64 * len(shape))(*shape)
# No strides: compact and row-major
self._strides = ctypes.POINTER(ctypes.c_int64)()
def as_manager_ctx(self) -> ctypes.c_void_p:
py_obj = ctypes.py_object(self)
py_obj_ptr = ctypes.pointer(py_obj)
ctypes.pythonapi.Py_IncRef(py_obj)
ctypes.pythonapi.Py_IncRef(ctypes.py_object(py_obj_ptr))
return ctypes.cast(py_obj_ptr, ctypes.c_void_p)
@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
def managed_tensor_deleter(handle: ctypes.c_void_p) -> None:
dl_managed_tensor = DLManagedTensor.from_address(handle) # type: ignore
py_obj_ptr = ctypes.cast(
dl_managed_tensor.manager_ctx, ctypes.POINTER(ctypes.py_object)
)
py_obj = py_obj_ptr.contents
ctypes.pythonapi.Py_DecRef(py_obj)
ctypes.pythonapi.Py_DecRef(ctypes.py_object(py_obj_ptr))
ctypes.pythonapi.PyMem_RawFree(handle)
@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
def pycapsule_deleter(handle: ctypes.c_void_p) -> None:
pycapsule: ctypes.py_object = ctypes.cast(handle, ctypes.py_object)
if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, c_str_dltensor):
dl_managed_tensor = ctypes.pythonapi.PyCapsule_GetPointer(
pycapsule, c_str_dltensor
)
managed_tensor_deleter(dl_managed_tensor)
ctypes.pythonapi.PyCapsule_SetDestructor(pycapsule, None)
def is_contiguous_data(
ndim: ctypes.c_int,
shape: ctypes.POINTER(ctypes.c_int64), # type: ignore
stride: ctypes.POINTER(ctypes.c_int64), # type: ignore
):
# If 'stride' doesn't capture valid value
if (stride is None) or (not bool(stride)):
return True
calculated_stride = 1
# iterate stride in reverse order [ndim-1, -1)
for i in reversed(range(ndim)): # type: ignore
if stride[i] != calculated_stride:
return False
calculated_stride *= shape[i]
return True
def get_byte_size(
dtype: DLDataType, ndim: ctypes.c_int, shape: ctypes.POINTER(ctypes.c_int64) # type: ignore
):
element_byte_size = dtype.bits * dtype.lanes // 8 # Assume 8 bits in a byte
for i in range(ndim): # type: ignore
element_byte_size *= shape[i]
return element_byte_size
def get_dlpack_capsule(dlpack_obj, stream=None):
# Extract PyCapsule of the DLPack object
if hasattr(dlpack_obj, "__dlpack__"):
if not hasattr(dlpack_obj, "__dlpack_device__"):
_raise_error(
"DLPack expects '__dlpack_device__' if '__dlpack__' has been defined"
)
device = dlpack_obj.__dlpack_device__()
# Have to condition on the device type as, using numpy as example,
# some DLPack implementation doesn't accept 'stream' as arguments
if device != DLDeviceType.kDLCUDA:
return dlpack_obj.__dlpack__()
else:
return dlpack_obj.__dlpack__(stream)
else:
# Old interface where PyCapsule object is passed directly
return dlpack_obj
def get_dlpack_device(dlpack_obj):
if hasattr(dlpack_obj, "__dlpack_device__"):
return dlpack_obj.__dlpack_device__()
return None
def get_managed_tensor(dlcapsule):
ptr = ctypes.pythonapi.PyCapsule_GetPointer(dlcapsule, c_str_dltensor)
return DLManagedTensor.from_address(ptr)
class DLPackObject:
def __init__(self, value) -> None:
try:
stream = None
device, device_id = value.__dlpack_device__()
if device == DLDeviceType.kDLCUDA:
if cupy is None:
raise ValueError(
f"DLPack synchronization on device {device,device_id} not supported"
)
with cupy.cuda.Device(device_id):
stream = 1 # legacy default stream
self._capsule = get_dlpack_capsule(value, stream)
self._tensor = get_managed_tensor(self._capsule).dl_tensor
else:
self._capsule = get_dlpack_capsule(value)
self._tensor = get_managed_tensor(self._capsule).dl_tensor
except Exception as e:
raise ValueError(f"Object does not support DLPack protocol: {e}") from None
def __eq__(self, other) -> bool:
if not isinstance(other, DLPackObject):
return False
if self.byte_size != other.byte_size:
return False
if self.memory_type != other.memory_type:
return False
if self.memory_type_id != other.memory_type_id:
return False
if self.shape != other.shape:
return False
if self.data_ptr != other.data_ptr:
return False
if self.contiguous != other.contiguous:
return False
if self.data_type != other.data_type:
return False
return True
@property
def byte_size(self) -> int:
return get_byte_size(self._tensor.dtype, self._tensor.ndim, self._tensor.shape)
@property
def memory_type(self) -> MemoryType:
return DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE[self._tensor.device.device_type]
@property
def memory_type_id(self) -> int:
return self._tensor.device.device_id
@property
def shape(self) -> list[int]:
return [self._tensor.shape[i] for i in range(self._tensor.ndim)]
@property
def data_type(self) -> DataType:
return DLPACK_TO_DATA_TYPE[self.dlpack_data_type]
@property
def dlpack_data_type(self) -> tuple[DLDataTypeCode, int]:
return (self._tensor.dtype.type_code, self._tensor.dtype.bits)
@property
def data_ptr(self) -> ctypes.c_void_p:
return self._tensor.data + self._tensor.byte_offset
@property
def contiguous(self) -> bool:
return is_contiguous_data(
self._tensor.ndim, self._tensor.shape, self._tensor.strides
)
DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE: dict[DLDeviceType, MemoryType] = CustomKeyErrorDict(
"DLPack device type",
"Memory type",
{
DLDeviceType.kDLCUDA: MemoryType.GPU,
DLDeviceType.kDLCPU: MemoryType.CPU,
},
)
MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE: dict[MemoryType, DLDeviceType] = CustomKeyErrorDict(
"Memory type",
"DLPack device type",
{
**{value: key for key, value in DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE.items()},
**{MemoryType.CPU_PINNED: DLDeviceType.kDLCPU},
},
)
def parse_device_or_memory_type(
device_or_memory_type: DeviceOrMemoryType,
) -> tuple[MemoryType, int]:
memory_type = None
memory_type_id = 0
if isinstance(device_or_memory_type, tuple):
if isinstance(device_or_memory_type[0], MemoryType):
memory_type = device_or_memory_type[0]
memory_type_id = device_or_memory_type[1]
elif isinstance(device_or_memory_type[0], DLDeviceType):
memory_type = DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE[device_or_memory_type[0]]
memory_type_id = device_or_memory_type[1]
else:
raise ValueError(f"Invalid memory type {device_or_memory_type}")
elif isinstance(device_or_memory_type, MemoryType):
memory_type = device_or_memory_type
memory_type_id = 0
elif isinstance(device_or_memory_type, str):
memory_str_tuple = device_or_memory_type.split(":")
if len(memory_str_tuple) > 2:
raise ValueError(f"Invalid memory type string {device_or_memory_type}")
memory_type = string_to_memory_type(memory_str_tuple[0].upper())
if len(memory_str_tuple) == 2:
try:
memory_type_id = int(memory_str_tuple[1])
except ValueError:
raise ValueError(
f"Invalid memory type string {device_or_memory_type}"
) from None
else:
memory_type_id = 0
return (memory_type, memory_type_id)
DLPACK_TO_DATA_TYPE: dict[tuple[DLDataTypeCode, int], DataType] = CustomKeyErrorDict(
"DLPack data type",
"Data type",
{
(DLDataTypeCode.kDLBool, 8): DataType.BOOL,
(DLDataTypeCode.kDLInt, 8): DataType.INT8,
(
DLDataTypeCode.kDLInt,
16,
): DataType.INT16,
(
DLDataTypeCode.kDLInt,
32,
): DataType.INT32,
(
DLDataTypeCode.kDLInt,
64,
): DataType.INT64,
(
DLDataTypeCode.kDLUInt,
8,
): DataType.UINT8,
(
DLDataTypeCode.kDLUInt,
16,
): DataType.UINT16,
(
DLDataTypeCode.kDLUInt,
32,
): DataType.UINT32,
(
DLDataTypeCode.kDLUInt,
64,
): DataType.UINT64,
(
DLDataTypeCode.kDLFloat,
16,
): DataType.FP16,
(
DLDataTypeCode.kDLFloat,
32,
): DataType.FP32,
(
DLDataTypeCode.kDLFloat,
64,
): DataType.FP64,
(
DLDataTypeCode.kDLBfloat,
16,
): DataType.BF16,
},
)
DATA_TYPE_TO_DLPACK_DTYPE: dict[DataType, DLDataType] = CustomKeyErrorDict(
"Data type",
"DLPack data type",
{
value: DLDataType(type_code=key[0], bits=key[1], lanes=1)
for key, value in DLPACK_TO_DATA_TYPE.items()
},
)
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Abstract Class for interacting with Triton Inference Serving Platform Inter-Component Protocol Data Plane""" """Abstract Class for interacting with Triton Distributed Inter-Component Protocol Data Plane"""
import abc import abc
import uuid import uuid
...@@ -21,28 +21,16 @@ from typing import Optional, Sequence ...@@ -21,28 +21,16 @@ from typing import Optional, Sequence
import cupy import cupy
import numpy import numpy
from tritonserver import (
from triton_distributed.icp.data_type import (
DATA_TYPE_TO_NUMPY_DTYPE,
DataType, DataType,
InvalidArgumentError, string_to_data_type,
MemoryBuffer,
MemoryType,
Tensor,
)
from tritonserver._api._datautils import (
STRING_TO_TRITON_MEMORY_TYPE,
TRITON_TO_NUMPY_DTYPE,
)
from tritonserver._c.triton_bindings import (
TRITONSERVER_DataTypeString as DataTypeString,
)
from tritonserver._c.triton_bindings import (
TRITONSERVER_MemoryTypeString as MemoryTypeString,
) )
from tritonserver._c.triton_bindings import ( from triton_distributed.icp.memory_buffer import MemoryBuffer
TRITONSERVER_StringToDataType as StringToDataType, from triton_distributed.icp.memory_type import MemoryType, string_to_memory_type
)
from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
from triton_distributed.icp.tensor import Tensor
class DataPlaneError(Exception): class DataPlaneError(Exception):
...@@ -73,13 +61,13 @@ def set_icp_data_type( ...@@ -73,13 +61,13 @@ def set_icp_data_type(
message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor, message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor,
value: DataType, value: DataType,
) -> None: ) -> None:
message.datatype = DataTypeString(value) message.datatype = value.name
def get_icp_data_type( def get_icp_data_type(
message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor, message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor,
) -> DataType: ) -> DataType:
return StringToDataType(message.datatype) return string_to_data_type(message.datatype)
def set_icp_tensor_uri( def set_icp_tensor_uri(
...@@ -116,7 +104,7 @@ def set_icp_memory_type( ...@@ -116,7 +104,7 @@ def set_icp_memory_type(
message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor, message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor,
value: MemoryType, value: MemoryType,
) -> None: ) -> None:
message.parameters[ICP_MEMORY_TYPE].string_param = MemoryTypeString(value) message.parameters[ICP_MEMORY_TYPE].string_param = value.name
def get_icp_memory_type( def get_icp_memory_type(
...@@ -124,9 +112,7 @@ def get_icp_memory_type( ...@@ -124,9 +112,7 @@ def get_icp_memory_type(
) -> MemoryType | None: ) -> MemoryType | None:
if ICP_MEMORY_TYPE not in message.parameters: if ICP_MEMORY_TYPE not in message.parameters:
return None return None
return STRING_TO_TRITON_MEMORY_TYPE[ return string_to_memory_type(message.parameters[ICP_MEMORY_TYPE].string_param)
message.parameters[ICP_MEMORY_TYPE].string_param
]
def set_icp_memory_type_id( def set_icp_memory_type_id(
...@@ -163,9 +149,7 @@ def set_icp_tensor_contents( ...@@ -163,9 +149,7 @@ def set_icp_tensor_contents(
with cupy.cuda.Device(tensor.memory_buffer.memory_type_id): with cupy.cuda.Device(tensor.memory_buffer.memory_type_id):
array = cupy.from_dlpack(tensor) array = cupy.from_dlpack(tensor)
else: else:
raise InvalidArgumentError( raise ValueError(f"Invalid Tensor Memory Type {tensor.memory_type}")
f"Invalid Tensor Memory Type {tensor.memory_type}"
)
message.contents.bytes_contents.append(array.tobytes()) message.contents.bytes_contents.append(array.tobytes())
...@@ -193,7 +177,7 @@ def get_icp_tensor_contents( ...@@ -193,7 +177,7 @@ def get_icp_tensor_contents(
array = numpy.array( array = numpy.array(
numpy.frombuffer( numpy.frombuffer(
message.contents.bytes_contents[0], message.contents.bytes_contents[0],
dtype=TRITON_TO_NUMPY_DTYPE[datatype], dtype=DATA_TYPE_TO_NUMPY_DTYPE[datatype],
) )
) )
tensor = Tensor(datatype, shape, MemoryBuffer.from_dlpack(array)) tensor = Tensor(datatype, shape, MemoryBuffer.from_dlpack(array))
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from enum import IntEnum
import numpy
from triton_distributed.icp._custom_key_error_dict import CustomKeyErrorDict
DataType = IntEnum(
"DataType",
names=(
"INVALID",
"BOOL",
"UINT8",
"UINT16",
"UINT32",
"UINT64",
"INT8",
"INT16",
"INT32",
"INT64",
"FP16",
"FP32",
"FP64",
"BYTES",
"BF16",
),
start=0,
)
def string_to_data_type(data_type_string: str) -> DataType:
try:
return DataType[data_type_string]
except KeyError:
raise ValueError(
f"Unsupported Data Type String. Can't convert {data_type_string} to DataType"
) from None
NUMPY_TO_DATA_TYPE: dict[type, DataType] = CustomKeyErrorDict(
"Numpy dtype",
"Data type",
{
bool: DataType.BOOL,
numpy.bool_: DataType.BOOL,
numpy.int8: DataType.INT8,
numpy.int16: DataType.INT16,
numpy.int32: DataType.INT32,
numpy.int64: DataType.INT64,
numpy.uint8: DataType.UINT8,
numpy.uint16: DataType.UINT16,
numpy.uint32: DataType.UINT32,
numpy.uint64: DataType.UINT64,
numpy.float16: DataType.FP16,
numpy.float32: DataType.FP32,
numpy.float64: DataType.FP64,
numpy.bytes_: DataType.BYTES,
numpy.str_: DataType.BYTES,
numpy.object_: DataType.BYTES,
},
)
DATA_TYPE_TO_NUMPY_DTYPE: dict[DataType, type] = CustomKeyErrorDict(
"Data type",
"Numpy dtype",
{
**{value: key for key, value in NUMPY_TO_DATA_TYPE.items()},
**{DataType.BYTES: numpy.object_},
**{DataType.BOOL: numpy.bool_},
},
)
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
from triton_distributed.icp._dlpack import DLPackObject
from triton_distributed.icp.memory_type import MemoryType
@dataclass
class MemoryBuffer:
"""Memory allocated for a Tensor.
This object does not own the memory but holds a reference to the
owner.
Parameters
----------
data_ptr : int
Pointer to the allocated memory.
memory_type : MemoryType
memory type
memory_type_id : int
memory type id (typically the same as device id)
size : int
Size of the allocated memory in bytes.
owner : Any
Object that owns or manages the memory buffer. Allocated
memory must not be freed while a reference to the owner is
held.
Examples
--------
>>> buffer = MemoryBuffer.from_dlpack(numpy.array([100],dtype=numpy.uint8))
"""
data_ptr: int
memory_type: MemoryType
memory_type_id: int
size: int
owner: Any
@staticmethod
def from_dlpack(owner: Any) -> MemoryBuffer:
if not hasattr(owner, "__dlpack__"):
raise ValueError("Object does not support DLpack protocol")
dlpack_object = DLPackObject(owner)
return MemoryBuffer._from_dlpack_object(owner, dlpack_object)
@staticmethod
def _from_dlpack_object(owner: Any, dlpack_object: DLPackObject) -> MemoryBuffer:
if not dlpack_object.contiguous:
raise ValueError("Only contiguous memory is supported")
return MemoryBuffer(
int(dlpack_object.data_ptr),
dlpack_object.memory_type,
dlpack_object.memory_type_id,
dlpack_object.byte_size,
owner,
)
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import IntEnum
MemoryType = IntEnum("MemoryType", names=("CPU", "CPU_PINNED", "GPU"), start=0)
def string_to_memory_type(memory_type_string: str) -> MemoryType:
try:
return MemoryType[memory_type_string]
except KeyError:
raise ValueError(
f"Unsupported Memory Type String. Can't convert {memory_type_string} to MemoryType"
) from None
...@@ -25,7 +25,6 @@ from typing import Dict, Optional ...@@ -25,7 +25,6 @@ from typing import Dict, Optional
from urllib.parse import urlsplit, urlunsplit from urllib.parse import urlsplit, urlunsplit
import nats import nats
from tritonserver import InvalidArgumentError
from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
from triton_distributed.icp.request_plane import ( from triton_distributed.icp.request_plane import (
...@@ -203,7 +202,7 @@ class NatsRequestPlane(RequestPlane): ...@@ -203,7 +202,7 @@ class NatsRequestPlane(RequestPlane):
Optional[nats.js.JetStreamContext.PullSubscription], Optional[nats.js.JetStreamContext.PullSubscription],
]: ]:
if self._jet_stream is None: if self._jet_stream is None:
raise InvalidArgumentError( raise ValueError(
"Failed to get model stream: NATS Jetstream not connected!" "Failed to get model stream: NATS Jetstream not connected!"
) )
...@@ -335,19 +334,15 @@ class NatsRequestPlane(RequestPlane): ...@@ -335,19 +334,15 @@ class NatsRequestPlane(RequestPlane):
responses: AsyncIterator[ModelInferResponse] | ModelInferResponse, responses: AsyncIterator[ModelInferResponse] | ModelInferResponse,
): ):
if self._jet_stream is None: if self._jet_stream is None:
raise InvalidArgumentError( raise ValueError("Failed to post response: NATS Jetstream not connected!")
"Failed to post response: NATS Jetstream not connected!"
)
request_id = get_icp_request_id(request) request_id = get_icp_request_id(request)
if request_id is None: if request_id is None:
raise InvalidArgumentError("ICP request must have request id") raise ValueError("ICP request must have request id")
response_to_uri = get_icp_response_to_uri(request) response_to_uri = get_icp_response_to_uri(request)
if not response_to_uri: if not response_to_uri:
raise InvalidArgumentError( raise ValueError("Attempting to send a response when non requested")
"Attempting to send a response when non requested"
)
parsed = urlsplit(response_to_uri) parsed = urlsplit(response_to_uri)
response_stream = parsed.path.replace("/", "") response_stream = parsed.path.replace("/", "")
...@@ -378,12 +373,10 @@ class NatsRequestPlane(RequestPlane): ...@@ -378,12 +373,10 @@ class NatsRequestPlane(RequestPlane):
] = None, ] = None,
) -> AsyncIterator[ModelInferResponse]: ) -> AsyncIterator[ModelInferResponse]:
if self._jet_stream is None: if self._jet_stream is None:
raise InvalidArgumentError( raise ValueError("Failed to post request: NATS Jetstream not connected!")
"Failed to post request: NATS Jetstream not connected!"
)
if response_iterator and response_handler: if response_iterator and response_handler:
raise InvalidArgumentError( raise ValueError(
"Can only specify either response handler or response iterator" "Can only specify either response handler or response iterator"
) )
......
...@@ -13,14 +13,12 @@ ...@@ -13,14 +13,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Abstract Class for interacting with the Triton Inference Serving Platform Inter-Component Protocol Control Plane""" """Abstract Class for interacting with the Triton Distributed Inter-Component Protocol Control Plane"""
import abc import abc
import uuid import uuid
from typing import AsyncIterator, Awaitable, Callable, Optional from typing import AsyncIterator, Awaitable, Callable, Optional
from tritonserver import TritonError
from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
ICP_REQUEST_ID = "icp_request_id" ICP_REQUEST_ID = "icp_request_id"
...@@ -33,6 +31,10 @@ ICP_REQUEST_CANCELLED = "icp_request_cancelled" ...@@ -33,6 +31,10 @@ ICP_REQUEST_CANCELLED = "icp_request_cancelled"
ICP_ERROR = "icp_response_error" ICP_ERROR = "icp_response_error"
class RequestPlaneError(Exception):
pass
def get_icp_request_id( def get_icp_request_id(
message: ModelInferRequest | ModelInferResponse, message: ModelInferRequest | ModelInferResponse,
) -> uuid.UUID | None: ) -> uuid.UUID | None:
...@@ -47,13 +49,15 @@ def set_icp_request_id( ...@@ -47,13 +49,15 @@ def set_icp_request_id(
message.parameters[ICP_REQUEST_ID].string_param = str(value) message.parameters[ICP_REQUEST_ID].string_param = str(value)
def get_icp_response_error(message: ModelInferResponse) -> TritonError | None: def get_icp_response_error(message: ModelInferResponse) -> RequestPlaneError | None:
if ICP_ERROR not in message.parameters: if ICP_ERROR not in message.parameters:
return None return None
return TritonError(message.parameters[ICP_ERROR].string_param) return RequestPlaneError(message.parameters[ICP_ERROR].string_param)
def set_icp_response_error(message: ModelInferResponse, value: TritonError) -> None: def set_icp_response_error(
message: ModelInferResponse, value: RequestPlaneError
) -> None:
message.parameters[ICP_ERROR].string_param = str(value) message.parameters[ICP_ERROR].string_param = str(value)
......
This diff is collapsed.
...@@ -25,10 +25,8 @@ from urllib.parse import urlsplit ...@@ -25,10 +25,8 @@ from urllib.parse import urlsplit
import cupy import cupy
import numpy import numpy
import tritonserver
import ucp import ucp
from cupy_backends.cuda.api.runtime import CUDARuntimeError from cupy_backends.cuda.api.runtime import CUDARuntimeError
from tritonserver import InvalidArgumentError, MemoryBuffer, MemoryType, Tensor
from triton_distributed.icp.data_plane import ( from triton_distributed.icp.data_plane import (
DataPlane, DataPlane,
...@@ -48,7 +46,11 @@ from triton_distributed.icp.data_plane import ( ...@@ -48,7 +46,11 @@ from triton_distributed.icp.data_plane import (
set_icp_tensor_size, set_icp_tensor_size,
set_icp_tensor_uri, set_icp_tensor_uri,
) )
from triton_distributed.icp.data_type import DataType
from triton_distributed.icp.memory_buffer import MemoryBuffer
from triton_distributed.icp.memory_type import MemoryType
from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
from triton_distributed.icp.tensor import Tensor
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
...@@ -175,20 +177,18 @@ class _UcpDataPlane(DataPlane): ...@@ -175,20 +177,18 @@ class _UcpDataPlane(DataPlane):
if tensor_id in self._tensor_store: if tensor_id in self._tensor_store:
tensor = self._tensor_store[tensor_id] tensor = self._tensor_store[tensor_id]
array_module = numpy array_module = numpy
if tensor.memory_type == tritonserver.MemoryType.CPU: if tensor.memory_type == MemoryType.CPU:
array_module = numpy array_module = numpy
device_manager = contextlib.nullcontext() device_manager = contextlib.nullcontext()
elif tensor.memory_type == tritonserver.MemoryType.GPU: elif tensor.memory_type == MemoryType.GPU:
array_module = cupy array_module = cupy
device_manager = cupy.cuda.Device( device_manager = cupy.cuda.Device(
tensor.memory_buffer.memory_type_id tensor.memory_buffer.memory_type_id
) )
else: else:
raise InvalidArgumentError( raise ValueError(f"Invalid Memory Type {tensor.memory_type}")
f"Invalid Memory Type {tensor.memory_type}"
)
with device_manager: with device_manager:
if tensor.data_type == tritonserver.DataType.BYTES: if tensor.data_type == DataType.BYTES:
array = tensor.memory_buffer.owner array = tensor.memory_buffer.owner
else: else:
array = array_module.from_dlpack(tensor) array = array_module.from_dlpack(tensor)
...@@ -343,7 +343,7 @@ class _UcpDataPlane(DataPlane): ...@@ -343,7 +343,7 @@ class _UcpDataPlane(DataPlane):
if requested_memory_type is not None: if requested_memory_type is not None:
memory_type = requested_memory_type memory_type = requested_memory_type
if memory_type == tritonserver.MemoryType.GPU and self._cuda_is_available: if memory_type == MemoryType.GPU and self._cuda_is_available:
array_module = cupy array_module = cupy
if requested_memory_type_id is not None: if requested_memory_type_id is not None:
device_manager = cupy.cuda.Device(requested_memory_type_id) device_manager = cupy.cuda.Device(requested_memory_type_id)
......
...@@ -24,10 +24,11 @@ import numpy ...@@ -24,10 +24,11 @@ import numpy
import pytest import pytest
import ucp import ucp
from cupy_backends.cuda.api.runtime import CUDARuntimeError from cupy_backends.cuda.api.runtime import CUDARuntimeError
from tritonserver import DataType, MemoryType, Tensor
from tritonserver._api._datautils import TRITON_TO_NUMPY_DTYPE
from triton_distributed.icp.data_plane import DataPlaneError from triton_distributed.icp.data_plane import DataPlaneError
from triton_distributed.icp.data_type import DATA_TYPE_TO_NUMPY_DTYPE, DataType
from triton_distributed.icp.memory_type import MemoryType
from triton_distributed.icp.tensor import Tensor
from triton_distributed.icp.ucp_data_plane import ( from triton_distributed.icp.ucp_data_plane import (
UcpDataPlane, UcpDataPlane,
get_icp_tensor_uri, get_icp_tensor_uri,
...@@ -283,7 +284,7 @@ def test_requested_memory_type(memory_type, memory_type_id, request): ...@@ -283,7 +284,7 @@ def test_requested_memory_type(memory_type, memory_type_id, request):
def _get_random_tensor(data_type: DataType, size: Sequence[int]): def _get_random_tensor(data_type: DataType, size: Sequence[int]):
dtype = TRITON_TO_NUMPY_DTYPE[data_type] dtype = DATA_TYPE_TO_NUMPY_DTYPE[data_type]
value = numpy.random.rand(*size) value = numpy.random.rand(*size)
return value.astype(dtype) return value.astype(dtype)
......
[build-system]
requires = ["setuptools>=65.0", "setuptools-scm>=8"]
build-backend = "setuptools.build_meta"
[project]
name = "triton_distributed.runtime"
dynamic = ["version"]
dependencies = ["triton_distributed.icp >= 0"]
[tool.setuptools_scm]
version_file = "src/triton_distributed/runtime/_version.py"
root = "../.."
[tool.setuptools.packages.find]
where = ["src"]
include = ["triton_distributed.runtime*"]
namespaces= true
...@@ -24,8 +24,13 @@ from triton_distributed.runtime.remote_request import ( ...@@ -24,8 +24,13 @@ from triton_distributed.runtime.remote_request import (
from triton_distributed.runtime.remote_response import ( from triton_distributed.runtime.remote_response import (
RemoteInferenceResponse as RemoteInferenceResponse, RemoteInferenceResponse as RemoteInferenceResponse,
) )
from triton_distributed.runtime.triton_core_operator import (
TritonCoreOperator as TritonCoreOperator, try:
) from triton_distributed.runtime.triton_core_operator import (
TritonCoreOperator as TritonCoreOperator,
)
except ImportError:
pass
from triton_distributed.runtime.worker import Worker as Worker from triton_distributed.runtime.worker import Worker as Worker
from triton_distributed.runtime.worker import WorkerConfig as WorkerConfig from triton_distributed.runtime.worker import WorkerConfig as WorkerConfig
...@@ -16,8 +16,6 @@ import multiprocessing ...@@ -16,8 +16,6 @@ import multiprocessing
from pprint import pformat from pprint import pformat
from typing import Optional, Type from typing import Optional, Type
from tritonserver import InvalidArgumentError
from triton_distributed.icp import ( from triton_distributed.icp import (
DataPlane, DataPlane,
NatsRequestPlane, NatsRequestPlane,
...@@ -71,7 +69,7 @@ class Deployment: ...@@ -71,7 +69,7 @@ class Deployment:
if self._default_request_plane == NatsRequestPlane: if self._default_request_plane == NatsRequestPlane:
self.request_plane_server = NatsServer(log_dir=self._default_log_dir) self.request_plane_server = NatsServer(log_dir=self._default_log_dir)
else: else:
raise InvalidArgumentError( raise ValueError(
f"Unknown Request Plane Type, can not initialize {self._default_request_plane}" f"Unknown Request Plane Type, can not initialize {self._default_request_plane}"
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment