Commit ccfe101e authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

build: pip based installation of icp and runtime. Also make tritonserver optional


Signed-off-by: default avatarNeelay Shah <neelays@nvidia.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent 5701753a
...@@ -154,8 +154,20 @@ COPY . /workspace ...@@ -154,8 +154,20 @@ COPY . /workspace
RUN cd runtime/rust && cargo build --release --locked && cargo doc --no-deps RUN cd runtime/rust && cargo build --release --locked && cargo doc --no-deps
RUN /workspace/icp/protos/gen_python.sh RUN /workspace/icp/protos/gen_python.sh
# Install python packages
ARG PYTHON_PACKAGE_VERSION=0.0.1.dev+unknown
# SETUPTOOLS_SCM_PRETEND_VERSION allows dynamically setting the package versions during build/install.
# This allows having versioned packages during development between releases, such as commit IDs.
#
# Normally SCM version is taken directly from .git but this is not available in the Dockerfile
# and so we pass in via a buildarg
RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_ICP=${PYTHON_PACKAGE_VERSION} pip install -e /workspace/icp/python
RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_RUNTIME=${PYTHON_PACKAGE_VERSION} pip install -e /workspace/runtime/python
# Sets pythonpath for python modules # Sets pythonpath for python modules
ENV PYTHONPATH="${PYTHONPATH}:/workspace/icp/python/src:/workspace/runtime/python/src:/workspace/examples/python:/opt/tritonserver/python/openai/openai_frontend" ENV PYTHONPATH="${PYTHONPATH}:/workspace/examples/python:/opt/tritonserver/python/openai/openai_frontend"
# Enable system UCX # Enable system UCX
ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
......
...@@ -22,12 +22,16 @@ PLATFORM=linux/amd64 ...@@ -22,12 +22,16 @@ PLATFORM=linux/amd64
# Get short commit hash # Get short commit hash
commit_id=$(git rev-parse --short HEAD) commit_id=$(git rev-parse --short HEAD)
# Attempt to get current tag # if COMMIT_ID matches a TAG use that
current_tag=$(git describe --tags --exact-match 2>/dev/null) || true current_tag=$(git describe --tags --exact-match 2>/dev/null | sed 's/^v//') || true
# Use tag if available, otherwise use commit hash # Get latest TAG and add COMMIT_ID for dev
VERSION=${current_tag:-$commit_id} latest_tag=$(git describe --tags --abbrev=0 $(git rev-list --tags --max-count=1 main) | sed 's/^v//') || true
# Use tag if available, otherwise use latest_tag.dev.commit_id
VERSION=v${current_tag:-$latest_tag.dev.$commit_id}
PYTHON_PACKAGE_VERSION=${current_tag:-$latest_tag.dev+$commit_id}
# Frameworks # Frameworks
# #
...@@ -244,7 +248,7 @@ get_options "$@" ...@@ -244,7 +248,7 @@ get_options "$@"
# BUILD DEV IMAGE # BUILD DEV IMAGE
BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG --build-arg FRAMEWORK=$FRAMEWORK --build-arg ${FRAMEWORK}_FRAMEWORK=1" BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG --build-arg FRAMEWORK=$FRAMEWORK --build-arg ${FRAMEWORK}_FRAMEWORK=1 --build-arg VERSION=$VERSION --build-arg PYTHON_PACKAGE_VERSION=$PYTHON_PACKAGE_VERSION"
if [ ! -z ${GITHUB_TOKEN} ]; then if [ ! -z ${GITHUB_TOKEN} ]; then
BUILD_ARGS+=" --build-arg GITHUB_TOKEN=${GITHUB_TOKEN} " BUILD_ARGS+=" --build-arg GITHUB_TOKEN=${GITHUB_TOKEN} "
......
...@@ -8,12 +8,12 @@ class EncodeDecodeOperator(Operator): ...@@ -8,12 +8,12 @@ class EncodeDecodeOperator(Operator):
self, self,
name, name,
version, version,
triton_core,
request_plane, request_plane,
data_plane, data_plane,
parameters, parameters,
repository, repository,
logger, logger,
triton_core,
): ):
self._encoder = RemoteOperator("encoder", request_plane, data_plane) self._encoder = RemoteOperator("encoder", request_plane, data_plane)
self._decoder = RemoteOperator("decoder", request_plane, data_plane) self._decoder = RemoteOperator("decoder", request_plane, data_plane)
......
...@@ -41,12 +41,12 @@ class EncodeDecodeOperator(Operator): ...@@ -41,12 +41,12 @@ class EncodeDecodeOperator(Operator):
self, self,
name, name,
version, version,
triton_core,
request_plane, request_plane,
data_plane, data_plane,
parameters, parameters,
repository, repository,
logger, logger,
triton_core,
): ):
self._encoder = RemoteOperator("encoder", request_plane, data_plane) self._encoder = RemoteOperator("encoder", request_plane, data_plane)
self._decoder = RemoteOperator("decoder", request_plane, data_plane) self._decoder = RemoteOperator("decoder", request_plane, data_plane)
......
...@@ -2,7 +2,7 @@ import argparse ...@@ -2,7 +2,7 @@ import argparse
import json import json
import logging import logging
from dataclasses import field from dataclasses import field
from typing import AsyncGenerator, List, Optional from typing import Any, AsyncGenerator, List, Optional
import numpy as np import numpy as np
...@@ -23,7 +23,6 @@ class VllmOperator(Operator): ...@@ -23,7 +23,6 @@ class VllmOperator(Operator):
self, self,
name: str, name: str,
version: int, version: int,
triton_core,
request_plane: RequestPlane, request_plane: RequestPlane,
data_plane: DataPlane, data_plane: DataPlane,
parameters: Optional[dict[str, str | int | bool | bytes]] = field( parameters: Optional[dict[str, str | int | bool | bytes]] = field(
...@@ -31,6 +30,7 @@ class VllmOperator(Operator): ...@@ -31,6 +30,7 @@ class VllmOperator(Operator):
), ),
repository: Optional[str] = None, repository: Optional[str] = None,
logger: Optional[logging.Logger] = None, logger: Optional[logging.Logger] = None,
triton_core: Optional[Any] = None,
): ):
self.name = name self.name = name
self.version = version self.version = version
......
[build-system]
requires = ["setuptools>=65.0", "setuptools-scm>=8"]
build-backend = "setuptools.build_meta"
[project]
name = "triton_distributed.icp"
dynamic = ["version"]
[tool.setuptools_scm]
version_file = "src/triton_distributed/icp/_version.py"
root = "../.."
[tool.setuptools.packages.find]
where = ["src"]
include = ["triton_distributed.icp*"]
namespaces= true
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Type
class CustomKeyErrorDict(dict):
def __init__(
self,
from_name: str,
to_name: str,
*args,
exception: Type[Exception] = ValueError,
**kwargs,
):
super().__init__(*args, **kwargs)
self._to_name = to_name
self._from_name = from_name
self._exception = exception
def __getitem__(self, key):
try:
return super().__getitem__(key)
except KeyError:
raise self._exception(
f"Unsupported {self._from_name}. Can't convert {key} to {self._to_name}"
) from None
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
# This file contains the DLPack API wrapped in Python style (see
# 'dlpack.h' for detail) and the utilities for Triton client to interact
# with DLPack
#
# Ref:
# https://github.com/dmlc/dlpack/blob/main/include/dlpack/dlpack.h
# https://github.com/dmlc/dlpack/blob/main/apps/numpy_dlpack/dlpack/from_numpy.py
################################################################################
import ctypes
from typing import Union
from triton_distributed.icp._custom_key_error_dict import CustomKeyErrorDict
from triton_distributed.icp.data_type import DataType
from triton_distributed.icp.memory_type import MemoryType, string_to_memory_type
try:
import cupy
except ImportError:
cupy = None
# Need to explicit set the res / arg types for pythonapi functions to
# work properly
ctypes.pythonapi.PyMem_RawMalloc.restype = ctypes.c_void_p
ctypes.pythonapi.PyMem_RawFree.argtypes = [ctypes.c_void_p]
ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
ctypes.pythonapi.PyCapsule_New.argtypes = [
ctypes.c_void_p,
ctypes.c_char_p,
ctypes.c_void_p,
]
ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]
c_str_dltensor = b"dltensor"
class DLDeviceType(ctypes.c_int):
kDLCPU = 1
kDLCUDA = 2
kDLCUDAHost = 3
kDLOpenCL = 4
kDLVulkan = 7
kDLMetal = 8
kDLVPI = 9
kDLROCM = 10
kDLROCMHost = 11
kDLExtDev = 12
kDLCUDAManaged = 13
kDLOneAPI = 14
kDLWebGPU = 15
kDLHexagon = 16
DeviceOrMemoryType = Union[
tuple[MemoryType, int], MemoryType, tuple[DLDeviceType, int], str
]
class DLDevice(ctypes.Structure):
_fields_ = [
("device_type", ctypes.c_int),
("device_id", ctypes.c_int),
]
class DLDataTypeCode(ctypes.c_uint8):
kDLInt = 0
kDLUInt = 1
kDLFloat = 2
kDLOpaquePointer = 3
kDLBfloat = 4
kDLComplex = 5
kDLBool = 6
class DLDataType(ctypes.Structure):
_fields_ = [
("type_code", ctypes.c_uint8),
("bits", ctypes.c_uint8),
("lanes", ctypes.c_uint16),
]
class DLTensor(ctypes.Structure):
_fields_ = [
("data", ctypes.c_void_p),
("device", DLDevice),
("ndim", ctypes.c_int),
("dtype", DLDataType),
("shape", ctypes.POINTER(ctypes.c_int64)),
("strides", ctypes.POINTER(ctypes.c_int64)),
("byte_offset", ctypes.c_uint64),
]
class DLManagedTensor(ctypes.Structure):
_fields_ = [
("dl_tensor", DLTensor),
("manager_ctx", ctypes.c_void_p),
("deleter", ctypes.CFUNCTYPE(None, ctypes.c_void_p)),
]
# Utilities
def _raise_error(msg):
"""
Raise error with the provided message
"""
raise Exception(msg) from None
# Use as managed context in DLPack that doesn't hold ownership of the
# data content.
class DataViewContext:
def __init__(self, shape) -> None:
# Convert the Python object to ctypes objects expected by
# DLPack
self._shape = (ctypes.c_int64 * len(shape))(*shape)
# No strides: compact and row-major
self._strides = ctypes.POINTER(ctypes.c_int64)()
def as_manager_ctx(self) -> ctypes.c_void_p:
py_obj = ctypes.py_object(self)
py_obj_ptr = ctypes.pointer(py_obj)
ctypes.pythonapi.Py_IncRef(py_obj)
ctypes.pythonapi.Py_IncRef(ctypes.py_object(py_obj_ptr))
return ctypes.cast(py_obj_ptr, ctypes.c_void_p)
@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
def managed_tensor_deleter(handle: ctypes.c_void_p) -> None:
dl_managed_tensor = DLManagedTensor.from_address(handle) # type: ignore
py_obj_ptr = ctypes.cast(
dl_managed_tensor.manager_ctx, ctypes.POINTER(ctypes.py_object)
)
py_obj = py_obj_ptr.contents
ctypes.pythonapi.Py_DecRef(py_obj)
ctypes.pythonapi.Py_DecRef(ctypes.py_object(py_obj_ptr))
ctypes.pythonapi.PyMem_RawFree(handle)
@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
def pycapsule_deleter(handle: ctypes.c_void_p) -> None:
pycapsule: ctypes.py_object = ctypes.cast(handle, ctypes.py_object)
if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, c_str_dltensor):
dl_managed_tensor = ctypes.pythonapi.PyCapsule_GetPointer(
pycapsule, c_str_dltensor
)
managed_tensor_deleter(dl_managed_tensor)
ctypes.pythonapi.PyCapsule_SetDestructor(pycapsule, None)
def is_contiguous_data(
ndim: ctypes.c_int,
shape: ctypes.POINTER(ctypes.c_int64), # type: ignore
stride: ctypes.POINTER(ctypes.c_int64), # type: ignore
):
# If 'stride' doesn't capture valid value
if (stride is None) or (not bool(stride)):
return True
calculated_stride = 1
# iterate stride in reverse order [ndim-1, -1)
for i in reversed(range(ndim)): # type: ignore
if stride[i] != calculated_stride:
return False
calculated_stride *= shape[i]
return True
def get_byte_size(
dtype: DLDataType, ndim: ctypes.c_int, shape: ctypes.POINTER(ctypes.c_int64) # type: ignore
):
element_byte_size = dtype.bits * dtype.lanes // 8 # Assume 8 bits in a byte
for i in range(ndim): # type: ignore
element_byte_size *= shape[i]
return element_byte_size
def get_dlpack_capsule(dlpack_obj, stream=None):
# Extract PyCapsule of the DLPack object
if hasattr(dlpack_obj, "__dlpack__"):
if not hasattr(dlpack_obj, "__dlpack_device__"):
_raise_error(
"DLPack expects '__dlpack_device__' if '__dlpack__' has been defined"
)
device = dlpack_obj.__dlpack_device__()
# Have to condition on the device type as, using numpy as example,
# some DLPack implementation doesn't accept 'stream' as arguments
if device != DLDeviceType.kDLCUDA:
return dlpack_obj.__dlpack__()
else:
return dlpack_obj.__dlpack__(stream)
else:
# Old interface where PyCapsule object is passed directly
return dlpack_obj
def get_dlpack_device(dlpack_obj):
if hasattr(dlpack_obj, "__dlpack_device__"):
return dlpack_obj.__dlpack_device__()
return None
def get_managed_tensor(dlcapsule):
ptr = ctypes.pythonapi.PyCapsule_GetPointer(dlcapsule, c_str_dltensor)
return DLManagedTensor.from_address(ptr)
class DLPackObject:
def __init__(self, value) -> None:
try:
stream = None
device, device_id = value.__dlpack_device__()
if device == DLDeviceType.kDLCUDA:
if cupy is None:
raise ValueError(
f"DLPack synchronization on device {device,device_id} not supported"
)
with cupy.cuda.Device(device_id):
stream = 1 # legacy default stream
self._capsule = get_dlpack_capsule(value, stream)
self._tensor = get_managed_tensor(self._capsule).dl_tensor
else:
self._capsule = get_dlpack_capsule(value)
self._tensor = get_managed_tensor(self._capsule).dl_tensor
except Exception as e:
raise ValueError(f"Object does not support DLPack protocol: {e}") from None
def __eq__(self, other) -> bool:
if not isinstance(other, DLPackObject):
return False
if self.byte_size != other.byte_size:
return False
if self.memory_type != other.memory_type:
return False
if self.memory_type_id != other.memory_type_id:
return False
if self.shape != other.shape:
return False
if self.data_ptr != other.data_ptr:
return False
if self.contiguous != other.contiguous:
return False
if self.data_type != other.data_type:
return False
return True
@property
def byte_size(self) -> int:
return get_byte_size(self._tensor.dtype, self._tensor.ndim, self._tensor.shape)
@property
def memory_type(self) -> MemoryType:
return DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE[self._tensor.device.device_type]
@property
def memory_type_id(self) -> int:
return self._tensor.device.device_id
@property
def shape(self) -> list[int]:
return [self._tensor.shape[i] for i in range(self._tensor.ndim)]
@property
def data_type(self) -> DataType:
return DLPACK_TO_DATA_TYPE[self.dlpack_data_type]
@property
def dlpack_data_type(self) -> tuple[DLDataTypeCode, int]:
return (self._tensor.dtype.type_code, self._tensor.dtype.bits)
@property
def data_ptr(self) -> ctypes.c_void_p:
return self._tensor.data + self._tensor.byte_offset
@property
def contiguous(self) -> bool:
return is_contiguous_data(
self._tensor.ndim, self._tensor.shape, self._tensor.strides
)
DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE: dict[DLDeviceType, MemoryType] = CustomKeyErrorDict(
"DLPack device type",
"Memory type",
{
DLDeviceType.kDLCUDA: MemoryType.GPU,
DLDeviceType.kDLCPU: MemoryType.CPU,
},
)
MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE: dict[MemoryType, DLDeviceType] = CustomKeyErrorDict(
"Memory type",
"DLPack device type",
{
**{value: key for key, value in DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE.items()},
**{MemoryType.CPU_PINNED: DLDeviceType.kDLCPU},
},
)
def parse_device_or_memory_type(
device_or_memory_type: DeviceOrMemoryType,
) -> tuple[MemoryType, int]:
memory_type = None
memory_type_id = 0
if isinstance(device_or_memory_type, tuple):
if isinstance(device_or_memory_type[0], MemoryType):
memory_type = device_or_memory_type[0]
memory_type_id = device_or_memory_type[1]
elif isinstance(device_or_memory_type[0], DLDeviceType):
memory_type = DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE[device_or_memory_type[0]]
memory_type_id = device_or_memory_type[1]
else:
raise ValueError(f"Invalid memory type {device_or_memory_type}")
elif isinstance(device_or_memory_type, MemoryType):
memory_type = device_or_memory_type
memory_type_id = 0
elif isinstance(device_or_memory_type, str):
memory_str_tuple = device_or_memory_type.split(":")
if len(memory_str_tuple) > 2:
raise ValueError(f"Invalid memory type string {device_or_memory_type}")
memory_type = string_to_memory_type(memory_str_tuple[0].upper())
if len(memory_str_tuple) == 2:
try:
memory_type_id = int(memory_str_tuple[1])
except ValueError:
raise ValueError(
f"Invalid memory type string {device_or_memory_type}"
) from None
else:
memory_type_id = 0
return (memory_type, memory_type_id)
DLPACK_TO_DATA_TYPE: dict[tuple[DLDataTypeCode, int], DataType] = CustomKeyErrorDict(
"DLPack data type",
"Data type",
{
(DLDataTypeCode.kDLBool, 8): DataType.BOOL,
(DLDataTypeCode.kDLInt, 8): DataType.INT8,
(
DLDataTypeCode.kDLInt,
16,
): DataType.INT16,
(
DLDataTypeCode.kDLInt,
32,
): DataType.INT32,
(
DLDataTypeCode.kDLInt,
64,
): DataType.INT64,
(
DLDataTypeCode.kDLUInt,
8,
): DataType.UINT8,
(
DLDataTypeCode.kDLUInt,
16,
): DataType.UINT16,
(
DLDataTypeCode.kDLUInt,
32,
): DataType.UINT32,
(
DLDataTypeCode.kDLUInt,
64,
): DataType.UINT64,
(
DLDataTypeCode.kDLFloat,
16,
): DataType.FP16,
(
DLDataTypeCode.kDLFloat,
32,
): DataType.FP32,
(
DLDataTypeCode.kDLFloat,
64,
): DataType.FP64,
(
DLDataTypeCode.kDLBfloat,
16,
): DataType.BF16,
},
)
DATA_TYPE_TO_DLPACK_DTYPE: dict[DataType, DLDataType] = CustomKeyErrorDict(
"Data type",
"DLPack data type",
{
value: DLDataType(type_code=key[0], bits=key[1], lanes=1)
for key, value in DLPACK_TO_DATA_TYPE.items()
},
)
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Abstract Class for interacting with Triton Inference Serving Platform Inter-Component Protocol Data Plane""" """Abstract Class for interacting with Triton Distributed Inter-Component Protocol Data Plane"""
import abc import abc
import uuid import uuid
...@@ -21,28 +21,16 @@ from typing import Optional, Sequence ...@@ -21,28 +21,16 @@ from typing import Optional, Sequence
import cupy import cupy
import numpy import numpy
from tritonserver import (
from triton_distributed.icp.data_type import (
DATA_TYPE_TO_NUMPY_DTYPE,
DataType, DataType,
InvalidArgumentError, string_to_data_type,
MemoryBuffer,
MemoryType,
Tensor,
)
from tritonserver._api._datautils import (
STRING_TO_TRITON_MEMORY_TYPE,
TRITON_TO_NUMPY_DTYPE,
)
from tritonserver._c.triton_bindings import (
TRITONSERVER_DataTypeString as DataTypeString,
)
from tritonserver._c.triton_bindings import (
TRITONSERVER_MemoryTypeString as MemoryTypeString,
) )
from tritonserver._c.triton_bindings import ( from triton_distributed.icp.memory_buffer import MemoryBuffer
TRITONSERVER_StringToDataType as StringToDataType, from triton_distributed.icp.memory_type import MemoryType, string_to_memory_type
)
from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
from triton_distributed.icp.tensor import Tensor
class DataPlaneError(Exception): class DataPlaneError(Exception):
...@@ -73,13 +61,13 @@ def set_icp_data_type( ...@@ -73,13 +61,13 @@ def set_icp_data_type(
message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor, message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor,
value: DataType, value: DataType,
) -> None: ) -> None:
message.datatype = DataTypeString(value) message.datatype = value.name
def get_icp_data_type( def get_icp_data_type(
message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor, message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor,
) -> DataType: ) -> DataType:
return StringToDataType(message.datatype) return string_to_data_type(message.datatype)
def set_icp_tensor_uri( def set_icp_tensor_uri(
...@@ -116,7 +104,7 @@ def set_icp_memory_type( ...@@ -116,7 +104,7 @@ def set_icp_memory_type(
message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor, message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor,
value: MemoryType, value: MemoryType,
) -> None: ) -> None:
message.parameters[ICP_MEMORY_TYPE].string_param = MemoryTypeString(value) message.parameters[ICP_MEMORY_TYPE].string_param = value.name
def get_icp_memory_type( def get_icp_memory_type(
...@@ -124,9 +112,7 @@ def get_icp_memory_type( ...@@ -124,9 +112,7 @@ def get_icp_memory_type(
) -> MemoryType | None: ) -> MemoryType | None:
if ICP_MEMORY_TYPE not in message.parameters: if ICP_MEMORY_TYPE not in message.parameters:
return None return None
return STRING_TO_TRITON_MEMORY_TYPE[ return string_to_memory_type(message.parameters[ICP_MEMORY_TYPE].string_param)
message.parameters[ICP_MEMORY_TYPE].string_param
]
def set_icp_memory_type_id( def set_icp_memory_type_id(
...@@ -163,9 +149,7 @@ def set_icp_tensor_contents( ...@@ -163,9 +149,7 @@ def set_icp_tensor_contents(
with cupy.cuda.Device(tensor.memory_buffer.memory_type_id): with cupy.cuda.Device(tensor.memory_buffer.memory_type_id):
array = cupy.from_dlpack(tensor) array = cupy.from_dlpack(tensor)
else: else:
raise InvalidArgumentError( raise ValueError(f"Invalid Tensor Memory Type {tensor.memory_type}")
f"Invalid Tensor Memory Type {tensor.memory_type}"
)
message.contents.bytes_contents.append(array.tobytes()) message.contents.bytes_contents.append(array.tobytes())
...@@ -193,7 +177,7 @@ def get_icp_tensor_contents( ...@@ -193,7 +177,7 @@ def get_icp_tensor_contents(
array = numpy.array( array = numpy.array(
numpy.frombuffer( numpy.frombuffer(
message.contents.bytes_contents[0], message.contents.bytes_contents[0],
dtype=TRITON_TO_NUMPY_DTYPE[datatype], dtype=DATA_TYPE_TO_NUMPY_DTYPE[datatype],
) )
) )
tensor = Tensor(datatype, shape, MemoryBuffer.from_dlpack(array)) tensor = Tensor(datatype, shape, MemoryBuffer.from_dlpack(array))
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from enum import IntEnum
import numpy
from triton_distributed.icp._custom_key_error_dict import CustomKeyErrorDict
DataType = IntEnum(
"DataType",
names=(
"INVALID",
"BOOL",
"UINT8",
"UINT16",
"UINT32",
"UINT64",
"INT8",
"INT16",
"INT32",
"INT64",
"FP16",
"FP32",
"FP64",
"BYTES",
"BF16",
),
start=0,
)
def string_to_data_type(data_type_string: str) -> DataType:
try:
return DataType[data_type_string]
except KeyError:
raise ValueError(
f"Unsupported Data Type String. Can't convert {data_type_string} to DataType"
) from None
NUMPY_TO_DATA_TYPE: dict[type, DataType] = CustomKeyErrorDict(
"Numpy dtype",
"Data type",
{
bool: DataType.BOOL,
numpy.bool_: DataType.BOOL,
numpy.int8: DataType.INT8,
numpy.int16: DataType.INT16,
numpy.int32: DataType.INT32,
numpy.int64: DataType.INT64,
numpy.uint8: DataType.UINT8,
numpy.uint16: DataType.UINT16,
numpy.uint32: DataType.UINT32,
numpy.uint64: DataType.UINT64,
numpy.float16: DataType.FP16,
numpy.float32: DataType.FP32,
numpy.float64: DataType.FP64,
numpy.bytes_: DataType.BYTES,
numpy.str_: DataType.BYTES,
numpy.object_: DataType.BYTES,
},
)
DATA_TYPE_TO_NUMPY_DTYPE: dict[DataType, type] = CustomKeyErrorDict(
"Data type",
"Numpy dtype",
{
**{value: key for key, value in NUMPY_TO_DATA_TYPE.items()},
**{DataType.BYTES: numpy.object_},
**{DataType.BOOL: numpy.bool_},
},
)
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
from triton_distributed.icp._dlpack import DLPackObject
from triton_distributed.icp.memory_type import MemoryType
@dataclass
class MemoryBuffer:
"""Memory allocated for a Tensor.
This object does not own the memory but holds a reference to the
owner.
Parameters
----------
data_ptr : int
Pointer to the allocated memory.
memory_type : MemoryType
memory type
memory_type_id : int
memory type id (typically the same as device id)
size : int
Size of the allocated memory in bytes.
owner : Any
Object that owns or manages the memory buffer. Allocated
memory must not be freed while a reference to the owner is
held.
Examples
--------
>>> buffer = MemoryBuffer.from_dlpack(numpy.array([100],dtype=numpy.uint8))
"""
data_ptr: int
memory_type: MemoryType
memory_type_id: int
size: int
owner: Any
@staticmethod
def from_dlpack(owner: Any) -> MemoryBuffer:
if not hasattr(owner, "__dlpack__"):
raise ValueError("Object does not support DLpack protocol")
dlpack_object = DLPackObject(owner)
return MemoryBuffer._from_dlpack_object(owner, dlpack_object)
@staticmethod
def _from_dlpack_object(owner: Any, dlpack_object: DLPackObject) -> MemoryBuffer:
if not dlpack_object.contiguous:
raise ValueError("Only contiguous memory is supported")
return MemoryBuffer(
int(dlpack_object.data_ptr),
dlpack_object.memory_type,
dlpack_object.memory_type_id,
dlpack_object.byte_size,
owner,
)
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import IntEnum
MemoryType = IntEnum("MemoryType", names=("CPU", "CPU_PINNED", "GPU"), start=0)
def string_to_memory_type(memory_type_string: str) -> MemoryType:
try:
return MemoryType[memory_type_string]
except KeyError:
raise ValueError(
f"Unsupported Memory Type String. Can't convert {memory_type_string} to MemoryType"
) from None
...@@ -25,7 +25,6 @@ from typing import Dict, Optional ...@@ -25,7 +25,6 @@ from typing import Dict, Optional
from urllib.parse import urlsplit, urlunsplit from urllib.parse import urlsplit, urlunsplit
import nats import nats
from tritonserver import InvalidArgumentError
from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
from triton_distributed.icp.request_plane import ( from triton_distributed.icp.request_plane import (
...@@ -203,7 +202,7 @@ class NatsRequestPlane(RequestPlane): ...@@ -203,7 +202,7 @@ class NatsRequestPlane(RequestPlane):
Optional[nats.js.JetStreamContext.PullSubscription], Optional[nats.js.JetStreamContext.PullSubscription],
]: ]:
if self._jet_stream is None: if self._jet_stream is None:
raise InvalidArgumentError( raise ValueError(
"Failed to get model stream: NATS Jetstream not connected!" "Failed to get model stream: NATS Jetstream not connected!"
) )
...@@ -335,19 +334,15 @@ class NatsRequestPlane(RequestPlane): ...@@ -335,19 +334,15 @@ class NatsRequestPlane(RequestPlane):
responses: AsyncIterator[ModelInferResponse] | ModelInferResponse, responses: AsyncIterator[ModelInferResponse] | ModelInferResponse,
): ):
if self._jet_stream is None: if self._jet_stream is None:
raise InvalidArgumentError( raise ValueError("Failed to post response: NATS Jetstream not connected!")
"Failed to post response: NATS Jetstream not connected!"
)
request_id = get_icp_request_id(request) request_id = get_icp_request_id(request)
if request_id is None: if request_id is None:
raise InvalidArgumentError("ICP request must have request id") raise ValueError("ICP request must have request id")
response_to_uri = get_icp_response_to_uri(request) response_to_uri = get_icp_response_to_uri(request)
if not response_to_uri: if not response_to_uri:
raise InvalidArgumentError( raise ValueError("Attempting to send a response when non requested")
"Attempting to send a response when non requested"
)
parsed = urlsplit(response_to_uri) parsed = urlsplit(response_to_uri)
response_stream = parsed.path.replace("/", "") response_stream = parsed.path.replace("/", "")
...@@ -378,12 +373,10 @@ class NatsRequestPlane(RequestPlane): ...@@ -378,12 +373,10 @@ class NatsRequestPlane(RequestPlane):
] = None, ] = None,
) -> AsyncIterator[ModelInferResponse]: ) -> AsyncIterator[ModelInferResponse]:
if self._jet_stream is None: if self._jet_stream is None:
raise InvalidArgumentError( raise ValueError("Failed to post request: NATS Jetstream not connected!")
"Failed to post request: NATS Jetstream not connected!"
)
if response_iterator and response_handler: if response_iterator and response_handler:
raise InvalidArgumentError( raise ValueError(
"Can only specify either response handler or response iterator" "Can only specify either response handler or response iterator"
) )
......
...@@ -13,14 +13,12 @@ ...@@ -13,14 +13,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Abstract Class for interacting with the Triton Inference Serving Platform Inter-Component Protocol Control Plane""" """Abstract Class for interacting with the Triton Distributed Inter-Component Protocol Control Plane"""
import abc import abc
import uuid import uuid
from typing import AsyncIterator, Awaitable, Callable, Optional from typing import AsyncIterator, Awaitable, Callable, Optional
from tritonserver import TritonError
from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
ICP_REQUEST_ID = "icp_request_id" ICP_REQUEST_ID = "icp_request_id"
...@@ -33,6 +31,10 @@ ICP_REQUEST_CANCELLED = "icp_request_cancelled" ...@@ -33,6 +31,10 @@ ICP_REQUEST_CANCELLED = "icp_request_cancelled"
ICP_ERROR = "icp_response_error" ICP_ERROR = "icp_response_error"
class RequestPlaneError(Exception):
pass
def get_icp_request_id( def get_icp_request_id(
message: ModelInferRequest | ModelInferResponse, message: ModelInferRequest | ModelInferResponse,
) -> uuid.UUID | None: ) -> uuid.UUID | None:
...@@ -47,13 +49,15 @@ def set_icp_request_id( ...@@ -47,13 +49,15 @@ def set_icp_request_id(
message.parameters[ICP_REQUEST_ID].string_param = str(value) message.parameters[ICP_REQUEST_ID].string_param = str(value)
def get_icp_response_error(message: ModelInferResponse) -> TritonError | None: def get_icp_response_error(message: ModelInferResponse) -> RequestPlaneError | None:
if ICP_ERROR not in message.parameters: if ICP_ERROR not in message.parameters:
return None return None
return TritonError(message.parameters[ICP_ERROR].string_param) return RequestPlaneError(message.parameters[ICP_ERROR].string_param)
def set_icp_response_error(message: ModelInferResponse, value: TritonError) -> None: def set_icp_response_error(
message: ModelInferResponse, value: RequestPlaneError
) -> None:
message.parameters[ICP_ERROR].string_param = str(value) message.parameters[ICP_ERROR].string_param = str(value)
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import ctypes
import struct
from dataclasses import dataclass
from typing import Any, Callable, ClassVar, Sequence
import numpy
from triton_distributed.icp._dlpack import (
DATA_TYPE_TO_DLPACK_DTYPE,
MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE,
DeviceOrMemoryType,
DLDevice,
DLDeviceType,
DLManagedTensor,
DLPackObject,
c_str_dltensor,
parse_device_or_memory_type,
)
from triton_distributed.icp.data_type import NUMPY_TO_DATA_TYPE, DataType
from triton_distributed.icp.memory_buffer import MemoryBuffer
from triton_distributed.icp.memory_type import MemoryType
try:
import cupy
except ImportError:
cupy = None
@dataclass
class Tensor:
"""Class representing a Tensor.
Parameters
----------
data_type : DataType
Data type of the tensor.
shape : Sequence[int]
Shape of the tensor.
memory_buffer : MemoryBuffer
Memory buffer containing the tensor data.
"""
data_type: DataType
shape: Sequence[int]
memory_buffer: MemoryBuffer
@property
def data_ptr(self) -> int:
"""Get the pointer to the tensor's data.
Returns
-------
int
The pointer to the tensor's data.
"""
return self.memory_buffer.data_ptr
@property
def memory_type(self) -> MemoryType:
"""Get the memory type of the tensor.
Returns
-------
MemoryType
The memory type of the tensor.
"""
return self.memory_buffer.memory_type
@property
def memory_type_id(self) -> int:
"""Get the ID representing the memory type of the tensor.
Returns
-------
int
The ID representing the memory type of the tensor.
"""
return self.memory_buffer.memory_type_id
@property
def size(self) -> int:
"""Get the size of the tensor's data in bytes.
Returns
-------
int
The size of the tensor's data in bytes.
"""
return self.memory_buffer.size
def _sync_on_requested_stream(self, requested_stream_ptr):
"""Stream synchronization based on cupy implementation.
Record event on requested stream if different from current
stream.
See
`https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html`
and
`https://github.com/cupy/cupy/blob/f9563bcd5c674623f80ce975b590f9c860b44ed6/cupy/_core/core.pyx#L281`.
for more details.
Parameters
----------
requested_stream_ptr :
requested stream as defined by DLPack protocol
Raises
------
unsupported
If synchronization can not be done
"""
current_stream = None
unsupported = ValueError(
f"DLPack stream synchronization on memory type {self.memory_type} and stream {requested_stream_ptr} not supported"
)
if requested_stream_ptr is not None and not isinstance(
requested_stream_ptr, int
):
raise unsupported
if self.memory_type != MemoryType.GPU:
if requested_stream_ptr not in (None, 0):
raise unsupported
return
if cupy is None:
raise unsupported
# NOTE: Technically this is not required by the protocol. It is the
# responsibility of the caller(consumer) to ensure that
# we are on the correct device. Added to ensure
# the semantics are correct - but should be a no-op.
# May be removed in the future.
with cupy.cuda.Device(self.memory_type_id):
current_stream = cupy.cuda.get_current_stream()
curr_stream_ptr = current_stream.ptr
# Based on cupy documentation
# cupy.cuda.Stream.null.ptr is legacy default stream
# cupy.cuda.Stream.ptds.ptr is per thread default stream
if curr_stream_ptr == 0:
curr_stream_ptr = cupy.cuda.Stream.null.ptr
if requested_stream_ptr in (None, 0, 1):
requested_stream_ptr = cupy.cuda.Stream.null.ptr
if requested_stream_ptr in (2,):
requested_stream_ptr = cupy.cuda.Stream.ptds.ptr
if requested_stream_ptr >= 0 and curr_stream_ptr != requested_stream_ptr:
next_stream = cupy.cuda.ExternalStream(requested_stream_ptr)
event = current_stream.record()
next_stream.wait_event(event)
def __dlpack__(self, *, stream=None):
"""Convert the tensor to a DLPack-compatible object.
Parameters
----------
stream : Any, optional
Currently Ignored parameter, by default None.
Returns
-------
Any
A DLPack-compatible object representing the tensor.
"""
self._sync_on_requested_stream(stream)
dl_managed_tensor = self._create_managed_tensor()
pycapsule = ctypes.pythonapi.PyCapsule_New(
ctypes.byref(dl_managed_tensor),
c_str_dltensor,
Tensor._pycapsule_deleter,
)
return pycapsule
def __dlpack_device__(self) -> tuple[DLDeviceType, int]:
"""Get the DLPack device information for the tensor.
Returns
-------
tuple[DLDeviceType, int]
A tuple representing the DLPack device information (device type, device ID).
"""
return (
MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE[self.memory_type],
self.memory_type_id,
)
def to_string_array(self) -> numpy.ndarray:
"""Deserialize BYTES Tensor into numpy array of strings.
If memory is not on the host the tensor data will be copied to
the host before deserialization.
Returns
-------
numpy.ndarray
A numpy array of objects representing the BYTES tensor.
Examples
--------
numpy_ndarray = response.outputs["text_output"].to_string_array()
"""
return self.to_bytes_array().astype(str)
def to_bytes_array(self) -> numpy.ndarray:
"""Deserialize BYTES Tensor into numpy array.
If memory is not on the host the tensor data will be copied to
the host before deserialization.
Returns
-------
numpy.ndarray
A numpy array of objects representing the BYTES tensor.
Examples
--------
numpy_ndarray = response.outputs["text_output"].to_bytes_array()
"""
if self.data_type != DataType.BYTES:
raise ValueError(
f"Tensor has data type {self.data_type} not {DataType.BYTES}"
)
# Reshape into 1d array of bytes on host
original_data_type = self.data_type
original_shape = self.shape
self.data_type = DataType.UINT8
self.shape = [self.size]
numpy_ndarray = self._to_numpy_on_host()
# Deserialize bytes array and reshape
self.shape = original_shape
self.data_type = original_data_type
return Tensor._deserialize_bytes_array(numpy_ndarray).reshape(self.shape)
@staticmethod
def from_string_array(string_array: list[str] | numpy.ndarray) -> Tensor:
"""Create BYTES Tensor from numpy array of strings or list of strings.
Creates a tensor of type BYTES from a list of strings,
or numpy array of type str_. The
method allocates new host memory to store the serialized
tensor.
Parameters
----------
string_array : list[str] | numpy.ndarray
an array like object to convert
Returns
-------
Tensor
Raises
------
ValueError
If the given object can not be converted.
Examples
--------
tensor = Tensor.from_string_array(numpy.array(["hello"]))
tensor = Tensor.from_string_array(["hello"])
"""
return Tensor.from_bytes_array(string_array)
@staticmethod
def from_bytes_array(
bytes_array: list[str] | list[bytes] | numpy.ndarray,
) -> Tensor:
"""Create BYTES Tensor from numpy array or list
Creates a tensor of type BYTES from a list of strings,
bytes or a numpy array of type object_, bytes_, or str_. The
method allocates new host memory to store the serialized
tensor.
Parameters
----------
bytes_array : list[str | bytes] | numpy.ndarray
an array like object to convert
Returns
-------
Tensor
Raises
------
ValueError
If the given object can not be converted.
Examples
--------
tensor = Tensor.from_bytes_array(numpy.array(["hello"]))
tensor = Tensor.from_bytes_array(["hello"])
"""
result = Tensor._from_object(bytes_array)
if result.data_type != DataType.BYTES:
raise ValueError(
f"Unsupported conversion from {bytes_array} to BYTES Tensor. Got {result.data_type}"
)
return result
@staticmethod
def _from_object(obj: list[Any] | numpy.ndarray | Any) -> Tensor:
"""Create a tensor from an object.
Creates a tensor from an object using specific conversion
methods if available or falls back to using __from_dlpack__.
Specific conversions are currently supported for:
list[obj: Any] : implicitly converted to numpy.array()
numpy.ndarray : serialized if required to BYTES tensor
Parameters
----------
obj : list[Any] | numpy.ndarray | Any
The input object to create the tensor from.
Returns
-------
Tensor
A new tensor created from the specified object.
Examples
--------
tensor = Tensor.from_object(numpy.array(["hello"]))
tensor = Tensor.from_object(["hello"])
"""
if type(obj) in Tensor._from_converters:
return Tensor._from_converters[type(obj)](obj)
elif hasattr(obj, "__dlpack__"):
return Tensor.from_dlpack(obj)
else:
raise ValueError(
f"Input type {type(obj)} not supported. Must be one of {list(Tensor._from_converters.keys())} or the type must support __dlpack__"
)
@staticmethod
def from_dlpack(obj: Any) -> Tensor:
"""Create a tensor from a DLPack-compatible object.
Parameters
----------
obj : Any
The DLPack-compatible object.
Returns
-------
Tensor
A new tensor created from the DLPack-compatible object.
Examples
--------
tensor = Tensor.from_dlpack(numpy.array([0,1,2], dtype=numpy.float16))
tensor = Tensor.from_dlpack(torch.zeros(100, dtype=torch.float16))
"""
dlpack_object = DLPackObject(obj)
data_type = dlpack_object.data_type
shape = dlpack_object.shape
memory_buffer = MemoryBuffer._from_dlpack_object(
obj, dlpack_object=dlpack_object
)
return Tensor(data_type, shape, memory_buffer)
def to_host(self) -> Tensor:
"""Move the tensor to CPU memory from device memory
Returns
-------
Tensor
The tensor moved to the CPU.
Examples
--------
tensor = Tensor.from_dlpack(torch.zeros(100, dtype=torch.float16).to("cuda"))
numpy_nd_array = numpy.array(tensor.to_host())
"""
return self.to_device("cpu")
def to_device(self, device: DeviceOrMemoryType) -> Tensor:
"""Move the tensor to the specified device.
Parameters
----------
device : DeviceOrMemoryType
The target device. Device can be specified as a string,
MemoryType, tuple [MemoryType, memory_type__id], or
tuple[DLDeviceType, device_id].
Returns
-------
Tensor
The tensor moved to the specified device.
Examples
--------
tensor_cpu = tritonserver.Tensor.from_dlpack(numpy.array([0,1,2], dtype=numpy.float16))
# Different ways to specify the device
tensor_gpu = tensor_cpu.to_device(MemoryType.GPU)
tensor_gpu = tensor_cpu.to_device((MemoryType.GPU,0))
tensor_gpu = tensor_cpu.to_device((DLDeviceType.kDLCUDA,0))
tensor_gpu = tensor_cpu.to_device("gpu")
tensor_gpu = tensor_cpu.to_device("gpu:0")
ndarray_gpu = cupy.from_dlpack(tensor_gpu)
ndarray_gpu[0] = ndarray_gpu.mean()
tensor_cpu = tensor_gpu.to_device("cpu")
ndarray_cpu = numpy.from_dlpack(tensor_cpu)
assert ndarray_cpu[0] == ndarray_gpu[0]
"""
memory_type, memory_type_id = parse_device_or_memory_type(device)
if self.memory_type == memory_type and self.memory_type_id == memory_type_id:
return self
if self.memory_type == MemoryType.CPU_PINNED and memory_type == MemoryType.CPU:
return self
if cupy is not None:
if self.memory_type in (MemoryType.CPU, MemoryType.CPU_PINNED):
ndarray = numpy.from_dlpack(self)
else:
ndarray = cupy.from_dlpack(self)
if memory_type == MemoryType.CPU:
return Tensor.from_dlpack(cupy.asnumpy(ndarray))
if memory_type == MemoryType.GPU:
with cupy.cuda.Device(memory_type_id):
return Tensor.from_dlpack(cupy.asarray(ndarray))
raise ValueError(
f"Conversion from {(self.memory_type,self.memory_type_id)} to {(memory_type, memory_type_id)} not supported."
)
def _to_numpy_on_host(self) -> numpy.ndarray:
if self.memory_type in (MemoryType.CPU, MemoryType.CPU_PINNED):
return numpy.from_dlpack(self)
if cupy is not None:
return cupy.asnumpy(cupy.from_dlpack(self))
raise ValueError(
f"Conversion from {self.memory_type} to numpy array not supported."
)
@staticmethod
def _deserialize_bytes_array(numpy_ndarray: numpy.ndarray) -> numpy.ndarray:
result = []
_buffer = memoryview(numpy_ndarray)
offset = 0
while offset < len(_buffer):
(item_length,) = struct.unpack_from("@I", _buffer, offset)
offset += 4
result.append(bytes(_buffer[offset : offset + item_length]))
offset += item_length
return numpy.array(result, dtype=numpy.object_)
@staticmethod
def _serialize_numpy_bytes_array(array: numpy.ndarray) -> numpy.ndarray:
result = []
for array_item in numpy.nditer(array, flags=["refs_ok"], order="C"):
item = array_item.item() # type: ignore
if not isinstance(item, bytes):
item = str(item).encode("utf-8")
result.append(struct.pack("@I", len(item)))
result.append(item)
return numpy.frombuffer(b"".join(result), dtype=numpy.byte)
@staticmethod
def _from_list(obj: list[Any]) -> Tensor:
try:
return Tensor._from_numpy(numpy.array(obj))
except Exception as e:
raise ValueError(f"Conversion from {obj} to tensor not supported.") from e
@staticmethod
def _from_numpy(obj: numpy.ndarray | numpy.generic) -> Tensor:
data_type = NUMPY_TO_DATA_TYPE[obj.dtype.type]
shape = obj.shape
if isinstance(obj, numpy.generic):
obj = numpy.asarray(obj)
if data_type == DataType.BYTES:
obj = Tensor._serialize_numpy_bytes_array(obj)
memory_buffer = MemoryBuffer(
data_ptr=obj.ctypes.data,
memory_type=MemoryType.CPU,
memory_type_id=0,
size=obj.itemsize * obj.size,
owner=obj,
)
return Tensor(data_type, shape, memory_buffer)
def _create_managed_tensor(self) -> DLManagedTensor:
# Allocates space for a managed tensor object
# and fills in the fields
#
# To ensure the lifetime of the managed tensor we create a
# context object that includes a newly created shape array and a
# reference to self
size = ctypes.c_size_t(ctypes.sizeof(DLManagedTensor))
address = ctypes.pythonapi.PyMem_RawMalloc(size)
dl_managed_tensor = DLManagedTensor.from_address(address)
dl_managed_tensor.dl_tensor.data = self.data_ptr
dl_managed_tensor.dl_tensor.device = DLDevice(
MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE[self.memory_type],
self.memory_type_id,
)
dl_managed_tensor.dl_tensor.dtype = DATA_TYPE_TO_DLPACK_DTYPE[self.data_type]
dl_managed_tensor.dl_tensor.ndim = len(self.shape)
manager_ctx = _ManagerCtx(self)
dl_managed_tensor.dl_tensor.shape = manager_ctx.shape
dl_managed_tensor.dl_tensor.strides = manager_ctx.strides
dl_managed_tensor.dl_tensor.byte_offset = 0
dl_managed_tensor.deleter = Tensor._managed_tensor_deleter
dl_managed_tensor.manager_ctx = manager_ctx.reference()
return dl_managed_tensor
@staticmethod
@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
def _managed_tensor_deleter(handle: int) -> None:
dl_managed_tensor = DLManagedTensor.from_address(handle)
_ManagerCtx.release(dl_managed_tensor.manager_ctx)
ctypes.pythonapi.PyMem_RawFree(handle)
@staticmethod
@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
def _pycapsule_deleter(handle: ctypes.c_void_p) -> None:
try:
pycapsule: ctypes.py_object = ctypes.cast(handle, ctypes.py_object)
if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, c_str_dltensor):
dl_managed_tensor = ctypes.pythonapi.PyCapsule_GetPointer(
pycapsule, c_str_dltensor
)
Tensor._managed_tensor_deleter(dl_managed_tensor)
ctypes.pythonapi.PyCapsule_SetDestructor(pycapsule, None)
except Exception as e:
print(f"Exception occurred while deleting capsule: {e}")
raise e
_from_converters: ClassVar[dict[type, Callable[[Any], Tensor]]] = dict(
{numpy.ndarray: _from_numpy, numpy.generic: _from_numpy, list: _from_list},
)
class _ManagerCtx:
# To ensure the lifetime of the managed tensor we create a
# context object that includes a newly created shape array and a
# reference to self
def __init__(self, tensor: Tensor) -> None:
self._tensor = tensor
self.shape = (ctypes.c_int64 * len(tensor.shape))(*tensor.shape)
self.strides = ctypes.POINTER(ctypes.c_int64)()
def reference(self) -> ctypes.c_void_p:
py_obj = ctypes.py_object(self)
ctypes.pythonapi.Py_IncRef(py_obj)
# Note: Could not find a direct way to cast a python object
# to a c_void_p. The mechanism is to either use id(self) or
# cast as described here:
#
# https://groups.google.com/g/dev-python/c/QRRqVC7gkf4/m/zH7l1gTXBwAJ
#
# To avoid relying on the behavior of id() we use the casting mechanism
return ctypes.POINTER(ctypes.c_void_p)(py_obj)[0] # type: ignore
@staticmethod
def release(reference: ctypes.c_void_p) -> None:
py_obj = ctypes.cast(reference, ctypes.py_object)
ctypes.pythonapi.Py_DecRef(py_obj)
...@@ -25,10 +25,8 @@ from urllib.parse import urlsplit ...@@ -25,10 +25,8 @@ from urllib.parse import urlsplit
import cupy import cupy
import numpy import numpy
import tritonserver
import ucp import ucp
from cupy_backends.cuda.api.runtime import CUDARuntimeError from cupy_backends.cuda.api.runtime import CUDARuntimeError
from tritonserver import InvalidArgumentError, MemoryBuffer, MemoryType, Tensor
from triton_distributed.icp.data_plane import ( from triton_distributed.icp.data_plane import (
DataPlane, DataPlane,
...@@ -48,7 +46,11 @@ from triton_distributed.icp.data_plane import ( ...@@ -48,7 +46,11 @@ from triton_distributed.icp.data_plane import (
set_icp_tensor_size, set_icp_tensor_size,
set_icp_tensor_uri, set_icp_tensor_uri,
) )
from triton_distributed.icp.data_type import DataType
from triton_distributed.icp.memory_buffer import MemoryBuffer
from triton_distributed.icp.memory_type import MemoryType
from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
from triton_distributed.icp.tensor import Tensor
LOGGER = logging.getLogger(__name__) LOGGER = logging.getLogger(__name__)
...@@ -175,20 +177,18 @@ class _UcpDataPlane(DataPlane): ...@@ -175,20 +177,18 @@ class _UcpDataPlane(DataPlane):
if tensor_id in self._tensor_store: if tensor_id in self._tensor_store:
tensor = self._tensor_store[tensor_id] tensor = self._tensor_store[tensor_id]
array_module = numpy array_module = numpy
if tensor.memory_type == tritonserver.MemoryType.CPU: if tensor.memory_type == MemoryType.CPU:
array_module = numpy array_module = numpy
device_manager = contextlib.nullcontext() device_manager = contextlib.nullcontext()
elif tensor.memory_type == tritonserver.MemoryType.GPU: elif tensor.memory_type == MemoryType.GPU:
array_module = cupy array_module = cupy
device_manager = cupy.cuda.Device( device_manager = cupy.cuda.Device(
tensor.memory_buffer.memory_type_id tensor.memory_buffer.memory_type_id
) )
else: else:
raise InvalidArgumentError( raise ValueError(f"Invalid Memory Type {tensor.memory_type}")
f"Invalid Memory Type {tensor.memory_type}"
)
with device_manager: with device_manager:
if tensor.data_type == tritonserver.DataType.BYTES: if tensor.data_type == DataType.BYTES:
array = tensor.memory_buffer.owner array = tensor.memory_buffer.owner
else: else:
array = array_module.from_dlpack(tensor) array = array_module.from_dlpack(tensor)
...@@ -343,7 +343,7 @@ class _UcpDataPlane(DataPlane): ...@@ -343,7 +343,7 @@ class _UcpDataPlane(DataPlane):
if requested_memory_type is not None: if requested_memory_type is not None:
memory_type = requested_memory_type memory_type = requested_memory_type
if memory_type == tritonserver.MemoryType.GPU and self._cuda_is_available: if memory_type == MemoryType.GPU and self._cuda_is_available:
array_module = cupy array_module = cupy
if requested_memory_type_id is not None: if requested_memory_type_id is not None:
device_manager = cupy.cuda.Device(requested_memory_type_id) device_manager = cupy.cuda.Device(requested_memory_type_id)
......
...@@ -24,10 +24,11 @@ import numpy ...@@ -24,10 +24,11 @@ import numpy
import pytest import pytest
import ucp import ucp
from cupy_backends.cuda.api.runtime import CUDARuntimeError from cupy_backends.cuda.api.runtime import CUDARuntimeError
from tritonserver import DataType, MemoryType, Tensor
from tritonserver._api._datautils import TRITON_TO_NUMPY_DTYPE
from triton_distributed.icp.data_plane import DataPlaneError from triton_distributed.icp.data_plane import DataPlaneError
from triton_distributed.icp.data_type import DATA_TYPE_TO_NUMPY_DTYPE, DataType
from triton_distributed.icp.memory_type import MemoryType
from triton_distributed.icp.tensor import Tensor
from triton_distributed.icp.ucp_data_plane import ( from triton_distributed.icp.ucp_data_plane import (
UcpDataPlane, UcpDataPlane,
get_icp_tensor_uri, get_icp_tensor_uri,
...@@ -283,7 +284,7 @@ def test_requested_memory_type(memory_type, memory_type_id, request): ...@@ -283,7 +284,7 @@ def test_requested_memory_type(memory_type, memory_type_id, request):
def _get_random_tensor(data_type: DataType, size: Sequence[int]): def _get_random_tensor(data_type: DataType, size: Sequence[int]):
dtype = TRITON_TO_NUMPY_DTYPE[data_type] dtype = DATA_TYPE_TO_NUMPY_DTYPE[data_type]
value = numpy.random.rand(*size) value = numpy.random.rand(*size)
return value.astype(dtype) return value.astype(dtype)
......
[build-system]
requires = ["setuptools>=65.0", "setuptools-scm>=8"]
build-backend = "setuptools.build_meta"
[project]
name = "triton_distributed.runtime"
dynamic = ["version"]
dependencies = ["triton_distributed.icp >= 0"]
[tool.setuptools_scm]
version_file = "src/triton_distributed/runtime/_version.py"
root = "../.."
[tool.setuptools.packages.find]
where = ["src"]
include = ["triton_distributed.runtime*"]
namespaces= true
...@@ -24,8 +24,13 @@ from triton_distributed.runtime.remote_request import ( ...@@ -24,8 +24,13 @@ from triton_distributed.runtime.remote_request import (
from triton_distributed.runtime.remote_response import ( from triton_distributed.runtime.remote_response import (
RemoteInferenceResponse as RemoteInferenceResponse, RemoteInferenceResponse as RemoteInferenceResponse,
) )
from triton_distributed.runtime.triton_core_operator import (
TritonCoreOperator as TritonCoreOperator, try:
) from triton_distributed.runtime.triton_core_operator import (
TritonCoreOperator as TritonCoreOperator,
)
except ImportError:
pass
from triton_distributed.runtime.worker import Worker as Worker from triton_distributed.runtime.worker import Worker as Worker
from triton_distributed.runtime.worker import WorkerConfig as WorkerConfig from triton_distributed.runtime.worker import WorkerConfig as WorkerConfig
...@@ -16,8 +16,6 @@ import multiprocessing ...@@ -16,8 +16,6 @@ import multiprocessing
from pprint import pformat from pprint import pformat
from typing import Optional, Type from typing import Optional, Type
from tritonserver import InvalidArgumentError
from triton_distributed.icp import ( from triton_distributed.icp import (
DataPlane, DataPlane,
NatsRequestPlane, NatsRequestPlane,
...@@ -71,7 +69,7 @@ class Deployment: ...@@ -71,7 +69,7 @@ class Deployment:
if self._default_request_plane == NatsRequestPlane: if self._default_request_plane == NatsRequestPlane:
self.request_plane_server = NatsServer(log_dir=self._default_log_dir) self.request_plane_server = NatsServer(log_dir=self._default_log_dir)
else: else:
raise InvalidArgumentError( raise ValueError(
f"Unknown Request Plane Type, can not initialize {self._default_request_plane}" f"Unknown Request Plane Type, can not initialize {self._default_request_plane}"
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment