build: pip based installation of icp and runtime. Also make tritonserver optional

Signed-off-by: Neelay Shah <neelays@nvidia.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>

build: pip based installation of icp and runtime. Also make tritonserver optional
Signed-off-by: Neelay Shah <neelays@nvidia.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
ccfe101e · Neelay Shah · GitHub · 5701753a · ccfe101e · ccfe101e
Commit ccfe101e authored Feb 04, 2025 by Neelay Shah Committed by GitHub Feb 04, 2025
20 changed files
--- a/container/Dockerfile
+++ b/container/Dockerfile
@@ -154,8 +154,20 @@ COPY . /workspace
 RUN cd runtime/rust && cargo build --release --locked && cargo doc --no-deps
 RUN /workspace/icp/protos/gen_python.sh
+# Install python packages
+ARG PYTHON_PACKAGE_VERSION=0.0.1.dev+unknown
+# SETUPTOOLS_SCM_PRETEND_VERSION allows dynamically setting the package versions during build/install.
+# This allows having versioned packages during development between releases, such as commit IDs.
+#
+# Normally SCM version is taken directly from .git but this is not available in the Dockerfile
+# and so we pass in via a buildarg
+RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_ICP=${PYTHON_PACKAGE_VERSION} pip install -e /workspace/icp/python
+RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_RUNTIME=${PYTHON_PACKAGE_VERSION} pip install -e /workspace/runtime/python
 # Sets pythonpath for python modules
-ENV PYTHONPATH="${PYTHONPATH}:/workspace/icp/python/src:/workspace/runtime/python/src:/workspace/examples/python:/opt/tritonserver/python/openai/openai_frontend"
+ENV PYTHONPATH="${PYTHONPATH}:/workspace/examples/python:/opt/tritonserver/python/openai/openai_frontend"
 # Enable system UCX
 ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true

--- a/container/build.sh
+++ b/container/build.sh
@@ -22,12 +22,16 @@ PLATFORM=linux/amd64
 # Get short commit hash
 commit_id=$(git rev-parse --short HEAD)
-# Attempt to get current tag
+# if COMMIT_ID matches a TAG use that
-current_tag=$(git describe --tags --exact-match 2>/dev/null) || true
+current_tag=$(git describe --tags --exact-match 2>/dev/null | sed 's/^v//') || true
-# Use tag if available, otherwise use commit hash
+# Get latest TAG and add COMMIT_ID for dev
-VERSION=${current_tag:-$commit_id}
+latest_tag=$(git describe --tags --abbrev=0 $(git rev-list --tags --max-count=1 main) | sed 's/^v//') || true
+# Use tag if available, otherwise use latest_tag.dev.commit_id
+VERSION=v${current_tag:-$latest_tag.dev.$commit_id}
+PYTHON_PACKAGE_VERSION=${current_tag:-$latest_tag.dev+$commit_id}
 # Frameworks
 #
@@ -244,7 +248,7 @@ get_options "$@"
 # BUILD DEV IMAGE
-BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG --build-arg FRAMEWORK=$FRAMEWORK --build-arg ${FRAMEWORK}_FRAMEWORK=1"
+BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG --build-arg FRAMEWORK=$FRAMEWORK --build-arg ${FRAMEWORK}_FRAMEWORK=1 --build-arg VERSION=$VERSION --build-arg PYTHON_PACKAGE_VERSION=$PYTHON_PACKAGE_VERSION"
 if [ ! -z ${GITHUB_TOKEN} ]; then
    BUILD_ARGS+=" --build-arg GITHUB_TOKEN=${GITHUB_TOKEN} "

--- a/examples/python/hello_world/operators/encoder_decoder.py
+++ b/examples/python/hello_world/operators/encoder_decoder.py
@@ -8,12 +8,12 @@ class EncodeDecodeOperator(Operator):
        self,
        name,
        version,
-        triton_core,
        request_plane,
        data_plane,
        parameters,
        repository,
        logger,
+        triton_core,
    ):
        self._encoder = RemoteOperator("encoder", request_plane, data_plane)
        self._decoder = RemoteOperator("decoder", request_plane, data_plane)

--- a/examples/python/hello_world/single_file.py
+++ b/examples/python/hello_world/single_file.py
@@ -41,12 +41,12 @@ class EncodeDecodeOperator(Operator):
        self,
        name,
        version,
-        triton_core,
        request_plane,
        data_plane,
        parameters,
        repository,
        logger,
+        triton_core,
    ):
        self._encoder = RemoteOperator("encoder", request_plane, data_plane)
        self._decoder = RemoteOperator("decoder", request_plane, data_plane)

--- a/examples/python/llm/vllm/operators/vllm.py
+++ b/examples/python/llm/vllm/operators/vllm.py
@@ -2,7 +2,7 @@ import argparse
 import json
 import logging
 from dataclasses import field
-from typing import AsyncGenerator, List, Optional
+from typing import Any, AsyncGenerator, List, Optional
 import numpy as np
@@ -23,7 +23,6 @@ class VllmOperator(Operator):
        self,
        name: str,
        version: int,
-        triton_core,
        request_plane: RequestPlane,
        data_plane: DataPlane,
        parameters: Optional[dict[str, str | int | bool | bytes]] = field(
@@ -31,6 +30,7 @@ class VllmOperator(Operator):
        ),
        repository: Optional[str] = None,
        logger: Optional[logging.Logger] = None,
+        triton_core: Optional[Any] = None,
    ):
        self.name = name
        self.version = version

--- a/icp/python/pyproject.toml
+++ b/icp/python/pyproject.toml
+[build-system]
+requires = ["setuptools>=65.0", "setuptools-scm>=8"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "triton_distributed.icp"
+dynamic = ["version"]
+[tool.setuptools_scm]
+version_file = "src/triton_distributed/icp/_version.py"
+root = "../.."
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["triton_distributed.icp*"]
+namespaces= true
--- a/icp/python/src/triton_distributed/icp/_custom_key_error_dict.py
+++ b/icp/python/src/triton_distributed/icp/_custom_key_error_dict.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Type
+class CustomKeyErrorDict(dict):
+    def __init__(
+        self,
+        from_name: str,
+        to_name: str,
+        *args,
+        exception: Type[Exception] = ValueError,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self._to_name = to_name
+        self._from_name = from_name
+        self._exception = exception
+    def __getitem__(self, key):
+        try:
+            return super().__getitem__(key)
+        except KeyError:
+            raise self._exception(
+                f"Unsupported {self._from_name}. Can't convert {key} to {self._to_name}"
+            ) from None
--- a/icp/python/src/triton_distributed/icp/_dlpack.py
+++ b/icp/python/src/triton_distributed/icp/_dlpack.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+# This file contains the DLPack API wrapped in Python style (see
+# 'dlpack.h' for detail) and the utilities for Triton client to interact
+# with DLPack
+#
+# Ref:
+# https://github.com/dmlc/dlpack/blob/main/include/dlpack/dlpack.h
+# https://github.com/dmlc/dlpack/blob/main/apps/numpy_dlpack/dlpack/from_numpy.py
+################################################################################
+import ctypes
+from typing import Union
+from triton_distributed.icp._custom_key_error_dict import CustomKeyErrorDict
+from triton_distributed.icp.data_type import DataType
+from triton_distributed.icp.memory_type import MemoryType, string_to_memory_type
+try:
+    import cupy
+except ImportError:
+    cupy = None
+# Need to explicit set the res / arg types for pythonapi functions to
+# work properly
+ctypes.pythonapi.PyMem_RawMalloc.restype = ctypes.c_void_p
+ctypes.pythonapi.PyMem_RawFree.argtypes = [ctypes.c_void_p]
+ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
+ctypes.pythonapi.PyCapsule_New.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_char_p,
+    ctypes.c_void_p,
+]
+ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
+ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]
+c_str_dltensor = b"dltensor"
+class DLDeviceType(ctypes.c_int):
+    kDLCPU = 1
+    kDLCUDA = 2
+    kDLCUDAHost = 3
+    kDLOpenCL = 4
+    kDLVulkan = 7
+    kDLMetal = 8
+    kDLVPI = 9
+    kDLROCM = 10
+    kDLROCMHost = 11
+    kDLExtDev = 12
+    kDLCUDAManaged = 13
+    kDLOneAPI = 14
+    kDLWebGPU = 15
+    kDLHexagon = 16
+DeviceOrMemoryType = Union[
+    tuple[MemoryType, int], MemoryType, tuple[DLDeviceType, int], str
+]
+class DLDevice(ctypes.Structure):
+    _fields_ = [
+        ("device_type", ctypes.c_int),
+        ("device_id", ctypes.c_int),
+    ]
+class DLDataTypeCode(ctypes.c_uint8):
+    kDLInt = 0
+    kDLUInt = 1
+    kDLFloat = 2
+    kDLOpaquePointer = 3
+    kDLBfloat = 4
+    kDLComplex = 5
+    kDLBool = 6
+class DLDataType(ctypes.Structure):
+    _fields_ = [
+        ("type_code", ctypes.c_uint8),
+        ("bits", ctypes.c_uint8),
+        ("lanes", ctypes.c_uint16),
+    ]
+class DLTensor(ctypes.Structure):
+    _fields_ = [
+        ("data", ctypes.c_void_p),
+        ("device", DLDevice),
+        ("ndim", ctypes.c_int),
+        ("dtype", DLDataType),
+        ("shape", ctypes.POINTER(ctypes.c_int64)),
+        ("strides", ctypes.POINTER(ctypes.c_int64)),
+        ("byte_offset", ctypes.c_uint64),
+    ]
+class DLManagedTensor(ctypes.Structure):
+    _fields_ = [
+        ("dl_tensor", DLTensor),
+        ("manager_ctx", ctypes.c_void_p),
+        ("deleter", ctypes.CFUNCTYPE(None, ctypes.c_void_p)),
+    ]
+# Utilities
+def _raise_error(msg):
+    """
+    Raise error with the provided message
+    """
+    raise Exception(msg) from None
+# Use as managed context in DLPack that doesn't hold ownership of the
+# data content.
+class DataViewContext:
+    def __init__(self, shape) -> None:
+        # Convert the Python object to ctypes objects expected by
+        # DLPack
+        self._shape = (ctypes.c_int64 * len(shape))(*shape)
+        # No strides: compact and row-major
+        self._strides = ctypes.POINTER(ctypes.c_int64)()
+    def as_manager_ctx(self) -> ctypes.c_void_p:
+        py_obj = ctypes.py_object(self)
+        py_obj_ptr = ctypes.pointer(py_obj)
+        ctypes.pythonapi.Py_IncRef(py_obj)
+        ctypes.pythonapi.Py_IncRef(ctypes.py_object(py_obj_ptr))
+        return ctypes.cast(py_obj_ptr, ctypes.c_void_p)
+@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+def managed_tensor_deleter(handle: ctypes.c_void_p) -> None:
+    dl_managed_tensor = DLManagedTensor.from_address(handle)  # type: ignore
+    py_obj_ptr = ctypes.cast(
+        dl_managed_tensor.manager_ctx, ctypes.POINTER(ctypes.py_object)
+    )
+    py_obj = py_obj_ptr.contents
+    ctypes.pythonapi.Py_DecRef(py_obj)
+    ctypes.pythonapi.Py_DecRef(ctypes.py_object(py_obj_ptr))
+    ctypes.pythonapi.PyMem_RawFree(handle)
+@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+def pycapsule_deleter(handle: ctypes.c_void_p) -> None:
+    pycapsule: ctypes.py_object = ctypes.cast(handle, ctypes.py_object)
+    if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, c_str_dltensor):
+        dl_managed_tensor = ctypes.pythonapi.PyCapsule_GetPointer(
+            pycapsule, c_str_dltensor
+        )
+        managed_tensor_deleter(dl_managed_tensor)
+        ctypes.pythonapi.PyCapsule_SetDestructor(pycapsule, None)
+def is_contiguous_data(
+    ndim: ctypes.c_int,
+    shape: ctypes.POINTER(ctypes.c_int64),  # type: ignore
+    stride: ctypes.POINTER(ctypes.c_int64),  # type: ignore
+):
+    # If 'stride' doesn't capture valid value
+    if (stride is None) or (not bool(stride)):
+        return True
+    calculated_stride = 1
+    # iterate stride in reverse order [ndim-1, -1)
+    for i in reversed(range(ndim)):  # type: ignore
+        if stride[i] != calculated_stride:
+            return False
+        calculated_stride *= shape[i]
+    return True
+def get_byte_size(
+    dtype: DLDataType, ndim: ctypes.c_int, shape: ctypes.POINTER(ctypes.c_int64)  # type: ignore
+):
+    element_byte_size = dtype.bits * dtype.lanes // 8  # Assume 8 bits in a byte
+    for i in range(ndim):  # type: ignore
+        element_byte_size *= shape[i]
+    return element_byte_size
+def get_dlpack_capsule(dlpack_obj, stream=None):
+    # Extract PyCapsule of the DLPack object
+    if hasattr(dlpack_obj, "__dlpack__"):
+        if not hasattr(dlpack_obj, "__dlpack_device__"):
+            _raise_error(
+                "DLPack expects '__dlpack_device__' if '__dlpack__' has been defined"
+            )
+        device = dlpack_obj.__dlpack_device__()
+        # Have to condition on the device type as, using numpy as example,
+        # some DLPack implementation doesn't accept 'stream' as arguments
+        if device != DLDeviceType.kDLCUDA:
+            return dlpack_obj.__dlpack__()
+        else:
+            return dlpack_obj.__dlpack__(stream)
+    else:
+        # Old interface where PyCapsule object is passed directly
+        return dlpack_obj
+def get_dlpack_device(dlpack_obj):
+    if hasattr(dlpack_obj, "__dlpack_device__"):
+        return dlpack_obj.__dlpack_device__()
+    return None
+def get_managed_tensor(dlcapsule):
+    ptr = ctypes.pythonapi.PyCapsule_GetPointer(dlcapsule, c_str_dltensor)
+    return DLManagedTensor.from_address(ptr)
+class DLPackObject:
+    def __init__(self, value) -> None:
+        try:
+            stream = None
+            device, device_id = value.__dlpack_device__()
+            if device == DLDeviceType.kDLCUDA:
+                if cupy is None:
+                    raise ValueError(
+                        f"DLPack synchronization on device {device,device_id} not supported"
+                    )
+                with cupy.cuda.Device(device_id):
+                    stream = 1  # legacy default stream
+                    self._capsule = get_dlpack_capsule(value, stream)
+                    self._tensor = get_managed_tensor(self._capsule).dl_tensor
+            else:
+                self._capsule = get_dlpack_capsule(value)
+                self._tensor = get_managed_tensor(self._capsule).dl_tensor
+        except Exception as e:
+            raise ValueError(f"Object does not support DLPack protocol: {e}") from None
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, DLPackObject):
+            return False
+        if self.byte_size != other.byte_size:
+            return False
+        if self.memory_type != other.memory_type:
+            return False
+        if self.memory_type_id != other.memory_type_id:
+            return False
+        if self.shape != other.shape:
+            return False
+        if self.data_ptr != other.data_ptr:
+            return False
+        if self.contiguous != other.contiguous:
+            return False
+        if self.data_type != other.data_type:
+            return False
+        return True
+    @property
+    def byte_size(self) -> int:
+        return get_byte_size(self._tensor.dtype, self._tensor.ndim, self._tensor.shape)
+    @property
+    def memory_type(self) -> MemoryType:
+        return DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE[self._tensor.device.device_type]
+    @property
+    def memory_type_id(self) -> int:
+        return self._tensor.device.device_id
+    @property
+    def shape(self) -> list[int]:
+        return [self._tensor.shape[i] for i in range(self._tensor.ndim)]
+    @property
+    def data_type(self) -> DataType:
+        return DLPACK_TO_DATA_TYPE[self.dlpack_data_type]
+    @property
+    def dlpack_data_type(self) -> tuple[DLDataTypeCode, int]:
+        return (self._tensor.dtype.type_code, self._tensor.dtype.bits)
+    @property
+    def data_ptr(self) -> ctypes.c_void_p:
+        return self._tensor.data + self._tensor.byte_offset
+    @property
+    def contiguous(self) -> bool:
+        return is_contiguous_data(
+            self._tensor.ndim, self._tensor.shape, self._tensor.strides
+        )
+DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE: dict[DLDeviceType, MemoryType] = CustomKeyErrorDict(
+    "DLPack device type",
+    "Memory type",
+    {
+        DLDeviceType.kDLCUDA: MemoryType.GPU,
+        DLDeviceType.kDLCPU: MemoryType.CPU,
+    },
+)
+MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE: dict[MemoryType, DLDeviceType] = CustomKeyErrorDict(
+    "Memory type",
+    "DLPack device type",
+    {
+        **{value: key for key, value in DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE.items()},
+        **{MemoryType.CPU_PINNED: DLDeviceType.kDLCPU},
+    },
+)
+def parse_device_or_memory_type(
+    device_or_memory_type: DeviceOrMemoryType,
+) -> tuple[MemoryType, int]:
+    memory_type = None
+    memory_type_id = 0
+    if isinstance(device_or_memory_type, tuple):
+        if isinstance(device_or_memory_type[0], MemoryType):
+            memory_type = device_or_memory_type[0]
+            memory_type_id = device_or_memory_type[1]
+        elif isinstance(device_or_memory_type[0], DLDeviceType):
+            memory_type = DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE[device_or_memory_type[0]]
+            memory_type_id = device_or_memory_type[1]
+        else:
+            raise ValueError(f"Invalid memory type {device_or_memory_type}")
+    elif isinstance(device_or_memory_type, MemoryType):
+        memory_type = device_or_memory_type
+        memory_type_id = 0
+    elif isinstance(device_or_memory_type, str):
+        memory_str_tuple = device_or_memory_type.split(":")
+        if len(memory_str_tuple) > 2:
+            raise ValueError(f"Invalid memory type string {device_or_memory_type}")
+        memory_type = string_to_memory_type(memory_str_tuple[0].upper())
+        if len(memory_str_tuple) == 2:
+            try:
+                memory_type_id = int(memory_str_tuple[1])
+            except ValueError:
+                raise ValueError(
+                    f"Invalid memory type string {device_or_memory_type}"
+                ) from None
+        else:
+            memory_type_id = 0
+    return (memory_type, memory_type_id)
+DLPACK_TO_DATA_TYPE: dict[tuple[DLDataTypeCode, int], DataType] = CustomKeyErrorDict(
+    "DLPack data type",
+    "Data type",
+    {
+        (DLDataTypeCode.kDLBool, 8): DataType.BOOL,
+        (DLDataTypeCode.kDLInt, 8): DataType.INT8,
+        (
+            DLDataTypeCode.kDLInt,
+            16,
+        ): DataType.INT16,
+        (
+            DLDataTypeCode.kDLInt,
+            32,
+        ): DataType.INT32,
+        (
+            DLDataTypeCode.kDLInt,
+            64,
+        ): DataType.INT64,
+        (
+            DLDataTypeCode.kDLUInt,
+            8,
+        ): DataType.UINT8,
+        (
+            DLDataTypeCode.kDLUInt,
+            16,
+        ): DataType.UINT16,
+        (
+            DLDataTypeCode.kDLUInt,
+            32,
+        ): DataType.UINT32,
+        (
+            DLDataTypeCode.kDLUInt,
+            64,
+        ): DataType.UINT64,
+        (
+            DLDataTypeCode.kDLFloat,
+            16,
+        ): DataType.FP16,
+        (
+            DLDataTypeCode.kDLFloat,
+            32,
+        ): DataType.FP32,
+        (
+            DLDataTypeCode.kDLFloat,
+            64,
+        ): DataType.FP64,
+        (
+            DLDataTypeCode.kDLBfloat,
+            16,
+        ): DataType.BF16,
+    },
+)
+DATA_TYPE_TO_DLPACK_DTYPE: dict[DataType, DLDataType] = CustomKeyErrorDict(
+    "Data type",
+    "DLPack data type",
+    {
+        value: DLDataType(type_code=key[0], bits=key[1], lanes=1)
+        for key, value in DLPACK_TO_DATA_TYPE.items()
+    },
+)
--- a/icp/python/src/triton_distributed/icp/data_plane.py
+++ b/icp/python/src/triton_distributed/icp/data_plane.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Abstract Class for interacting with Triton Inference Serving Platform Inter-Component Protocol Data Plane"""
+"""Abstract Class for interacting with Triton Distributed Inter-Component Protocol Data Plane"""
 import abc
 import uuid
@@ -21,28 +21,16 @@ from typing import Optional, Sequence
 import cupy
 import numpy
-from tritonserver import (
+from triton_distributed.icp.data_type import (
+    DATA_TYPE_TO_NUMPY_DTYPE,
    DataType,
-    InvalidArgumentError,
+    string_to_data_type,
-    MemoryBuffer,
-    MemoryType,
-    Tensor,
-)
-from tritonserver._api._datautils import (
-    STRING_TO_TRITON_MEMORY_TYPE,
-    TRITON_TO_NUMPY_DTYPE,
-)
-from tritonserver._c.triton_bindings import (
-    TRITONSERVER_DataTypeString as DataTypeString,
-)
-from tritonserver._c.triton_bindings import (
-    TRITONSERVER_MemoryTypeString as MemoryTypeString,
 )
-from tritonserver._c.triton_bindings import (
+from triton_distributed.icp.memory_buffer import MemoryBuffer
-    TRITONSERVER_StringToDataType as StringToDataType,
+from triton_distributed.icp.memory_type import MemoryType, string_to_memory_type
-)
 from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
+from triton_distributed.icp.tensor import Tensor
 class DataPlaneError(Exception):
@@ -73,13 +61,13 @@ def set_icp_data_type(
    message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor,
    value: DataType,
 ) -> None:
-    message.datatype = DataTypeString(value)
+    message.datatype = value.name
 def get_icp_data_type(
    message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor,
 ) -> DataType:
-    return StringToDataType(message.datatype)
+    return string_to_data_type(message.datatype)
 def set_icp_tensor_uri(
@@ -116,7 +104,7 @@ def set_icp_memory_type(
    message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor,
    value: MemoryType,
 ) -> None:
-    message.parameters[ICP_MEMORY_TYPE].string_param = MemoryTypeString(value)
+    message.parameters[ICP_MEMORY_TYPE].string_param = value.name
 def get_icp_memory_type(
@@ -124,9 +112,7 @@ def get_icp_memory_type(
 ) -> MemoryType | None:
    if ICP_MEMORY_TYPE not in message.parameters:
        return None
-    return STRING_TO_TRITON_MEMORY_TYPE[
+    return string_to_memory_type(message.parameters[ICP_MEMORY_TYPE].string_param)
-        message.parameters[ICP_MEMORY_TYPE].string_param
-    ]
 def set_icp_memory_type_id(
@@ -163,9 +149,7 @@ def set_icp_tensor_contents(
            with cupy.cuda.Device(tensor.memory_buffer.memory_type_id):
                array = cupy.from_dlpack(tensor)
        else:
-            raise InvalidArgumentError(
+            raise ValueError(f"Invalid Tensor Memory Type {tensor.memory_type}")
-                f"Invalid Tensor Memory Type {tensor.memory_type}"
-            )
        message.contents.bytes_contents.append(array.tobytes())
@@ -193,7 +177,7 @@ def get_icp_tensor_contents(
        array = numpy.array(
            numpy.frombuffer(
                message.contents.bytes_contents[0],
-                dtype=TRITON_TO_NUMPY_DTYPE[datatype],
+                dtype=DATA_TYPE_TO_NUMPY_DTYPE[datatype],
            )
        )
        tensor = Tensor(datatype, shape, MemoryBuffer.from_dlpack(array))

--- a/icp/python/src/triton_distributed/icp/data_type.py
+++ b/icp/python/src/triton_distributed/icp/data_type.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from enum import IntEnum
+import numpy
+from triton_distributed.icp._custom_key_error_dict import CustomKeyErrorDict
+DataType = IntEnum(
+    "DataType",
+    names=(
+        "INVALID",
+        "BOOL",
+        "UINT8",
+        "UINT16",
+        "UINT32",
+        "UINT64",
+        "INT8",
+        "INT16",
+        "INT32",
+        "INT64",
+        "FP16",
+        "FP32",
+        "FP64",
+        "BYTES",
+        "BF16",
+    ),
+    start=0,
+)
+def string_to_data_type(data_type_string: str) -> DataType:
+    try:
+        return DataType[data_type_string]
+    except KeyError:
+        raise ValueError(
+            f"Unsupported Data Type String. Can't convert {data_type_string} to DataType"
+        ) from None
+NUMPY_TO_DATA_TYPE: dict[type, DataType] = CustomKeyErrorDict(
+    "Numpy dtype",
+    "Data type",
+    {
+        bool: DataType.BOOL,
+        numpy.bool_: DataType.BOOL,
+        numpy.int8: DataType.INT8,
+        numpy.int16: DataType.INT16,
+        numpy.int32: DataType.INT32,
+        numpy.int64: DataType.INT64,
+        numpy.uint8: DataType.UINT8,
+        numpy.uint16: DataType.UINT16,
+        numpy.uint32: DataType.UINT32,
+        numpy.uint64: DataType.UINT64,
+        numpy.float16: DataType.FP16,
+        numpy.float32: DataType.FP32,
+        numpy.float64: DataType.FP64,
+        numpy.bytes_: DataType.BYTES,
+        numpy.str_: DataType.BYTES,
+        numpy.object_: DataType.BYTES,
+    },
+)
+DATA_TYPE_TO_NUMPY_DTYPE: dict[DataType, type] = CustomKeyErrorDict(
+    "Data type",
+    "Numpy dtype",
+    {
+        **{value: key for key, value in NUMPY_TO_DATA_TYPE.items()},
+        **{DataType.BYTES: numpy.object_},
+        **{DataType.BOOL: numpy.bool_},
+    },
+)
--- a/icp/python/src/triton_distributed/icp/memory_buffer.py
+++ b/icp/python/src/triton_distributed/icp/memory_buffer.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+from triton_distributed.icp._dlpack import DLPackObject
+from triton_distributed.icp.memory_type import MemoryType
+@dataclass
+class MemoryBuffer:
+    """Memory allocated for a Tensor.
+    This object does not own the memory but holds a reference to the
+    owner.
+    Parameters
+    ----------
+    data_ptr : int
+        Pointer to the allocated memory.
+    memory_type : MemoryType
+        memory type
+    memory_type_id : int
+        memory type id (typically the same as device id)
+    size : int
+        Size of the allocated memory in bytes.
+    owner : Any
+        Object that owns or manages the memory buffer.  Allocated
+        memory must not be freed while a reference to the owner is
+        held.
+    Examples
+    --------
+    >>> buffer = MemoryBuffer.from_dlpack(numpy.array([100],dtype=numpy.uint8))
+    """
+    data_ptr: int
+    memory_type: MemoryType
+    memory_type_id: int
+    size: int
+    owner: Any
+    @staticmethod
+    def from_dlpack(owner: Any) -> MemoryBuffer:
+        if not hasattr(owner, "__dlpack__"):
+            raise ValueError("Object does not support DLpack protocol")
+        dlpack_object = DLPackObject(owner)
+        return MemoryBuffer._from_dlpack_object(owner, dlpack_object)
+    @staticmethod
+    def _from_dlpack_object(owner: Any, dlpack_object: DLPackObject) -> MemoryBuffer:
+        if not dlpack_object.contiguous:
+            raise ValueError("Only contiguous memory is supported")
+        return MemoryBuffer(
+            int(dlpack_object.data_ptr),
+            dlpack_object.memory_type,
+            dlpack_object.memory_type_id,
+            dlpack_object.byte_size,
+            owner,
+        )
--- a/icp/python/src/triton_distributed/icp/memory_type.py
+++ b/icp/python/src/triton_distributed/icp/memory_type.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import IntEnum
+MemoryType = IntEnum("MemoryType", names=("CPU", "CPU_PINNED", "GPU"), start=0)
+def string_to_memory_type(memory_type_string: str) -> MemoryType:
+    try:
+        return MemoryType[memory_type_string]
+    except KeyError:
+        raise ValueError(
+            f"Unsupported Memory Type String. Can't convert {memory_type_string} to MemoryType"
+        ) from None
--- a/icp/python/src/triton_distributed/icp/nats_request_plane.py
+++ b/icp/python/src/triton_distributed/icp/nats_request_plane.py
@@ -25,7 +25,6 @@ from typing import Dict, Optional
 from urllib.parse import urlsplit, urlunsplit
 import nats
-from tritonserver import InvalidArgumentError
 from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
 from triton_distributed.icp.request_plane import (
@@ -203,7 +202,7 @@ class NatsRequestPlane(RequestPlane):
        Optional[nats.js.JetStreamContext.PullSubscription],
    ]:
        if self._jet_stream is None:
-            raise InvalidArgumentError(
+            raise ValueError(
                "Failed to get model stream: NATS Jetstream not connected!"
            )
@@ -335,19 +334,15 @@ class NatsRequestPlane(RequestPlane):
        responses: AsyncIterator[ModelInferResponse] | ModelInferResponse,
    ):
        if self._jet_stream is None:
-            raise InvalidArgumentError(
+            raise ValueError("Failed to post response: NATS Jetstream not connected!")
-                "Failed to post response: NATS Jetstream not connected!"
-            )
        request_id = get_icp_request_id(request)
        if request_id is None:
-            raise InvalidArgumentError("ICP request must have request id")
+            raise ValueError("ICP request must have request id")
        response_to_uri = get_icp_response_to_uri(request)
        if not response_to_uri:
-            raise InvalidArgumentError(
+            raise ValueError("Attempting to send a response when non requested")
-                "Attempting to send a response when non requested"
-            )
        parsed = urlsplit(response_to_uri)
        response_stream = parsed.path.replace("/", "")
@@ -378,12 +373,10 @@ class NatsRequestPlane(RequestPlane):
        ] = None,
    ) -> AsyncIterator[ModelInferResponse]:
        if self._jet_stream is None:
-            raise InvalidArgumentError(
+            raise ValueError("Failed to post request: NATS Jetstream not connected!")
-                "Failed to post request: NATS Jetstream not connected!"
-            )
        if response_iterator and response_handler:
-            raise InvalidArgumentError(
+            raise ValueError(
                "Can only specify either response handler or response iterator"
            )

--- a/icp/python/src/triton_distributed/icp/request_plane.py
+++ b/icp/python/src/triton_distributed/icp/request_plane.py
@@ -13,14 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Abstract Class for interacting with the Triton Inference Serving Platform Inter-Component Protocol Control Plane"""
+"""Abstract Class for interacting with the Triton Distributed Inter-Component Protocol Control Plane"""
 import abc
 import uuid
 from typing import AsyncIterator, Awaitable, Callable, Optional
-from tritonserver import TritonError
 from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
 ICP_REQUEST_ID = "icp_request_id"
@@ -33,6 +31,10 @@ ICP_REQUEST_CANCELLED = "icp_request_cancelled"
 ICP_ERROR = "icp_response_error"
+class RequestPlaneError(Exception):
+    pass
 def get_icp_request_id(
    message: ModelInferRequest | ModelInferResponse,
 ) -> uuid.UUID | None:
@@ -47,13 +49,15 @@ def set_icp_request_id(
    message.parameters[ICP_REQUEST_ID].string_param = str(value)
-def get_icp_response_error(message: ModelInferResponse) -> TritonError | None:
+def get_icp_response_error(message: ModelInferResponse) -> RequestPlaneError | None:
    if ICP_ERROR not in message.parameters:
        return None
-    return TritonError(message.parameters[ICP_ERROR].string_param)
+    return RequestPlaneError(message.parameters[ICP_ERROR].string_param)
-def set_icp_response_error(message: ModelInferResponse, value: TritonError) -> None:
+def set_icp_response_error(
+    message: ModelInferResponse, value: RequestPlaneError
+) -> None:
    message.parameters[ICP_ERROR].string_param = str(value)

--- a/icp/python/src/triton_distributed/icp/tensor.py
+++ b/icp/python/src/triton_distributed/icp/tensor.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import ctypes
+import struct
+from dataclasses import dataclass
+from typing import Any, Callable, ClassVar, Sequence
+import numpy
+from triton_distributed.icp._dlpack import (
+    DATA_TYPE_TO_DLPACK_DTYPE,
+    MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE,
+    DeviceOrMemoryType,
+    DLDevice,
+    DLDeviceType,
+    DLManagedTensor,
+    DLPackObject,
+    c_str_dltensor,
+    parse_device_or_memory_type,
+)
+from triton_distributed.icp.data_type import NUMPY_TO_DATA_TYPE, DataType
+from triton_distributed.icp.memory_buffer import MemoryBuffer
+from triton_distributed.icp.memory_type import MemoryType
+try:
+    import cupy
+except ImportError:
+    cupy = None
+@dataclass
+class Tensor:
+    """Class representing a Tensor.
+    Parameters
+    ----------
+    data_type : DataType
+        Data type of the tensor.
+    shape : Sequence[int]
+        Shape of the tensor.
+    memory_buffer : MemoryBuffer
+        Memory buffer containing the tensor data.
+    """
+    data_type: DataType
+    shape: Sequence[int]
+    memory_buffer: MemoryBuffer
+    @property
+    def data_ptr(self) -> int:
+        """Get the pointer to the tensor's data.
+        Returns
+        -------
+        int
+            The pointer to the tensor's data.
+        """
+        return self.memory_buffer.data_ptr
+    @property
+    def memory_type(self) -> MemoryType:
+        """Get the memory type of the tensor.
+        Returns
+        -------
+        MemoryType
+            The memory type of the tensor.
+        """
+        return self.memory_buffer.memory_type
+    @property
+    def memory_type_id(self) -> int:
+        """Get the ID representing the memory type of the tensor.
+        Returns
+        -------
+        int
+            The ID representing the memory type of the tensor.
+        """
+        return self.memory_buffer.memory_type_id
+    @property
+    def size(self) -> int:
+        """Get the size of the tensor's data in bytes.
+        Returns
+        -------
+        int
+            The size of the tensor's data in bytes.
+        """
+        return self.memory_buffer.size
+    def _sync_on_requested_stream(self, requested_stream_ptr):
+        """Stream synchronization based on cupy implementation.
+        Record event on requested stream if different from current
+        stream.
+        See
+        `https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html`
+        and
+        `https://github.com/cupy/cupy/blob/f9563bcd5c674623f80ce975b590f9c860b44ed6/cupy/_core/core.pyx#L281`.
+        for more details.
+        Parameters
+        ----------
+        requested_stream_ptr :
+            requested stream as defined by DLPack protocol
+        Raises
+        ------
+        unsupported
+            If synchronization can not be done
+        """
+        current_stream = None
+        unsupported = ValueError(
+            f"DLPack stream synchronization on memory type {self.memory_type} and stream {requested_stream_ptr} not supported"
+        )
+        if requested_stream_ptr is not None and not isinstance(
+            requested_stream_ptr, int
+        ):
+            raise unsupported
+        if self.memory_type != MemoryType.GPU:
+            if requested_stream_ptr not in (None, 0):
+                raise unsupported
+            return
+        if cupy is None:
+            raise unsupported
+        # NOTE: Technically this is not required by the protocol. It is the
+        #       responsibility of the caller(consumer) to ensure that
+        #       we are on the correct device. Added to ensure
+        #       the semantics are correct - but should be a no-op.
+        #       May be removed in the future.
+        with cupy.cuda.Device(self.memory_type_id):
+            current_stream = cupy.cuda.get_current_stream()
+            curr_stream_ptr = current_stream.ptr
+            # Based on cupy documentation
+            # cupy.cuda.Stream.null.ptr is legacy default stream
+            # cupy.cuda.Stream.ptds.ptr is per thread default stream
+            if curr_stream_ptr == 0:
+                curr_stream_ptr = cupy.cuda.Stream.null.ptr
+            if requested_stream_ptr in (None, 0, 1):
+                requested_stream_ptr = cupy.cuda.Stream.null.ptr
+            if requested_stream_ptr in (2,):
+                requested_stream_ptr = cupy.cuda.Stream.ptds.ptr
+            if requested_stream_ptr >= 0 and curr_stream_ptr != requested_stream_ptr:
+                next_stream = cupy.cuda.ExternalStream(requested_stream_ptr)
+                event = current_stream.record()
+                next_stream.wait_event(event)
+    def __dlpack__(self, *, stream=None):
+        """Convert the tensor to a DLPack-compatible object.
+        Parameters
+        ----------
+        stream : Any, optional
+            Currently Ignored parameter, by default None.
+        Returns
+        -------
+        Any
+            A DLPack-compatible object representing the tensor.
+        """
+        self._sync_on_requested_stream(stream)
+        dl_managed_tensor = self._create_managed_tensor()
+        pycapsule = ctypes.pythonapi.PyCapsule_New(
+            ctypes.byref(dl_managed_tensor),
+            c_str_dltensor,
+            Tensor._pycapsule_deleter,
+        )
+        return pycapsule
+    def __dlpack_device__(self) -> tuple[DLDeviceType, int]:
+        """Get the DLPack device information for the tensor.
+        Returns
+        -------
+        tuple[DLDeviceType, int]
+            A tuple representing the DLPack device information (device type, device ID).
+        """
+        return (
+            MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE[self.memory_type],
+            self.memory_type_id,
+        )
+    def to_string_array(self) -> numpy.ndarray:
+        """Deserialize  BYTES Tensor into numpy array of strings.
+        If memory is not on the host the tensor data will be copied to
+        the host before deserialization.
+        Returns
+        -------
+        numpy.ndarray
+            A numpy array of objects representing the BYTES tensor.
+        Examples
+        --------
+        numpy_ndarray = response.outputs["text_output"].to_string_array()
+        """
+        return self.to_bytes_array().astype(str)
+    def to_bytes_array(self) -> numpy.ndarray:
+        """Deserialize BYTES Tensor into numpy array.
+        If memory is not on the host the tensor data will be copied to
+        the host before deserialization.
+        Returns
+        -------
+        numpy.ndarray
+            A numpy array of objects representing the BYTES tensor.
+        Examples
+        --------
+        numpy_ndarray = response.outputs["text_output"].to_bytes_array()
+        """
+        if self.data_type != DataType.BYTES:
+            raise ValueError(
+                f"Tensor has data type {self.data_type} not {DataType.BYTES}"
+            )
+        # Reshape into 1d array of bytes on host
+        original_data_type = self.data_type
+        original_shape = self.shape
+        self.data_type = DataType.UINT8
+        self.shape = [self.size]
+        numpy_ndarray = self._to_numpy_on_host()
+        # Deserialize bytes array and reshape
+        self.shape = original_shape
+        self.data_type = original_data_type
+        return Tensor._deserialize_bytes_array(numpy_ndarray).reshape(self.shape)
+    @staticmethod
+    def from_string_array(string_array: list[str] | numpy.ndarray) -> Tensor:
+        """Create BYTES Tensor from numpy array of strings or list of strings.
+        Creates a tensor of type BYTES from a list of strings,
+        or numpy array of type str_. The
+        method allocates new host memory to store the serialized
+        tensor.
+        Parameters
+        ----------
+        string_array : list[str] | numpy.ndarray
+            an array like object to convert
+        Returns
+        -------
+        Tensor
+        Raises
+        ------
+        ValueError
+            If the given object can not be converted.
+        Examples
+        --------
+        tensor = Tensor.from_string_array(numpy.array(["hello"]))
+        tensor = Tensor.from_string_array(["hello"])
+        """
+        return Tensor.from_bytes_array(string_array)
+    @staticmethod
+    def from_bytes_array(
+        bytes_array: list[str] | list[bytes] | numpy.ndarray,
+    ) -> Tensor:
+        """Create  BYTES Tensor from numpy array or list
+        Creates a  tensor of type BYTES from a list of strings,
+        bytes or a numpy array of type object_, bytes_, or str_. The
+        method allocates new host memory to store the serialized
+        tensor.
+        Parameters
+        ----------
+        bytes_array : list[str | bytes] | numpy.ndarray
+            an array like object to convert
+        Returns
+        -------
+        Tensor
+        Raises
+        ------
+        ValueError
+            If the given object can not be converted.
+        Examples
+        --------
+        tensor = Tensor.from_bytes_array(numpy.array(["hello"]))
+        tensor = Tensor.from_bytes_array(["hello"])
+        """
+        result = Tensor._from_object(bytes_array)
+        if result.data_type != DataType.BYTES:
+            raise ValueError(
+                f"Unsupported conversion from {bytes_array} to BYTES Tensor. Got {result.data_type}"
+            )
+        return result
+    @staticmethod
+    def _from_object(obj: list[Any] | numpy.ndarray | Any) -> Tensor:
+        """Create a tensor from an object.
+        Creates a tensor from an object using specific conversion
+        methods if available or falls back to using __from_dlpack__.
+        Specific conversions are currently supported for:
+        list[obj: Any] : implicitly converted to numpy.array()
+        numpy.ndarray : serialized if required to BYTES tensor
+        Parameters
+        ----------
+        obj : list[Any] | numpy.ndarray | Any
+            The input object to create the tensor from.
+        Returns
+        -------
+        Tensor
+            A new tensor created from the specified object.
+        Examples
+        --------
+        tensor = Tensor.from_object(numpy.array(["hello"]))
+        tensor = Tensor.from_object(["hello"])
+        """
+        if type(obj) in Tensor._from_converters:
+            return Tensor._from_converters[type(obj)](obj)
+        elif hasattr(obj, "__dlpack__"):
+            return Tensor.from_dlpack(obj)
+        else:
+            raise ValueError(
+                f"Input type {type(obj)} not supported. Must be one of {list(Tensor._from_converters.keys())} or the type must support __dlpack__"
+            )
+    @staticmethod
+    def from_dlpack(obj: Any) -> Tensor:
+        """Create a tensor from a DLPack-compatible object.
+        Parameters
+        ----------
+        obj : Any
+            The DLPack-compatible object.
+        Returns
+        -------
+        Tensor
+            A new tensor created from the DLPack-compatible object.
+        Examples
+        --------
+        tensor = Tensor.from_dlpack(numpy.array([0,1,2], dtype=numpy.float16))
+        tensor = Tensor.from_dlpack(torch.zeros(100, dtype=torch.float16))
+        """
+        dlpack_object = DLPackObject(obj)
+        data_type = dlpack_object.data_type
+        shape = dlpack_object.shape
+        memory_buffer = MemoryBuffer._from_dlpack_object(
+            obj, dlpack_object=dlpack_object
+        )
+        return Tensor(data_type, shape, memory_buffer)
+    def to_host(self) -> Tensor:
+        """Move the tensor to CPU memory from device memory
+        Returns
+        -------
+        Tensor
+            The tensor moved to the CPU.
+        Examples
+        --------
+        tensor = Tensor.from_dlpack(torch.zeros(100, dtype=torch.float16).to("cuda"))
+        numpy_nd_array = numpy.array(tensor.to_host())
+        """
+        return self.to_device("cpu")
+    def to_device(self, device: DeviceOrMemoryType) -> Tensor:
+        """Move the tensor to the specified device.
+        Parameters
+        ----------
+        device : DeviceOrMemoryType
+            The target device. Device can be specified as a string,
+            MemoryType, tuple [MemoryType, memory_type__id], or
+            tuple[DLDeviceType, device_id].
+        Returns
+        -------
+        Tensor
+            The tensor moved to the specified device.
+        Examples
+        --------
+        tensor_cpu = tritonserver.Tensor.from_dlpack(numpy.array([0,1,2], dtype=numpy.float16))
+        # Different ways to specify the device
+        tensor_gpu = tensor_cpu.to_device(MemoryType.GPU)
+        tensor_gpu = tensor_cpu.to_device((MemoryType.GPU,0))
+        tensor_gpu = tensor_cpu.to_device((DLDeviceType.kDLCUDA,0))
+        tensor_gpu = tensor_cpu.to_device("gpu")
+        tensor_gpu = tensor_cpu.to_device("gpu:0")
+        ndarray_gpu = cupy.from_dlpack(tensor_gpu)
+        ndarray_gpu[0] = ndarray_gpu.mean()
+        tensor_cpu = tensor_gpu.to_device("cpu")
+        ndarray_cpu = numpy.from_dlpack(tensor_cpu)
+        assert ndarray_cpu[0] == ndarray_gpu[0]
+        """
+        memory_type, memory_type_id = parse_device_or_memory_type(device)
+        if self.memory_type == memory_type and self.memory_type_id == memory_type_id:
+            return self
+        if self.memory_type == MemoryType.CPU_PINNED and memory_type == MemoryType.CPU:
+            return self
+        if cupy is not None:
+            if self.memory_type in (MemoryType.CPU, MemoryType.CPU_PINNED):
+                ndarray = numpy.from_dlpack(self)
+            else:
+                ndarray = cupy.from_dlpack(self)
+            if memory_type == MemoryType.CPU:
+                return Tensor.from_dlpack(cupy.asnumpy(ndarray))
+            if memory_type == MemoryType.GPU:
+                with cupy.cuda.Device(memory_type_id):
+                    return Tensor.from_dlpack(cupy.asarray(ndarray))
+        raise ValueError(
+            f"Conversion from {(self.memory_type,self.memory_type_id)} to {(memory_type, memory_type_id)} not supported."
+        )
+    def _to_numpy_on_host(self) -> numpy.ndarray:
+        if self.memory_type in (MemoryType.CPU, MemoryType.CPU_PINNED):
+            return numpy.from_dlpack(self)
+        if cupy is not None:
+            return cupy.asnumpy(cupy.from_dlpack(self))
+        raise ValueError(
+            f"Conversion from {self.memory_type} to numpy array not supported."
+        )
+    @staticmethod
+    def _deserialize_bytes_array(numpy_ndarray: numpy.ndarray) -> numpy.ndarray:
+        result = []
+        _buffer = memoryview(numpy_ndarray)
+        offset = 0
+        while offset < len(_buffer):
+            (item_length,) = struct.unpack_from("@I", _buffer, offset)
+            offset += 4
+            result.append(bytes(_buffer[offset : offset + item_length]))
+            offset += item_length
+        return numpy.array(result, dtype=numpy.object_)
+    @staticmethod
+    def _serialize_numpy_bytes_array(array: numpy.ndarray) -> numpy.ndarray:
+        result = []
+        for array_item in numpy.nditer(array, flags=["refs_ok"], order="C"):
+            item = array_item.item()  # type: ignore
+            if not isinstance(item, bytes):
+                item = str(item).encode("utf-8")
+            result.append(struct.pack("@I", len(item)))
+            result.append(item)
+        return numpy.frombuffer(b"".join(result), dtype=numpy.byte)
+    @staticmethod
+    def _from_list(obj: list[Any]) -> Tensor:
+        try:
+            return Tensor._from_numpy(numpy.array(obj))
+        except Exception as e:
+            raise ValueError(f"Conversion from {obj} to tensor not supported.") from e
+    @staticmethod
+    def _from_numpy(obj: numpy.ndarray | numpy.generic) -> Tensor:
+        data_type = NUMPY_TO_DATA_TYPE[obj.dtype.type]
+        shape = obj.shape
+        if isinstance(obj, numpy.generic):
+            obj = numpy.asarray(obj)
+        if data_type == DataType.BYTES:
+            obj = Tensor._serialize_numpy_bytes_array(obj)
+        memory_buffer = MemoryBuffer(
+            data_ptr=obj.ctypes.data,
+            memory_type=MemoryType.CPU,
+            memory_type_id=0,
+            size=obj.itemsize * obj.size,
+            owner=obj,
+        )
+        return Tensor(data_type, shape, memory_buffer)
+    def _create_managed_tensor(self) -> DLManagedTensor:
+        # Allocates space for a managed tensor object
+        # and fills in the fields
+        #
+        # To ensure the lifetime of the managed tensor we create a
+        # context object that includes a newly created shape array and a
+        # reference to self
+        size = ctypes.c_size_t(ctypes.sizeof(DLManagedTensor))
+        address = ctypes.pythonapi.PyMem_RawMalloc(size)
+        dl_managed_tensor = DLManagedTensor.from_address(address)
+        dl_managed_tensor.dl_tensor.data = self.data_ptr
+        dl_managed_tensor.dl_tensor.device = DLDevice(
+            MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE[self.memory_type],
+            self.memory_type_id,
+        )
+        dl_managed_tensor.dl_tensor.dtype = DATA_TYPE_TO_DLPACK_DTYPE[self.data_type]
+        dl_managed_tensor.dl_tensor.ndim = len(self.shape)
+        manager_ctx = _ManagerCtx(self)
+        dl_managed_tensor.dl_tensor.shape = manager_ctx.shape
+        dl_managed_tensor.dl_tensor.strides = manager_ctx.strides
+        dl_managed_tensor.dl_tensor.byte_offset = 0
+        dl_managed_tensor.deleter = Tensor._managed_tensor_deleter
+        dl_managed_tensor.manager_ctx = manager_ctx.reference()
+        return dl_managed_tensor
+    @staticmethod
+    @ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+    def _managed_tensor_deleter(handle: int) -> None:
+        dl_managed_tensor = DLManagedTensor.from_address(handle)
+        _ManagerCtx.release(dl_managed_tensor.manager_ctx)
+        ctypes.pythonapi.PyMem_RawFree(handle)
+    @staticmethod
+    @ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+    def _pycapsule_deleter(handle: ctypes.c_void_p) -> None:
+        try:
+            pycapsule: ctypes.py_object = ctypes.cast(handle, ctypes.py_object)
+            if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, c_str_dltensor):
+                dl_managed_tensor = ctypes.pythonapi.PyCapsule_GetPointer(
+                    pycapsule, c_str_dltensor
+                )
+                Tensor._managed_tensor_deleter(dl_managed_tensor)
+                ctypes.pythonapi.PyCapsule_SetDestructor(pycapsule, None)
+        except Exception as e:
+            print(f"Exception occurred while deleting capsule: {e}")
+            raise e
+    _from_converters: ClassVar[dict[type, Callable[[Any], Tensor]]] = dict(
+        {numpy.ndarray: _from_numpy, numpy.generic: _from_numpy, list: _from_list},
+    )
+class _ManagerCtx:
+    # To ensure the lifetime of the managed tensor we create a
+    # context object that includes a newly created shape array and a
+    # reference to self
+    def __init__(self, tensor: Tensor) -> None:
+        self._tensor = tensor
+        self.shape = (ctypes.c_int64 * len(tensor.shape))(*tensor.shape)
+        self.strides = ctypes.POINTER(ctypes.c_int64)()
+    def reference(self) -> ctypes.c_void_p:
+        py_obj = ctypes.py_object(self)
+        ctypes.pythonapi.Py_IncRef(py_obj)
+        # Note: Could not find a direct way to cast a python object
+        # to a c_void_p. The mechanism is to either use id(self) or
+        # cast as described here:
+        #
+        # https://groups.google.com/g/dev-python/c/QRRqVC7gkf4/m/zH7l1gTXBwAJ
+        #
+        # To avoid relying on the behavior of id() we use the casting mechanism
+        return ctypes.POINTER(ctypes.c_void_p)(py_obj)[0]  # type: ignore
+    @staticmethod
+    def release(reference: ctypes.c_void_p) -> None:
+        py_obj = ctypes.cast(reference, ctypes.py_object)
+        ctypes.pythonapi.Py_DecRef(py_obj)
--- a/icp/python/src/triton_distributed/icp/ucp_data_plane.py
+++ b/icp/python/src/triton_distributed/icp/ucp_data_plane.py
@@ -25,10 +25,8 @@ from urllib.parse import urlsplit
 import cupy
 import numpy
-import tritonserver
 import ucp
 from cupy_backends.cuda.api.runtime import CUDARuntimeError
-from tritonserver import InvalidArgumentError, MemoryBuffer, MemoryType, Tensor
 from triton_distributed.icp.data_plane import (
    DataPlane,
@@ -48,7 +46,11 @@ from triton_distributed.icp.data_plane import (
    set_icp_tensor_size,
    set_icp_tensor_uri,
 )
+from triton_distributed.icp.data_type import DataType
+from triton_distributed.icp.memory_buffer import MemoryBuffer
+from triton_distributed.icp.memory_type import MemoryType
 from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
+from triton_distributed.icp.tensor import Tensor
 LOGGER = logging.getLogger(__name__)
@@ -175,20 +177,18 @@ class _UcpDataPlane(DataPlane):
                if tensor_id in self._tensor_store:
                    tensor = self._tensor_store[tensor_id]
                    array_module = numpy
-                    if tensor.memory_type == tritonserver.MemoryType.CPU:
+                    if tensor.memory_type == MemoryType.CPU:
                        array_module = numpy
                        device_manager = contextlib.nullcontext()
-                    elif tensor.memory_type == tritonserver.MemoryType.GPU:
+                    elif tensor.memory_type == MemoryType.GPU:
                        array_module = cupy
                        device_manager = cupy.cuda.Device(
                            tensor.memory_buffer.memory_type_id
                        )
                    else:
-                        raise InvalidArgumentError(
+                        raise ValueError(f"Invalid Memory Type {tensor.memory_type}")
-                            f"Invalid Memory Type {tensor.memory_type}"
-                        )
                    with device_manager:
-                        if tensor.data_type == tritonserver.DataType.BYTES:
+                        if tensor.data_type == DataType.BYTES:
                            array = tensor.memory_buffer.owner
                        else:
                            array = array_module.from_dlpack(tensor)
@@ -343,7 +343,7 @@ class _UcpDataPlane(DataPlane):
        if requested_memory_type is not None:
            memory_type = requested_memory_type
-        if memory_type == tritonserver.MemoryType.GPU and self._cuda_is_available:
+        if memory_type == MemoryType.GPU and self._cuda_is_available:
            array_module = cupy
            if requested_memory_type_id is not None:
                device_manager = cupy.cuda.Device(requested_memory_type_id)

--- a/icp/tests/python/test_data_plane.py
+++ b/icp/tests/python/test_data_plane.py
@@ -24,10 +24,11 @@ import numpy
 import pytest
 import ucp
 from cupy_backends.cuda.api.runtime import CUDARuntimeError
-from tritonserver import DataType, MemoryType, Tensor
-from tritonserver._api._datautils import TRITON_TO_NUMPY_DTYPE
 from triton_distributed.icp.data_plane import DataPlaneError
+from triton_distributed.icp.data_type import DATA_TYPE_TO_NUMPY_DTYPE, DataType
+from triton_distributed.icp.memory_type import MemoryType
+from triton_distributed.icp.tensor import Tensor
 from triton_distributed.icp.ucp_data_plane import (
    UcpDataPlane,
    get_icp_tensor_uri,
@@ -283,7 +284,7 @@ def test_requested_memory_type(memory_type, memory_type_id, request):
 def _get_random_tensor(data_type: DataType, size: Sequence[int]):
-    dtype = TRITON_TO_NUMPY_DTYPE[data_type]
+    dtype = DATA_TYPE_TO_NUMPY_DTYPE[data_type]
    value = numpy.random.rand(*size)
    return value.astype(dtype)

--- a/runtime/python/pyproject.toml
+++ b/runtime/python/pyproject.toml
+[build-system]
+requires = ["setuptools>=65.0", "setuptools-scm>=8"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "triton_distributed.runtime"
+dynamic = ["version"]
+dependencies = ["triton_distributed.icp >= 0"]
+[tool.setuptools_scm]
+version_file = "src/triton_distributed/runtime/_version.py"
+root = "../.."
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["triton_distributed.runtime*"]
+namespaces= true
--- a/runtime/python/src/triton_distributed/runtime/__init__.py
+++ b/runtime/python/src/triton_distributed/runtime/__init__.py
@@ -24,8 +24,13 @@ from triton_distributed.runtime.remote_request import (
 from triton_distributed.runtime.remote_response import (
    RemoteInferenceResponse as RemoteInferenceResponse,
 )
-from triton_distributed.runtime.triton_core_operator import (
-    TritonCoreOperator as TritonCoreOperator,
+try:
-)
+    from triton_distributed.runtime.triton_core_operator import (
+        TritonCoreOperator as TritonCoreOperator,
+    )
+except ImportError:
+    pass
 from triton_distributed.runtime.worker import Worker as Worker
 from triton_distributed.runtime.worker import WorkerConfig as WorkerConfig
--- a/runtime/python/src/triton_distributed/runtime/deployment.py
+++ b/runtime/python/src/triton_distributed/runtime/deployment.py
@@ -16,8 +16,6 @@ import multiprocessing
 from pprint import pformat
 from typing import Optional, Type
-from tritonserver import InvalidArgumentError
 from triton_distributed.icp import (
    DataPlane,
    NatsRequestPlane,
@@ -71,7 +69,7 @@ class Deployment:
            if self._default_request_plane == NatsRequestPlane:
                self.request_plane_server = NatsServer(log_dir=self._default_log_dir)
            else:
-                raise InvalidArgumentError(
+                raise ValueError(
                    f"Unknown Request Plane Type, can not initialize {self._default_request_plane}"
                )