build: pip based installation of icp and runtime. Also make tritonserver optional

Signed-off-by: Neelay Shah <neelays@nvidia.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>

build: pip based installation of icp and runtime. Also make tritonserver optional
Signed-off-by: Neelay Shah <neelays@nvidia.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
ccfe101e · Neelay Shah · GitHub · 5701753a · ccfe101e · ccfe101e
Commit ccfe101e authored Feb 04, 2025 by Neelay Shah Committed by GitHub Feb 04, 2025
20 changed files
--- a/container/Dockerfile
+++ b/container/Dockerfile
@@ -154,8 +154,20 @@ COPY . /workspace
 RUN cd runtime/rust && cargo build --release --locked && cargo doc --no-deps
 RUN /workspace/icp/protos/gen_python.sh

+# Install python packages
+ARG PYTHON_PACKAGE_VERSION=0.0.1.dev+unknown
+
+# SETUPTOOLS_SCM_PRETEND_VERSION allows dynamically setting the package versions during build/install.
+# This allows having versioned packages during development between releases, such as commit IDs.
+#
+# Normally SCM version is taken directly from .git but this is not available in the Dockerfile
+# and so we pass in via a buildarg
+
+RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_ICP=${PYTHON_PACKAGE_VERSION} pip install -e /workspace/icp/python
+RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_RUNTIME=${PYTHON_PACKAGE_VERSION} pip install -e /workspace/runtime/python
+
 # Sets pythonpath for python modules
-ENV PYTHONPATH="${PYTHONPATH}:/workspace/icp/python/src:/workspace/runtime/python/src:/workspace/examples/python:/opt/tritonserver/python/openai/openai_frontend"
+ENV PYTHONPATH="${PYTHONPATH}:/workspace/examples/python:/opt/tritonserver/python/openai/openai_frontend"

 # Enable system UCX
 ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true

--- a/container/build.sh
+++ b/container/build.sh
@@ -22,12 +22,16 @@ PLATFORM=linux/amd64
 # Get short commit hash
 commit_id=$(git rev-parse --short HEAD)

-# Attempt to get current tag
-current_tag=$(git describe --tags --exact-match 2>/dev/null) || true
+# if COMMIT_ID matches a TAG use that
+current_tag=$(git describe --tags --exact-match 2>/dev/null | sed 's/^v//') || true

-# Use tag if available, otherwise use commit hash
-VERSION=${current_tag:-$commit_id}
+# Get latest TAG and add COMMIT_ID for dev
+latest_tag=$(git describe --tags --abbrev=0 $(git rev-list --tags --max-count=1 main) | sed 's/^v//') || true

+# Use tag if available, otherwise use latest_tag.dev.commit_id
+VERSION=v${current_tag:-$latest_tag.dev.$commit_id}
+
+PYTHON_PACKAGE_VERSION=${current_tag:-$latest_tag.dev+$commit_id}

 # Frameworks
 #
@@ -244,7 +248,7 @@ get_options "$@"

 # BUILD DEV IMAGE

-BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG --build-arg FRAMEWORK=$FRAMEWORK --build-arg ${FRAMEWORK}_FRAMEWORK=1"
+BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG --build-arg FRAMEWORK=$FRAMEWORK --build-arg ${FRAMEWORK}_FRAMEWORK=1 --build-arg VERSION=$VERSION --build-arg PYTHON_PACKAGE_VERSION=$PYTHON_PACKAGE_VERSION"

 if [ ! -z ${GITHUB_TOKEN} ]; then
    BUILD_ARGS+=" --build-arg GITHUB_TOKEN=${GITHUB_TOKEN} "

--- a/examples/python/hello_world/operators/encoder_decoder.py
+++ b/examples/python/hello_world/operators/encoder_decoder.py
@@ -8,12 +8,12 @@ class EncodeDecodeOperator(Operator):
        self,
        name,
        version,
-        triton_core,
        request_plane,
        data_plane,
        parameters,
        repository,
        logger,
+        triton_core,
    ):
        self._encoder = RemoteOperator("encoder", request_plane, data_plane)
        self._decoder = RemoteOperator("decoder", request_plane, data_plane)

--- a/examples/python/hello_world/single_file.py
+++ b/examples/python/hello_world/single_file.py
@@ -41,12 +41,12 @@ class EncodeDecodeOperator(Operator):
        self,
        name,
        version,
-        triton_core,
        request_plane,
        data_plane,
        parameters,
        repository,
        logger,
+        triton_core,
    ):
        self._encoder = RemoteOperator("encoder", request_plane, data_plane)
        self._decoder = RemoteOperator("decoder", request_plane, data_plane)

--- a/examples/python/llm/vllm/operators/vllm.py
+++ b/examples/python/llm/vllm/operators/vllm.py
@@ -2,7 +2,7 @@ import argparse
 import json
 import logging
 from dataclasses import field
-from typing import AsyncGenerator, List, Optional
+from typing import Any, AsyncGenerator, List, Optional

 import numpy as np

@@ -23,7 +23,6 @@ class VllmOperator(Operator):
        self,
        name: str,
        version: int,
-        triton_core,
        request_plane: RequestPlane,
        data_plane: DataPlane,
        parameters: Optional[dict[str, str | int | bool | bytes]] = field(
@@ -31,6 +30,7 @@ class VllmOperator(Operator):
        ),
        repository: Optional[str] = None,
        logger: Optional[logging.Logger] = None,
+        triton_core: Optional[Any] = None,
    ):
        self.name = name
        self.version = version

--- a/icp/python/pyproject.toml
+++ b/icp/python/pyproject.toml
+[build-system]
+requires = ["setuptools>=65.0", "setuptools-scm>=8"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "triton_distributed.icp"
+dynamic = ["version"]
+
+[tool.setuptools_scm]
+version_file = "src/triton_distributed/icp/_version.py"
+root = "../.."
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["triton_distributed.icp*"]
+namespaces= true
--- a/icp/python/src/triton_distributed/icp/_custom_key_error_dict.py
+++ b/icp/python/src/triton_distributed/icp/_custom_key_error_dict.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Type
+
+
+class CustomKeyErrorDict(dict):
+    def __init__(
+        self,
+        from_name: str,
+        to_name: str,
+        *args,
+        exception: Type[Exception] = ValueError,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self._to_name = to_name
+        self._from_name = from_name
+        self._exception = exception
+
+    def __getitem__(self, key):
+        try:
+            return super().__getitem__(key)
+        except KeyError:
+            raise self._exception(
+                f"Unsupported {self._from_name}. Can't convert {key} to {self._to_name}"
+            ) from None
--- a/icp/python/src/triton_distributed/icp/_dlpack.py
+++ b/icp/python/src/triton_distributed/icp/_dlpack.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+################################################################################
+# This file contains the DLPack API wrapped in Python style (see
+# 'dlpack.h' for detail) and the utilities for Triton client to interact
+# with DLPack
+#
+# Ref:
+# https://github.com/dmlc/dlpack/blob/main/include/dlpack/dlpack.h
+# https://github.com/dmlc/dlpack/blob/main/apps/numpy_dlpack/dlpack/from_numpy.py
+################################################################################
+
+import ctypes
+from typing import Union
+
+from triton_distributed.icp._custom_key_error_dict import CustomKeyErrorDict
+from triton_distributed.icp.data_type import DataType
+from triton_distributed.icp.memory_type import MemoryType, string_to_memory_type
+
+try:
+    import cupy
+except ImportError:
+    cupy = None
+
+# Need to explicit set the res / arg types for pythonapi functions to
+# work properly
+ctypes.pythonapi.PyMem_RawMalloc.restype = ctypes.c_void_p
+ctypes.pythonapi.PyMem_RawFree.argtypes = [ctypes.c_void_p]
+
+ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
+ctypes.pythonapi.PyCapsule_New.argtypes = [
+    ctypes.c_void_p,
+    ctypes.c_char_p,
+    ctypes.c_void_p,
+]
+
+ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
+ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]
+
+
+c_str_dltensor = b"dltensor"
+
+
+class DLDeviceType(ctypes.c_int):
+    kDLCPU = 1
+    kDLCUDA = 2
+    kDLCUDAHost = 3
+    kDLOpenCL = 4
+    kDLVulkan = 7
+    kDLMetal = 8
+    kDLVPI = 9
+    kDLROCM = 10
+    kDLROCMHost = 11
+    kDLExtDev = 12
+    kDLCUDAManaged = 13
+    kDLOneAPI = 14
+    kDLWebGPU = 15
+    kDLHexagon = 16
+
+
+DeviceOrMemoryType = Union[
+    tuple[MemoryType, int], MemoryType, tuple[DLDeviceType, int], str
+]
+
+
+class DLDevice(ctypes.Structure):
+    _fields_ = [
+        ("device_type", ctypes.c_int),
+        ("device_id", ctypes.c_int),
+    ]
+
+
+class DLDataTypeCode(ctypes.c_uint8):
+    kDLInt = 0
+    kDLUInt = 1
+    kDLFloat = 2
+    kDLOpaquePointer = 3
+    kDLBfloat = 4
+    kDLComplex = 5
+    kDLBool = 6
+
+
+class DLDataType(ctypes.Structure):
+    _fields_ = [
+        ("type_code", ctypes.c_uint8),
+        ("bits", ctypes.c_uint8),
+        ("lanes", ctypes.c_uint16),
+    ]
+
+
+class DLTensor(ctypes.Structure):
+    _fields_ = [
+        ("data", ctypes.c_void_p),
+        ("device", DLDevice),
+        ("ndim", ctypes.c_int),
+        ("dtype", DLDataType),
+        ("shape", ctypes.POINTER(ctypes.c_int64)),
+        ("strides", ctypes.POINTER(ctypes.c_int64)),
+        ("byte_offset", ctypes.c_uint64),
+    ]
+
+
+class DLManagedTensor(ctypes.Structure):
+    _fields_ = [
+        ("dl_tensor", DLTensor),
+        ("manager_ctx", ctypes.c_void_p),
+        ("deleter", ctypes.CFUNCTYPE(None, ctypes.c_void_p)),
+    ]
+
+
+# Utilities
+
+
+def _raise_error(msg):
+    """
+    Raise error with the provided message
+    """
+    raise Exception(msg) from None
+
+
+# Use as managed context in DLPack that doesn't hold ownership of the
+# data content.
+class DataViewContext:
+    def __init__(self, shape) -> None:
+        # Convert the Python object to ctypes objects expected by
+        # DLPack
+        self._shape = (ctypes.c_int64 * len(shape))(*shape)
+        # No strides: compact and row-major
+        self._strides = ctypes.POINTER(ctypes.c_int64)()
+
+    def as_manager_ctx(self) -> ctypes.c_void_p:
+        py_obj = ctypes.py_object(self)
+        py_obj_ptr = ctypes.pointer(py_obj)
+        ctypes.pythonapi.Py_IncRef(py_obj)
+        ctypes.pythonapi.Py_IncRef(ctypes.py_object(py_obj_ptr))
+        return ctypes.cast(py_obj_ptr, ctypes.c_void_p)
+
+
+@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+def managed_tensor_deleter(handle: ctypes.c_void_p) -> None:
+    dl_managed_tensor = DLManagedTensor.from_address(handle)  # type: ignore
+    py_obj_ptr = ctypes.cast(
+        dl_managed_tensor.manager_ctx, ctypes.POINTER(ctypes.py_object)
+    )
+    py_obj = py_obj_ptr.contents
+    ctypes.pythonapi.Py_DecRef(py_obj)
+    ctypes.pythonapi.Py_DecRef(ctypes.py_object(py_obj_ptr))
+    ctypes.pythonapi.PyMem_RawFree(handle)
+
+
+@ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+def pycapsule_deleter(handle: ctypes.c_void_p) -> None:
+    pycapsule: ctypes.py_object = ctypes.cast(handle, ctypes.py_object)
+    if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, c_str_dltensor):
+        dl_managed_tensor = ctypes.pythonapi.PyCapsule_GetPointer(
+            pycapsule, c_str_dltensor
+        )
+        managed_tensor_deleter(dl_managed_tensor)
+        ctypes.pythonapi.PyCapsule_SetDestructor(pycapsule, None)
+
+
+def is_contiguous_data(
+    ndim: ctypes.c_int,
+    shape: ctypes.POINTER(ctypes.c_int64),  # type: ignore
+    stride: ctypes.POINTER(ctypes.c_int64),  # type: ignore
+):
+    # If 'stride' doesn't capture valid value
+    if (stride is None) or (not bool(stride)):
+        return True
+    calculated_stride = 1
+    # iterate stride in reverse order [ndim-1, -1)
+    for i in reversed(range(ndim)):  # type: ignore
+        if stride[i] != calculated_stride:
+            return False
+        calculated_stride *= shape[i]
+    return True
+
+
+def get_byte_size(
+    dtype: DLDataType, ndim: ctypes.c_int, shape: ctypes.POINTER(ctypes.c_int64)  # type: ignore
+):
+    element_byte_size = dtype.bits * dtype.lanes // 8  # Assume 8 bits in a byte
+    for i in range(ndim):  # type: ignore
+        element_byte_size *= shape[i]
+    return element_byte_size
+
+
+def get_dlpack_capsule(dlpack_obj, stream=None):
+    # Extract PyCapsule of the DLPack object
+    if hasattr(dlpack_obj, "__dlpack__"):
+        if not hasattr(dlpack_obj, "__dlpack_device__"):
+            _raise_error(
+                "DLPack expects '__dlpack_device__' if '__dlpack__' has been defined"
+            )
+        device = dlpack_obj.__dlpack_device__()
+        # Have to condition on the device type as, using numpy as example,
+        # some DLPack implementation doesn't accept 'stream' as arguments
+        if device != DLDeviceType.kDLCUDA:
+            return dlpack_obj.__dlpack__()
+        else:
+            return dlpack_obj.__dlpack__(stream)
+    else:
+        # Old interface where PyCapsule object is passed directly
+        return dlpack_obj
+
+
+def get_dlpack_device(dlpack_obj):
+    if hasattr(dlpack_obj, "__dlpack_device__"):
+        return dlpack_obj.__dlpack_device__()
+    return None
+
+
+def get_managed_tensor(dlcapsule):
+    ptr = ctypes.pythonapi.PyCapsule_GetPointer(dlcapsule, c_str_dltensor)
+    return DLManagedTensor.from_address(ptr)
+
+
+class DLPackObject:
+    def __init__(self, value) -> None:
+        try:
+            stream = None
+            device, device_id = value.__dlpack_device__()
+            if device == DLDeviceType.kDLCUDA:
+                if cupy is None:
+                    raise ValueError(
+                        f"DLPack synchronization on device {device,device_id} not supported"
+                    )
+                with cupy.cuda.Device(device_id):
+                    stream = 1  # legacy default stream
+                    self._capsule = get_dlpack_capsule(value, stream)
+                    self._tensor = get_managed_tensor(self._capsule).dl_tensor
+            else:
+                self._capsule = get_dlpack_capsule(value)
+                self._tensor = get_managed_tensor(self._capsule).dl_tensor
+        except Exception as e:
+            raise ValueError(f"Object does not support DLPack protocol: {e}") from None
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, DLPackObject):
+            return False
+        if self.byte_size != other.byte_size:
+            return False
+        if self.memory_type != other.memory_type:
+            return False
+        if self.memory_type_id != other.memory_type_id:
+            return False
+        if self.shape != other.shape:
+            return False
+        if self.data_ptr != other.data_ptr:
+            return False
+        if self.contiguous != other.contiguous:
+            return False
+        if self.data_type != other.data_type:
+            return False
+        return True
+
+    @property
+    def byte_size(self) -> int:
+        return get_byte_size(self._tensor.dtype, self._tensor.ndim, self._tensor.shape)
+
+    @property
+    def memory_type(self) -> MemoryType:
+        return DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE[self._tensor.device.device_type]
+
+    @property
+    def memory_type_id(self) -> int:
+        return self._tensor.device.device_id
+
+    @property
+    def shape(self) -> list[int]:
+        return [self._tensor.shape[i] for i in range(self._tensor.ndim)]
+
+    @property
+    def data_type(self) -> DataType:
+        return DLPACK_TO_DATA_TYPE[self.dlpack_data_type]
+
+    @property
+    def dlpack_data_type(self) -> tuple[DLDataTypeCode, int]:
+        return (self._tensor.dtype.type_code, self._tensor.dtype.bits)
+
+    @property
+    def data_ptr(self) -> ctypes.c_void_p:
+        return self._tensor.data + self._tensor.byte_offset
+
+    @property
+    def contiguous(self) -> bool:
+        return is_contiguous_data(
+            self._tensor.ndim, self._tensor.shape, self._tensor.strides
+        )
+
+
+DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE: dict[DLDeviceType, MemoryType] = CustomKeyErrorDict(
+    "DLPack device type",
+    "Memory type",
+    {
+        DLDeviceType.kDLCUDA: MemoryType.GPU,
+        DLDeviceType.kDLCPU: MemoryType.CPU,
+    },
+)
+
+MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE: dict[MemoryType, DLDeviceType] = CustomKeyErrorDict(
+    "Memory type",
+    "DLPack device type",
+    {
+        **{value: key for key, value in DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE.items()},
+        **{MemoryType.CPU_PINNED: DLDeviceType.kDLCPU},
+    },
+)
+
+
+def parse_device_or_memory_type(
+    device_or_memory_type: DeviceOrMemoryType,
+) -> tuple[MemoryType, int]:
+    memory_type = None
+    memory_type_id = 0
+    if isinstance(device_or_memory_type, tuple):
+        if isinstance(device_or_memory_type[0], MemoryType):
+            memory_type = device_or_memory_type[0]
+            memory_type_id = device_or_memory_type[1]
+        elif isinstance(device_or_memory_type[0], DLDeviceType):
+            memory_type = DLPACK_DEVICE_TYPE_TO_MEMORY_TYPE[device_or_memory_type[0]]
+            memory_type_id = device_or_memory_type[1]
+        else:
+            raise ValueError(f"Invalid memory type {device_or_memory_type}")
+    elif isinstance(device_or_memory_type, MemoryType):
+        memory_type = device_or_memory_type
+        memory_type_id = 0
+    elif isinstance(device_or_memory_type, str):
+        memory_str_tuple = device_or_memory_type.split(":")
+        if len(memory_str_tuple) > 2:
+            raise ValueError(f"Invalid memory type string {device_or_memory_type}")
+        memory_type = string_to_memory_type(memory_str_tuple[0].upper())
+        if len(memory_str_tuple) == 2:
+            try:
+                memory_type_id = int(memory_str_tuple[1])
+            except ValueError:
+                raise ValueError(
+                    f"Invalid memory type string {device_or_memory_type}"
+                ) from None
+        else:
+            memory_type_id = 0
+    return (memory_type, memory_type_id)
+
+
+DLPACK_TO_DATA_TYPE: dict[tuple[DLDataTypeCode, int], DataType] = CustomKeyErrorDict(
+    "DLPack data type",
+    "Data type",
+    {
+        (DLDataTypeCode.kDLBool, 8): DataType.BOOL,
+        (DLDataTypeCode.kDLInt, 8): DataType.INT8,
+        (
+            DLDataTypeCode.kDLInt,
+            16,
+        ): DataType.INT16,
+        (
+            DLDataTypeCode.kDLInt,
+            32,
+        ): DataType.INT32,
+        (
+            DLDataTypeCode.kDLInt,
+            64,
+        ): DataType.INT64,
+        (
+            DLDataTypeCode.kDLUInt,
+            8,
+        ): DataType.UINT8,
+        (
+            DLDataTypeCode.kDLUInt,
+            16,
+        ): DataType.UINT16,
+        (
+            DLDataTypeCode.kDLUInt,
+            32,
+        ): DataType.UINT32,
+        (
+            DLDataTypeCode.kDLUInt,
+            64,
+        ): DataType.UINT64,
+        (
+            DLDataTypeCode.kDLFloat,
+            16,
+        ): DataType.FP16,
+        (
+            DLDataTypeCode.kDLFloat,
+            32,
+        ): DataType.FP32,
+        (
+            DLDataTypeCode.kDLFloat,
+            64,
+        ): DataType.FP64,
+        (
+            DLDataTypeCode.kDLBfloat,
+            16,
+        ): DataType.BF16,
+    },
+)
+
+DATA_TYPE_TO_DLPACK_DTYPE: dict[DataType, DLDataType] = CustomKeyErrorDict(
+    "Data type",
+    "DLPack data type",
+    {
+        value: DLDataType(type_code=key[0], bits=key[1], lanes=1)
+        for key, value in DLPACK_TO_DATA_TYPE.items()
+    },
+)
--- a/icp/python/src/triton_distributed/icp/data_plane.py
+++ b/icp/python/src/triton_distributed/icp/data_plane.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""Abstract Class for interacting with Triton Inference Serving Platform Inter-Component Protocol Data Plane"""
+"""Abstract Class for interacting with Triton Distributed Inter-Component Protocol Data Plane"""

 import abc
 import uuid
@@ -21,28 +21,16 @@ from typing import Optional, Sequence

 import cupy
 import numpy
-from tritonserver import (
+
+from triton_distributed.icp.data_type import (
+    DATA_TYPE_TO_NUMPY_DTYPE,
    DataType,
-    InvalidArgumentError,
-    MemoryBuffer,
-    MemoryType,
-    Tensor,
-)
-from tritonserver._api._datautils import (
-    STRING_TO_TRITON_MEMORY_TYPE,
-    TRITON_TO_NUMPY_DTYPE,
-)
-from tritonserver._c.triton_bindings import (
-    TRITONSERVER_DataTypeString as DataTypeString,
-)
-from tritonserver._c.triton_bindings import (
-    TRITONSERVER_MemoryTypeString as MemoryTypeString,
+    string_to_data_type,
 )
-from tritonserver._c.triton_bindings import (
-    TRITONSERVER_StringToDataType as StringToDataType,
-)
-
+from triton_distributed.icp.memory_buffer import MemoryBuffer
+from triton_distributed.icp.memory_type import MemoryType, string_to_memory_type
 from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
+from triton_distributed.icp.tensor import Tensor


 class DataPlaneError(Exception):
@@ -73,13 +61,13 @@ def set_icp_data_type(
    message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor,
    value: DataType,
 ) -> None:
-    message.datatype = DataTypeString(value)
+    message.datatype = value.name


 def get_icp_data_type(
    message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor,
 ) -> DataType:
-    return StringToDataType(message.datatype)
+    return string_to_data_type(message.datatype)


 def set_icp_tensor_uri(
@@ -116,7 +104,7 @@ def set_icp_memory_type(
    message: ModelInferRequest.InferInputTensor | ModelInferResponse.InferOutputTensor,
    value: MemoryType,
 ) -> None:
-    message.parameters[ICP_MEMORY_TYPE].string_param = MemoryTypeString(value)
+    message.parameters[ICP_MEMORY_TYPE].string_param = value.name


 def get_icp_memory_type(
@@ -124,9 +112,7 @@ def get_icp_memory_type(
 ) -> MemoryType | None:
    if ICP_MEMORY_TYPE not in message.parameters:
        return None
-    return STRING_TO_TRITON_MEMORY_TYPE[
-        message.parameters[ICP_MEMORY_TYPE].string_param
-    ]
+    return string_to_memory_type(message.parameters[ICP_MEMORY_TYPE].string_param)


 def set_icp_memory_type_id(
@@ -163,9 +149,7 @@ def set_icp_tensor_contents(
            with cupy.cuda.Device(tensor.memory_buffer.memory_type_id):
                array = cupy.from_dlpack(tensor)
        else:
-            raise InvalidArgumentError(
-                f"Invalid Tensor Memory Type {tensor.memory_type}"
-            )
+            raise ValueError(f"Invalid Tensor Memory Type {tensor.memory_type}")
        message.contents.bytes_contents.append(array.tobytes())


@@ -193,7 +177,7 @@ def get_icp_tensor_contents(
        array = numpy.array(
            numpy.frombuffer(
                message.contents.bytes_contents[0],
-                dtype=TRITON_TO_NUMPY_DTYPE[datatype],
+                dtype=DATA_TYPE_TO_NUMPY_DTYPE[datatype],
            )
        )
        tensor = Tensor(datatype, shape, MemoryBuffer.from_dlpack(array))

--- a/icp/python/src/triton_distributed/icp/data_type.py
+++ b/icp/python/src/triton_distributed/icp/data_type.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from enum import IntEnum
+
+import numpy
+
+from triton_distributed.icp._custom_key_error_dict import CustomKeyErrorDict
+
+DataType = IntEnum(
+    "DataType",
+    names=(
+        "INVALID",
+        "BOOL",
+        "UINT8",
+        "UINT16",
+        "UINT32",
+        "UINT64",
+        "INT8",
+        "INT16",
+        "INT32",
+        "INT64",
+        "FP16",
+        "FP32",
+        "FP64",
+        "BYTES",
+        "BF16",
+    ),
+    start=0,
+)
+
+
+def string_to_data_type(data_type_string: str) -> DataType:
+    try:
+        return DataType[data_type_string]
+    except KeyError:
+        raise ValueError(
+            f"Unsupported Data Type String. Can't convert {data_type_string} to DataType"
+        ) from None
+
+
+NUMPY_TO_DATA_TYPE: dict[type, DataType] = CustomKeyErrorDict(
+    "Numpy dtype",
+    "Data type",
+    {
+        bool: DataType.BOOL,
+        numpy.bool_: DataType.BOOL,
+        numpy.int8: DataType.INT8,
+        numpy.int16: DataType.INT16,
+        numpy.int32: DataType.INT32,
+        numpy.int64: DataType.INT64,
+        numpy.uint8: DataType.UINT8,
+        numpy.uint16: DataType.UINT16,
+        numpy.uint32: DataType.UINT32,
+        numpy.uint64: DataType.UINT64,
+        numpy.float16: DataType.FP16,
+        numpy.float32: DataType.FP32,
+        numpy.float64: DataType.FP64,
+        numpy.bytes_: DataType.BYTES,
+        numpy.str_: DataType.BYTES,
+        numpy.object_: DataType.BYTES,
+    },
+)
+
+DATA_TYPE_TO_NUMPY_DTYPE: dict[DataType, type] = CustomKeyErrorDict(
+    "Data type",
+    "Numpy dtype",
+    {
+        **{value: key for key, value in NUMPY_TO_DATA_TYPE.items()},
+        **{DataType.BYTES: numpy.object_},
+        **{DataType.BOOL: numpy.bool_},
+    },
+)
--- a/icp/python/src/triton_distributed/icp/memory_buffer.py
+++ b/icp/python/src/triton_distributed/icp/memory_buffer.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from triton_distributed.icp._dlpack import DLPackObject
+from triton_distributed.icp.memory_type import MemoryType
+
+
+@dataclass
+class MemoryBuffer:
+    """Memory allocated for a Tensor.
+
+    This object does not own the memory but holds a reference to the
+    owner.
+
+    Parameters
+    ----------
+    data_ptr : int
+        Pointer to the allocated memory.
+    memory_type : MemoryType
+        memory type
+    memory_type_id : int
+        memory type id (typically the same as device id)
+    size : int
+        Size of the allocated memory in bytes.
+    owner : Any
+        Object that owns or manages the memory buffer.  Allocated
+        memory must not be freed while a reference to the owner is
+        held.
+
+    Examples
+    --------
+    >>> buffer = MemoryBuffer.from_dlpack(numpy.array([100],dtype=numpy.uint8))
+
+    """
+
+    data_ptr: int
+    memory_type: MemoryType
+    memory_type_id: int
+    size: int
+    owner: Any
+
+    @staticmethod
+    def from_dlpack(owner: Any) -> MemoryBuffer:
+        if not hasattr(owner, "__dlpack__"):
+            raise ValueError("Object does not support DLpack protocol")
+
+        dlpack_object = DLPackObject(owner)
+
+        return MemoryBuffer._from_dlpack_object(owner, dlpack_object)
+
+    @staticmethod
+    def _from_dlpack_object(owner: Any, dlpack_object: DLPackObject) -> MemoryBuffer:
+        if not dlpack_object.contiguous:
+            raise ValueError("Only contiguous memory is supported")
+
+        return MemoryBuffer(
+            int(dlpack_object.data_ptr),
+            dlpack_object.memory_type,
+            dlpack_object.memory_type_id,
+            dlpack_object.byte_size,
+            owner,
+        )
--- a/icp/python/src/triton_distributed/icp/memory_type.py
+++ b/icp/python/src/triton_distributed/icp/memory_type.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import IntEnum
+
+MemoryType = IntEnum("MemoryType", names=("CPU", "CPU_PINNED", "GPU"), start=0)
+
+
+def string_to_memory_type(memory_type_string: str) -> MemoryType:
+    try:
+        return MemoryType[memory_type_string]
+    except KeyError:
+        raise ValueError(
+            f"Unsupported Memory Type String. Can't convert {memory_type_string} to MemoryType"
+        ) from None
--- a/icp/python/src/triton_distributed/icp/nats_request_plane.py
+++ b/icp/python/src/triton_distributed/icp/nats_request_plane.py
@@ -25,7 +25,6 @@ from typing import Dict, Optional
 from urllib.parse import urlsplit, urlunsplit

 import nats
-from tritonserver import InvalidArgumentError

 from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
 from triton_distributed.icp.request_plane import (
@@ -203,7 +202,7 @@ class NatsRequestPlane(RequestPlane):
        Optional[nats.js.JetStreamContext.PullSubscription],
    ]:
        if self._jet_stream is None:
-            raise InvalidArgumentError(
+            raise ValueError(
                "Failed to get model stream: NATS Jetstream not connected!"
            )

@@ -335,19 +334,15 @@ class NatsRequestPlane(RequestPlane):
        responses: AsyncIterator[ModelInferResponse] | ModelInferResponse,
    ):
        if self._jet_stream is None:
-            raise InvalidArgumentError(
-                "Failed to post response: NATS Jetstream not connected!"
-            )
+            raise ValueError("Failed to post response: NATS Jetstream not connected!")

        request_id = get_icp_request_id(request)
        if request_id is None:
-            raise InvalidArgumentError("ICP request must have request id")
+            raise ValueError("ICP request must have request id")

        response_to_uri = get_icp_response_to_uri(request)
        if not response_to_uri:
-            raise InvalidArgumentError(
-                "Attempting to send a response when non requested"
-            )
+            raise ValueError("Attempting to send a response when non requested")

        parsed = urlsplit(response_to_uri)
        response_stream = parsed.path.replace("/", "")
@@ -378,12 +373,10 @@ class NatsRequestPlane(RequestPlane):
        ] = None,
    ) -> AsyncIterator[ModelInferResponse]:
        if self._jet_stream is None:
-            raise InvalidArgumentError(
-                "Failed to post request: NATS Jetstream not connected!"
-            )
+            raise ValueError("Failed to post request: NATS Jetstream not connected!")

        if response_iterator and response_handler:
-            raise InvalidArgumentError(
+            raise ValueError(
                "Can only specify either response handler or response iterator"
            )


--- a/icp/python/src/triton_distributed/icp/request_plane.py
+++ b/icp/python/src/triton_distributed/icp/request_plane.py
@@ -13,14 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""Abstract Class for interacting with the Triton Inference Serving Platform Inter-Component Protocol Control Plane"""
+"""Abstract Class for interacting with the Triton Distributed Inter-Component Protocol Control Plane"""

 import abc
 import uuid
 from typing import AsyncIterator, Awaitable, Callable, Optional

-from tritonserver import TritonError
-
 from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse

 ICP_REQUEST_ID = "icp_request_id"
@@ -33,6 +31,10 @@ ICP_REQUEST_CANCELLED = "icp_request_cancelled"
 ICP_ERROR = "icp_response_error"


+class RequestPlaneError(Exception):
+    pass
+
+
 def get_icp_request_id(
    message: ModelInferRequest | ModelInferResponse,
 ) -> uuid.UUID | None:
@@ -47,13 +49,15 @@ def set_icp_request_id(
    message.parameters[ICP_REQUEST_ID].string_param = str(value)


-def get_icp_response_error(message: ModelInferResponse) -> TritonError | None:
+def get_icp_response_error(message: ModelInferResponse) -> RequestPlaneError | None:
    if ICP_ERROR not in message.parameters:
        return None
-    return TritonError(message.parameters[ICP_ERROR].string_param)
+    return RequestPlaneError(message.parameters[ICP_ERROR].string_param)


-def set_icp_response_error(message: ModelInferResponse, value: TritonError) -> None:
+def set_icp_response_error(
+    message: ModelInferResponse, value: RequestPlaneError
+) -> None:
    message.parameters[ICP_ERROR].string_param = str(value)



--- a/icp/python/src/triton_distributed/icp/tensor.py
+++ b/icp/python/src/triton_distributed/icp/tensor.py
--- a/icp/python/src/triton_distributed/icp/ucp_data_plane.py
+++ b/icp/python/src/triton_distributed/icp/ucp_data_plane.py
@@ -25,10 +25,8 @@ from urllib.parse import urlsplit

 import cupy
 import numpy
-import tritonserver
 import ucp
 from cupy_backends.cuda.api.runtime import CUDARuntimeError
-from tritonserver import InvalidArgumentError, MemoryBuffer, MemoryType, Tensor

 from triton_distributed.icp.data_plane import (
    DataPlane,
@@ -48,7 +46,11 @@ from triton_distributed.icp.data_plane import (
    set_icp_tensor_size,
    set_icp_tensor_uri,
 )
+from triton_distributed.icp.data_type import DataType
+from triton_distributed.icp.memory_buffer import MemoryBuffer
+from triton_distributed.icp.memory_type import MemoryType
 from triton_distributed.icp.protos.icp_pb2 import ModelInferRequest, ModelInferResponse
+from triton_distributed.icp.tensor import Tensor

 LOGGER = logging.getLogger(__name__)

@@ -175,20 +177,18 @@ class _UcpDataPlane(DataPlane):
                if tensor_id in self._tensor_store:
                    tensor = self._tensor_store[tensor_id]
                    array_module = numpy
-                    if tensor.memory_type == tritonserver.MemoryType.CPU:
+                    if tensor.memory_type == MemoryType.CPU:
                        array_module = numpy
                        device_manager = contextlib.nullcontext()
-                    elif tensor.memory_type == tritonserver.MemoryType.GPU:
+                    elif tensor.memory_type == MemoryType.GPU:
                        array_module = cupy
                        device_manager = cupy.cuda.Device(
                            tensor.memory_buffer.memory_type_id
                        )
                    else:
-                        raise InvalidArgumentError(
-                            f"Invalid Memory Type {tensor.memory_type}"
-                        )
+                        raise ValueError(f"Invalid Memory Type {tensor.memory_type}")
                    with device_manager:
-                        if tensor.data_type == tritonserver.DataType.BYTES:
+                        if tensor.data_type == DataType.BYTES:
                            array = tensor.memory_buffer.owner
                        else:
                            array = array_module.from_dlpack(tensor)
@@ -343,7 +343,7 @@ class _UcpDataPlane(DataPlane):
        if requested_memory_type is not None:
            memory_type = requested_memory_type

-        if memory_type == tritonserver.MemoryType.GPU and self._cuda_is_available:
+        if memory_type == MemoryType.GPU and self._cuda_is_available:
            array_module = cupy
            if requested_memory_type_id is not None:
                device_manager = cupy.cuda.Device(requested_memory_type_id)

--- a/icp/tests/python/test_data_plane.py
+++ b/icp/tests/python/test_data_plane.py
@@ -24,10 +24,11 @@ import numpy
 import pytest
 import ucp
 from cupy_backends.cuda.api.runtime import CUDARuntimeError
-from tritonserver import DataType, MemoryType, Tensor
-from tritonserver._api._datautils import TRITON_TO_NUMPY_DTYPE

 from triton_distributed.icp.data_plane import DataPlaneError
+from triton_distributed.icp.data_type import DATA_TYPE_TO_NUMPY_DTYPE, DataType
+from triton_distributed.icp.memory_type import MemoryType
+from triton_distributed.icp.tensor import Tensor
 from triton_distributed.icp.ucp_data_plane import (
    UcpDataPlane,
    get_icp_tensor_uri,
@@ -283,7 +284,7 @@ def test_requested_memory_type(memory_type, memory_type_id, request):


 def _get_random_tensor(data_type: DataType, size: Sequence[int]):
-    dtype = TRITON_TO_NUMPY_DTYPE[data_type]
+    dtype = DATA_TYPE_TO_NUMPY_DTYPE[data_type]
    value = numpy.random.rand(*size)
    return value.astype(dtype)


--- a/runtime/python/pyproject.toml
+++ b/runtime/python/pyproject.toml
+[build-system]
+requires = ["setuptools>=65.0", "setuptools-scm>=8"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "triton_distributed.runtime"
+dynamic = ["version"]
+dependencies = ["triton_distributed.icp >= 0"]
+
+[tool.setuptools_scm]
+version_file = "src/triton_distributed/runtime/_version.py"
+root = "../.."
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["triton_distributed.runtime*"]
+namespaces= true
--- a/runtime/python/src/triton_distributed/runtime/__init__.py
+++ b/runtime/python/src/triton_distributed/runtime/__init__.py
@@ -24,8 +24,13 @@ from triton_distributed.runtime.remote_request import (
 from triton_distributed.runtime.remote_response import (
    RemoteInferenceResponse as RemoteInferenceResponse,
 )
-from triton_distributed.runtime.triton_core_operator import (
+
+try:
+    from triton_distributed.runtime.triton_core_operator import (
        TritonCoreOperator as TritonCoreOperator,
-)
+    )
+except ImportError:
+    pass
+
 from triton_distributed.runtime.worker import Worker as Worker
 from triton_distributed.runtime.worker import WorkerConfig as WorkerConfig
--- a/runtime/python/src/triton_distributed/runtime/deployment.py
+++ b/runtime/python/src/triton_distributed/runtime/deployment.py
@@ -16,8 +16,6 @@ import multiprocessing
 from pprint import pformat
 from typing import Optional, Type

-from tritonserver import InvalidArgumentError
-
 from triton_distributed.icp import (
    DataPlane,
    NatsRequestPlane,
@@ -71,7 +69,7 @@ class Deployment:
            if self._default_request_plane == NatsRequestPlane:
                self.request_plane_server = NatsServer(log_dir=self._default_log_dir)
            else:
-                raise InvalidArgumentError(
+                raise ValueError(
                    f"Unknown Request Plane Type, can not initialize {self._default_request_plane}"
                )